Line | Count | Source (jump to first uncovered line) |
1 | | /* Handling of recursive HTTP retrieving. |
2 | | Copyright (C) 1996-2012, 2015, 2018-2024 Free Software Foundation, |
3 | | Inc. |
4 | | |
5 | | This file is part of GNU Wget. |
6 | | |
7 | | GNU Wget is free software; you can redistribute it and/or modify |
8 | | it under the terms of the GNU General Public License as published by |
9 | | the Free Software Foundation; either version 3 of the License, or |
10 | | (at your option) any later version. |
11 | | |
12 | | GNU Wget is distributed in the hope that it will be useful, |
13 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | GNU General Public License for more details. |
16 | | |
17 | | You should have received a copy of the GNU General Public License |
18 | | along with Wget. If not, see <http://www.gnu.org/licenses/>. |
19 | | |
20 | | Additional permission under GNU GPL version 3 section 7 |
21 | | |
22 | | If you modify this program, or any covered work, by linking or |
23 | | combining it with the OpenSSL project's OpenSSL library (or a |
24 | | modified version of that library), containing parts covered by the |
25 | | terms of the OpenSSL or SSLeay licenses, the Free Software Foundation |
26 | | grants you additional permission to convey the resulting work. |
27 | | Corresponding Source for a non-source form of such a combination |
28 | | shall include the source code for the parts of OpenSSL used as well |
29 | | as that of the covered work. */ |
30 | | |
31 | | #include "wget.h" |
32 | | |
33 | | #include <stdio.h> |
34 | | #include <stdlib.h> |
35 | | #include <string.h> |
36 | | #include <unistd.h> |
37 | | #include <errno.h> |
38 | | #include <assert.h> |
39 | | |
40 | | #include "url.h" |
41 | | #include "recur.h" |
42 | | #include "utils.h" |
43 | | #include "retr.h" |
44 | | #include "ftp.h" |
45 | | #include "host.h" |
46 | | #include "hash.h" |
47 | | #include "res.h" |
48 | | #include "convert.h" |
49 | | #include "html-url.h" |
50 | | #include "css-url.h" |
51 | | #include "spider.h" |
52 | | #include "exits.h" |
53 | | |
54 | | /* Functions for maintaining the URL queue. */ |
55 | | |
56 | | struct queue_element { |
57 | | const char *url; /* the URL to download */ |
58 | | const char *referer; /* the referring document */ |
59 | | int depth; /* the depth */ |
60 | | bool html_allowed; /* whether the document is allowed to |
61 | | be treated as HTML. */ |
62 | | struct iri *iri; /* sXXXav */ |
63 | | bool css_allowed; /* whether the document is allowed to |
64 | | be treated as CSS. */ |
65 | | struct queue_element *next; /* next element in queue */ |
66 | | }; |
67 | | |
68 | | struct url_queue { |
69 | | struct queue_element *head; |
70 | | struct queue_element *tail; |
71 | | int count, maxcount; |
72 | | }; |
73 | | |
74 | | /* Create a URL queue. */ |
75 | | |
76 | | static struct url_queue * |
77 | | url_queue_new (void) |
78 | 0 | { |
79 | 0 | struct url_queue *queue = xnew0 (struct url_queue); |
80 | 0 | return queue; |
81 | 0 | } |
82 | | |
83 | | /* Delete a URL queue. */ |
84 | | |
85 | | static void |
86 | | url_queue_delete (struct url_queue *queue) |
87 | 0 | { |
88 | 0 | xfree (queue); |
89 | 0 | } |
90 | | |
91 | | /* Enqueue a URL in the queue. The queue is FIFO: the items will be |
92 | | retrieved ("dequeued") from the queue in the order they were placed |
93 | | into it. */ |
94 | | |
95 | | static void |
96 | | url_enqueue (struct url_queue *queue, struct iri *i, |
97 | | const char *url, const char *referer, int depth, |
98 | | bool html_allowed, bool css_allowed) |
99 | 0 | { |
100 | 0 | struct queue_element *qel = xnew (struct queue_element); |
101 | 0 | qel->iri = i; |
102 | 0 | qel->url = url; |
103 | 0 | qel->referer = referer; |
104 | 0 | qel->depth = depth; |
105 | 0 | qel->html_allowed = html_allowed; |
106 | 0 | qel->css_allowed = css_allowed; |
107 | 0 | qel->next = NULL; |
108 | |
|
109 | 0 | ++queue->count; |
110 | 0 | if (queue->count > queue->maxcount) |
111 | 0 | queue->maxcount = queue->count; |
112 | |
|
113 | 0 | DEBUGP (("Enqueuing %s at depth %d\n", |
114 | 0 | quotearg_n_style (0, escape_quoting_style, url), depth)); |
115 | 0 | DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); |
116 | |
|
117 | 0 | if (i) |
118 | 0 | DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url), |
119 | 0 | i->uri_encoding ? quote_n (1, i->uri_encoding) : "None")); |
120 | |
|
121 | 0 | if (queue->tail) |
122 | 0 | queue->tail->next = qel; |
123 | 0 | queue->tail = qel; |
124 | |
|
125 | 0 | if (!queue->head) |
126 | 0 | queue->head = queue->tail; |
127 | 0 | } |
128 | | |
129 | | /* Take a URL out of the queue. Return true if this operation |
130 | | succeeded, or false if the queue is empty. */ |
131 | | |
132 | | static bool |
133 | | url_dequeue (struct url_queue *queue, struct iri **i, |
134 | | const char **url, const char **referer, int *depth, |
135 | | bool *html_allowed, bool *css_allowed) |
136 | 0 | { |
137 | 0 | struct queue_element *qel = queue->head; |
138 | |
|
139 | 0 | if (!qel) |
140 | 0 | return false; |
141 | | |
142 | 0 | queue->head = queue->head->next; |
143 | 0 | if (!queue->head) |
144 | 0 | queue->tail = NULL; |
145 | |
|
146 | 0 | *i = qel->iri; |
147 | 0 | *url = qel->url; |
148 | 0 | *referer = qel->referer; |
149 | 0 | *depth = qel->depth; |
150 | 0 | *html_allowed = qel->html_allowed; |
151 | 0 | *css_allowed = qel->css_allowed; |
152 | |
|
153 | 0 | --queue->count; |
154 | |
|
155 | 0 | DEBUGP (("Dequeuing %s at depth %d\n", |
156 | 0 | quotearg_n_style (0, escape_quoting_style, qel->url), qel->depth)); |
157 | 0 | DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount)); |
158 | |
|
159 | 0 | xfree (qel); |
160 | 0 | return true; |
161 | 0 | } |
162 | | |
163 | | static void blacklist_add (struct hash_table *blacklist, const char *url) |
164 | 0 | { |
165 | 0 | char *url_unescaped = xstrdup (url); |
166 | |
|
167 | 0 | url_unescape (url_unescaped); |
168 | 0 | string_set_add (blacklist, url_unescaped); |
169 | 0 | xfree (url_unescaped); |
170 | 0 | } |
171 | | |
172 | | static int blacklist_contains (struct hash_table *blacklist, const char *url) |
173 | 0 | { |
174 | 0 | char *url_unescaped = xstrdup(url); |
175 | 0 | int ret; |
176 | |
|
177 | 0 | url_unescape (url_unescaped); |
178 | 0 | ret = string_set_contains (blacklist, url_unescaped); |
179 | 0 | xfree (url_unescaped); |
180 | |
|
181 | 0 | return ret; |
182 | 0 | } |
183 | | |
184 | | typedef enum |
185 | | { |
186 | | WG_RR_SUCCESS, WG_RR_BLACKLIST, WG_RR_NOTHTTPS, WG_RR_NONHTTP, WG_RR_ABSOLUTE, |
187 | | WG_RR_DOMAIN, WG_RR_PARENT, WG_RR_LIST, WG_RR_REGEX, WG_RR_RULES, |
188 | | WG_RR_SPANNEDHOST, WG_RR_ROBOTS |
189 | | } reject_reason; |
190 | | |
191 | | static reject_reason download_child (const struct urlpos *, struct url *, int, |
192 | | struct url *, struct hash_table *, struct iri *); |
193 | | static reject_reason descend_redirect (const char *, struct url *, int, |
194 | | struct url *, struct hash_table *, struct iri *); |
195 | | static void write_reject_log_header (FILE *); |
196 | | static void write_reject_log_reason (FILE *, reject_reason, |
197 | | const struct url *, const struct url *); |
198 | | |
199 | | /* Retrieve a part of the web beginning with START_URL. This used to |
200 | | be called "recursive retrieval", because the old function was |
201 | | recursive and implemented depth-first search. retrieve_tree on the |
202 | | other hand implements breadth-search traversal of the tree, which |
203 | | results in much nicer ordering of downloads. |
204 | | |
205 | | The algorithm this function uses is simple: |
206 | | |
207 | | 1. put START_URL in the queue. |
208 | | 2. while there are URLs in the queue: |
209 | | |
210 | | 3. get next URL from the queue. |
211 | | 4. download it. |
212 | | 5. if the URL is HTML and its depth does not exceed maximum depth, |
213 | | get the list of URLs embedded therein. |
214 | | 6. for each of those URLs do the following: |
215 | | |
216 | | 7. if the URL is not one of those downloaded before, and if it |
217 | | satisfies the criteria specified by the various command-line |
218 | | options, add it to the queue. */ |
219 | | |
220 | | uerr_t |
221 | | retrieve_tree (struct url *start_url_parsed, struct iri *pi) |
222 | 0 | { |
223 | 0 | uerr_t status = RETROK; |
224 | | |
225 | | /* The queue of URLs we need to load. */ |
226 | 0 | struct url_queue *queue; |
227 | | |
228 | | /* The URLs we do not wish to enqueue, because they are already in |
229 | | the queue, but haven't been downloaded yet. */ |
230 | 0 | struct hash_table *blacklist; |
231 | |
|
232 | 0 | struct iri *i = iri_new (); |
233 | |
|
234 | 0 | FILE *rejectedlog = NULL; /* Don't write a rejected log. */ |
235 | | |
236 | | /* Duplicate pi struct if not NULL */ |
237 | 0 | if (pi) |
238 | 0 | { |
239 | 0 | #define COPYSTR(x) (x) ? xstrdup(x) : NULL; |
240 | 0 | i->uri_encoding = COPYSTR (pi->uri_encoding); |
241 | 0 | i->content_encoding = COPYSTR (pi->content_encoding); |
242 | 0 | i->utf8_encode = pi->utf8_encode; |
243 | 0 | #undef COPYSTR |
244 | 0 | } |
245 | 0 | #ifdef ENABLE_IRI |
246 | 0 | else |
247 | 0 | set_uri_encoding (i, opt.locale, true); |
248 | 0 | #endif |
249 | |
|
250 | 0 | queue = url_queue_new (); |
251 | 0 | blacklist = make_string_hash_table (0); |
252 | | |
253 | | /* Enqueue the starting URL. Use start_url_parsed->url rather than |
254 | | just URL so we enqueue the canonical form of the URL. */ |
255 | 0 | url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true, |
256 | 0 | false); |
257 | 0 | blacklist_add (blacklist, start_url_parsed->url); |
258 | |
|
259 | 0 | if (opt.rejected_log) |
260 | 0 | { |
261 | 0 | rejectedlog = fopen (opt.rejected_log, "w"); |
262 | 0 | write_reject_log_header (rejectedlog); |
263 | 0 | if (!rejectedlog) |
264 | 0 | logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror (errno)); |
265 | 0 | } |
266 | |
|
267 | 0 | while (1) |
268 | 0 | { |
269 | 0 | bool descend = false; |
270 | 0 | char *url, *referer, *file = NULL; |
271 | 0 | int depth; |
272 | 0 | bool html_allowed, css_allowed; |
273 | 0 | bool is_css = false; |
274 | 0 | bool dash_p_leaf_HTML = false; |
275 | |
|
276 | 0 | if (opt.quota && total_downloaded_bytes > opt.quota) |
277 | 0 | break; |
278 | 0 | if (status == FWRITEERR) |
279 | 0 | break; |
280 | | |
281 | | /* Get the next URL from the queue... */ |
282 | | |
283 | 0 | if (!url_dequeue (queue, (struct iri **) &i, |
284 | 0 | (const char **)&url, (const char **)&referer, |
285 | 0 | &depth, &html_allowed, &css_allowed)) |
286 | 0 | break; |
287 | | |
288 | | /* ...and download it. Note that this download is in most cases |
289 | | unconditional, as download_child already makes sure a file |
290 | | doesn't get enqueued twice -- and yet this check is here, and |
291 | | not in download_child. This is so that if you run `wget -r |
292 | | URL1 URL2', and a random URL is encountered once under URL1 |
293 | | and again under URL2, but at a different (possibly smaller) |
294 | | depth, we want the URL's children to be taken into account |
295 | | the second time. */ |
296 | 0 | if (dl_url_file_map && hash_table_contains (dl_url_file_map, url)) |
297 | 0 | { |
298 | 0 | bool is_css_bool; |
299 | |
|
300 | 0 | file = xstrdup (hash_table_get (dl_url_file_map, url)); |
301 | |
|
302 | 0 | DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n", |
303 | 0 | url, file)); |
304 | |
|
305 | 0 | if ((is_css_bool = (css_allowed |
306 | 0 | && downloaded_css_set |
307 | 0 | && string_set_contains (downloaded_css_set, file))) |
308 | 0 | || (html_allowed |
309 | 0 | && downloaded_html_set |
310 | 0 | && string_set_contains (downloaded_html_set, file))) |
311 | 0 | { |
312 | 0 | descend = true; |
313 | 0 | is_css = is_css_bool; |
314 | 0 | } |
315 | 0 | } |
316 | 0 | else |
317 | 0 | { |
318 | 0 | int dt = 0, url_err; |
319 | 0 | char *redirected = NULL; |
320 | 0 | struct url *url_parsed = url_parse (url, &url_err, i, true); |
321 | |
|
322 | 0 | if (!url_parsed) |
323 | 0 | { |
324 | 0 | logprintf (LOG_NOTQUIET, "%s: %s.\n",url, url_error (url_err)); |
325 | 0 | inform_exit_status (URLERROR); |
326 | 0 | } |
327 | 0 | else |
328 | 0 | { |
329 | |
|
330 | 0 | status = retrieve_url (url_parsed, url, &file, &redirected, referer, |
331 | 0 | &dt, false, i, true); |
332 | |
|
333 | 0 | if (html_allowed && file && status == RETROK |
334 | 0 | && (dt & RETROKF) && (dt & TEXTHTML)) |
335 | 0 | { |
336 | 0 | descend = true; |
337 | 0 | is_css = false; |
338 | 0 | } |
339 | | |
340 | | /* a little different, css_allowed can override content type |
341 | | lots of web servers serve css with an incorrect content type |
342 | | */ |
343 | 0 | if (file && status == RETROK |
344 | 0 | && (dt & RETROKF) && |
345 | 0 | ((dt & TEXTCSS) || css_allowed)) |
346 | 0 | { |
347 | 0 | descend = true; |
348 | 0 | is_css = true; |
349 | 0 | } |
350 | |
|
351 | 0 | if (redirected) |
352 | 0 | { |
353 | | /* We have been redirected, possibly to another host, or |
354 | | different path, or wherever. Check whether we really |
355 | | want to follow it. */ |
356 | 0 | if (descend) |
357 | 0 | { |
358 | 0 | reject_reason r = descend_redirect (redirected, url_parsed, |
359 | 0 | depth, start_url_parsed, blacklist, i); |
360 | 0 | if (r == WG_RR_SUCCESS) |
361 | 0 | { |
362 | | /* Make sure that the old pre-redirect form gets |
363 | | blacklisted. */ |
364 | 0 | blacklist_add (blacklist, url); |
365 | 0 | } |
366 | 0 | else |
367 | 0 | { |
368 | 0 | write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed); |
369 | 0 | descend = false; |
370 | 0 | } |
371 | 0 | } |
372 | |
|
373 | 0 | xfree (url); |
374 | 0 | url = redirected; |
375 | 0 | } |
376 | 0 | else |
377 | 0 | { |
378 | 0 | xfree (url); |
379 | 0 | url = xstrdup (url_parsed->url); |
380 | 0 | } |
381 | 0 | url_free (url_parsed); |
382 | 0 | } |
383 | 0 | } |
384 | |
|
385 | 0 | if (opt.spider) |
386 | 0 | { |
387 | 0 | visited_url (url, referer); |
388 | 0 | } |
389 | |
|
390 | 0 | if (descend |
391 | 0 | && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION) |
392 | 0 | { |
393 | 0 | if (opt.page_requisites |
394 | 0 | && (depth == opt.reclevel || depth == opt.reclevel + 1)) |
395 | 0 | { |
396 | | /* When -p is specified, we are allowed to exceed the |
397 | | maximum depth, but only for the "inline" links, |
398 | | i.e. those that are needed to display the page. |
399 | | Originally this could exceed the depth at most by |
400 | | one, but we allow one more level so that the leaf |
401 | | pages that contain frames can be loaded |
402 | | correctly. */ |
403 | 0 | dash_p_leaf_HTML = true; |
404 | 0 | } |
405 | 0 | else |
406 | 0 | { |
407 | | /* Either -p wasn't specified or it was and we've |
408 | | already spent the two extra (pseudo-)levels that it |
409 | | affords us, so we need to bail out. */ |
410 | 0 | DEBUGP (("Not descending further; at depth %d, max. %d.\n", |
411 | 0 | depth, opt.reclevel)); |
412 | 0 | descend = false; |
413 | 0 | } |
414 | 0 | } |
415 | | |
416 | | /* If the downloaded document was HTML or CSS, parse it and enqueue the |
417 | | links it contains. */ |
418 | |
|
419 | 0 | if (descend) |
420 | 0 | { |
421 | 0 | bool meta_disallow_follow = false; |
422 | 0 | struct urlpos *children |
423 | 0 | = is_css ? get_urls_css_file (file, url) : |
424 | 0 | get_urls_html (file, url, &meta_disallow_follow, i); |
425 | |
|
426 | 0 | if (opt.use_robots && meta_disallow_follow) |
427 | 0 | { |
428 | 0 | logprintf(LOG_VERBOSE, _("nofollow attribute found in %s. Will not follow any links on this page\n"), file); |
429 | 0 | free_urlpos (children); |
430 | 0 | children = NULL; |
431 | 0 | } |
432 | |
|
433 | 0 | if (children) |
434 | 0 | { |
435 | 0 | struct urlpos *child = children; |
436 | 0 | struct url *url_parsed = url_parse (url, NULL, i, true); |
437 | 0 | struct iri *ci; |
438 | 0 | char *referer_url = url; |
439 | 0 | bool strip_auth; |
440 | |
|
441 | 0 | assert (url_parsed != NULL); |
442 | |
|
443 | 0 | if (!url_parsed) |
444 | 0 | continue; |
445 | | |
446 | 0 | strip_auth = (url_parsed && url_parsed->user); |
447 | | |
448 | | /* Strip auth info if present */ |
449 | 0 | if (strip_auth) |
450 | 0 | referer_url = url_string (url_parsed, URL_AUTH_HIDE); |
451 | |
|
452 | 0 | for (; child; child = child->next) |
453 | 0 | { |
454 | 0 | reject_reason r; |
455 | |
|
456 | 0 | if (child->ignore_when_downloading) |
457 | 0 | { |
458 | 0 | DEBUGP (("Not following due to 'ignore' flag: %s\n", child->url->url)); |
459 | 0 | continue; |
460 | 0 | } |
461 | | |
462 | 0 | if (dash_p_leaf_HTML && !child->link_inline_p) |
463 | 0 | { |
464 | 0 | DEBUGP (("Not following due to 'link inline' flag: %s\n", child->url->url)); |
465 | 0 | continue; |
466 | 0 | } |
467 | | |
468 | 0 | r = download_child (child, url_parsed, depth, |
469 | 0 | start_url_parsed, blacklist, i); |
470 | 0 | if (r == WG_RR_SUCCESS) |
471 | 0 | { |
472 | 0 | ci = iri_new (); |
473 | 0 | set_uri_encoding (ci, i->content_encoding, false); |
474 | 0 | url_enqueue (queue, ci, xstrdup (child->url->url), |
475 | 0 | xstrdup (referer_url), depth + 1, |
476 | 0 | child->link_expect_html, |
477 | 0 | child->link_expect_css); |
478 | | /* We blacklist the URL we have enqueued, because we |
479 | | don't want to enqueue (and hence download) the |
480 | | same URL twice. */ |
481 | 0 | blacklist_add (blacklist, child->url->url); |
482 | 0 | } |
483 | 0 | else |
484 | 0 | { |
485 | 0 | write_reject_log_reason (rejectedlog, r, child->url, url_parsed); |
486 | 0 | } |
487 | 0 | } |
488 | |
|
489 | 0 | if (strip_auth) |
490 | 0 | xfree (referer_url); |
491 | 0 | url_free (url_parsed); |
492 | 0 | free_urlpos (children); |
493 | 0 | } |
494 | 0 | } |
495 | | |
496 | 0 | if (file |
497 | 0 | && (opt.delete_after |
498 | 0 | || opt.spider /* opt.recursive is implicitly true */ |
499 | 0 | || !acceptable (file))) |
500 | 0 | { |
501 | | /* Either --delete-after was specified, or we loaded this |
502 | | (otherwise unneeded because of --spider or rejected by -R) |
503 | | HTML file just to harvest its hyperlinks -- in either case, |
504 | | delete the local file. */ |
505 | 0 | DEBUGP (("Removing file due to %s in recursive_retrieve():\n", |
506 | 0 | opt.delete_after ? "--delete-after" : |
507 | 0 | (opt.spider ? "--spider" : |
508 | 0 | "recursive rejection criteria"))); |
509 | 0 | logprintf (LOG_VERBOSE, |
510 | 0 | (opt.delete_after || opt.spider |
511 | 0 | ? _("Removing %s.\n") |
512 | 0 | : _("Removing %s since it should be rejected.\n")), |
513 | 0 | file); |
514 | 0 | if (unlink (file)) |
515 | 0 | logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno)); |
516 | 0 | logputs (LOG_VERBOSE, "\n"); |
517 | 0 | register_delete_file (file); |
518 | 0 | } |
519 | |
|
520 | 0 | xfree (url); |
521 | 0 | xfree (referer); |
522 | 0 | xfree (file); |
523 | 0 | iri_free (i); |
524 | 0 | } |
525 | |
|
526 | 0 | if (rejectedlog) |
527 | 0 | { |
528 | 0 | fclose (rejectedlog); |
529 | 0 | rejectedlog = NULL; |
530 | 0 | } |
531 | | |
532 | | /* If anything is left of the queue due to a premature exit, free it |
533 | | now. */ |
534 | 0 | { |
535 | 0 | char *d1, *d2; |
536 | 0 | int d3; |
537 | 0 | bool d4, d5; |
538 | 0 | struct iri *d6; |
539 | 0 | while (url_dequeue (queue, (struct iri **)&d6, |
540 | 0 | (const char **)&d1, (const char **)&d2, &d3, &d4, &d5)) |
541 | 0 | { |
542 | 0 | iri_free (d6); |
543 | 0 | xfree (d1); |
544 | 0 | xfree (d2); |
545 | 0 | } |
546 | 0 | } |
547 | 0 | url_queue_delete (queue); |
548 | |
|
549 | 0 | string_set_free (blacklist); |
550 | |
|
551 | 0 | if (opt.quota && total_downloaded_bytes > opt.quota) |
552 | 0 | return QUOTEXC; |
553 | 0 | else if (status == FWRITEERR) |
554 | 0 | return FWRITEERR; |
555 | 0 | else |
556 | 0 | return RETROK; |
557 | 0 | } |
558 | | |
559 | | /* Based on the context provided by retrieve_tree, decide whether a |
560 | | URL is to be descended to. This is only ever called from |
561 | | retrieve_tree, but is in a separate function for clarity. |
562 | | |
563 | | The most expensive checks (such as those for robots) are memoized |
564 | | by storing these URLs to BLACKLIST. This may or may not help. It |
565 | | will help if those URLs are encountered many times. */ |
566 | | |
567 | | static reject_reason |
568 | | download_child (const struct urlpos *upos, struct url *parent, int depth, |
569 | | struct url *start_url_parsed, struct hash_table *blacklist, |
570 | | struct iri *iri) |
571 | 0 | { |
572 | 0 | struct url *u = upos->url; |
573 | 0 | const char *url = u->url; |
574 | 0 | bool u_scheme_like_http; |
575 | 0 | reject_reason reason = WG_RR_SUCCESS; |
576 | |
|
577 | 0 | DEBUGP (("Deciding whether to enqueue \"%s\".\n", url)); |
578 | |
|
579 | 0 | if (blacklist_contains (blacklist, url)) |
580 | 0 | { |
581 | 0 | if (opt.spider) |
582 | 0 | { |
583 | 0 | char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD); |
584 | 0 | DEBUGP (("download_child: parent->url is: %s\n", quote (parent->url))); |
585 | 0 | visited_url (url, referrer); |
586 | 0 | xfree (referrer); |
587 | 0 | } |
588 | 0 | DEBUGP (("Already on the black list.\n")); |
589 | 0 | reason = WG_RR_BLACKLIST; |
590 | 0 | goto out; |
591 | 0 | } |
592 | | |
593 | | /* Several things to check for: |
594 | | 1. if scheme is not https and https_only requested |
595 | | 2. if scheme is not http, and we don't load it |
596 | | 3. check for relative links (if relative_only is set) |
597 | | 4. check for domain |
598 | | 5. check for no-parent |
599 | | 6. check for excludes && includes |
600 | | 7. check for suffix |
601 | | 8. check for same host (if spanhost is unset), with possible |
602 | | gethostbyname baggage |
603 | | 9. check for robots.txt |
604 | | |
605 | | Addendum: If the URL is FTP, and it is to be loaded, only the |
606 | | domain and suffix settings are "stronger". |
607 | | |
608 | | Note that .html files will get loaded regardless of suffix rules |
609 | | (but that is remedied later with unlink) unless the depth equals |
610 | | the maximum depth. |
611 | | |
612 | | More time- and memory- consuming tests should be put later on |
613 | | the list. */ |
614 | | |
615 | 0 | #ifdef HAVE_SSL |
616 | 0 | if (opt.https_only && u->scheme != SCHEME_HTTPS) |
617 | 0 | { |
618 | 0 | DEBUGP (("Not following non-HTTPS links.\n")); |
619 | 0 | reason = WG_RR_NOTHTTPS; |
620 | 0 | goto out; |
621 | 0 | } |
622 | 0 | #endif |
623 | | |
624 | | /* Determine whether URL under consideration has a HTTP-like scheme. */ |
625 | 0 | u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP); |
626 | | |
627 | | /* 1. Schemes other than HTTP are normally not recursed into. */ |
628 | 0 | if (!u_scheme_like_http && !((u->scheme == SCHEME_FTP |
629 | 0 | #ifdef HAVE_SSL |
630 | 0 | || u->scheme == SCHEME_FTPS |
631 | 0 | #endif |
632 | 0 | ) && opt.follow_ftp)) |
633 | 0 | { |
634 | 0 | DEBUGP (("Not following non-HTTP schemes.\n")); |
635 | 0 | reason = WG_RR_NONHTTP; |
636 | 0 | goto out; |
637 | 0 | } |
638 | | |
639 | | /* 2. If it is an absolute link and they are not followed, throw it |
640 | | out. */ |
641 | 0 | if (u_scheme_like_http) |
642 | 0 | if (opt.relative_only && !upos->link_relative_p) |
643 | 0 | { |
644 | 0 | DEBUGP (("It doesn't really look like a relative link.\n")); |
645 | 0 | reason = WG_RR_ABSOLUTE; |
646 | 0 | goto out; |
647 | 0 | } |
648 | | |
649 | | /* 3. If its domain is not to be accepted/looked-up, chuck it |
650 | | out. */ |
651 | 0 | if (!accept_domain (u)) |
652 | 0 | { |
653 | 0 | DEBUGP (("The domain was not accepted.\n")); |
654 | 0 | reason = WG_RR_DOMAIN; |
655 | 0 | goto out; |
656 | 0 | } |
657 | | |
658 | | /* 4. Check for parent directory. |
659 | | |
660 | | If we descended to a different host or changed the scheme, ignore |
661 | | opt.no_parent. Also ignore it for documents needed to display |
662 | | the parent page when in -p mode. */ |
663 | 0 | if (opt.no_parent |
664 | 0 | && schemes_are_similar_p (u->scheme, start_url_parsed->scheme) |
665 | 0 | && 0 == strcasecmp (u->host, start_url_parsed->host) |
666 | 0 | && (u->scheme != start_url_parsed->scheme |
667 | 0 | || u->port == start_url_parsed->port) |
668 | 0 | && !(opt.page_requisites && upos->link_inline_p)) |
669 | 0 | { |
670 | 0 | if (!subdir_p (start_url_parsed->dir, u->dir)) |
671 | 0 | { |
672 | 0 | DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n", |
673 | 0 | u->dir, start_url_parsed->dir)); |
674 | 0 | reason = WG_RR_PARENT; |
675 | 0 | goto out; |
676 | 0 | } |
677 | 0 | } |
678 | | |
679 | | /* 5. If the file does not match the acceptance list, or is on the |
680 | | rejection list, chuck it out. The same goes for the directory |
681 | | exclusion and inclusion lists. */ |
682 | 0 | if (opt.includes || opt.excludes) |
683 | 0 | { |
684 | 0 | if (!accdir (u->dir)) |
685 | 0 | { |
686 | 0 | DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir)); |
687 | 0 | reason = WG_RR_LIST; |
688 | 0 | goto out; |
689 | 0 | } |
690 | 0 | } |
691 | 0 | if (!accept_url (url)) |
692 | 0 | { |
693 | 0 | DEBUGP (("%s is excluded/not-included through regex.\n", url)); |
694 | 0 | reason = WG_RR_REGEX; |
695 | 0 | goto out; |
696 | 0 | } |
697 | | |
698 | | /* 6. Check for acceptance/rejection rules. We ignore these rules |
699 | | for directories (no file name to match) and for non-leaf HTMLs, |
700 | | which can lead to other files that do need to be downloaded. (-p |
701 | | automatically implies non-leaf because with -p we can, if |
702 | | necessary, overstep the maximum depth to get the page requisites.) */ |
703 | 0 | if (u->file[0] != '\0' |
704 | 0 | && !(has_html_suffix_p (u->file) |
705 | | /* The exception only applies to non-leaf HTMLs (but -p |
706 | | always implies non-leaf because we can overstep the |
707 | | maximum depth to get the requisites): */ |
708 | 0 | && (/* non-leaf */ |
709 | 0 | opt.reclevel == INFINITE_RECURSION |
710 | | /* also non-leaf */ |
711 | 0 | || depth < opt.reclevel - 1 |
712 | | /* -p, which implies non-leaf (see above) */ |
713 | 0 | || opt.page_requisites))) |
714 | 0 | { |
715 | 0 | if (!acceptable (u->file)) |
716 | 0 | { |
717 | 0 | DEBUGP (("%s (%s) does not match acc/rej rules.\n", |
718 | 0 | url, u->file)); |
719 | 0 | reason = WG_RR_RULES; |
720 | 0 | goto out; |
721 | 0 | } |
722 | 0 | } |
723 | | |
724 | | /* 7. */ |
725 | 0 | if (schemes_are_similar_p (u->scheme, parent->scheme)) |
726 | 0 | if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host)) |
727 | 0 | { |
728 | 0 | DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n", |
729 | 0 | u->host, parent->host)); |
730 | 0 | reason = WG_RR_SPANNEDHOST; |
731 | 0 | goto out; |
732 | 0 | } |
733 | | |
734 | | /* 8. */ |
735 | 0 | if (opt.use_robots && u_scheme_like_http) |
736 | 0 | { |
737 | 0 | struct robot_specs *specs = res_get_specs (u->host, u->port); |
738 | 0 | if (!specs) |
739 | 0 | { |
740 | 0 | char *rfile; |
741 | 0 | if (res_retrieve_file (url, &rfile, iri)) |
742 | 0 | { |
743 | 0 | specs = res_parse_from_file (rfile); |
744 | | |
745 | | /* Delete the robots.txt file if we chose to either delete the |
746 | | files after downloading or we're just running a spider or |
747 | | we use page requisites or pattern matching. */ |
748 | 0 | if (opt.delete_after || opt.spider || match_tail(rfile, ".tmp", false)) |
749 | 0 | { |
750 | 0 | logprintf (LOG_VERBOSE, _("Removing %s.\n"), rfile); |
751 | 0 | if (unlink (rfile)) |
752 | 0 | logprintf (LOG_NOTQUIET, "unlink: %s\n", |
753 | 0 | strerror (errno)); |
754 | 0 | } |
755 | |
|
756 | 0 | xfree (rfile); |
757 | 0 | } |
758 | 0 | else |
759 | 0 | { |
760 | | /* If we cannot get real specs, at least produce |
761 | | dummy ones so that we can register them and stop |
762 | | trying to retrieve them. */ |
763 | 0 | specs = res_parse ("", 0); |
764 | 0 | } |
765 | 0 | res_register_specs (u->host, u->port, specs); |
766 | 0 | } |
767 | | |
768 | | /* Now that we have (or don't have) robots.txt specs, we can |
769 | | check what they say. */ |
770 | 0 | if (!res_match_path (specs, u->path)) |
771 | 0 | { |
772 | 0 | DEBUGP (("Not following %s because robots.txt forbids it.\n", url)); |
773 | 0 | blacklist_add (blacklist, url); |
774 | 0 | reason = WG_RR_ROBOTS; |
775 | 0 | goto out; |
776 | 0 | } |
777 | 0 | } |
778 | | |
779 | 0 | out: |
780 | |
|
781 | 0 | if (reason == WG_RR_SUCCESS) |
782 | | /* The URL has passed all the tests. It can be placed in the |
783 | | download queue. */ |
784 | 0 | DEBUGP (("Decided to load it.\n")); |
785 | 0 | else |
786 | 0 | DEBUGP (("Decided NOT to load it.\n")); |
787 | |
|
788 | 0 | return reason; |
789 | 0 | } |
790 | | |
791 | | /* This function determines whether we will consider downloading the |
792 | | children of a URL whose download resulted in a redirection, |
793 | | possibly to another host, etc. It is needed very rarely, and thus |
794 | | it is merely a simple-minded wrapper around download_child. */ |
795 | | |
796 | | static reject_reason |
797 | | descend_redirect (const char *redirected, struct url *orig_parsed, int depth, |
798 | | struct url *start_url_parsed, struct hash_table *blacklist, |
799 | | struct iri *iri) |
800 | 0 | { |
801 | 0 | struct url *new_parsed; |
802 | 0 | struct urlpos *upos; |
803 | 0 | reject_reason reason; |
804 | |
|
805 | 0 | assert (orig_parsed != NULL); |
806 | |
|
807 | 0 | new_parsed = url_parse (redirected, NULL, NULL, false); |
808 | 0 | assert (new_parsed != NULL); |
809 | |
|
810 | 0 | upos = xnew0 (struct urlpos); |
811 | 0 | upos->url = new_parsed; |
812 | |
|
813 | 0 | reason = download_child (upos, orig_parsed, depth, |
814 | 0 | start_url_parsed, blacklist, iri); |
815 | |
|
816 | 0 | if (reason == WG_RR_SUCCESS) |
817 | 0 | blacklist_add (blacklist, upos->url->url); |
818 | 0 | else if (reason == WG_RR_LIST || reason == WG_RR_REGEX) |
819 | 0 | { |
820 | 0 | DEBUGP (("Ignoring decision for redirects, decided to load it.\n")); |
821 | 0 | blacklist_add (blacklist, upos->url->url); |
822 | 0 | reason = WG_RR_SUCCESS; |
823 | 0 | } |
824 | 0 | else |
825 | 0 | DEBUGP (("Redirection \"%s\" failed the test.\n", redirected)); |
826 | |
|
827 | 0 | url_free (new_parsed); |
828 | 0 | xfree (upos); |
829 | |
|
830 | 0 | return reason; |
831 | 0 | } |
832 | | |
833 | | |
834 | | /* This function writes the rejected log header. */ |
835 | | static void |
836 | | write_reject_log_header (FILE *f) |
837 | 0 | { |
838 | 0 | if (!f) |
839 | 0 | return; |
840 | | |
841 | | /* Note: Update this header when columns change in any way. */ |
842 | 0 | fprintf (f, "REASON\t" |
843 | 0 | "U_URL\tU_SCHEME\tU_HOST\tU_PORT\tU_PATH\tU_PARAMS\tU_QUERY\tU_FRAGMENT\t" |
844 | 0 | "P_URL\tP_SCHEME\tP_HOST\tP_PORT\tP_PATH\tP_PARAMS\tP_QUERY\tP_FRAGMENT\n"); |
845 | 0 | } |
846 | | |
847 | | /* This function writes a URL to the reject log. Internal use only. */ |
848 | | static void |
849 | | write_reject_log_url (FILE *fp, const struct url *url) |
850 | 0 | { |
851 | 0 | const char *escaped_str; |
852 | 0 | const char *scheme_str; |
853 | |
|
854 | 0 | if (!fp) |
855 | 0 | return; |
856 | | |
857 | 0 | escaped_str = url_escape (url->url); |
858 | |
|
859 | 0 | switch (url->scheme) |
860 | 0 | { |
861 | 0 | case SCHEME_HTTP: scheme_str = "SCHEME_HTTP"; break; |
862 | 0 | #ifdef HAVE_SSL |
863 | 0 | case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS"; break; |
864 | 0 | case SCHEME_FTPS: scheme_str = "SCHEME_FTPS"; break; |
865 | 0 | #endif |
866 | 0 | case SCHEME_FTP: scheme_str = "SCHEME_FTP"; break; |
867 | 0 | default: scheme_str = "SCHEME_INVALID"; break; |
868 | 0 | } |
869 | | |
870 | 0 | fprintf (fp, "%s\t%s\t%s\t%i\t%s\t%s\t%s\t%s", |
871 | 0 | escaped_str, |
872 | 0 | scheme_str, |
873 | 0 | url->host, |
874 | 0 | url->port, |
875 | 0 | url->path, |
876 | 0 | url->params ? url->params : "", |
877 | 0 | url->query ? url->query : "", |
878 | 0 | url->fragment ? url->fragment : ""); |
879 | |
|
880 | 0 | xfree (escaped_str); |
881 | 0 | } |
882 | | |
883 | | /* This function writes out information on why a URL was rejected and its |
884 | | context from download_child such as the URL being rejected and it's |
885 | | parent's URL. The format it uses is comma separated values but with tabs. */ |
886 | | static void |
887 | | write_reject_log_reason (FILE *fp, reject_reason reason, |
888 | | const struct url *url, const struct url *parent) |
889 | 0 | { |
890 | 0 | const char *reason_str; |
891 | |
|
892 | 0 | if (!fp) |
893 | 0 | return; |
894 | | |
895 | 0 | switch (reason) |
896 | 0 | { |
897 | 0 | case WG_RR_SUCCESS: reason_str = "SUCCESS"; break; |
898 | 0 | case WG_RR_BLACKLIST: reason_str = "BLACKLIST"; break; |
899 | 0 | case WG_RR_NOTHTTPS: reason_str = "NOTHTTPS"; break; |
900 | 0 | case WG_RR_NONHTTP: reason_str = "NONHTTP"; break; |
901 | 0 | case WG_RR_ABSOLUTE: reason_str = "ABSOLUTE"; break; |
902 | 0 | case WG_RR_DOMAIN: reason_str = "DOMAIN"; break; |
903 | 0 | case WG_RR_PARENT: reason_str = "PARENT"; break; |
904 | 0 | case WG_RR_LIST: reason_str = "LIST"; break; |
905 | 0 | case WG_RR_REGEX: reason_str = "REGEX"; break; |
906 | 0 | case WG_RR_RULES: reason_str = "RULES"; break; |
907 | 0 | case WG_RR_SPANNEDHOST: reason_str = "SPANNEDHOST"; break; |
908 | 0 | case WG_RR_ROBOTS: reason_str = "ROBOTS"; break; |
909 | 0 | default: reason_str = "UNKNOWN"; break; |
910 | 0 | } |
911 | | |
912 | 0 | fprintf (fp, "%s\t", reason_str); |
913 | 0 | write_reject_log_url (fp, url); |
914 | 0 | fprintf (fp, "\t"); |
915 | 0 | write_reject_log_url (fp, parent); |
916 | 0 | fprintf (fp, "\n"); |
917 | 0 | } |
918 | | |
919 | | /* vim:set sts=2 sw=2 cino+={s: */ |