Coverage Report

Created: 2024-07-23 07:36

/src/wget/src/recur.c
Line
Count
Source (jump to first uncovered line)
1
/* Handling of recursive HTTP retrieving.
2
   Copyright (C) 1996-2012, 2015, 2018-2024 Free Software Foundation,
3
   Inc.
4
5
This file is part of GNU Wget.
6
7
GNU Wget is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 3 of the License, or
10
 (at your option) any later version.
11
12
GNU Wget is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
GNU General Public License for more details.
16
17
You should have received a copy of the GNU General Public License
18
along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20
Additional permission under GNU GPL version 3 section 7
21
22
If you modify this program, or any covered work, by linking or
23
combining it with the OpenSSL project's OpenSSL library (or a
24
modified version of that library), containing parts covered by the
25
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26
grants you additional permission to convey the resulting work.
27
Corresponding Source for a non-source form of such a combination
28
shall include the source code for the parts of OpenSSL used as well
29
as that of the covered work.  */
30
31
#include "wget.h"
32
33
#include <stdio.h>
34
#include <stdlib.h>
35
#include <string.h>
36
#include <unistd.h>
37
#include <errno.h>
38
#include <assert.h>
39
40
#include "url.h"
41
#include "recur.h"
42
#include "utils.h"
43
#include "retr.h"
44
#include "ftp.h"
45
#include "host.h"
46
#include "hash.h"
47
#include "res.h"
48
#include "convert.h"
49
#include "html-url.h"
50
#include "css-url.h"
51
#include "spider.h"
52
#include "exits.h"
53
54
/* Functions for maintaining the URL queue.  */
55
56
struct queue_element {
57
  const char *url;              /* the URL to download */
58
  const char *referer;          /* the referring document */
59
  int depth;                    /* the depth */
60
  bool html_allowed;            /* whether the document is allowed to
61
                                   be treated as HTML. */
62
  struct iri *iri;                /* sXXXav */
63
  bool css_allowed;             /* whether the document is allowed to
64
                                   be treated as CSS. */
65
  struct queue_element *next;   /* next element in queue */
66
};
67
68
struct url_queue {
69
  struct queue_element *head;
70
  struct queue_element *tail;
71
  int count, maxcount;
72
};
73
74
/* Create a URL queue. */
75
76
static struct url_queue *
77
url_queue_new (void)
78
0
{
79
0
  struct url_queue *queue = xnew0 (struct url_queue);
80
0
  return queue;
81
0
}
82
83
/* Delete a URL queue. */
84
85
static void
86
url_queue_delete (struct url_queue *queue)
87
0
{
88
0
  xfree (queue);
89
0
}
90
91
/* Enqueue a URL in the queue.  The queue is FIFO: the items will be
92
   retrieved ("dequeued") from the queue in the order they were placed
93
   into it.  */
94
95
static void
96
url_enqueue (struct url_queue *queue, struct iri *i,
97
             const char *url, const char *referer, int depth,
98
             bool html_allowed, bool css_allowed)
99
0
{
100
0
  struct queue_element *qel = xnew (struct queue_element);
101
0
  qel->iri = i;
102
0
  qel->url = url;
103
0
  qel->referer = referer;
104
0
  qel->depth = depth;
105
0
  qel->html_allowed = html_allowed;
106
0
  qel->css_allowed = css_allowed;
107
0
  qel->next = NULL;
108
109
0
  ++queue->count;
110
0
  if (queue->count > queue->maxcount)
111
0
    queue->maxcount = queue->count;
112
113
0
  DEBUGP (("Enqueuing %s at depth %d\n",
114
0
           quotearg_n_style (0, escape_quoting_style, url), depth));
115
0
  DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
116
117
0
  if (i)
118
0
    DEBUGP (("[IRI Enqueuing %s with %s\n", quote_n (0, url),
119
0
             i->uri_encoding ? quote_n (1, i->uri_encoding) : "None"));
120
121
0
  if (queue->tail)
122
0
    queue->tail->next = qel;
123
0
  queue->tail = qel;
124
125
0
  if (!queue->head)
126
0
    queue->head = queue->tail;
127
0
}
128
129
/* Take a URL out of the queue.  Return true if this operation
130
   succeeded, or false if the queue is empty.  */
131
132
static bool
133
url_dequeue (struct url_queue *queue, struct iri **i,
134
             const char **url, const char **referer, int *depth,
135
             bool *html_allowed, bool *css_allowed)
136
0
{
137
0
  struct queue_element *qel = queue->head;
138
139
0
  if (!qel)
140
0
    return false;
141
142
0
  queue->head = queue->head->next;
143
0
  if (!queue->head)
144
0
    queue->tail = NULL;
145
146
0
  *i = qel->iri;
147
0
  *url = qel->url;
148
0
  *referer = qel->referer;
149
0
  *depth = qel->depth;
150
0
  *html_allowed = qel->html_allowed;
151
0
  *css_allowed = qel->css_allowed;
152
153
0
  --queue->count;
154
155
0
  DEBUGP (("Dequeuing %s at depth %d\n",
156
0
           quotearg_n_style (0, escape_quoting_style, qel->url), qel->depth));
157
0
  DEBUGP (("Queue count %d, maxcount %d.\n", queue->count, queue->maxcount));
158
159
0
  xfree (qel);
160
0
  return true;
161
0
}
162
163
static void blacklist_add (struct hash_table *blacklist, const char *url)
164
0
{
165
0
  char *url_unescaped = xstrdup (url);
166
167
0
  url_unescape (url_unescaped);
168
0
  string_set_add (blacklist, url_unescaped);
169
0
  xfree (url_unescaped);
170
0
}
171
172
static int blacklist_contains (struct hash_table *blacklist, const char *url)
173
0
{
174
0
  char *url_unescaped = xstrdup(url);
175
0
  int ret;
176
177
0
  url_unescape (url_unescaped);
178
0
  ret = string_set_contains (blacklist, url_unescaped);
179
0
  xfree (url_unescaped);
180
181
0
  return ret;
182
0
}
183
184
typedef enum
185
{
186
  WG_RR_SUCCESS, WG_RR_BLACKLIST, WG_RR_NOTHTTPS, WG_RR_NONHTTP, WG_RR_ABSOLUTE,
187
  WG_RR_DOMAIN, WG_RR_PARENT, WG_RR_LIST, WG_RR_REGEX, WG_RR_RULES,
188
  WG_RR_SPANNEDHOST, WG_RR_ROBOTS
189
} reject_reason;
190
191
static reject_reason download_child (const struct urlpos *, struct url *, int,
192
                              struct url *, struct hash_table *, struct iri *);
193
static reject_reason descend_redirect (const char *, struct url *, int,
194
                              struct url *, struct hash_table *, struct iri *);
195
static void write_reject_log_header (FILE *);
196
static void write_reject_log_reason (FILE *, reject_reason,
197
                              const struct url *, const struct url *);
198
199
/* Retrieve a part of the web beginning with START_URL.  This used to
200
   be called "recursive retrieval", because the old function was
201
   recursive and implemented depth-first search.  retrieve_tree on the
202
   other hand implements breadth-search traversal of the tree, which
203
   results in much nicer ordering of downloads.
204
205
   The algorithm this function uses is simple:
206
207
   1. put START_URL in the queue.
208
   2. while there are URLs in the queue:
209
210
     3. get next URL from the queue.
211
     4. download it.
212
     5. if the URL is HTML and its depth does not exceed maximum depth,
213
        get the list of URLs embedded therein.
214
     6. for each of those URLs do the following:
215
216
       7. if the URL is not one of those downloaded before, and if it
217
          satisfies the criteria specified by the various command-line
218
          options, add it to the queue. */
219
220
uerr_t
221
retrieve_tree (struct url *start_url_parsed, struct iri *pi)
222
0
{
223
0
  uerr_t status = RETROK;
224
225
  /* The queue of URLs we need to load. */
226
0
  struct url_queue *queue;
227
228
  /* The URLs we do not wish to enqueue, because they are already in
229
     the queue, but haven't been downloaded yet.  */
230
0
  struct hash_table *blacklist;
231
232
0
  struct iri *i = iri_new ();
233
234
0
  FILE *rejectedlog = NULL; /* Don't write a rejected log. */
235
236
  /* Duplicate pi struct if not NULL */
237
0
  if (pi)
238
0
    {
239
0
#define COPYSTR(x)  (x) ? xstrdup(x) : NULL;
240
0
      i->uri_encoding = COPYSTR (pi->uri_encoding);
241
0
      i->content_encoding = COPYSTR (pi->content_encoding);
242
0
      i->utf8_encode = pi->utf8_encode;
243
0
#undef COPYSTR
244
0
    }
245
0
#ifdef ENABLE_IRI
246
0
  else
247
0
    set_uri_encoding (i, opt.locale, true);
248
0
#endif
249
250
0
  queue = url_queue_new ();
251
0
  blacklist = make_string_hash_table (0);
252
253
  /* Enqueue the starting URL.  Use start_url_parsed->url rather than
254
     just URL so we enqueue the canonical form of the URL.  */
255
0
  url_enqueue (queue, i, xstrdup (start_url_parsed->url), NULL, 0, true,
256
0
               false);
257
0
  blacklist_add (blacklist, start_url_parsed->url);
258
259
0
  if (opt.rejected_log)
260
0
    {
261
0
      rejectedlog = fopen (opt.rejected_log, "w");
262
0
      write_reject_log_header (rejectedlog);
263
0
      if (!rejectedlog)
264
0
        logprintf (LOG_NOTQUIET, "%s: %s\n", opt.rejected_log, strerror (errno));
265
0
    }
266
267
0
  while (1)
268
0
    {
269
0
      bool descend = false;
270
0
      char *url, *referer, *file = NULL;
271
0
      int depth;
272
0
      bool html_allowed, css_allowed;
273
0
      bool is_css = false;
274
0
      bool dash_p_leaf_HTML = false;
275
276
0
      if (opt.quota && total_downloaded_bytes > opt.quota)
277
0
        break;
278
0
      if (status == FWRITEERR)
279
0
        break;
280
281
      /* Get the next URL from the queue... */
282
283
0
      if (!url_dequeue (queue, (struct iri **) &i,
284
0
                        (const char **)&url, (const char **)&referer,
285
0
                        &depth, &html_allowed, &css_allowed))
286
0
        break;
287
288
      /* ...and download it.  Note that this download is in most cases
289
         unconditional, as download_child already makes sure a file
290
         doesn't get enqueued twice -- and yet this check is here, and
291
         not in download_child.  This is so that if you run `wget -r
292
         URL1 URL2', and a random URL is encountered once under URL1
293
         and again under URL2, but at a different (possibly smaller)
294
         depth, we want the URL's children to be taken into account
295
         the second time.  */
296
0
      if (dl_url_file_map && hash_table_contains (dl_url_file_map, url))
297
0
        {
298
0
          bool is_css_bool;
299
300
0
          file = xstrdup (hash_table_get (dl_url_file_map, url));
301
302
0
          DEBUGP (("Already downloaded \"%s\", reusing it from \"%s\".\n",
303
0
                   url, file));
304
305
0
          if ((is_css_bool = (css_allowed
306
0
                  && downloaded_css_set
307
0
                  && string_set_contains (downloaded_css_set, file)))
308
0
              || (html_allowed
309
0
                && downloaded_html_set
310
0
                && string_set_contains (downloaded_html_set, file)))
311
0
            {
312
0
              descend = true;
313
0
              is_css = is_css_bool;
314
0
            }
315
0
        }
316
0
      else
317
0
        {
318
0
          int dt = 0, url_err;
319
0
          char *redirected = NULL;
320
0
          struct url *url_parsed = url_parse (url, &url_err, i, true);
321
322
0
          if (!url_parsed)
323
0
            {
324
0
              logprintf (LOG_NOTQUIET, "%s: %s.\n",url, url_error (url_err));
325
0
              inform_exit_status (URLERROR);
326
0
            }
327
0
          else
328
0
            {
329
330
0
              status = retrieve_url (url_parsed, url, &file, &redirected, referer,
331
0
                                     &dt, false, i, true);
332
333
0
              if (html_allowed && file && status == RETROK
334
0
                  && (dt & RETROKF) && (dt & TEXTHTML))
335
0
                {
336
0
                  descend = true;
337
0
                  is_css = false;
338
0
                }
339
340
              /* a little different, css_allowed can override content type
341
                 lots of web servers serve css with an incorrect content type
342
              */
343
0
              if (file && status == RETROK
344
0
                  && (dt & RETROKF) &&
345
0
                  ((dt & TEXTCSS) || css_allowed))
346
0
                {
347
0
                  descend = true;
348
0
                  is_css = true;
349
0
                }
350
351
0
              if (redirected)
352
0
                {
353
                  /* We have been redirected, possibly to another host, or
354
                     different path, or wherever.  Check whether we really
355
                     want to follow it.  */
356
0
                  if (descend)
357
0
                    {
358
0
                      reject_reason r = descend_redirect (redirected, url_parsed,
359
0
                                        depth, start_url_parsed, blacklist, i);
360
0
                      if (r == WG_RR_SUCCESS)
361
0
                        {
362
                          /* Make sure that the old pre-redirect form gets
363
                             blacklisted. */
364
0
                          blacklist_add (blacklist, url);
365
0
                        }
366
0
                      else
367
0
                        {
368
0
                          write_reject_log_reason (rejectedlog, r, url_parsed, start_url_parsed);
369
0
                          descend = false;
370
0
                        }
371
0
                    }
372
373
0
                  xfree (url);
374
0
                  url = redirected;
375
0
                }
376
0
              else
377
0
                {
378
0
                  xfree (url);
379
0
                  url = xstrdup (url_parsed->url);
380
0
                }
381
0
              url_free (url_parsed);
382
0
            }
383
0
        }
384
385
0
      if (opt.spider)
386
0
        {
387
0
          visited_url (url, referer);
388
0
        }
389
390
0
      if (descend
391
0
          && depth >= opt.reclevel && opt.reclevel != INFINITE_RECURSION)
392
0
        {
393
0
          if (opt.page_requisites
394
0
              && (depth == opt.reclevel || depth == opt.reclevel + 1))
395
0
            {
396
              /* When -p is specified, we are allowed to exceed the
397
                 maximum depth, but only for the "inline" links,
398
                 i.e. those that are needed to display the page.
399
                 Originally this could exceed the depth at most by
400
                 one, but we allow one more level so that the leaf
401
                 pages that contain frames can be loaded
402
                 correctly.  */
403
0
              dash_p_leaf_HTML = true;
404
0
            }
405
0
          else
406
0
            {
407
              /* Either -p wasn't specified or it was and we've
408
                 already spent the two extra (pseudo-)levels that it
409
                 affords us, so we need to bail out. */
410
0
              DEBUGP (("Not descending further; at depth %d, max. %d.\n",
411
0
                       depth, opt.reclevel));
412
0
              descend = false;
413
0
            }
414
0
        }
415
416
      /* If the downloaded document was HTML or CSS, parse it and enqueue the
417
         links it contains. */
418
419
0
      if (descend)
420
0
        {
421
0
          bool meta_disallow_follow = false;
422
0
          struct urlpos *children
423
0
            = is_css ? get_urls_css_file (file, url) :
424
0
                       get_urls_html (file, url, &meta_disallow_follow, i);
425
426
0
          if (opt.use_robots && meta_disallow_follow)
427
0
            {
428
0
              logprintf(LOG_VERBOSE, _("nofollow attribute found in %s. Will not follow any links on this page\n"), file);
429
0
              free_urlpos (children);
430
0
              children = NULL;
431
0
            }
432
433
0
          if (children)
434
0
            {
435
0
              struct urlpos *child = children;
436
0
              struct url *url_parsed = url_parse (url, NULL, i, true);
437
0
              struct iri *ci;
438
0
              char *referer_url = url;
439
0
              bool strip_auth;
440
441
0
              assert (url_parsed != NULL);
442
443
0
              if (!url_parsed)
444
0
                continue;
445
446
0
              strip_auth = (url_parsed && url_parsed->user);
447
448
              /* Strip auth info if present */
449
0
              if (strip_auth)
450
0
                referer_url = url_string (url_parsed, URL_AUTH_HIDE);
451
452
0
              for (; child; child = child->next)
453
0
                {
454
0
                  reject_reason r;
455
456
0
                  if (child->ignore_when_downloading)
457
0
                    {
458
0
                      DEBUGP (("Not following due to 'ignore' flag: %s\n", child->url->url));
459
0
                      continue;
460
0
                    }
461
462
0
                  if (dash_p_leaf_HTML && !child->link_inline_p)
463
0
                    {
464
0
                      DEBUGP (("Not following due to 'link inline' flag: %s\n", child->url->url));
465
0
                      continue;
466
0
                    }
467
468
0
                  r = download_child (child, url_parsed, depth,
469
0
                                      start_url_parsed, blacklist, i);
470
0
                  if (r == WG_RR_SUCCESS)
471
0
                    {
472
0
                      ci = iri_new ();
473
0
                      set_uri_encoding (ci, i->content_encoding, false);
474
0
                      url_enqueue (queue, ci, xstrdup (child->url->url),
475
0
                                   xstrdup (referer_url), depth + 1,
476
0
                                   child->link_expect_html,
477
0
                                   child->link_expect_css);
478
                      /* We blacklist the URL we have enqueued, because we
479
                         don't want to enqueue (and hence download) the
480
                         same URL twice.  */
481
0
                      blacklist_add (blacklist, child->url->url);
482
0
                    }
483
0
                  else
484
0
                    {
485
0
                      write_reject_log_reason (rejectedlog, r, child->url, url_parsed);
486
0
                    }
487
0
                }
488
489
0
              if (strip_auth)
490
0
                xfree (referer_url);
491
0
              url_free (url_parsed);
492
0
              free_urlpos (children);
493
0
            }
494
0
        }
495
496
0
      if (file
497
0
          && (opt.delete_after
498
0
              || opt.spider /* opt.recursive is implicitly true */
499
0
              || !acceptable (file)))
500
0
        {
501
          /* Either --delete-after was specified, or we loaded this
502
             (otherwise unneeded because of --spider or rejected by -R)
503
             HTML file just to harvest its hyperlinks -- in either case,
504
             delete the local file. */
505
0
          DEBUGP (("Removing file due to %s in recursive_retrieve():\n",
506
0
                   opt.delete_after ? "--delete-after" :
507
0
                   (opt.spider ? "--spider" :
508
0
                    "recursive rejection criteria")));
509
0
          logprintf (LOG_VERBOSE,
510
0
                     (opt.delete_after || opt.spider
511
0
                      ? _("Removing %s.\n")
512
0
                      : _("Removing %s since it should be rejected.\n")),
513
0
                     file);
514
0
          if (unlink (file))
515
0
            logprintf (LOG_NOTQUIET, "unlink: %s\n", strerror (errno));
516
0
          logputs (LOG_VERBOSE, "\n");
517
0
          register_delete_file (file);
518
0
        }
519
520
0
      xfree (url);
521
0
      xfree (referer);
522
0
      xfree (file);
523
0
      iri_free (i);
524
0
    }
525
526
0
  if (rejectedlog)
527
0
    {
528
0
      fclose (rejectedlog);
529
0
      rejectedlog = NULL;
530
0
    }
531
532
  /* If anything is left of the queue due to a premature exit, free it
533
     now.  */
534
0
  {
535
0
    char *d1, *d2;
536
0
    int d3;
537
0
    bool d4, d5;
538
0
    struct iri *d6;
539
0
    while (url_dequeue (queue, (struct iri **)&d6,
540
0
                        (const char **)&d1, (const char **)&d2, &d3, &d4, &d5))
541
0
      {
542
0
        iri_free (d6);
543
0
        xfree (d1);
544
0
        xfree (d2);
545
0
      }
546
0
  }
547
0
  url_queue_delete (queue);
548
549
0
  string_set_free (blacklist);
550
551
0
  if (opt.quota && total_downloaded_bytes > opt.quota)
552
0
    return QUOTEXC;
553
0
  else if (status == FWRITEERR)
554
0
    return FWRITEERR;
555
0
  else
556
0
    return RETROK;
557
0
}
558
559
/* Based on the context provided by retrieve_tree, decide whether a
560
   URL is to be descended to.  This is only ever called from
561
   retrieve_tree, but is in a separate function for clarity.
562
563
   The most expensive checks (such as those for robots) are memoized
564
   by storing these URLs to BLACKLIST.  This may or may not help.  It
565
   will help if those URLs are encountered many times.  */
566
567
static reject_reason
568
download_child (const struct urlpos *upos, struct url *parent, int depth,
569
                  struct url *start_url_parsed, struct hash_table *blacklist,
570
                  struct iri *iri)
571
0
{
572
0
  struct url *u = upos->url;
573
0
  const char *url = u->url;
574
0
  bool u_scheme_like_http;
575
0
  reject_reason reason = WG_RR_SUCCESS;
576
577
0
  DEBUGP (("Deciding whether to enqueue \"%s\".\n", url));
578
579
0
  if (blacklist_contains (blacklist, url))
580
0
    {
581
0
      if (opt.spider)
582
0
        {
583
0
          char *referrer = url_string (parent, URL_AUTH_HIDE_PASSWD);
584
0
          DEBUGP (("download_child: parent->url is: %s\n", quote (parent->url)));
585
0
          visited_url (url, referrer);
586
0
          xfree (referrer);
587
0
        }
588
0
      DEBUGP (("Already on the black list.\n"));
589
0
      reason = WG_RR_BLACKLIST;
590
0
      goto out;
591
0
    }
592
593
  /* Several things to check for:
594
     1. if scheme is not https and https_only requested
595
     2. if scheme is not http, and we don't load it
596
     3. check for relative links (if relative_only is set)
597
     4. check for domain
598
     5. check for no-parent
599
     6. check for excludes && includes
600
     7. check for suffix
601
     8. check for same host (if spanhost is unset), with possible
602
     gethostbyname baggage
603
     9. check for robots.txt
604
605
     Addendum: If the URL is FTP, and it is to be loaded, only the
606
     domain and suffix settings are "stronger".
607
608
     Note that .html files will get loaded regardless of suffix rules
609
     (but that is remedied later with unlink) unless the depth equals
610
     the maximum depth.
611
612
     More time- and memory- consuming tests should be put later on
613
     the list.  */
614
615
0
#ifdef HAVE_SSL
616
0
  if (opt.https_only && u->scheme != SCHEME_HTTPS)
617
0
    {
618
0
      DEBUGP (("Not following non-HTTPS links.\n"));
619
0
      reason = WG_RR_NOTHTTPS;
620
0
      goto out;
621
0
    }
622
0
#endif
623
624
  /* Determine whether URL under consideration has a HTTP-like scheme. */
625
0
  u_scheme_like_http = schemes_are_similar_p (u->scheme, SCHEME_HTTP);
626
627
  /* 1. Schemes other than HTTP are normally not recursed into. */
628
0
  if (!u_scheme_like_http && !((u->scheme == SCHEME_FTP
629
0
#ifdef HAVE_SSL
630
0
      || u->scheme == SCHEME_FTPS
631
0
#endif
632
0
      ) && opt.follow_ftp))
633
0
    {
634
0
      DEBUGP (("Not following non-HTTP schemes.\n"));
635
0
      reason = WG_RR_NONHTTP;
636
0
      goto out;
637
0
    }
638
639
  /* 2. If it is an absolute link and they are not followed, throw it
640
     out.  */
641
0
  if (u_scheme_like_http)
642
0
    if (opt.relative_only && !upos->link_relative_p)
643
0
      {
644
0
        DEBUGP (("It doesn't really look like a relative link.\n"));
645
0
        reason = WG_RR_ABSOLUTE;
646
0
        goto out;
647
0
      }
648
649
  /* 3. If its domain is not to be accepted/looked-up, chuck it
650
     out.  */
651
0
  if (!accept_domain (u))
652
0
    {
653
0
      DEBUGP (("The domain was not accepted.\n"));
654
0
      reason = WG_RR_DOMAIN;
655
0
      goto out;
656
0
    }
657
658
  /* 4. Check for parent directory.
659
660
     If we descended to a different host or changed the scheme, ignore
661
     opt.no_parent.  Also ignore it for documents needed to display
662
     the parent page when in -p mode.  */
663
0
  if (opt.no_parent
664
0
      && schemes_are_similar_p (u->scheme, start_url_parsed->scheme)
665
0
      && 0 == strcasecmp (u->host, start_url_parsed->host)
666
0
      && (u->scheme != start_url_parsed->scheme
667
0
          || u->port == start_url_parsed->port)
668
0
      && !(opt.page_requisites && upos->link_inline_p))
669
0
    {
670
0
      if (!subdir_p (start_url_parsed->dir, u->dir))
671
0
        {
672
0
          DEBUGP (("Going to \"%s\" would escape \"%s\" with no_parent on.\n",
673
0
                   u->dir, start_url_parsed->dir));
674
0
          reason = WG_RR_PARENT;
675
0
          goto out;
676
0
        }
677
0
    }
678
679
  /* 5. If the file does not match the acceptance list, or is on the
680
     rejection list, chuck it out.  The same goes for the directory
681
     exclusion and inclusion lists.  */
682
0
  if (opt.includes || opt.excludes)
683
0
    {
684
0
      if (!accdir (u->dir))
685
0
        {
686
0
          DEBUGP (("%s (%s) is excluded/not-included.\n", url, u->dir));
687
0
          reason = WG_RR_LIST;
688
0
          goto out;
689
0
        }
690
0
    }
691
0
  if (!accept_url (url))
692
0
    {
693
0
      DEBUGP (("%s is excluded/not-included through regex.\n", url));
694
0
      reason = WG_RR_REGEX;
695
0
      goto out;
696
0
    }
697
698
  /* 6. Check for acceptance/rejection rules.  We ignore these rules
699
     for directories (no file name to match) and for non-leaf HTMLs,
700
     which can lead to other files that do need to be downloaded.  (-p
701
     automatically implies non-leaf because with -p we can, if
702
     necessary, overstep the maximum depth to get the page requisites.)  */
703
0
  if (u->file[0] != '\0'
704
0
      && !(has_html_suffix_p (u->file)
705
           /* The exception only applies to non-leaf HTMLs (but -p
706
              always implies non-leaf because we can overstep the
707
              maximum depth to get the requisites): */
708
0
           && (/* non-leaf */
709
0
               opt.reclevel == INFINITE_RECURSION
710
               /* also non-leaf */
711
0
               || depth < opt.reclevel - 1
712
               /* -p, which implies non-leaf (see above) */
713
0
               || opt.page_requisites)))
714
0
    {
715
0
      if (!acceptable (u->file))
716
0
        {
717
0
          DEBUGP (("%s (%s) does not match acc/rej rules.\n",
718
0
                   url, u->file));
719
0
          reason = WG_RR_RULES;
720
0
          goto out;
721
0
        }
722
0
    }
723
724
  /* 7. */
725
0
  if (schemes_are_similar_p (u->scheme, parent->scheme))
726
0
    if (!opt.spanhost && 0 != strcasecmp (parent->host, u->host))
727
0
      {
728
0
        DEBUGP (("This is not the same hostname as the parent's (%s and %s).\n",
729
0
                 u->host, parent->host));
730
0
        reason = WG_RR_SPANNEDHOST;
731
0
        goto out;
732
0
      }
733
734
  /* 8. */
735
0
  if (opt.use_robots && u_scheme_like_http)
736
0
    {
737
0
      struct robot_specs *specs = res_get_specs (u->host, u->port);
738
0
      if (!specs)
739
0
        {
740
0
          char *rfile;
741
0
          if (res_retrieve_file (url, &rfile, iri))
742
0
            {
743
0
              specs = res_parse_from_file (rfile);
744
745
              /* Delete the robots.txt file if we chose to either delete the
746
                 files after downloading or we're just running a spider or
747
                 we use page requisites or pattern matching. */
748
0
              if (opt.delete_after || opt.spider || match_tail(rfile, ".tmp", false))
749
0
                {
750
0
                  logprintf (LOG_VERBOSE, _("Removing %s.\n"), rfile);
751
0
                  if (unlink (rfile))
752
0
                      logprintf (LOG_NOTQUIET, "unlink: %s\n",
753
0
                                 strerror (errno));
754
0
                }
755
756
0
              xfree (rfile);
757
0
            }
758
0
          else
759
0
            {
760
              /* If we cannot get real specs, at least produce
761
                 dummy ones so that we can register them and stop
762
                 trying to retrieve them.  */
763
0
              specs = res_parse ("", 0);
764
0
            }
765
0
          res_register_specs (u->host, u->port, specs);
766
0
        }
767
768
      /* Now that we have (or don't have) robots.txt specs, we can
769
         check what they say.  */
770
0
      if (!res_match_path (specs, u->path))
771
0
        {
772
0
          DEBUGP (("Not following %s because robots.txt forbids it.\n", url));
773
0
          blacklist_add (blacklist, url);
774
0
          reason = WG_RR_ROBOTS;
775
0
          goto out;
776
0
        }
777
0
    }
778
779
0
  out:
780
781
0
  if (reason == WG_RR_SUCCESS)
782
    /* The URL has passed all the tests.  It can be placed in the
783
       download queue. */
784
0
    DEBUGP (("Decided to load it.\n"));
785
0
  else
786
0
    DEBUGP (("Decided NOT to load it.\n"));
787
788
0
  return reason;
789
0
}
790
791
/* This function determines whether we will consider downloading the
792
   children of a URL whose download resulted in a redirection,
793
   possibly to another host, etc.  It is needed very rarely, and thus
794
   it is merely a simple-minded wrapper around download_child.  */
795
796
static reject_reason
797
descend_redirect (const char *redirected, struct url *orig_parsed, int depth,
798
                    struct url *start_url_parsed, struct hash_table *blacklist,
799
                    struct iri *iri)
800
0
{
801
0
  struct url *new_parsed;
802
0
  struct urlpos *upos;
803
0
  reject_reason reason;
804
805
0
  assert (orig_parsed != NULL);
806
807
0
  new_parsed = url_parse (redirected, NULL, NULL, false);
808
0
  assert (new_parsed != NULL);
809
810
0
  upos = xnew0 (struct urlpos);
811
0
  upos->url = new_parsed;
812
813
0
  reason = download_child (upos, orig_parsed, depth,
814
0
                              start_url_parsed, blacklist, iri);
815
816
0
  if (reason == WG_RR_SUCCESS)
817
0
    blacklist_add (blacklist, upos->url->url);
818
0
  else if (reason == WG_RR_LIST || reason == WG_RR_REGEX)
819
0
    {
820
0
      DEBUGP (("Ignoring decision for redirects, decided to load it.\n"));
821
0
      blacklist_add (blacklist, upos->url->url);
822
0
      reason = WG_RR_SUCCESS;
823
0
    }
824
0
  else
825
0
    DEBUGP (("Redirection \"%s\" failed the test.\n", redirected));
826
827
0
  url_free (new_parsed);
828
0
  xfree (upos);
829
830
0
  return reason;
831
0
}
832
833
834
/* This function writes the rejected log header. */
835
static void
836
write_reject_log_header (FILE *f)
837
0
{
838
0
  if (!f)
839
0
    return;
840
841
  /* Note: Update this header when columns change in any way. */
842
0
  fprintf (f, "REASON\t"
843
0
    "U_URL\tU_SCHEME\tU_HOST\tU_PORT\tU_PATH\tU_PARAMS\tU_QUERY\tU_FRAGMENT\t"
844
0
    "P_URL\tP_SCHEME\tP_HOST\tP_PORT\tP_PATH\tP_PARAMS\tP_QUERY\tP_FRAGMENT\n");
845
0
}
846
847
/* This function writes a URL to the reject log. Internal use only. */
848
static void
849
write_reject_log_url (FILE *fp, const struct url *url)
850
0
{
851
0
  const char *escaped_str;
852
0
  const char *scheme_str;
853
854
0
  if (!fp)
855
0
    return;
856
857
0
  escaped_str = url_escape (url->url);
858
859
0
  switch (url->scheme)
860
0
    {
861
0
      case SCHEME_HTTP:  scheme_str = "SCHEME_HTTP";    break;
862
0
#ifdef HAVE_SSL
863
0
      case SCHEME_HTTPS: scheme_str = "SCHEME_HTTPS";   break;
864
0
      case SCHEME_FTPS:  scheme_str = "SCHEME_FTPS";    break;
865
0
#endif
866
0
      case SCHEME_FTP:   scheme_str = "SCHEME_FTP";     break;
867
0
      default:           scheme_str = "SCHEME_INVALID"; break;
868
0
    }
869
870
0
  fprintf (fp, "%s\t%s\t%s\t%i\t%s\t%s\t%s\t%s",
871
0
    escaped_str,
872
0
    scheme_str,
873
0
    url->host,
874
0
    url->port,
875
0
    url->path,
876
0
    url->params ? url->params : "",
877
0
    url->query ? url->query : "",
878
0
    url->fragment ? url->fragment : "");
879
880
0
  xfree (escaped_str);
881
0
}
882
883
/* This function writes out information on why a URL was rejected and its
884
   context from download_child such as the URL being rejected and it's
885
   parent's URL. The format it uses is comma separated values but with tabs. */
886
static void
887
write_reject_log_reason (FILE *fp, reject_reason reason,
888
                         const struct url *url, const struct url *parent)
889
0
{
890
0
  const char *reason_str;
891
892
0
  if (!fp)
893
0
    return;
894
895
0
  switch (reason)
896
0
    {
897
0
      case WG_RR_SUCCESS:     reason_str = "SUCCESS";     break;
898
0
      case WG_RR_BLACKLIST:   reason_str = "BLACKLIST";   break;
899
0
      case WG_RR_NOTHTTPS:    reason_str = "NOTHTTPS";    break;
900
0
      case WG_RR_NONHTTP:     reason_str = "NONHTTP";     break;
901
0
      case WG_RR_ABSOLUTE:    reason_str = "ABSOLUTE";    break;
902
0
      case WG_RR_DOMAIN:      reason_str = "DOMAIN";      break;
903
0
      case WG_RR_PARENT:      reason_str = "PARENT";      break;
904
0
      case WG_RR_LIST:        reason_str = "LIST";        break;
905
0
      case WG_RR_REGEX:       reason_str = "REGEX";       break;
906
0
      case WG_RR_RULES:       reason_str = "RULES";       break;
907
0
      case WG_RR_SPANNEDHOST: reason_str = "SPANNEDHOST"; break;
908
0
      case WG_RR_ROBOTS:      reason_str = "ROBOTS";      break;
909
0
      default:                reason_str = "UNKNOWN";     break;
910
0
    }
911
912
0
  fprintf (fp, "%s\t", reason_str);
913
0
  write_reject_log_url (fp, url);
914
0
  fprintf (fp, "\t");
915
0
  write_reject_log_url (fp, parent);
916
0
  fprintf (fp, "\n");
917
0
}
918
919
/* vim:set sts=2 sw=2 cino+={s: */