Coverage Report

Created: 2025-03-06 06:58

/src/wget/src/html-url.c
Line
Count
Source (jump to first uncovered line)
1
/* Collect URLs from HTML source.
2
   Copyright (C) 1998-2012, 2015, 2018-2024 Free Software Foundation,
3
   Inc.
4
5
This file is part of GNU Wget.
6
7
GNU Wget is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 3 of the License, or
10
 (at your option) any later version.
11
12
GNU Wget is distributed in the hope that it will be useful,
13
but WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
GNU General Public License for more details.
16
17
You should have received a copy of the GNU General Public License
18
along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20
Additional permission under GNU GPL version 3 section 7
21
22
If you modify this program, or any covered work, by linking or
23
combining it with the OpenSSL project's OpenSSL library (or a
24
modified version of that library), containing parts covered by the
25
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26
grants you additional permission to convey the resulting work.
27
Corresponding Source for a non-source form of such a combination
28
shall include the source code for the parts of OpenSSL used as well
29
as that of the covered work.  */
30
31
#include "wget.h"
32
33
#include <stdio.h>
34
#include <string.h>
35
#include <stdlib.h>
36
#include <errno.h>
37
#include <assert.h>
38
39
#include "exits.h"
40
#include "html-parse.h"
41
#include "url.h"
42
#include "utils.h"
43
#include "hash.h"
44
#include "convert.h"
45
#include "recur.h"
46
#include "html-url.h"
47
#include "css-url.h"
48
#include "c-strcase.h"
49
50
typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *);
51
52
#define DECLARE_TAG_HANDLER(fun)                                \
53
  static void fun (int, struct taginfo *, struct map_context *)
54
55
DECLARE_TAG_HANDLER (tag_find_urls);
56
DECLARE_TAG_HANDLER (tag_handle_base);
57
DECLARE_TAG_HANDLER (tag_handle_form);
58
DECLARE_TAG_HANDLER (tag_handle_img);
59
DECLARE_TAG_HANDLER (tag_handle_link);
60
DECLARE_TAG_HANDLER (tag_handle_meta);
61
62
enum {
63
  TAG_A,
64
  TAG_APPLET,
65
  TAG_AREA,
66
  TAG_BASE,
67
  TAG_BGSOUND,
68
  TAG_BODY,
69
  TAG_EMBED,
70
  TAG_FIG,
71
  TAG_FORM,
72
  TAG_FRAME,
73
  TAG_IFRAME,
74
  TAG_IMG,
75
  TAG_INPUT,
76
  TAG_LAYER,
77
  TAG_LINK,
78
  TAG_META,
79
  TAG_OBJECT,
80
  TAG_OVERLAY,
81
  TAG_SCRIPT,
82
  TAG_TABLE,
83
  TAG_TD,
84
  TAG_TH,
85
  TAG_VIDEO,
86
  TAG_AUDIO,
87
  TAG_SOURCE
88
};
89
90
/* The list of known tags and functions used for handling them.  Most
91
   tags are simply harvested for URLs. */
92
static struct known_tag {
93
  int tagid;
94
  const char *name;
95
  tag_handler_t handler;
96
} known_tags[] = {
97
  { TAG_A,       "a",           tag_find_urls },
98
  { TAG_APPLET,  "applet",      tag_find_urls },
99
  { TAG_AREA,    "area",        tag_find_urls },
100
  { TAG_BASE,    "base",        tag_handle_base },
101
  { TAG_BGSOUND, "bgsound",     tag_find_urls },
102
  { TAG_BODY,    "body",        tag_find_urls },
103
  { TAG_EMBED,   "embed",       tag_find_urls },
104
  { TAG_FIG,     "fig",         tag_find_urls },
105
  { TAG_FORM,    "form",        tag_handle_form },
106
  { TAG_FRAME,   "frame",       tag_find_urls },
107
  { TAG_IFRAME,  "iframe",      tag_find_urls },
108
  { TAG_IMG,     "img",         tag_handle_img }, // tag_find_urls() plus handling "srcset"
109
  { TAG_INPUT,   "input",       tag_find_urls },
110
  { TAG_LAYER,   "layer",       tag_find_urls },
111
  { TAG_LINK,    "link",        tag_handle_link },
112
  { TAG_META,    "meta",        tag_handle_meta },
113
  { TAG_OBJECT,  "object",      tag_find_urls },
114
  { TAG_OVERLAY, "overlay",     tag_find_urls },
115
  { TAG_SCRIPT,  "script",      tag_find_urls },
116
  { TAG_TABLE,   "table",       tag_find_urls },
117
  { TAG_TD,      "td",          tag_find_urls },
118
  { TAG_TH,      "th",          tag_find_urls },
119
  { TAG_VIDEO,   "video",       tag_find_urls },
120
  { TAG_AUDIO,   "audio",       tag_find_urls },
121
  { TAG_SOURCE,  "source",      tag_handle_img } // tag_find_urls() plus handling "srcset"
122
};
123
124
/* tag_url_attributes documents which attributes of which tags contain
125
   URLs to harvest.  It is used by tag_find_urls.  */
126
127
/* Defines for the FLAGS. */
128
129
/* The link is "inline", i.e. needs to be retrieved for this document
130
   to be correctly rendered.  Inline links include inlined images,
131
   stylesheets, children frames, etc.  */
132
0
#define ATTR_INLINE     1
133
134
/* The link is expected to yield HTML contents.  It's important not to
135
   try to follow HTML obtained by following e.g. <img src="...">
136
   regardless of content-type.  Doing this causes infinite loops for
137
   "images" that return non-404 error pages with links to the same
138
   image.  */
139
0
#define ATTR_HTML       2
140
141
/* For tags handled by tag_find_urls: attributes that contain URLs to
142
   download. */
143
static struct {
144
  int tagid;
145
  const char *attr_name;
146
  int flags;
147
} tag_url_attributes[] = {
148
  { TAG_A,              "href",         ATTR_HTML },
149
  { TAG_APPLET,         "code",         ATTR_INLINE },
150
  { TAG_AREA,           "href",         ATTR_HTML },
151
  { TAG_BGSOUND,        "src",          ATTR_INLINE },
152
  { TAG_BODY,           "background",   ATTR_INLINE },
153
  { TAG_EMBED,          "href",         ATTR_HTML },
154
  { TAG_EMBED,          "src",          ATTR_INLINE | ATTR_HTML },
155
  { TAG_FIG,            "src",          ATTR_INLINE },
156
  { TAG_FRAME,          "src",          ATTR_INLINE | ATTR_HTML },
157
  { TAG_IFRAME,         "src",          ATTR_INLINE | ATTR_HTML },
158
  { TAG_IMG,            "href",         ATTR_INLINE },
159
  { TAG_IMG,            "lowsrc",       ATTR_INLINE },
160
  { TAG_IMG,            "src",          ATTR_INLINE },
161
  { TAG_INPUT,          "src",          ATTR_INLINE },
162
  { TAG_LAYER,          "src",          ATTR_INLINE | ATTR_HTML },
163
  { TAG_OBJECT,         "data",         ATTR_INLINE },
164
  { TAG_OVERLAY,        "src",          ATTR_INLINE | ATTR_HTML },
165
  { TAG_SCRIPT,         "src",          ATTR_INLINE },
166
  { TAG_TABLE,          "background",   ATTR_INLINE },
167
  { TAG_TD,             "background",   ATTR_INLINE },
168
  { TAG_TH,             "background",   ATTR_INLINE },
169
  { TAG_VIDEO,          "src",          ATTR_INLINE },
170
  { TAG_VIDEO,          "poster",       ATTR_INLINE },
171
  { TAG_AUDIO,          "src",          ATTR_INLINE },
172
  { TAG_AUDIO,          "poster",       ATTR_INLINE },
173
  { TAG_SOURCE,         "src",          ATTR_INLINE },
174
};
175
176
/* The lists of interesting tags and attributes are built dynamically,
177
   from the information above.  However, some places in the code refer
178
   to the attributes not mentioned here.  We add them manually.  */
179
static const char *additional_attributes[] = {
180
  "rel",                        /* used by tag_handle_link  */
181
  "type",                       /* used by tag_handle_link  */
182
  "http-equiv",                 /* used by tag_handle_meta  */
183
  "name",                       /* used by tag_handle_meta  */
184
  "content",                    /* used by tag_handle_meta  */
185
  "action",                     /* used by tag_handle_form  */
186
  "style",                      /* used by check_style_attr */
187
  "srcset",                     /* used by tag_handle_img */
188
};
189
190
static struct hash_table *interesting_tags;
191
static struct hash_table *interesting_attributes;
192
193
/* Will contains the (last) charset found in 'http-equiv=content-type'
194
   meta tags  */
195
static char *meta_charset;
196
197
static void
198
init_interesting (void)
199
0
{
200
  /* Init the variables interesting_tags and interesting_attributes
201
     that are used by the HTML parser to know which tags and
202
     attributes we're interested in.  We initialize this only once,
203
     for performance reasons.
204
205
     Here we also make sure that what we put in interesting_tags
206
     matches the user's preferences as specified through --ignore-tags
207
     and --follow-tags.  */
208
209
0
  size_t i;
210
0
  interesting_tags = make_nocase_string_hash_table (countof (known_tags));
211
212
  /* First, add all the tags we know hot to handle, mapped to their
213
     respective entries in known_tags.  */
214
0
  for (i = 0; i < countof (known_tags); i++)
215
0
    hash_table_put (interesting_tags, known_tags[i].name, known_tags + i);
216
217
  /* Then remove the tags ignored through --ignore-tags.  */
218
0
  if (opt.ignore_tags)
219
0
    {
220
0
      char **ignored;
221
0
      for (ignored = opt.ignore_tags; *ignored; ignored++)
222
0
        hash_table_remove (interesting_tags, *ignored);
223
0
    }
224
225
  /* If --follow-tags is specified, use only those tags.  */
226
0
  if (opt.follow_tags)
227
0
    {
228
      /* Create a new table intersecting --follow-tags and known_tags,
229
         and use it as interesting_tags.  */
230
0
      struct hash_table *intersect = make_nocase_string_hash_table (0);
231
0
      char **followed;
232
0
      for (followed = opt.follow_tags; *followed; followed++)
233
0
        {
234
0
          struct known_tag *t = hash_table_get (interesting_tags, *followed);
235
0
          if (!t)
236
0
            continue;           /* ignore unknown --follow-tags entries. */
237
0
          hash_table_put (intersect, *followed, t);
238
0
        }
239
0
      hash_table_destroy (interesting_tags);
240
0
      interesting_tags = intersect;
241
0
    }
242
243
  /* Add the attributes we care about. */
244
0
  interesting_attributes = make_nocase_string_hash_table (10);
245
0
  for (i = 0; i < countof (additional_attributes); i++)
246
0
    hash_table_put (interesting_attributes, additional_attributes[i], "1");
247
0
  for (i = 0; i < countof (tag_url_attributes); i++)
248
0
    hash_table_put (interesting_attributes,
249
0
                    tag_url_attributes[i].attr_name, "1");
250
0
}
251
252
/* Find the value of attribute named NAME in the taginfo TAG.  If the
253
   attribute is not present, return NULL.  If ATTRIND is non-NULL, the
254
   index of the attribute in TAG will be stored there.  */
255
256
static char *
257
find_attr (struct taginfo *tag, const char *name, int *attrind)
258
0
{
259
0
  int i;
260
0
  for (i = 0; i < tag->nattrs; i++)
261
0
    if (!c_strcasecmp (tag->attrs[i].name, name))
262
0
      {
263
0
        if (attrind)
264
0
          *attrind = i;
265
0
        return tag->attrs[i].value;
266
0
      }
267
0
  return NULL;
268
0
}
269
270
/* used for calls to append_url */
271
#define ATTR_POS(tag, attrind, ctx) \
272
0
 (tag->attrs[attrind].value_raw_beginning - ctx->text)
273
#define ATTR_SIZE(tag, attrind) \
274
0
 (tag->attrs[attrind].value_raw_size)
275
276
/* Append LINK_URI to the urlpos structure that is being built.
277
278
   LINK_URI will be merged with the current document base.
279
*/
280
281
struct urlpos *
282
append_url (const char *link_uri, int position, int size,
283
            struct map_context *ctx)
284
0
{
285
0
  int link_has_scheme = url_has_scheme (link_uri);
286
0
  struct urlpos *newel;
287
0
  const char *base = ctx->base ? ctx->base : ctx->parent_base;
288
0
  struct url *url;
289
290
0
  struct iri *iri = iri_new ();
291
0
  set_uri_encoding (iri, opt.locale, true);
292
0
  iri->utf8_encode = true;
293
294
0
  if (!base)
295
0
    {
296
0
      DEBUGP (("%s: no base, merge will use \"%s\".\n",
297
0
               ctx->document_file, link_uri));
298
299
0
      if (!link_has_scheme)
300
0
        {
301
          /* Base URL is unavailable, and the link does not have a
302
             location attached to it -- we have to give up.  Since
303
             this can only happen when using `--force-html -i', print
304
             a warning.  */
305
0
          logprintf (LOG_NOTQUIET,
306
0
                     _("%s: Cannot resolve incomplete link %s.\n"),
307
0
                     ctx->document_file, link_uri);
308
0
          iri_free (iri);
309
0
          return NULL;
310
0
        }
311
312
0
      url = url_parse (link_uri, NULL, iri, false);
313
0
      if (!url)
314
0
        {
315
0
          DEBUGP (("%s: link \"%s\" doesn't parse.\n",
316
0
                   ctx->document_file, link_uri));
317
0
          iri_free (iri);
318
0
          return NULL;
319
0
        }
320
0
    }
321
0
  else
322
0
    {
323
      /* Merge BASE with LINK_URI, but also make sure the result is
324
         canonicalized, i.e. that "../" have been resolved.
325
         (parse_url will do that for us.) */
326
327
0
      char *complete_uri = uri_merge (base, link_uri);
328
329
0
      DEBUGP (("%s: merge(%s, %s) -> %s\n",
330
0
               quotearg_n_style (0, escape_quoting_style, ctx->document_file),
331
0
               quote_n (1, base),
332
0
               quote_n (2, link_uri),
333
0
               quotearg_n_style (3, escape_quoting_style, complete_uri)));
334
335
0
      url = url_parse (complete_uri, NULL, iri, false);
336
0
      if (!url)
337
0
        {
338
0
          DEBUGP (("%s: merged link \"%s\" doesn't parse.\n",
339
0
                   ctx->document_file, complete_uri));
340
0
          xfree (complete_uri);
341
0
          iri_free (iri);
342
0
          return NULL;
343
0
        }
344
0
      xfree (complete_uri);
345
0
    }
346
347
0
  iri_free (iri);
348
349
0
  DEBUGP (("appending %s to urlpos.\n", quote (url->url)));
350
351
0
  newel = xnew0 (struct urlpos);
352
0
  newel->url = url;
353
0
  newel->pos = position;
354
0
  newel->size = size;
355
356
  /* A URL is relative if the host is not named, and the name does not
357
     start with `/'.  */
358
0
  if (!link_has_scheme && *link_uri != '/')
359
0
    newel->link_relative_p = 1;
360
0
  else if (link_has_scheme)
361
0
    newel->link_complete_p = 1;
362
363
  /* Append the new URL maintaining the order by position.  */
364
0
  if (ctx->head == NULL)
365
0
    ctx->head = newel;
366
0
  else
367
0
    {
368
0
      struct urlpos *it, *prev = NULL;
369
370
0
      it = ctx->head;
371
0
      while (it && position > it->pos)
372
0
        {
373
0
          prev = it;
374
0
          it = it->next;
375
0
        }
376
377
0
      newel->next = it;
378
379
0
      if (prev)
380
0
        prev->next = newel;
381
0
      else
382
0
        ctx->head = newel;
383
0
    }
384
385
0
  return newel;
386
0
}
387
388
static void
389
check_style_attr (struct taginfo *tag, struct map_context *ctx)
390
0
{
391
0
  int attrind;
392
0
  int raw_start;
393
0
  int raw_len;
394
0
  char *style = find_attr (tag, "style", &attrind);
395
0
  if (!style)
396
0
    return;
397
398
  /* raw pos and raw size include the quotes, skip them when they are
399
     present.  */
400
0
  raw_start = ATTR_POS (tag, attrind, ctx);
401
0
  raw_len  = ATTR_SIZE (tag, attrind);
402
0
  if( *(char *)(ctx->text + raw_start) == '\''
403
0
      || *(char *)(ctx->text + raw_start) == '"')
404
0
    {
405
0
      raw_start += 1;
406
0
      raw_len -= 2;
407
0
    }
408
409
0
  if(raw_len <= 0)
410
0
       return;
411
412
0
  get_urls_css (ctx, raw_start, raw_len);
413
0
}
414
415
/* All the tag_* functions are called from collect_tags_mapper, as
416
   specified by KNOWN_TAGS.  */
417
418
/* Default tag handler: collect URLs from attributes specified for
419
   this tag by tag_url_attributes.  */
420
421
static void
422
tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx)
423
0
{
424
0
  size_t i;
425
0
  int attrind;
426
0
  int first = -1;
427
428
0
  for (i = 0; i < countof (tag_url_attributes); i++)
429
0
    if (tag_url_attributes[i].tagid == tagid)
430
0
      {
431
        /* We've found the index of tag_url_attributes where the
432
           attributes of our tag begin.  */
433
0
        first = i;
434
0
        break;
435
0
      }
436
0
  assert (first != -1);
437
438
  /* Loop over the "interesting" attributes of this tag.  In this
439
     example, it will loop over "src" and "lowsrc".
440
441
       <img src="foo.png" lowsrc="bar.png">
442
443
     This has to be done in the outer loop so that the attributes are
444
     processed in the same order in which they appear in the page.
445
     This is required when converting links.  */
446
447
0
  for (attrind = 0; attrind < tag->nattrs; attrind++)
448
0
    {
449
      /* Find whether TAG/ATTRIND is a combination that contains a
450
         URL. */
451
0
      char *link = tag->attrs[attrind].value;
452
0
      const size_t size = countof (tag_url_attributes);
453
454
      /* If you're cringing at the inefficiency of the nested loops,
455
         remember that they both iterate over a very small number of
456
         items.  The worst-case inner loop is for the IMG tag, which
457
         has three attributes.  */
458
0
      for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++)
459
0
        {
460
0
          if (0 == strcasecmp (tag->attrs[attrind].name,
461
0
                               tag_url_attributes[i].attr_name))
462
0
            {
463
0
              struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx),
464
0
                                              ATTR_SIZE(tag,attrind), ctx);
465
0
              if (up)
466
0
                {
467
0
                  int flags = tag_url_attributes[i].flags;
468
0
                  if (flags & ATTR_INLINE)
469
0
                    up->link_inline_p = 1;
470
0
                  if (flags & ATTR_HTML)
471
0
                    up->link_expect_html = 1;
472
0
                }
473
0
            }
474
0
        }
475
0
    }
476
0
}
477
478
/* Handle the BASE tag, for <base href=...>. */
479
480
static void
481
tag_handle_base (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
482
0
{
483
0
  struct urlpos *base_urlpos;
484
0
  int attrind;
485
0
  char *newbase = find_attr (tag, "href", &attrind);
486
0
  if (!newbase)
487
0
    return;
488
489
0
  base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx),
490
0
                            ATTR_SIZE(tag,attrind), ctx);
491
0
  if (!base_urlpos)
492
0
    return;
493
0
  base_urlpos->ignore_when_downloading = 1;
494
0
  base_urlpos->link_base_p = 1;
495
496
0
  xfree (ctx->base);
497
0
  if (ctx->parent_base)
498
0
    ctx->base = uri_merge (ctx->parent_base, newbase);
499
0
  else
500
0
    ctx->base = xstrdup (newbase);
501
0
}
502
503
/* Mark the URL found in <form action=...> for conversion. */
504
505
static void
506
tag_handle_form (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
507
0
{
508
0
  int attrind;
509
0
  char *action = find_attr (tag, "action", &attrind);
510
511
0
  if (action)
512
0
    {
513
0
      struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx),
514
0
                                      ATTR_SIZE(tag,attrind), ctx);
515
0
      if (up)
516
0
        up->ignore_when_downloading = 1;
517
0
    }
518
0
}
519
520
/* Handle the LINK tag.  It requires special handling because how its
521
   links will be followed in -p mode depends on the REL attribute.  */
522
523
static void
524
tag_handle_link (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
525
0
{
526
0
  int attrind;
527
0
  char *href = find_attr (tag, "href", &attrind);
528
529
  /* All <link href="..."> link references are external, except those
530
     known not to be, such as style sheet and shortcut icon:
531
532
     <link rel="stylesheet" href="..."> or <link rel="alternate stylesheet" href="...">
533
     <link rel="shortcut icon" href="..."> or <link rel="icon" href="...">
534
  */
535
0
  if (href)
536
0
    {
537
0
      struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx),
538
0
                                      ATTR_SIZE(tag,attrind), ctx);
539
0
      if (up)
540
0
        {
541
0
          char *rel = find_attr (tag, "rel", NULL);
542
0
          if (rel)
543
0
            {
544
0
              if (0 == c_strcasecmp (rel, "stylesheet") || 0 == c_strcasecmp (rel, "alternate stylesheet"))
545
0
                {
546
0
                  up->link_inline_p = 1;
547
0
                  up->link_expect_css = 1;
548
0
                }
549
0
              else if (0 == c_strcasecmp (rel, "shortcut icon") || 0 == c_strcasecmp (rel, "icon"))
550
0
                {
551
0
                  up->link_inline_p = 1;
552
0
                }
553
0
              else if (0 == c_strcasecmp (rel, "manifest"))
554
0
                {
555
0
                  up->link_inline_p = 1;
556
0
                }
557
0
              else
558
0
                {
559
                  /* The external ones usually point to HTML pages, such as
560
                     <link rel="next" href="...">
561
                     except when the type attribute says otherwise:
562
                     <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" />
563
                  */
564
0
                  char *type = find_attr (tag, "type", NULL);
565
0
                  if (!type || c_strcasecmp (type, "text/html") == 0)
566
0
                    up->link_expect_html = 1;
567
0
                }
568
0
            }
569
0
        }
570
0
    }
571
0
}
572
573
/* Handle the META tag.  This requires special handling because of the
574
   refresh feature and because of robot exclusion.  */
575
576
static void
577
tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx)
578
0
{
579
0
  char *name = find_attr (tag, "name", NULL);
580
0
  char *http_equiv = find_attr (tag, "http-equiv", NULL);
581
582
0
  if (http_equiv && 0 == c_strcasecmp (http_equiv, "refresh"))
583
0
    {
584
      /* Some pages use a META tag to specify that the page be
585
         refreshed by a new page after a given number of seconds.  The
586
         general format for this is:
587
588
           <meta http-equiv=Refresh content="NUMBER; URL=index2.html">
589
590
         So we just need to skip past the "NUMBER; URL=" garbage to
591
         get to the URL.  */
592
593
0
      struct urlpos *entry;
594
0
      int attrind;
595
0
      int timeout;
596
0
      char *p;
597
598
0
      char *refresh = find_attr (tag, "content", &attrind);
599
0
      if (!refresh)
600
0
        return;
601
602
0
      timeout = strtol(refresh, &p, 10);
603
604
0
      if (timeout < 0 || *p++ != ';')
605
0
        return;
606
607
0
      while (c_isspace (*p))
608
0
        ++p;
609
0
      if (!(   c_toupper (*p)       == 'U'
610
0
            && c_toupper (*(p + 1)) == 'R'
611
0
            && c_toupper (*(p + 2)) == 'L'
612
0
            &&          *(p + 3)  == '='))
613
0
        return;
614
0
      p += 4;
615
0
      while (c_isspace (*p))
616
0
        ++p;
617
618
0
      entry = append_url (p, ATTR_POS(tag,attrind,ctx),
619
0
                          ATTR_SIZE(tag,attrind), ctx);
620
0
      if (entry)
621
0
        {
622
0
          entry->link_refresh_p = 1;
623
0
          entry->refresh_timeout = timeout;
624
0
          entry->link_expect_html = 1;
625
0
        }
626
0
    }
627
0
  else if (http_equiv && 0 == c_strcasecmp (http_equiv, "content-type"))
628
0
    {
629
      /* Handle stuff like:
630
         <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */
631
632
0
      char *mcharset;
633
0
      char *content = find_attr (tag, "content", NULL);
634
0
      if (!content)
635
0
        return;
636
637
0
      mcharset = parse_charset (content);
638
0
      if (!mcharset)
639
0
        return;
640
641
0
      xfree (meta_charset);
642
0
      meta_charset = mcharset;
643
0
    }
644
0
  else if (name && 0 == c_strcasecmp (name, "robots"))
645
0
    {
646
      /* Handle stuff like:
647
         <meta name="robots" content="index,nofollow"> */
648
0
      char *content = find_attr (tag, "content", NULL);
649
0
      if (!content)
650
0
        return;
651
0
      if (!c_strcasecmp (content, "none"))
652
0
        ctx->nofollow = true;
653
0
      else
654
0
        {
655
0
          while (*content)
656
0
            {
657
0
              char *end;
658
              /* Skip any initial whitespace. */
659
0
              content += strspn (content, " \f\n\r\t\v");
660
              /* Find the next occurrence of ',' or whitespace,
661
               * or the end of the string.  */
662
0
              end = content + strcspn (content, ", \f\n\r\t\v");
663
0
              if (!c_strncasecmp (content, "nofollow", end - content))
664
0
                ctx->nofollow = true;
665
              /* Skip past the next comma, if any. */
666
0
              if (*end == ',')
667
0
                ++end;
668
0
              else
669
0
                {
670
0
                  end = strchr (end, ',');
671
0
                  if (end)
672
0
                    ++end;
673
0
                  else
674
0
                    end = content + strlen (content);
675
0
                }
676
0
              content = end;
677
0
            }
678
0
        }
679
0
    }
680
0
}
681
682
/* Handle the IMG tag.  This requires special handling for the srcset attr,
683
   while the traditional src/lowsrc/href attributes can be handled generically.
684
*/
685
686
static void
687
0
tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) {
688
0
  int attrind;
689
0
  char *srcset;
690
691
  /* Use the generic approach for the attributes without special syntax. */
692
0
  tag_find_urls(tagid, tag, ctx);
693
694
0
  srcset = find_attr (tag, "srcset", &attrind);
695
0
  if (srcset)
696
0
    {
697
      /* These are relative to the input text. */
698
0
      int base_ind = ATTR_POS (tag,attrind,ctx);
699
0
      int size = strlen (srcset);
700
701
      /* These are relative to srcset. */
702
0
      int offset, url_start, url_end;
703
704
      /* Make sure to line up base_ind with srcset[0], not outside quotes. */
705
0
      if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'')
706
0
        ++base_ind;
707
708
0
      offset = 0;
709
0
      while (offset < size)
710
0
        {
711
0
          bool has_descriptor = true;
712
713
          /* Skip over initial whitespace and commas. Note there is no \v
714
            in HTML5 whitespace. */
715
0
          url_start = offset + strspn (srcset + offset, " \f\n\r\t,");
716
717
0
          if (url_start == size)
718
0
            return;
719
720
          /* URL is any non-whitespace chars (including commas) - but with
721
             trailing commas removed. */
722
0
          url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t");
723
0
          while ((url_end - 1) > url_start && srcset[url_end - 1] == ',')
724
0
            {
725
0
              has_descriptor = false;
726
0
              --url_end;
727
0
            }
728
729
0
          if (url_end > url_start)
730
0
            {
731
0
              char *url_text = strdupdelim (srcset + url_start,
732
0
                                            srcset + url_end);
733
0
              struct urlpos *up = append_url (url_text, base_ind + url_start,
734
0
                                              url_end - url_start, ctx);
735
0
              if (up)
736
0
                {
737
0
                  up->link_inline_p = 1;
738
0
                  up->link_noquote_html_p = 1;
739
0
                }
740
0
              xfree (url_text);
741
0
            }
742
743
          /* If the URL wasn't terminated by a , there may also be a descriptor
744
             which we just skip. */
745
0
          if (has_descriptor)
746
0
            {
747
              /* This is comma-terminated, except there may be one level of
748
                 parentheses escaping that. */
749
0
              bool in_paren = false;
750
0
              for (offset = url_end; offset < size; ++offset)
751
0
                {
752
0
                  char c = srcset[offset];
753
0
                  if (c == '(')
754
0
                    in_paren = true;
755
0
                  else if (c == ')' && in_paren)
756
0
                    in_paren = false;
757
0
                  else if (c == ',' && !in_paren)
758
0
                    break;
759
0
                }
760
0
            }
761
0
          else
762
0
            offset = url_end;
763
0
        }
764
0
    }
765
0
}
766
767
/* Dispatch the tag handler appropriate for the tag we're mapping
768
   over.  See known_tags[] for definition of tag handlers.  */
769
770
static void
771
collect_tags_mapper (struct taginfo *tag, void *arg)
772
0
{
773
0
  struct map_context *ctx = (struct map_context *)arg;
774
775
  /* Find the tag in our table of tags.  This must not fail because
776
     map_html_tags only returns tags found in interesting_tags.
777
778
     I've changed this for now, I'm passing NULL as interesting_tags
779
     to map_html_tags.  This way we can check all tags for a style
780
     attribute.
781
  */
782
0
  struct known_tag *t = hash_table_get (interesting_tags, tag->name);
783
784
0
  if (t != NULL)
785
0
    t->handler (t->tagid, tag, ctx);
786
787
0
  check_style_attr (tag, ctx);
788
789
0
  if (tag->end_tag_p && (0 == c_strcasecmp (tag->name, "style"))
790
0
      && tag->contents_begin && tag->contents_end
791
0
      && tag->contents_begin <= tag->contents_end)
792
0
  {
793
    /* parse contents */
794
0
    get_urls_css (ctx, tag->contents_begin - ctx->text,
795
0
                  tag->contents_end - tag->contents_begin);
796
0
  }
797
0
}
798
799
/* Analyze HTML tags FILE and construct a list of URLs referenced from
800
   it.  It merges relative links in FILE with URL.  It is aware of
801
   <base href=...> and does the right thing.  */
802
803
struct urlpos *
804
get_urls_html_fm (const char *file, const struct file_memory *fm,
805
                    const char *url, bool *meta_disallow_follow,
806
                    struct iri *iri)
807
0
{
808
0
  struct map_context ctx;
809
0
  int flags;
810
811
0
  ctx.text = fm->content;
812
0
  ctx.head = NULL;
813
0
  ctx.base = NULL;
814
0
  ctx.parent_base = url ? url : opt.base_href;
815
0
  ctx.document_file = file;
816
0
  ctx.nofollow = false;
817
818
0
  if (!interesting_tags)
819
0
    init_interesting ();
820
821
  /* Specify MHT_TRIM_VALUES because of buggy HTML generators that
822
     generate <a href=" foo"> instead of <a href="foo"> (browsers
823
     ignore spaces as well.)  If you really mean space, use &32; or
824
     %20.  MHT_TRIM_VALUES also causes squashing of embedded newlines,
825
     e.g. in <img src="foo.[newline]html">.  Such newlines are also
826
     ignored by IE and Mozilla and are presumably introduced by
827
     writing HTML with editors that force word wrap.  */
828
0
  flags = MHT_TRIM_VALUES;
829
0
  if (opt.strict_comments)
830
0
    flags |= MHT_STRICT_COMMENTS;
831
832
  /* the NULL here used to be interesting_tags */
833
0
  map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags,
834
0
                 NULL, interesting_attributes);
835
836
0
#ifdef ENABLE_IRI
837
  /* Meta charset is only valid if there was no HTTP header Content-Type charset. */
838
  /* This is true for HTTP 1.0 and 1.1. */
839
0
  if (iri && !iri->content_encoding && meta_charset)
840
0
    set_content_encoding (iri, meta_charset);
841
0
#endif
842
0
  xfree (meta_charset);
843
844
0
  DEBUGP (("nofollow in %s: %d\n", file, ctx.nofollow));
845
846
0
  if (meta_disallow_follow)
847
0
    *meta_disallow_follow = ctx.nofollow;
848
849
0
  xfree (ctx.base);
850
0
  return ctx.head;
851
0
}
852
853
struct urlpos *
854
get_urls_html (const char *file, const char *url, bool *meta_disallow_follow,
855
                 struct iri *iri)
856
0
{
857
0
  struct urlpos *urls;
858
0
  struct file_memory *fm;
859
860
0
  fm = wget_read_file (file);
861
0
  if (!fm)
862
0
    {
863
0
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
864
0
      return NULL;
865
0
    }
866
0
  DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
867
868
0
  urls = get_urls_html_fm (file, fm, url, meta_disallow_follow, iri);
869
0
  wget_read_file_free (fm);
870
0
  return urls;
871
0
}
872
873
/* This doesn't really have anything to do with HTML, but it's similar
874
   to get_urls_html, so we put it here.  */
875
876
struct urlpos *
877
get_urls_file (const char *file, bool *read_again)
878
0
{
879
0
  struct file_memory *fm;
880
0
  struct urlpos *head, *tail;
881
0
  const char *text, *text_end;
882
883
  /* Load the file.  */
884
0
  fm = wget_read_from_file (file, read_again);
885
0
  if (!fm)
886
0
    {
887
0
      logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno));
888
0
      return NULL;
889
0
    }
890
0
  if (fm->length)
891
0
    DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length)));
892
893
0
  head = tail = NULL;
894
0
  text = fm->content;
895
0
  text_end = fm->content + fm->length;
896
0
  while (text < text_end)
897
0
    {
898
0
      int up_error_code;
899
0
      char *url_text;
900
0
      char *new_url;
901
0
      struct urlpos *entry;
902
0
      struct url *url;
903
904
0
      const char *line_beg = text;
905
0
      const char *line_end = memchr (text, '\n', text_end - text);
906
0
      if (!line_end)
907
0
        line_end = text_end;
908
0
      else
909
0
        ++line_end;
910
0
      text = line_end;
911
912
      /* Strip whitespace from the beginning and end of line. */
913
0
      while (line_beg < line_end && c_isspace (*line_beg))
914
0
        ++line_beg;
915
0
      while (line_end > line_beg && c_isspace (*(line_end - 1)))
916
0
        --line_end;
917
918
0
      if (line_beg == line_end)
919
0
        continue;
920
921
      /* The URL is in the [line_beg, line_end) region. */
922
923
      /* We must copy the URL to a zero-terminated string, and we
924
         can't use alloca because we're in a loop.  *sigh*.  */
925
0
      url_text = strdupdelim (line_beg, line_end);
926
927
0
      if (opt.base_href)
928
0
        {
929
          /* Merge opt.base_href with URL. */
930
0
          char *merged = uri_merge (opt.base_href, url_text);
931
0
          xfree (url_text);
932
0
          url_text = merged;
933
0
        }
934
935
0
      new_url = maybe_prepend_scheme (url_text);
936
0
      if (new_url)
937
0
        {
938
0
          xfree (url_text);
939
0
          url_text = new_url;
940
0
        }
941
942
0
      url = url_parse (url_text, &up_error_code, NULL, false);
943
0
      if (!url)
944
0
        {
945
0
          logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"),
946
0
                     file, url_text, url_error (up_error_code));
947
0
          xfree (url_text);
948
0
          inform_exit_status (URLERROR);
949
0
          continue;
950
0
        }
951
0
      xfree (url_text);
952
953
0
      entry = xnew0 (struct urlpos);
954
0
      entry->url = url;
955
956
0
      if (!head)
957
0
        head = entry;
958
0
      else
959
0
        tail->next = entry;
960
0
      tail = entry;
961
0
    }
962
0
  wget_read_file_free (fm);
963
0
  return head;
964
0
}
965
966
#if defined DEBUG_MALLOC || defined TESTING
967
void
968
cleanup_html_url (void)
969
0
{
970
  /* Destroy the hash tables.  The hash table keys and values are not
971
     allocated by this code, so we don't need to free them here.  */
972
0
  if (interesting_tags)
973
0
    hash_table_destroy (interesting_tags);
974
0
  if (interesting_attributes)
975
0
    hash_table_destroy (interesting_attributes);
976
0
}
977
#endif