Coverage Report

Created: 2024-03-08 06:32

/src/wget2/libwget/html_url.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * Copyright (c) 2013 Tim Ruehsen
3
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
4
 *
5
 * This file is part of libwget.
6
 *
7
 * Libwget is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as published by
9
 * the Free Software Foundation, either version 3 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * Libwget is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19
 *
20
 *
21
 * Extracting URLs from HTML
22
 *
23
 * Changelog
24
 * 26.09.2013  Tim Ruehsen  created
25
 *
26
 */
27
28
#include <config.h>
29
30
#include <unistd.h>
31
#include <stdlib.h>
32
#include <string.h>
33
#include <c-ctype.h>
34
35
#include <wget.h>
36
#include "private.h"
37
38
typedef struct {
39
  wget_html_parsed_result
40
    result;
41
  wget_vector *
42
    additional_tags;
43
  wget_vector *
44
    ignore_tags;
45
  wget_string
46
    download;
47
  int
48
    uri_index;
49
  size_t
50
    css_start_offset;
51
  char
52
    found_robots,
53
    found_content_type,
54
    link_inline;
55
  const char
56
    * html,
57
    * css_attr,
58
    * css_dir;
59
} html_context;
60
61
// see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
62
static const char maybe[256] = {
63
  ['a'] = 1,
64
  ['b'] = 1,
65
  ['c'] = 1,
66
  ['d'] = 1,
67
  ['f'] = 1,
68
  ['h'] = 1,
69
  ['i'] = 1,
70
  ['l'] = 1,
71
  ['m'] = 1,
72
  ['p'] = 1,
73
  ['s'] = 1,
74
  ['u'] = 1,
75
};
76
static const char attrs[][12] = {
77
  "action", "archive",
78
  "background",
79
  "code", "codebase", "cite", "classid",
80
  "data",
81
  "formaction",
82
  "href",
83
  "icon",
84
  "lowsrc", "longdesc",
85
  "manifest",
86
  "profile", "poster",
87
  "src", "srcset",
88
  "usemap"
89
};
90
91
static void css_parse_uri(void *context, const char *url WGET_GCC_UNUSED, size_t len, size_t pos)
92
1.43k
{
93
1.43k
  html_context *ctx = context;
94
1.43k
  wget_html_parsed_result *res = &ctx->result;
95
1.43k
  wget_html_parsed_url *parsed_url;
96
97
1.43k
  if (!(parsed_url = wget_malloc(sizeof(wget_html_parsed_url))))
98
0
    return;
99
100
1.43k
  parsed_url->link_inline = 1;
101
1.43k
  wget_strscpy(parsed_url->attr, ctx->css_attr, sizeof(parsed_url->attr));
102
1.43k
  wget_strscpy(parsed_url->tag, ctx->css_dir, sizeof(parsed_url->tag));
103
1.43k
  parsed_url->url.p = (const char *) (ctx->html + ctx->css_start_offset + pos);
104
1.43k
  parsed_url->url.len = len;
105
1.43k
  parsed_url->download.p = NULL;
106
1.43k
  parsed_url->download.len = 0;
107
108
1.43k
  if (!res->uris)
109
122
    res->uris = wget_vector_create(32, NULL);
110
111
1.43k
  wget_vector_add(res->uris, parsed_url);
112
1.43k
}
113
114
// Callback function, called from HTML parser for each URI found.
115
static void html_get_url(void *context, int flags, const char *tag, const char *attr, const char *val, size_t len, size_t pos WGET_GCC_UNUSED)
116
33.8k
{
117
33.8k
  html_context *ctx = context;
118
119
  // Read the encoding from META tag, e.g. from
120
  //   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">.
121
  // It overrides the encoding from the HTTP response resp. from the CLI.
122
  //
123
  // Also ,we are interested in ROBOTS e.g.
124
  //   <META name="ROBOTS" content="NOINDEX, NOFOLLOW">
125
33.8k
  if ((flags & XML_FLG_BEGIN)) {
126
8.53k
    if ((*tag|0x20) == 'a' && (tag[1] == 0 || !wget_strcasecmp_ascii(tag, "area"))) {
127
      // The download attribute is only valid for 'a' and 'area' tags.
128
      // S 4.6.5 in https://html.spec.whatwg.org/multipage/links.html#downloading-resources
129
483
      ctx->uri_index = -1;
130
483
      ctx->download.p = NULL;
131
483
      ctx->download.len = 0;
132
483
    }
133
8.05k
    else if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) {
134
1.25k
      ctx->found_robots = ctx->found_content_type = 0;
135
1.25k
    }
136
6.80k
    else if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
137
407
      ctx->link_inline = 0;
138
407
      ctx->uri_index = -1;
139
407
    }
140
8.53k
  }
141
142
33.8k
  if ((flags & XML_FLG_ATTRIBUTE) && val) {
143
12.9k
    wget_html_parsed_result *res = &ctx->result;
144
145
//    debug_printf("%02X %s %s '%.*s' %zu %zu\n", (unsigned) flags, tag, attr, (int) len, val, len, pos);
146
147
12.9k
    if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) {
148
5.23k
      if (!ctx->found_robots) {
149
4.31k
        if (!wget_strcasecmp_ascii(attr, "name") && !wget_strncasecmp_ascii(val, "robots", len)) {
150
367
          ctx->found_robots = 1;
151
367
          return;
152
367
        }
153
4.31k
      } else if (ctx->found_robots && !wget_strcasecmp_ascii(attr, "content")) {
154
683
        char valbuf[256], *valp;
155
683
        const char *value;
156
157
683
        if (!(value = valp = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len)))
158
0
          return;
159
160
9.59k
        while (*value) {
161
9.14k
          const char *p;
162
163
9.84k
          while (c_isspace(*value)) value++;
164
9.14k
          if (*value == ',') { value++; continue; }
165
63.2k
          for (p = value; *p && !c_isspace(*p) && *p != ','; p++);
166
8.77k
          if (p == value) break;
167
168
          // debug_printf("ROBOTS='%.*s'\n", (int)(p - value), value);
169
8.53k
          if (!wget_strncasecmp_ascii(value, "all", p - value) || !wget_strncasecmp_ascii(value, "follow", p - value))
170
3.13k
            res->follow = 1;
171
5.40k
          else if (!wget_strncasecmp_ascii(value, "nofollow", p - value) || !wget_strncasecmp_ascii(value, "none", p - value))
172
658
            res->follow = 0;
173
174
8.53k
          value = *p  ? p + 1 : p;
175
8.53k
        }
176
177
683
        if (valp != valbuf)
178
107
          xfree(valp);
179
180
683
        return;
181
683
      }
182
183
4.18k
      if (ctx->found_content_type && !res->encoding) {
184
2.18k
        if (!wget_strcasecmp_ascii(attr, "content")) {
185
1.96k
          char valbuf[256];
186
1.96k
          const char *value;
187
188
1.96k
          if (!(value = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len)))
189
0
            return;
190
191
1.96k
          wget_http_parse_content_type(value, NULL, &res->encoding);
192
193
1.96k
          if (value != valbuf)
194
112
            xfree(value);
195
1.96k
        }
196
2.18k
      }
197
2.00k
      else if (!ctx->found_content_type && !res->encoding) {
198
1.61k
        if (!wget_strcasecmp_ascii(attr, "http-equiv") && !wget_strncasecmp_ascii(val, "Content-Type", len)) {
199
591
          ctx->found_content_type = 1;
200
591
        }
201
1.02k
        else if (!wget_strcasecmp_ascii(attr, "charset")) {
202
9
          res->encoding = wget_strmemdup(val, len);
203
9
        }
204
1.61k
      }
205
206
4.18k
      return;
207
4.18k
    }
208
209
7.68k
    if (ctx->ignore_tags) {
210
0
      if (wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1
211
0
        || wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1)
212
0
        return;
213
0
    }
214
215
7.68k
    if ((*attr|0x20) == 's' && !wget_strcasecmp_ascii(attr, "style") && len) {
216
752
      ctx->css_dir = tag;
217
752
      ctx->css_attr = "style";
218
752
      ctx->css_start_offset = val - ctx->html;
219
752
      wget_css_parse_buffer(val, len, css_parse_uri, NULL, context);
220
752
      return;
221
752
    }
222
223
6.93k
    if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
224
1.99k
      if (!wget_strcasecmp_ascii(attr, "rel")) {
225
1.70k
        ctx->link_inline = 0;
226
227
        // "rel" contains a space separated list of items.
228
        //   see https://html.spec.whatwg.org/multipage/semantics.html#attr-link-rel
229
        //   see https://html.spec.whatwg.org/multipage/links.html#linkTypes
230
2.91k
        while (len) {
231
2.59k
          const char *p;
232
233
38.2k
          for (p = val;len && !c_isspace(*val); val++, len--); // find end of item
234
2.59k
          if (p == val) { val++; len--; continue; } // found a delimiter
235
236
          // Check for items that may be important to display the page.
237
2.04k
          if (!wget_strncasecmp_ascii(p, "icon", val - p)
238
2.04k
            || !wget_strncasecmp_ascii(p, "manifest", val - p)
239
2.04k
            || !wget_strncasecmp_ascii(p, "modulepreload", val - p)
240
2.04k
            || !wget_strncasecmp_ascii(p, "stylesheet", val - p)
241
2.04k
            || !wget_strncasecmp_ascii(p, "prefetch", val - p)
242
2.04k
            || !wget_strncasecmp_ascii(p, "preload", val - p))
243
1.38k
          {
244
1.38k
            ctx->link_inline = 1;
245
1.38k
            break;
246
1.38k
          }
247
2.04k
        }
248
249
1.70k
        if (ctx->uri_index >= 0) {
250
          // href= came before rel=
251
218
          wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index);
252
218
          if (url)
253
218
            url->link_inline = ctx->link_inline;
254
218
        }
255
1.70k
        return;
256
1.70k
      }
257
1.99k
    }
258
259
5.23k
    if ((*tag|0x20) == 'a' && (tag[1] == 0 || !wget_strcasecmp_ascii(tag, "area"))
260
5.23k
      && !wget_strcasecmp_ascii(attr, "download"))
261
0
    {
262
0
      if (!val)
263
0
        return;
264
265
0
      for (;len && c_isspace(*val); val++, len--); // skip leading spaces
266
0
      for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces
267
0
      if (!len)
268
0
        return;
269
270
      // remember for later
271
0
      ctx->download.p = val;
272
0
      ctx->download.len = len;
273
274
0
      if (ctx->uri_index >= 0) {
275
        // href= came before download=
276
0
        wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index);
277
0
        url->download.p = val;
278
0
        url->download.len = len;
279
0
      }
280
281
0
      return;
282
0
    }
283
284
    // shortcut to avoid unneeded calls to bsearch()
285
5.23k
    int found = 0;
286
287
    // search the static list for a tag/attr match
288
5.23k
    if (maybe[(unsigned char)*attr|0x20] && attr[1] && attr[2])
289
2.56k
      found = bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))wget_strcasecmp_ascii) != NULL;
290
291
    // search the dynamic list for a tag/attr match
292
5.23k
    if (!found && ctx->additional_tags) {
293
0
      if (wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1
294
0
        || wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1)
295
0
        found = 1;
296
0
    }
297
298
5.23k
    if (found) {
299
2.73k
      for (;len && c_isspace(*val); val++, len--); // skip leading spaces
300
2.75k
      for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces
301
302
2.14k
      if ((*tag|0x20) == 'b' && !wget_strcasecmp_ascii(tag, "base")) {
303
        // found a <BASE href="...">
304
194
        res->base.p = val;
305
194
        res->base.len = len;
306
194
        return;
307
194
      }
308
309
1.95k
      if (!res->uris)
310
383
        res->uris = wget_vector_create(32, NULL);
311
312
1.95k
      wget_html_parsed_url url;
313
314
1.95k
      if (!wget_strcasecmp_ascii(attr, "srcset")) {
315
        // value is a list of URLs, see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
316
        // See also https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
317
54.0k
        while (len) {
318
53.0k
          const char *p;
319
320
53.8k
          for (;len && c_isspace(*val); val++, len--); // skip leading spaces
321
138k
          for (p = val;len && !c_isspace(*val) && *val != ','; val++, len--); // find end of URL
322
53.0k
          if (p != val) {
323
            // The 'data:' URL contains a single comma: https://datatracker.ietf.org/doc/html/rfc2397
324
50.2k
            if (len && *val == ',' && !wget_strncasecmp_ascii(p, "data:", 5)) {
325
              // advance to the end of the 'data:' URL
326
3.59k
              for (val++, len--;len && !c_isspace(*val) && *val != ','; val++, len--);
327
1.72k
            }
328
50.2k
            url.download.p = NULL;
329
50.2k
            url.download.len = 0;
330
50.2k
            url.link_inline = ctx->link_inline;
331
50.2k
            wget_strscpy(url.attr, attr, sizeof(url.attr));
332
50.2k
            wget_strscpy(url.tag, tag, sizeof(url.tag));
333
50.2k
            url.url.p = p;
334
50.2k
            url.url.len = val - p;
335
50.2k
            wget_vector_add_memdup(res->uris, &url, sizeof(url));
336
50.2k
          }
337
59.7k
          for (;len && *val != ','; val++, len--); // skip optional width/density descriptor
338
53.0k
          if (len && *val == ',') { val++; len--; }
339
53.0k
        }
340
341
1.04k
      } else {
342
        // value is a single URL
343
914
        url.download.p = ctx->download.p;
344
914
        url.download.len = ctx->download.len;
345
914
        url.link_inline = ctx->link_inline;
346
914
        wget_strscpy(url.attr, attr, sizeof(url.attr));
347
914
        wget_strscpy(url.tag, tag, sizeof(url.tag));
348
914
        url.url.p = val;
349
914
        url.url.len = len;
350
914
        ctx->uri_index = wget_vector_add_memdup(res->uris, &url, sizeof(url));
351
914
      }
352
1.95k
    }
353
5.23k
  }
354
355
25.9k
  if (flags & XML_FLG_CONTENT && val && len && !wget_strcasecmp_ascii(tag, "style")) {
356
933
    ctx->css_dir = "style";
357
933
    ctx->css_attr = "";
358
933
    ctx->css_start_offset = val - ctx->html;
359
933
    wget_css_parse_buffer(val, len, css_parse_uri, NULL, context);
360
933
  }
361
25.9k
}
362
363
void wget_html_free_urls_inline (wget_html_parsed_result **res)
364
3.64k
{
365
3.64k
  if (res && *res) {
366
3.64k
    xfree((*res)->encoding);
367
3.64k
    wget_vector_free(&(*res)->uris);
368
3.64k
    xfree(*res);
369
3.64k
  }
370
3.64k
}
371
372
wget_html_parsed_result *wget_html_get_urls_inline(const char *html, wget_vector *additional_tags, wget_vector *ignore_tags)
373
3.64k
{
374
3.64k
  html_context context = {
375
3.64k
    .result.follow = 1,
376
3.64k
    .additional_tags = additional_tags,
377
3.64k
    .ignore_tags = ignore_tags,
378
3.64k
    .html = html,
379
3.64k
  };
380
381
//  context.result.uris = wget_vector_create(32, -2, NULL);
382
3.64k
  wget_html_parse_buffer(html, html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT);
383
384
3.64k
  return wget_memdup(&context.result, sizeof(context.result));
385
3.64k
}