/src/wget2/libwget/html_url.c

Source (jump to first uncovered line)
/*
 * Copyright (c) 2013 Tim Ruehsen
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
 *
 * This file is part of libwget.
 *
 * Libwget is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Libwget is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * Extracting URLs from HTML
 *
 * Changelog
 * 26.09.2013  Tim Ruehsen  created
 *
 */

#include <config.h>

#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <c-ctype.h>

#include <wget.h>
#include "private.h"

typedef struct {
  wget_html_parsed_result
    result;
  wget_vector *
    additional_tags;
  wget_vector *
    ignore_tags;
  wget_string
    download;
  int
    uri_index;
  size_t
    css_start_offset;
  char
    found_robots,
    found_content_type,
    link_inline;
  const char
    * html,
    * css_attr,
    * css_dir;
} html_context;

// see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
static const char maybe[256] = {
  ['a'] = 1,
  ['b'] = 1,
  ['c'] = 1,
  ['d'] = 1,
  ['f'] = 1,
  ['h'] = 1,
  ['i'] = 1,
  ['l'] = 1,
  ['m'] = 1,
  ['p'] = 1,
  ['s'] = 1,
  ['u'] = 1,
};
static const char attrs[][12] = {
  "action", "archive",
  "background",
  "code", "codebase", "cite", "classid",
  "data",
  "formaction",
  "href",
  "icon",
  "lowsrc", "longdesc",
  "manifest",
  "profile", "poster",
  "src", "srcset",
  "usemap"
};

static void css_parse_uri(void *context, const char *url WGET_GCC_UNUSED, size_t len, size_t pos)
{
  html_context *ctx = context;
  wget_html_parsed_result *res = &ctx->result;
  wget_html_parsed_url *parsed_url;

  if (!(parsed_url = wget_malloc(sizeof(wget_html_parsed_url))))
    return;

  parsed_url->link_inline = 1;
  wget_strscpy(parsed_url->attr, ctx->css_attr, sizeof(parsed_url->attr));
  wget_strscpy(parsed_url->tag, ctx->css_dir, sizeof(parsed_url->tag));
  parsed_url->url.p = (const char *) (ctx->html + ctx->css_start_offset + pos);
  parsed_url->url.len = len;
  parsed_url->download.p = NULL;
  parsed_url->download.len = 0;

  if (!res->uris)
    res->uris = wget_vector_create(32, NULL);

  wget_vector_add(res->uris, parsed_url);
}

// Callback function, called from HTML parser for each URI found.
static void html_get_url(void *context, int flags, const char *tag, const char *attr, const char *val, size_t len, size_t pos WGET_GCC_UNUSED)
{
  html_context *ctx = context;

  // Read the encoding from META tag, e.g. from
  //   <meta http-equiv="Content-Type" content="text/html; charset=utf-8">.
  // It overrides the encoding from the HTTP response resp. from the CLI.
  //
  // Also ,we are interested in ROBOTS e.g.
  //   <META name="ROBOTS" content="NOINDEX, NOFOLLOW">
  if ((flags & XML_FLG_BEGIN)) {
    if ((*tag|0x20) == 'a' && (tag[1] == 0 || !wget_strcasecmp_ascii(tag, "area"))) {
      // The download attribute is only valid for 'a' and 'area' tags.
      // S 4.6.5 in https://html.spec.whatwg.org/multipage/links.html#downloading-resources
      ctx->uri_index = -1;
      ctx->download.p = NULL;
      ctx->download.len = 0;
    }
    else if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) {
      ctx->found_robots = ctx->found_content_type = 0;
    }
    else if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
      ctx->link_inline = 0;
      ctx->uri_index = -1;
    }
  }

  if ((flags & XML_FLG_ATTRIBUTE) && val) {
    wget_html_parsed_result *res = &ctx->result;

//    debug_printf("%02X %s %s '%.*s' %zu %zu\n", (unsigned) flags, tag, attr, (int) len, val, len, pos);

    if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) {
      if (!ctx->found_robots) {
        if (!wget_strcasecmp_ascii(attr, "name") && !wget_strncasecmp_ascii(val, "robots", len)) {
          ctx->found_robots = 1;
          return;
        }
      } else if (ctx->found_robots && !wget_strcasecmp_ascii(attr, "content")) {
        char valbuf[256], *valp;
        const char *value;

        if (!(value = valp = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len)))
          return;

        while (*value) {
          const char *p;

          while (c_isspace(*value)) value++;
          if (*value == ',') { value++; continue; }
          for (p = value; *p && !c_isspace(*p) && *p != ','; p++);
          if (p == value) break;

          // debug_printf("ROBOTS='%.*s'\n", (int)(p - value), value);
          if (!wget_strncasecmp_ascii(value, "all", p - value) || !wget_strncasecmp_ascii(value, "follow", p - value))
            res->follow = 1;
          else if (!wget_strncasecmp_ascii(value, "nofollow", p - value) || !wget_strncasecmp_ascii(value, "none", p - value))
            res->follow = 0;

          value = *p  ? p + 1 : p;
        }

        if (valp != valbuf)
          xfree(valp);

        return;
      }

      if (ctx->found_content_type && !res->encoding) {
        if (!wget_strcasecmp_ascii(attr, "content")) {
          char valbuf[256];
          const char *value;

          if (!(value = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len)))
            return;

          wget_http_parse_content_type(value, NULL, &res->encoding);

          if (value != valbuf)
            xfree(value);
        }
      }
      else if (!ctx->found_content_type && !res->encoding) {
        if (!wget_strcasecmp_ascii(attr, "http-equiv") && !wget_strncasecmp_ascii(val, "Content-Type", len)) {
          ctx->found_content_type = 1;
        }
        else if (!wget_strcasecmp_ascii(attr, "charset")) {
          res->encoding = wget_strmemdup(val, len);
        }
      }

      return;
    }

    if (ctx->ignore_tags) {
      if (wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1
        || wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1)
        return;
    }

    if ((*attr|0x20) == 's' && !wget_strcasecmp_ascii(attr, "style") && len) {
      ctx->css_dir = tag;
      ctx->css_attr = "style";
      ctx->css_start_offset = val - ctx->html;
      wget_css_parse_buffer(val, len, css_parse_uri, NULL, context);
      return;
    }

    if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
      if (!wget_strcasecmp_ascii(attr, "rel")) {
        ctx->link_inline = 0;

        // "rel" contains a space separated list of items.
        //   see https://html.spec.whatwg.org/multipage/semantics.html#attr-link-rel
        //   see https://html.spec.whatwg.org/multipage/links.html#linkTypes
        while (len) {
          const char *p;

          for (p = val;len && !c_isspace(*val); val++, len--); // find end of item
          if (p == val) { val++; len--; continue; } // found a delimiter

          // Check for items that may be important to display the page.
          if (!wget_strncasecmp_ascii(p, "icon", val - p)
            || !wget_strncasecmp_ascii(p, "manifest", val - p)
            || !wget_strncasecmp_ascii(p, "modulepreload", val - p)
            || !wget_strncasecmp_ascii(p, "stylesheet", val - p)
            || !wget_strncasecmp_ascii(p, "prefetch", val - p)
            || !wget_strncasecmp_ascii(p, "preload", val - p))
          {
            ctx->link_inline = 1;
            break;
          }
        }

        if (ctx->uri_index >= 0) {
          // href= came before rel=
          wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index);
          if (url)
            url->link_inline = ctx->link_inline;
        }
        return;
      }
    }

    if ((*tag|0x20) == 'a' && (tag[1] == 0 || !wget_strcasecmp_ascii(tag, "area"))
      && !wget_strcasecmp_ascii(attr, "download"))
    {
      if (!val)
        return;

      for (;len && c_isspace(*val); val++, len--); // skip leading spaces
      for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces
      if (!len)
        return;

      // remember for later
      ctx->download.p = val;
      ctx->download.len = len;

      if (ctx->uri_index >= 0) {
        // href= came before download=
        wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index);
        url->download.p = val;
        url->download.len = len;
      }

      return;
    }

    // shortcut to avoid unneeded calls to bsearch()
    int found = 0;

    // search the static list for a tag/attr match
    if (maybe[(unsigned char)*attr|0x20] && attr[1] && attr[2])
      found = bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))wget_strcasecmp_ascii) != NULL;

    // search the dynamic list for a tag/attr match
    if (!found && ctx->additional_tags) {
      if (wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1
        || wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1)
        found = 1;
    }

    if (found) {
      for (;len && c_isspace(*val); val++, len--); // skip leading spaces
      for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces

      if ((*tag|0x20) == 'b' && !wget_strcasecmp_ascii(tag, "base")) {
        // found a <BASE href="...">
        res->base.p = val;
        res->base.len = len;
        return;
      }

      if (!res->uris)
        res->uris = wget_vector_create(32, NULL);

      wget_html_parsed_url url;

      if (!wget_strcasecmp_ascii(attr, "srcset")) {
        // value is a list of URLs, see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
        // See also https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
        while (len) {
          const char *p;

          for (;len && c_isspace(*val); val++, len--); // skip leading spaces
          for (p = val;len && !c_isspace(*val) && *val != ','; val++, len--); // find end of URL
          if (p != val) {
            // The 'data:' URL contains a single comma: https://datatracker.ietf.org/doc/html/rfc2397
            if (len && *val == ',' && !wget_strncasecmp_ascii(p, "data:", 5)) {
              // advance to the end of the 'data:' URL
              for (val++, len--;len && !c_isspace(*val) && *val != ','; val++, len--);
            }
            url.download.p = NULL;
            url.download.len = 0;
            url.link_inline = ctx->link_inline;
            wget_strscpy(url.attr, attr, sizeof(url.attr));
            wget_strscpy(url.tag, tag, sizeof(url.tag));
            url.url.p = p;
            url.url.len = val - p;
            wget_vector_add_memdup(res->uris, &url, sizeof(url));
          }
          for (;len && *val != ','; val++, len--); // skip optional width/density descriptor
          if (len && *val == ',') { val++; len--; }
        }

      } else {
        // value is a single URL
        url.download.p = ctx->download.p;
        url.download.len = ctx->download.len;
        url.link_inline = ctx->link_inline;
        wget_strscpy(url.attr, attr, sizeof(url.attr));
        wget_strscpy(url.tag, tag, sizeof(url.tag));
        url.url.p = val;
        url.url.len = len;
        ctx->uri_index = wget_vector_add_memdup(res->uris, &url, sizeof(url));
      }
    }
  }

  if (flags & XML_FLG_CONTENT && val && len && !wget_strcasecmp_ascii(tag, "style")) {
    ctx->css_dir = "style";
    ctx->css_attr = "";
    ctx->css_start_offset = val - ctx->html;
    wget_css_parse_buffer(val, len, css_parse_uri, NULL, context);
  }
}

void wget_html_free_urls_inline (wget_html_parsed_result **res)
{
  if (res && *res) {
    xfree((*res)->encoding);
    wget_vector_free(&(*res)->uris);
    xfree(*res);
  }
}

wget_html_parsed_result *wget_html_get_urls_inline(const char *html, wget_vector *additional_tags, wget_vector *ignore_tags)
{
  html_context context = {
    .result.follow = 1,
    .additional_tags = additional_tags,
    .ignore_tags = ignore_tags,
    .html = html,
  };

//  context.result.uris = wget_vector_create(32, -2, NULL);
  wget_html_parse_buffer(html, html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT);

  return wget_memdup(&context.result, sizeof(context.result));
}

Coverage Report

Created: 2024-03-08 06:32

Line	Count	Source (jump to first uncovered line)
1		/*
2		* Copyright (c) 2013 Tim Ruehsen
3		* Copyright (c) 2015-2024 Free Software Foundation, Inc.
4		*
5		* This file is part of libwget.
6		*
7		* Libwget is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as published by
9		* the Free Software Foundation, either version 3 of the License, or
10		* (at your option) any later version.
11		*
12		* Libwget is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libwget. If not, see <https://www.gnu.org/licenses/>.
19		*
20		*
21		* Extracting URLs from HTML
22		*
23		* Changelog
24		* 26.09.2013 Tim Ruehsen created
25		*
26		*/
27
28		#include <config.h>
29
30		#include <unistd.h>
31		#include <stdlib.h>
32		#include <string.h>
33		#include <c-ctype.h>
34
35		#include <wget.h>
36		#include "private.h"
37
38		typedef struct {
39		wget_html_parsed_result
40		result;
41		wget_vector *
42		additional_tags;
43		wget_vector *
44		ignore_tags;
45		wget_string
46		download;
47		int
48		uri_index;
49		size_t
50		css_start_offset;
51		char
52		found_robots,
53		found_content_type,
54		link_inline;
55		const char
56		* html,
57		* css_attr,
58		* css_dir;
59		} html_context;
60
61		// see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value
62		static const char maybe[256] = {
63		['a'] = 1,
64		['b'] = 1,
65		['c'] = 1,
66		['d'] = 1,
67		['f'] = 1,
68		['h'] = 1,
69		['i'] = 1,
70		['l'] = 1,
71		['m'] = 1,
72		['p'] = 1,
73		['s'] = 1,
74		['u'] = 1,
75		};
76		static const char attrs[][12] = {
77		"action", "archive",
78		"background",
79		"code", "codebase", "cite", "classid",
80		"data",
81		"formaction",
82		"href",
83		"icon",
84		"lowsrc", "longdesc",
85		"manifest",
86		"profile", "poster",
87		"src", "srcset",
88		"usemap"
89		};
90
91		static void css_parse_uri(void context, const char url WGET_GCC_UNUSED, size_t len, size_t pos)
92	1.43k	{
93	1.43k	html_context *ctx = context;
94	1.43k	wget_html_parsed_result *res = &ctx->result;
95	1.43k	wget_html_parsed_url *parsed_url;
96
97	1.43k	if (!(parsed_url = wget_malloc(sizeof(wget_html_parsed_url))))
98	0	return;
99
100	1.43k	parsed_url->link_inline = 1;
101	1.43k	wget_strscpy(parsed_url->attr, ctx->css_attr, sizeof(parsed_url->attr));
102	1.43k	wget_strscpy(parsed_url->tag, ctx->css_dir, sizeof(parsed_url->tag));
103	1.43k	parsed_url->url.p = (const char *) (ctx->html + ctx->css_start_offset + pos);
104	1.43k	parsed_url->url.len = len;
105	1.43k	parsed_url->download.p = NULL;
106	1.43k	parsed_url->download.len = 0;
107
108	1.43k	if (!res->uris)
109	122	res->uris = wget_vector_create(32, NULL);
110
111	1.43k	wget_vector_add(res->uris, parsed_url);
112	1.43k	}
113
114		// Callback function, called from HTML parser for each URI found.
115		static void html_get_url(void context, int flags, const char tag, const char attr, const char val, size_t len, size_t pos WGET_GCC_UNUSED)
116	33.8k	{
117	33.8k	html_context *ctx = context;
118
119		// Read the encoding from META tag, e.g. from
120		// <meta http-equiv="Content-Type" content="text/html; charset=utf-8">.
121		// It overrides the encoding from the HTTP response resp. from the CLI.
122		//
123		// Also ,we are interested in ROBOTS e.g.
124		// <META name="ROBOTS" content="NOINDEX, NOFOLLOW">
125	33.8k	if ((flags & XML_FLG_BEGIN)) {
126	8.53k	if ((*tag\|0x20) == 'a' && (tag[1] == 0 \|\| !wget_strcasecmp_ascii(tag, "area"))) {
127		// The download attribute is only valid for 'a' and 'area' tags.
128		// S 4.6.5 in https://html.spec.whatwg.org/multipage/links.html#downloading-resources
129	483	ctx->uri_index = -1;
130	483	ctx->download.p = NULL;
131	483	ctx->download.len = 0;
132	483	}
133	8.05k	else if ((*tag\|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) {
134	1.25k	ctx->found_robots = ctx->found_content_type = 0;
135	1.25k	}
136	6.80k	else if ((*tag\|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
137	407	ctx->link_inline = 0;
138	407	ctx->uri_index = -1;
139	407	}
140	8.53k	}
141
142	33.8k	if ((flags & XML_FLG_ATTRIBUTE) && val) {
143	12.9k	wget_html_parsed_result *res = &ctx->result;
144
145		// debug_printf("%02X %s %s '%.*s' %zu %zu\n", (unsigned) flags, tag, attr, (int) len, val, len, pos);
146
147	12.9k	if ((*tag\|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) {
148	5.23k	if (!ctx->found_robots) {
149	4.31k	if (!wget_strcasecmp_ascii(attr, "name") && !wget_strncasecmp_ascii(val, "robots", len)) {
150	367	ctx->found_robots = 1;
151	367	return;
152	367	}
153	4.31k	} else if (ctx->found_robots && !wget_strcasecmp_ascii(attr, "content")) {
154	683	char valbuf[256], *valp;
155	683	const char *value;
156
157	683	if (!(value = valp = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len)))
158	0	return;
159
160	9.59k	while (*value) {
161	9.14k	const char *p;
162
163	9.84k	while (c_isspace(*value)) value++;
164	9.14k	if (*value == ',') { value++; continue; }
165	63.2k	for (p = value; p && !c_isspace(p) && *p != ','; p++);
166	8.77k	if (p == value) break;
167
168		// debug_printf("ROBOTS='%.*s'\n", (int)(p - value), value);
169	8.53k	if (!wget_strncasecmp_ascii(value, "all", p - value) \|\| !wget_strncasecmp_ascii(value, "follow", p - value))
170	3.13k	res->follow = 1;
171	5.40k	else if (!wget_strncasecmp_ascii(value, "nofollow", p - value) \|\| !wget_strncasecmp_ascii(value, "none", p - value))
172	658	res->follow = 0;
173
174	8.53k	value = *p ? p + 1 : p;
175	8.53k	}
176
177	683	if (valp != valbuf)
178	107	xfree(valp);
179
180	683	return;
181	683	}
182
183	4.18k	if (ctx->found_content_type && !res->encoding) {
184	2.18k	if (!wget_strcasecmp_ascii(attr, "content")) {
185	1.96k	char valbuf[256];
186	1.96k	const char *value;
187
188	1.96k	if (!(value = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len)))
189	0	return;
190
191	1.96k	wget_http_parse_content_type(value, NULL, &res->encoding);
192
193	1.96k	if (value != valbuf)
194	112	xfree(value);
195	1.96k	}
196	2.18k	}
197	2.00k	else if (!ctx->found_content_type && !res->encoding) {
198	1.61k	if (!wget_strcasecmp_ascii(attr, "http-equiv") && !wget_strncasecmp_ascii(val, "Content-Type", len)) {
199	591	ctx->found_content_type = 1;
200	591	}
201	1.02k	else if (!wget_strcasecmp_ascii(attr, "charset")) {
202	9	res->encoding = wget_strmemdup(val, len);
203	9	}
204	1.61k	}
205
206	4.18k	return;
207	4.18k	}
208
209	7.68k	if (ctx->ignore_tags) {
210	0	if (wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1
211	0	\|\| wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1)
212	0	return;
213	0	}
214
215	7.68k	if ((*attr\|0x20) == 's' && !wget_strcasecmp_ascii(attr, "style") && len) {
216	752	ctx->css_dir = tag;
217	752	ctx->css_attr = "style";
218	752	ctx->css_start_offset = val - ctx->html;
219	752	wget_css_parse_buffer(val, len, css_parse_uri, NULL, context);
220	752	return;
221	752	}
222
223	6.93k	if ((*tag\|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) {
224	1.99k	if (!wget_strcasecmp_ascii(attr, "rel")) {
225	1.70k	ctx->link_inline = 0;
226
227		// "rel" contains a space separated list of items.
228		// see https://html.spec.whatwg.org/multipage/semantics.html#attr-link-rel
229		// see https://html.spec.whatwg.org/multipage/links.html#linkTypes
230	2.91k	while (len) {
231	2.59k	const char *p;
232
233	38.2k	for (p = val;len && !c_isspace(*val); val++, len--); // find end of item
234	2.59k	if (p == val) { val++; len--; continue; } // found a delimiter
235
236		// Check for items that may be important to display the page.
237	2.04k	if (!wget_strncasecmp_ascii(p, "icon", val - p)
238	2.04k	\|\| !wget_strncasecmp_ascii(p, "manifest", val - p)
239	2.04k	\|\| !wget_strncasecmp_ascii(p, "modulepreload", val - p)
240	2.04k	\|\| !wget_strncasecmp_ascii(p, "stylesheet", val - p)
241	2.04k	\|\| !wget_strncasecmp_ascii(p, "prefetch", val - p)
242	2.04k	\|\| !wget_strncasecmp_ascii(p, "preload", val - p))
243	1.38k	{
244	1.38k	ctx->link_inline = 1;
245	1.38k	break;
246	1.38k	}
247	2.04k	}
248
249	1.70k	if (ctx->uri_index >= 0) {
250		// href= came before rel=
251	218	wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index);
252	218	if (url)
253	218	url->link_inline = ctx->link_inline;
254	218	}
255	1.70k	return;
256	1.70k	}
257	1.99k	}
258
259	5.23k	if ((*tag\|0x20) == 'a' && (tag[1] == 0 \|\| !wget_strcasecmp_ascii(tag, "area"))
260	5.23k	&& !wget_strcasecmp_ascii(attr, "download"))
261	0	{
262	0	if (!val)
263	0	return;
264
265	0	for (;len && c_isspace(*val); val++, len--); // skip leading spaces
266	0	for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces
267	0	if (!len)
268	0	return;
269
270		// remember for later
271	0	ctx->download.p = val;
272	0	ctx->download.len = len;
273
274	0	if (ctx->uri_index >= 0) {
275		// href= came before download=
276	0	wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index);
277	0	url->download.p = val;
278	0	url->download.len = len;
279	0	}
280
281	0	return;
282	0	}
283
284		// shortcut to avoid unneeded calls to bsearch()
285	5.23k	int found = 0;
286
287		// search the static list for a tag/attr match
288	5.23k	if (maybe[(unsigned char)*attr\|0x20] && attr[1] && attr[2])
289	2.56k	found = bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int()(const void , const void *))wget_strcasecmp_ascii) != NULL;
290
291		// search the dynamic list for a tag/attr match
292	5.23k	if (!found && ctx->additional_tags) {
293	0	if (wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1
294	0	\|\| wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1)
295	0	found = 1;
296	0	}
297
298	5.23k	if (found) {
299	2.73k	for (;len && c_isspace(*val); val++, len--); // skip leading spaces
300	2.75k	for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces
301
302	2.14k	if ((*tag\|0x20) == 'b' && !wget_strcasecmp_ascii(tag, "base")) {
303		// found a <BASE href="...">
304	194	res->base.p = val;
305	194	res->base.len = len;
306	194	return;
307	194	}
308
309	1.95k	if (!res->uris)
310	383	res->uris = wget_vector_create(32, NULL);
311
312	1.95k	wget_html_parsed_url url;
313
314	1.95k	if (!wget_strcasecmp_ascii(attr, "srcset")) {
315		// value is a list of URLs, see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset
316		// See also https://html.spec.whatwg.org/multipage/images.html#srcset-attribute
317	54.0k	while (len) {
318	53.0k	const char *p;
319
320	53.8k	for (;len && c_isspace(*val); val++, len--); // skip leading spaces
321	138k	for (p = val;len && !c_isspace(val) && val != ','; val++, len--); // find end of URL
322	53.0k	if (p != val) {
323		// The 'data:' URL contains a single comma: https://datatracker.ietf.org/doc/html/rfc2397
324	50.2k	if (len && *val == ',' && !wget_strncasecmp_ascii(p, "data:", 5)) {
325		// advance to the end of the 'data:' URL
326	3.59k	for (val++, len--;len && !c_isspace(val) && val != ','; val++, len--);
327	1.72k	}
328	50.2k	url.download.p = NULL;
329	50.2k	url.download.len = 0;
330	50.2k	url.link_inline = ctx->link_inline;
331	50.2k	wget_strscpy(url.attr, attr, sizeof(url.attr));
332	50.2k	wget_strscpy(url.tag, tag, sizeof(url.tag));
333	50.2k	url.url.p = p;
334	50.2k	url.url.len = val - p;
335	50.2k	wget_vector_add_memdup(res->uris, &url, sizeof(url));
336	50.2k	}
337	59.7k	for (;len && *val != ','; val++, len--); // skip optional width/density descriptor
338	53.0k	if (len && *val == ',') { val++; len--; }
339	53.0k	}
340
341	1.04k	} else {
342		// value is a single URL
343	914	url.download.p = ctx->download.p;
344	914	url.download.len = ctx->download.len;
345	914	url.link_inline = ctx->link_inline;
346	914	wget_strscpy(url.attr, attr, sizeof(url.attr));
347	914	wget_strscpy(url.tag, tag, sizeof(url.tag));
348	914	url.url.p = val;
349	914	url.url.len = len;
350	914	ctx->uri_index = wget_vector_add_memdup(res->uris, &url, sizeof(url));
351	914	}
352	1.95k	}
353	5.23k	}
354
355	25.9k	if (flags & XML_FLG_CONTENT && val && len && !wget_strcasecmp_ascii(tag, "style")) {
356	933	ctx->css_dir = "style";
357	933	ctx->css_attr = "";
358	933	ctx->css_start_offset = val - ctx->html;
359	933	wget_css_parse_buffer(val, len, css_parse_uri, NULL, context);
360	933	}
361	25.9k	}
362
363		void wget_html_free_urls_inline (wget_html_parsed_result **res)
364	3.64k	{
365	3.64k	if (res && *res) {
366	3.64k	xfree((*res)->encoding);
367	3.64k	wget_vector_free(&(*res)->uris);
368	3.64k	xfree(*res);
369	3.64k	}
370	3.64k	}
371
372		wget_html_parsed_result wget_html_get_urls_inline(const char html, wget_vector additional_tags, wget_vector ignore_tags)
373	3.64k	{
374	3.64k	html_context context = {
375	3.64k	.result.follow = 1,
376	3.64k	.additional_tags = additional_tags,
377	3.64k	.ignore_tags = ignore_tags,
378	3.64k	.html = html,
379	3.64k	};
380
381		// context.result.uris = wget_vector_create(32, -2, NULL);
382	3.64k	wget_html_parse_buffer(html, html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT);
383
384	3.64k	return wget_memdup(&context.result, sizeof(context.result));
385	3.64k	}