/src/wget2/libwget/html_url.c
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2013 Tim Ruehsen |
3 | | * Copyright (c) 2015-2024 Free Software Foundation, Inc. |
4 | | * |
5 | | * This file is part of libwget. |
6 | | * |
7 | | * Libwget is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as published by |
9 | | * the Free Software Foundation, either version 3 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * Libwget is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libwget. If not, see <https://www.gnu.org/licenses/>. |
19 | | * |
20 | | * |
21 | | * Extracting URLs from HTML |
22 | | * |
23 | | * Changelog |
24 | | * 26.09.2013 Tim Ruehsen created |
25 | | * |
26 | | */ |
27 | | |
28 | | #include <config.h> |
29 | | |
30 | | #include <unistd.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <c-ctype.h> |
34 | | |
35 | | #include <wget.h> |
36 | | #include "private.h" |
37 | | |
38 | | typedef struct { |
39 | | wget_html_parsed_result |
40 | | result; |
41 | | wget_vector * |
42 | | additional_tags; |
43 | | wget_vector * |
44 | | ignore_tags; |
45 | | wget_string |
46 | | download; |
47 | | int |
48 | | uri_index; |
49 | | size_t |
50 | | css_start_offset; |
51 | | char |
52 | | found_robots, |
53 | | found_content_type, |
54 | | link_inline; |
55 | | const char |
56 | | * html, |
57 | | * css_attr, |
58 | | * css_dir; |
59 | | } html_context; |
60 | | |
61 | | // see https://stackoverflow.com/questions/2725156/complete-list-of-html-tag-attributes-which-have-a-url-value |
62 | | static const char maybe[256] = { |
63 | | ['a'] = 1, |
64 | | ['b'] = 1, |
65 | | ['c'] = 1, |
66 | | ['d'] = 1, |
67 | | ['f'] = 1, |
68 | | ['h'] = 1, |
69 | | ['i'] = 1, |
70 | | ['l'] = 1, |
71 | | ['m'] = 1, |
72 | | ['p'] = 1, |
73 | | ['s'] = 1, |
74 | | ['u'] = 1, |
75 | | }; |
76 | | static const char attrs[][12] = { |
77 | | "action", "archive", |
78 | | "background", |
79 | | "code", "codebase", "cite", "classid", |
80 | | "data", |
81 | | "formaction", |
82 | | "href", |
83 | | "icon", |
84 | | "lowsrc", "longdesc", |
85 | | "manifest", |
86 | | "profile", "poster", |
87 | | "src", "srcset", |
88 | | "usemap" |
89 | | }; |
90 | | |
91 | | static void css_parse_uri(void *context, const char *url WGET_GCC_UNUSED, size_t len, size_t pos) |
92 | 1.43k | { |
93 | 1.43k | html_context *ctx = context; |
94 | 1.43k | wget_html_parsed_result *res = &ctx->result; |
95 | 1.43k | wget_html_parsed_url *parsed_url; |
96 | | |
97 | 1.43k | if (!(parsed_url = wget_malloc(sizeof(wget_html_parsed_url)))) |
98 | 0 | return; |
99 | | |
100 | 1.43k | parsed_url->link_inline = 1; |
101 | 1.43k | wget_strscpy(parsed_url->attr, ctx->css_attr, sizeof(parsed_url->attr)); |
102 | 1.43k | wget_strscpy(parsed_url->tag, ctx->css_dir, sizeof(parsed_url->tag)); |
103 | 1.43k | parsed_url->url.p = (const char *) (ctx->html + ctx->css_start_offset + pos); |
104 | 1.43k | parsed_url->url.len = len; |
105 | 1.43k | parsed_url->download.p = NULL; |
106 | 1.43k | parsed_url->download.len = 0; |
107 | | |
108 | 1.43k | if (!res->uris) |
109 | 122 | res->uris = wget_vector_create(32, NULL); |
110 | | |
111 | 1.43k | wget_vector_add(res->uris, parsed_url); |
112 | 1.43k | } |
113 | | |
114 | | // Callback function, called from HTML parser for each URI found. |
115 | | static void html_get_url(void *context, int flags, const char *tag, const char *attr, const char *val, size_t len, size_t pos WGET_GCC_UNUSED) |
116 | 33.8k | { |
117 | 33.8k | html_context *ctx = context; |
118 | | |
119 | | // Read the encoding from META tag, e.g. from |
120 | | // <meta http-equiv="Content-Type" content="text/html; charset=utf-8">. |
121 | | // It overrides the encoding from the HTTP response resp. from the CLI. |
122 | | // |
123 | | // Also ,we are interested in ROBOTS e.g. |
124 | | // <META name="ROBOTS" content="NOINDEX, NOFOLLOW"> |
125 | 33.8k | if ((flags & XML_FLG_BEGIN)) { |
126 | 8.53k | if ((*tag|0x20) == 'a' && (tag[1] == 0 || !wget_strcasecmp_ascii(tag, "area"))) { |
127 | | // The download attribute is only valid for 'a' and 'area' tags. |
128 | | // S 4.6.5 in https://html.spec.whatwg.org/multipage/links.html#downloading-resources |
129 | 483 | ctx->uri_index = -1; |
130 | 483 | ctx->download.p = NULL; |
131 | 483 | ctx->download.len = 0; |
132 | 483 | } |
133 | 8.05k | else if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) { |
134 | 1.25k | ctx->found_robots = ctx->found_content_type = 0; |
135 | 1.25k | } |
136 | 6.80k | else if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) { |
137 | 407 | ctx->link_inline = 0; |
138 | 407 | ctx->uri_index = -1; |
139 | 407 | } |
140 | 8.53k | } |
141 | | |
142 | 33.8k | if ((flags & XML_FLG_ATTRIBUTE) && val) { |
143 | 12.9k | wget_html_parsed_result *res = &ctx->result; |
144 | | |
145 | | // debug_printf("%02X %s %s '%.*s' %zu %zu\n", (unsigned) flags, tag, attr, (int) len, val, len, pos); |
146 | | |
147 | 12.9k | if ((*tag|0x20) == 'm' && !wget_strcasecmp_ascii(tag, "meta")) { |
148 | 5.23k | if (!ctx->found_robots) { |
149 | 4.31k | if (!wget_strcasecmp_ascii(attr, "name") && !wget_strncasecmp_ascii(val, "robots", len)) { |
150 | 367 | ctx->found_robots = 1; |
151 | 367 | return; |
152 | 367 | } |
153 | 4.31k | } else if (ctx->found_robots && !wget_strcasecmp_ascii(attr, "content")) { |
154 | 683 | char valbuf[256], *valp; |
155 | 683 | const char *value; |
156 | | |
157 | 683 | if (!(value = valp = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len))) |
158 | 0 | return; |
159 | | |
160 | 9.59k | while (*value) { |
161 | 9.14k | const char *p; |
162 | | |
163 | 9.84k | while (c_isspace(*value)) value++; |
164 | 9.14k | if (*value == ',') { value++; continue; } |
165 | 63.2k | for (p = value; *p && !c_isspace(*p) && *p != ','; p++); |
166 | 8.77k | if (p == value) break; |
167 | | |
168 | | // debug_printf("ROBOTS='%.*s'\n", (int)(p - value), value); |
169 | 8.53k | if (!wget_strncasecmp_ascii(value, "all", p - value) || !wget_strncasecmp_ascii(value, "follow", p - value)) |
170 | 3.13k | res->follow = 1; |
171 | 5.40k | else if (!wget_strncasecmp_ascii(value, "nofollow", p - value) || !wget_strncasecmp_ascii(value, "none", p - value)) |
172 | 658 | res->follow = 0; |
173 | | |
174 | 8.53k | value = *p ? p + 1 : p; |
175 | 8.53k | } |
176 | | |
177 | 683 | if (valp != valbuf) |
178 | 107 | xfree(valp); |
179 | | |
180 | 683 | return; |
181 | 683 | } |
182 | | |
183 | 4.18k | if (ctx->found_content_type && !res->encoding) { |
184 | 2.18k | if (!wget_strcasecmp_ascii(attr, "content")) { |
185 | 1.96k | char valbuf[256]; |
186 | 1.96k | const char *value; |
187 | | |
188 | 1.96k | if (!(value = wget_strmemcpy_a(valbuf, sizeof(valbuf), val, len))) |
189 | 0 | return; |
190 | | |
191 | 1.96k | wget_http_parse_content_type(value, NULL, &res->encoding); |
192 | | |
193 | 1.96k | if (value != valbuf) |
194 | 112 | xfree(value); |
195 | 1.96k | } |
196 | 2.18k | } |
197 | 2.00k | else if (!ctx->found_content_type && !res->encoding) { |
198 | 1.61k | if (!wget_strcasecmp_ascii(attr, "http-equiv") && !wget_strncasecmp_ascii(val, "Content-Type", len)) { |
199 | 591 | ctx->found_content_type = 1; |
200 | 591 | } |
201 | 1.02k | else if (!wget_strcasecmp_ascii(attr, "charset")) { |
202 | 9 | res->encoding = wget_strmemdup(val, len); |
203 | 9 | } |
204 | 1.61k | } |
205 | | |
206 | 4.18k | return; |
207 | 4.18k | } |
208 | | |
209 | 7.68k | if (ctx->ignore_tags) { |
210 | 0 | if (wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1 |
211 | 0 | || wget_vector_find(ctx->ignore_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1) |
212 | 0 | return; |
213 | 0 | } |
214 | | |
215 | 7.68k | if ((*attr|0x20) == 's' && !wget_strcasecmp_ascii(attr, "style") && len) { |
216 | 752 | ctx->css_dir = tag; |
217 | 752 | ctx->css_attr = "style"; |
218 | 752 | ctx->css_start_offset = val - ctx->html; |
219 | 752 | wget_css_parse_buffer(val, len, css_parse_uri, NULL, context); |
220 | 752 | return; |
221 | 752 | } |
222 | | |
223 | 6.93k | if ((*tag|0x20) == 'l' && !wget_strcasecmp_ascii(tag, "link")) { |
224 | 1.99k | if (!wget_strcasecmp_ascii(attr, "rel")) { |
225 | 1.70k | ctx->link_inline = 0; |
226 | | |
227 | | // "rel" contains a space separated list of items. |
228 | | // see https://html.spec.whatwg.org/multipage/semantics.html#attr-link-rel |
229 | | // see https://html.spec.whatwg.org/multipage/links.html#linkTypes |
230 | 2.91k | while (len) { |
231 | 2.59k | const char *p; |
232 | | |
233 | 38.2k | for (p = val;len && !c_isspace(*val); val++, len--); // find end of item |
234 | 2.59k | if (p == val) { val++; len--; continue; } // found a delimiter |
235 | | |
236 | | // Check for items that may be important to display the page. |
237 | 2.04k | if (!wget_strncasecmp_ascii(p, "icon", val - p) |
238 | 2.04k | || !wget_strncasecmp_ascii(p, "manifest", val - p) |
239 | 2.04k | || !wget_strncasecmp_ascii(p, "modulepreload", val - p) |
240 | 2.04k | || !wget_strncasecmp_ascii(p, "stylesheet", val - p) |
241 | 2.04k | || !wget_strncasecmp_ascii(p, "prefetch", val - p) |
242 | 2.04k | || !wget_strncasecmp_ascii(p, "preload", val - p)) |
243 | 1.38k | { |
244 | 1.38k | ctx->link_inline = 1; |
245 | 1.38k | break; |
246 | 1.38k | } |
247 | 2.04k | } |
248 | | |
249 | 1.70k | if (ctx->uri_index >= 0) { |
250 | | // href= came before rel= |
251 | 218 | wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index); |
252 | 218 | if (url) |
253 | 218 | url->link_inline = ctx->link_inline; |
254 | 218 | } |
255 | 1.70k | return; |
256 | 1.70k | } |
257 | 1.99k | } |
258 | | |
259 | 5.23k | if ((*tag|0x20) == 'a' && (tag[1] == 0 || !wget_strcasecmp_ascii(tag, "area")) |
260 | 5.23k | && !wget_strcasecmp_ascii(attr, "download")) |
261 | 0 | { |
262 | 0 | if (!val) |
263 | 0 | return; |
264 | | |
265 | 0 | for (;len && c_isspace(*val); val++, len--); // skip leading spaces |
266 | 0 | for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces |
267 | 0 | if (!len) |
268 | 0 | return; |
269 | | |
270 | | // remember for later |
271 | 0 | ctx->download.p = val; |
272 | 0 | ctx->download.len = len; |
273 | |
|
274 | 0 | if (ctx->uri_index >= 0) { |
275 | | // href= came before download= |
276 | 0 | wget_html_parsed_url *url = wget_vector_get(res->uris, ctx->uri_index); |
277 | 0 | url->download.p = val; |
278 | 0 | url->download.len = len; |
279 | 0 | } |
280 | |
|
281 | 0 | return; |
282 | 0 | } |
283 | | |
284 | | // shortcut to avoid unneeded calls to bsearch() |
285 | 5.23k | int found = 0; |
286 | | |
287 | | // search the static list for a tag/attr match |
288 | 5.23k | if (maybe[(unsigned char)*attr|0x20] && attr[1] && attr[2]) |
289 | 2.56k | found = bsearch(attr, attrs, countof(attrs), sizeof(attrs[0]), (int(*)(const void *, const void *))wget_strcasecmp_ascii) != NULL; |
290 | | |
291 | | // search the dynamic list for a tag/attr match |
292 | 5.23k | if (!found && ctx->additional_tags) { |
293 | 0 | if (wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = NULL } ) != -1 |
294 | 0 | || wget_vector_find(ctx->additional_tags, &(wget_html_tag){ .name = tag, .attribute = attr } ) != -1) |
295 | 0 | found = 1; |
296 | 0 | } |
297 | | |
298 | 5.23k | if (found) { |
299 | 2.73k | for (;len && c_isspace(*val); val++, len--); // skip leading spaces |
300 | 2.75k | for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces |
301 | | |
302 | 2.14k | if ((*tag|0x20) == 'b' && !wget_strcasecmp_ascii(tag, "base")) { |
303 | | // found a <BASE href="..."> |
304 | 194 | res->base.p = val; |
305 | 194 | res->base.len = len; |
306 | 194 | return; |
307 | 194 | } |
308 | | |
309 | 1.95k | if (!res->uris) |
310 | 383 | res->uris = wget_vector_create(32, NULL); |
311 | | |
312 | 1.95k | wget_html_parsed_url url; |
313 | | |
314 | 1.95k | if (!wget_strcasecmp_ascii(attr, "srcset")) { |
315 | | // value is a list of URLs, see https://html.spec.whatwg.org/multipage/embedded-content.html#attr-img-srcset |
316 | | // See also https://html.spec.whatwg.org/multipage/images.html#srcset-attribute |
317 | 54.0k | while (len) { |
318 | 53.0k | const char *p; |
319 | | |
320 | 53.8k | for (;len && c_isspace(*val); val++, len--); // skip leading spaces |
321 | 138k | for (p = val;len && !c_isspace(*val) && *val != ','; val++, len--); // find end of URL |
322 | 53.0k | if (p != val) { |
323 | | // The 'data:' URL contains a single comma: https://datatracker.ietf.org/doc/html/rfc2397 |
324 | 50.2k | if (len && *val == ',' && !wget_strncasecmp_ascii(p, "data:", 5)) { |
325 | | // advance to the end of the 'data:' URL |
326 | 3.59k | for (val++, len--;len && !c_isspace(*val) && *val != ','; val++, len--); |
327 | 1.72k | } |
328 | 50.2k | url.download.p = NULL; |
329 | 50.2k | url.download.len = 0; |
330 | 50.2k | url.link_inline = ctx->link_inline; |
331 | 50.2k | wget_strscpy(url.attr, attr, sizeof(url.attr)); |
332 | 50.2k | wget_strscpy(url.tag, tag, sizeof(url.tag)); |
333 | 50.2k | url.url.p = p; |
334 | 50.2k | url.url.len = val - p; |
335 | 50.2k | wget_vector_add_memdup(res->uris, &url, sizeof(url)); |
336 | 50.2k | } |
337 | 59.7k | for (;len && *val != ','; val++, len--); // skip optional width/density descriptor |
338 | 53.0k | if (len && *val == ',') { val++; len--; } |
339 | 53.0k | } |
340 | | |
341 | 1.04k | } else { |
342 | | // value is a single URL |
343 | 914 | url.download.p = ctx->download.p; |
344 | 914 | url.download.len = ctx->download.len; |
345 | 914 | url.link_inline = ctx->link_inline; |
346 | 914 | wget_strscpy(url.attr, attr, sizeof(url.attr)); |
347 | 914 | wget_strscpy(url.tag, tag, sizeof(url.tag)); |
348 | 914 | url.url.p = val; |
349 | 914 | url.url.len = len; |
350 | 914 | ctx->uri_index = wget_vector_add_memdup(res->uris, &url, sizeof(url)); |
351 | 914 | } |
352 | 1.95k | } |
353 | 5.23k | } |
354 | | |
355 | 25.9k | if (flags & XML_FLG_CONTENT && val && len && !wget_strcasecmp_ascii(tag, "style")) { |
356 | 933 | ctx->css_dir = "style"; |
357 | 933 | ctx->css_attr = ""; |
358 | 933 | ctx->css_start_offset = val - ctx->html; |
359 | 933 | wget_css_parse_buffer(val, len, css_parse_uri, NULL, context); |
360 | 933 | } |
361 | 25.9k | } |
362 | | |
363 | | void wget_html_free_urls_inline (wget_html_parsed_result **res) |
364 | 3.64k | { |
365 | 3.64k | if (res && *res) { |
366 | 3.64k | xfree((*res)->encoding); |
367 | 3.64k | wget_vector_free(&(*res)->uris); |
368 | 3.64k | xfree(*res); |
369 | 3.64k | } |
370 | 3.64k | } |
371 | | |
372 | | wget_html_parsed_result *wget_html_get_urls_inline(const char *html, wget_vector *additional_tags, wget_vector *ignore_tags) |
373 | 3.64k | { |
374 | 3.64k | html_context context = { |
375 | 3.64k | .result.follow = 1, |
376 | 3.64k | .additional_tags = additional_tags, |
377 | 3.64k | .ignore_tags = ignore_tags, |
378 | 3.64k | .html = html, |
379 | 3.64k | }; |
380 | | |
381 | | // context.result.uris = wget_vector_create(32, -2, NULL); |
382 | 3.64k | wget_html_parse_buffer(html, html_get_url, &context, HTML_HINT_REMOVE_EMPTY_CONTENT); |
383 | | |
384 | 3.64k | return wget_memdup(&context.result, sizeof(context.result)); |
385 | 3.64k | } |