Line | Count | Source (jump to first uncovered line) |
1 | | /* Collect URLs from HTML source. |
2 | | Copyright (C) 1998-2012, 2015, 2018-2024 Free Software Foundation, |
3 | | Inc. |
4 | | |
5 | | This file is part of GNU Wget. |
6 | | |
7 | | GNU Wget is free software; you can redistribute it and/or modify |
8 | | it under the terms of the GNU General Public License as published by |
9 | | the Free Software Foundation; either version 3 of the License, or |
10 | | (at your option) any later version. |
11 | | |
12 | | GNU Wget is distributed in the hope that it will be useful, |
13 | | but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | GNU General Public License for more details. |
16 | | |
17 | | You should have received a copy of the GNU General Public License |
18 | | along with Wget. If not, see <http://www.gnu.org/licenses/>. |
19 | | |
20 | | Additional permission under GNU GPL version 3 section 7 |
21 | | |
22 | | If you modify this program, or any covered work, by linking or |
23 | | combining it with the OpenSSL project's OpenSSL library (or a |
24 | | modified version of that library), containing parts covered by the |
25 | | terms of the OpenSSL or SSLeay licenses, the Free Software Foundation |
26 | | grants you additional permission to convey the resulting work. |
27 | | Corresponding Source for a non-source form of such a combination |
28 | | shall include the source code for the parts of OpenSSL used as well |
29 | | as that of the covered work. */ |
30 | | |
31 | | #include "wget.h" |
32 | | |
33 | | #include <stdio.h> |
34 | | #include <string.h> |
35 | | #include <stdlib.h> |
36 | | #include <errno.h> |
37 | | #include <assert.h> |
38 | | |
39 | | #include "exits.h" |
40 | | #include "html-parse.h" |
41 | | #include "url.h" |
42 | | #include "utils.h" |
43 | | #include "hash.h" |
44 | | #include "convert.h" |
45 | | #include "recur.h" |
46 | | #include "html-url.h" |
47 | | #include "css-url.h" |
48 | | #include "c-strcase.h" |
49 | | |
50 | | typedef void (*tag_handler_t) (int, struct taginfo *, struct map_context *); |
51 | | |
52 | | #define DECLARE_TAG_HANDLER(fun) \ |
53 | | static void fun (int, struct taginfo *, struct map_context *) |
54 | | |
55 | | DECLARE_TAG_HANDLER (tag_find_urls); |
56 | | DECLARE_TAG_HANDLER (tag_handle_base); |
57 | | DECLARE_TAG_HANDLER (tag_handle_form); |
58 | | DECLARE_TAG_HANDLER (tag_handle_img); |
59 | | DECLARE_TAG_HANDLER (tag_handle_link); |
60 | | DECLARE_TAG_HANDLER (tag_handle_meta); |
61 | | |
62 | | enum { |
63 | | TAG_A, |
64 | | TAG_APPLET, |
65 | | TAG_AREA, |
66 | | TAG_BASE, |
67 | | TAG_BGSOUND, |
68 | | TAG_BODY, |
69 | | TAG_EMBED, |
70 | | TAG_FIG, |
71 | | TAG_FORM, |
72 | | TAG_FRAME, |
73 | | TAG_IFRAME, |
74 | | TAG_IMG, |
75 | | TAG_INPUT, |
76 | | TAG_LAYER, |
77 | | TAG_LINK, |
78 | | TAG_META, |
79 | | TAG_OBJECT, |
80 | | TAG_OVERLAY, |
81 | | TAG_SCRIPT, |
82 | | TAG_TABLE, |
83 | | TAG_TD, |
84 | | TAG_TH, |
85 | | TAG_VIDEO, |
86 | | TAG_AUDIO, |
87 | | TAG_SOURCE |
88 | | }; |
89 | | |
90 | | /* The list of known tags and functions used for handling them. Most |
91 | | tags are simply harvested for URLs. */ |
92 | | static struct known_tag { |
93 | | int tagid; |
94 | | const char *name; |
95 | | tag_handler_t handler; |
96 | | } known_tags[] = { |
97 | | { TAG_A, "a", tag_find_urls }, |
98 | | { TAG_APPLET, "applet", tag_find_urls }, |
99 | | { TAG_AREA, "area", tag_find_urls }, |
100 | | { TAG_BASE, "base", tag_handle_base }, |
101 | | { TAG_BGSOUND, "bgsound", tag_find_urls }, |
102 | | { TAG_BODY, "body", tag_find_urls }, |
103 | | { TAG_EMBED, "embed", tag_find_urls }, |
104 | | { TAG_FIG, "fig", tag_find_urls }, |
105 | | { TAG_FORM, "form", tag_handle_form }, |
106 | | { TAG_FRAME, "frame", tag_find_urls }, |
107 | | { TAG_IFRAME, "iframe", tag_find_urls }, |
108 | | { TAG_IMG, "img", tag_handle_img }, // tag_find_urls() plus handling "srcset" |
109 | | { TAG_INPUT, "input", tag_find_urls }, |
110 | | { TAG_LAYER, "layer", tag_find_urls }, |
111 | | { TAG_LINK, "link", tag_handle_link }, |
112 | | { TAG_META, "meta", tag_handle_meta }, |
113 | | { TAG_OBJECT, "object", tag_find_urls }, |
114 | | { TAG_OVERLAY, "overlay", tag_find_urls }, |
115 | | { TAG_SCRIPT, "script", tag_find_urls }, |
116 | | { TAG_TABLE, "table", tag_find_urls }, |
117 | | { TAG_TD, "td", tag_find_urls }, |
118 | | { TAG_TH, "th", tag_find_urls }, |
119 | | { TAG_VIDEO, "video", tag_find_urls }, |
120 | | { TAG_AUDIO, "audio", tag_find_urls }, |
121 | | { TAG_SOURCE, "source", tag_handle_img } // tag_find_urls() plus handling "srcset" |
122 | | }; |
123 | | |
124 | | /* tag_url_attributes documents which attributes of which tags contain |
125 | | URLs to harvest. It is used by tag_find_urls. */ |
126 | | |
127 | | /* Defines for the FLAGS. */ |
128 | | |
129 | | /* The link is "inline", i.e. needs to be retrieved for this document |
130 | | to be correctly rendered. Inline links include inlined images, |
131 | | stylesheets, children frames, etc. */ |
132 | 0 | #define ATTR_INLINE 1 |
133 | | |
134 | | /* The link is expected to yield HTML contents. It's important not to |
135 | | try to follow HTML obtained by following e.g. <img src="..."> |
136 | | regardless of content-type. Doing this causes infinite loops for |
137 | | "images" that return non-404 error pages with links to the same |
138 | | image. */ |
139 | 0 | #define ATTR_HTML 2 |
140 | | |
141 | | /* For tags handled by tag_find_urls: attributes that contain URLs to |
142 | | download. */ |
143 | | static struct { |
144 | | int tagid; |
145 | | const char *attr_name; |
146 | | int flags; |
147 | | } tag_url_attributes[] = { |
148 | | { TAG_A, "href", ATTR_HTML }, |
149 | | { TAG_APPLET, "code", ATTR_INLINE }, |
150 | | { TAG_AREA, "href", ATTR_HTML }, |
151 | | { TAG_BGSOUND, "src", ATTR_INLINE }, |
152 | | { TAG_BODY, "background", ATTR_INLINE }, |
153 | | { TAG_EMBED, "href", ATTR_HTML }, |
154 | | { TAG_EMBED, "src", ATTR_INLINE | ATTR_HTML }, |
155 | | { TAG_FIG, "src", ATTR_INLINE }, |
156 | | { TAG_FRAME, "src", ATTR_INLINE | ATTR_HTML }, |
157 | | { TAG_IFRAME, "src", ATTR_INLINE | ATTR_HTML }, |
158 | | { TAG_IMG, "href", ATTR_INLINE }, |
159 | | { TAG_IMG, "lowsrc", ATTR_INLINE }, |
160 | | { TAG_IMG, "src", ATTR_INLINE }, |
161 | | { TAG_INPUT, "src", ATTR_INLINE }, |
162 | | { TAG_LAYER, "src", ATTR_INLINE | ATTR_HTML }, |
163 | | { TAG_OBJECT, "data", ATTR_INLINE }, |
164 | | { TAG_OVERLAY, "src", ATTR_INLINE | ATTR_HTML }, |
165 | | { TAG_SCRIPT, "src", ATTR_INLINE }, |
166 | | { TAG_TABLE, "background", ATTR_INLINE }, |
167 | | { TAG_TD, "background", ATTR_INLINE }, |
168 | | { TAG_TH, "background", ATTR_INLINE }, |
169 | | { TAG_VIDEO, "src", ATTR_INLINE }, |
170 | | { TAG_VIDEO, "poster", ATTR_INLINE }, |
171 | | { TAG_AUDIO, "src", ATTR_INLINE }, |
172 | | { TAG_AUDIO, "poster", ATTR_INLINE }, |
173 | | { TAG_SOURCE, "src", ATTR_INLINE }, |
174 | | }; |
175 | | |
176 | | /* The lists of interesting tags and attributes are built dynamically, |
177 | | from the information above. However, some places in the code refer |
178 | | to the attributes not mentioned here. We add them manually. */ |
179 | | static const char *additional_attributes[] = { |
180 | | "rel", /* used by tag_handle_link */ |
181 | | "type", /* used by tag_handle_link */ |
182 | | "http-equiv", /* used by tag_handle_meta */ |
183 | | "name", /* used by tag_handle_meta */ |
184 | | "content", /* used by tag_handle_meta */ |
185 | | "action", /* used by tag_handle_form */ |
186 | | "style", /* used by check_style_attr */ |
187 | | "srcset", /* used by tag_handle_img */ |
188 | | }; |
189 | | |
190 | | static struct hash_table *interesting_tags; |
191 | | static struct hash_table *interesting_attributes; |
192 | | |
193 | | /* Will contains the (last) charset found in 'http-equiv=content-type' |
194 | | meta tags */ |
195 | | static char *meta_charset; |
196 | | |
197 | | static void |
198 | | init_interesting (void) |
199 | 0 | { |
200 | | /* Init the variables interesting_tags and interesting_attributes |
201 | | that are used by the HTML parser to know which tags and |
202 | | attributes we're interested in. We initialize this only once, |
203 | | for performance reasons. |
204 | | |
205 | | Here we also make sure that what we put in interesting_tags |
206 | | matches the user's preferences as specified through --ignore-tags |
207 | | and --follow-tags. */ |
208 | |
|
209 | 0 | size_t i; |
210 | 0 | interesting_tags = make_nocase_string_hash_table (countof (known_tags)); |
211 | | |
212 | | /* First, add all the tags we know hot to handle, mapped to their |
213 | | respective entries in known_tags. */ |
214 | 0 | for (i = 0; i < countof (known_tags); i++) |
215 | 0 | hash_table_put (interesting_tags, known_tags[i].name, known_tags + i); |
216 | | |
217 | | /* Then remove the tags ignored through --ignore-tags. */ |
218 | 0 | if (opt.ignore_tags) |
219 | 0 | { |
220 | 0 | char **ignored; |
221 | 0 | for (ignored = opt.ignore_tags; *ignored; ignored++) |
222 | 0 | hash_table_remove (interesting_tags, *ignored); |
223 | 0 | } |
224 | | |
225 | | /* If --follow-tags is specified, use only those tags. */ |
226 | 0 | if (opt.follow_tags) |
227 | 0 | { |
228 | | /* Create a new table intersecting --follow-tags and known_tags, |
229 | | and use it as interesting_tags. */ |
230 | 0 | struct hash_table *intersect = make_nocase_string_hash_table (0); |
231 | 0 | char **followed; |
232 | 0 | for (followed = opt.follow_tags; *followed; followed++) |
233 | 0 | { |
234 | 0 | struct known_tag *t = hash_table_get (interesting_tags, *followed); |
235 | 0 | if (!t) |
236 | 0 | continue; /* ignore unknown --follow-tags entries. */ |
237 | 0 | hash_table_put (intersect, *followed, t); |
238 | 0 | } |
239 | 0 | hash_table_destroy (interesting_tags); |
240 | 0 | interesting_tags = intersect; |
241 | 0 | } |
242 | | |
243 | | /* Add the attributes we care about. */ |
244 | 0 | interesting_attributes = make_nocase_string_hash_table (10); |
245 | 0 | for (i = 0; i < countof (additional_attributes); i++) |
246 | 0 | hash_table_put (interesting_attributes, additional_attributes[i], "1"); |
247 | 0 | for (i = 0; i < countof (tag_url_attributes); i++) |
248 | 0 | hash_table_put (interesting_attributes, |
249 | 0 | tag_url_attributes[i].attr_name, "1"); |
250 | 0 | } |
251 | | |
252 | | /* Find the value of attribute named NAME in the taginfo TAG. If the |
253 | | attribute is not present, return NULL. If ATTRIND is non-NULL, the |
254 | | index of the attribute in TAG will be stored there. */ |
255 | | |
256 | | static char * |
257 | | find_attr (struct taginfo *tag, const char *name, int *attrind) |
258 | 0 | { |
259 | 0 | int i; |
260 | 0 | for (i = 0; i < tag->nattrs; i++) |
261 | 0 | if (!c_strcasecmp (tag->attrs[i].name, name)) |
262 | 0 | { |
263 | 0 | if (attrind) |
264 | 0 | *attrind = i; |
265 | 0 | return tag->attrs[i].value; |
266 | 0 | } |
267 | 0 | return NULL; |
268 | 0 | } |
269 | | |
270 | | /* used for calls to append_url */ |
271 | | #define ATTR_POS(tag, attrind, ctx) \ |
272 | 0 | (tag->attrs[attrind].value_raw_beginning - ctx->text) |
273 | | #define ATTR_SIZE(tag, attrind) \ |
274 | 0 | (tag->attrs[attrind].value_raw_size) |
275 | | |
276 | | /* Append LINK_URI to the urlpos structure that is being built. |
277 | | |
278 | | LINK_URI will be merged with the current document base. |
279 | | */ |
280 | | |
281 | | struct urlpos * |
282 | | append_url (const char *link_uri, int position, int size, |
283 | | struct map_context *ctx) |
284 | 0 | { |
285 | 0 | int link_has_scheme = url_has_scheme (link_uri); |
286 | 0 | struct urlpos *newel; |
287 | 0 | const char *base = ctx->base ? ctx->base : ctx->parent_base; |
288 | 0 | struct url *url; |
289 | |
|
290 | 0 | struct iri *iri = iri_new (); |
291 | 0 | set_uri_encoding (iri, opt.locale, true); |
292 | 0 | iri->utf8_encode = true; |
293 | |
|
294 | 0 | if (!base) |
295 | 0 | { |
296 | 0 | DEBUGP (("%s: no base, merge will use \"%s\".\n", |
297 | 0 | ctx->document_file, link_uri)); |
298 | |
|
299 | 0 | if (!link_has_scheme) |
300 | 0 | { |
301 | | /* Base URL is unavailable, and the link does not have a |
302 | | location attached to it -- we have to give up. Since |
303 | | this can only happen when using `--force-html -i', print |
304 | | a warning. */ |
305 | 0 | logprintf (LOG_NOTQUIET, |
306 | 0 | _("%s: Cannot resolve incomplete link %s.\n"), |
307 | 0 | ctx->document_file, link_uri); |
308 | 0 | iri_free (iri); |
309 | 0 | return NULL; |
310 | 0 | } |
311 | | |
312 | 0 | url = url_parse (link_uri, NULL, iri, false); |
313 | 0 | if (!url) |
314 | 0 | { |
315 | 0 | DEBUGP (("%s: link \"%s\" doesn't parse.\n", |
316 | 0 | ctx->document_file, link_uri)); |
317 | 0 | iri_free (iri); |
318 | 0 | return NULL; |
319 | 0 | } |
320 | 0 | } |
321 | 0 | else |
322 | 0 | { |
323 | | /* Merge BASE with LINK_URI, but also make sure the result is |
324 | | canonicalized, i.e. that "../" have been resolved. |
325 | | (parse_url will do that for us.) */ |
326 | |
|
327 | 0 | char *complete_uri = uri_merge (base, link_uri); |
328 | |
|
329 | 0 | DEBUGP (("%s: merge(%s, %s) -> %s\n", |
330 | 0 | quotearg_n_style (0, escape_quoting_style, ctx->document_file), |
331 | 0 | quote_n (1, base), |
332 | 0 | quote_n (2, link_uri), |
333 | 0 | quotearg_n_style (3, escape_quoting_style, complete_uri))); |
334 | |
|
335 | 0 | url = url_parse (complete_uri, NULL, iri, false); |
336 | 0 | if (!url) |
337 | 0 | { |
338 | 0 | DEBUGP (("%s: merged link \"%s\" doesn't parse.\n", |
339 | 0 | ctx->document_file, complete_uri)); |
340 | 0 | xfree (complete_uri); |
341 | 0 | iri_free (iri); |
342 | 0 | return NULL; |
343 | 0 | } |
344 | 0 | xfree (complete_uri); |
345 | 0 | } |
346 | | |
347 | 0 | iri_free (iri); |
348 | |
|
349 | 0 | DEBUGP (("appending %s to urlpos.\n", quote (url->url))); |
350 | |
|
351 | 0 | newel = xnew0 (struct urlpos); |
352 | 0 | newel->url = url; |
353 | 0 | newel->pos = position; |
354 | 0 | newel->size = size; |
355 | | |
356 | | /* A URL is relative if the host is not named, and the name does not |
357 | | start with `/'. */ |
358 | 0 | if (!link_has_scheme && *link_uri != '/') |
359 | 0 | newel->link_relative_p = 1; |
360 | 0 | else if (link_has_scheme) |
361 | 0 | newel->link_complete_p = 1; |
362 | | |
363 | | /* Append the new URL maintaining the order by position. */ |
364 | 0 | if (ctx->head == NULL) |
365 | 0 | ctx->head = newel; |
366 | 0 | else |
367 | 0 | { |
368 | 0 | struct urlpos *it, *prev = NULL; |
369 | |
|
370 | 0 | it = ctx->head; |
371 | 0 | while (it && position > it->pos) |
372 | 0 | { |
373 | 0 | prev = it; |
374 | 0 | it = it->next; |
375 | 0 | } |
376 | |
|
377 | 0 | newel->next = it; |
378 | |
|
379 | 0 | if (prev) |
380 | 0 | prev->next = newel; |
381 | 0 | else |
382 | 0 | ctx->head = newel; |
383 | 0 | } |
384 | |
|
385 | 0 | return newel; |
386 | 0 | } |
387 | | |
388 | | static void |
389 | | check_style_attr (struct taginfo *tag, struct map_context *ctx) |
390 | 0 | { |
391 | 0 | int attrind; |
392 | 0 | int raw_start; |
393 | 0 | int raw_len; |
394 | 0 | char *style = find_attr (tag, "style", &attrind); |
395 | 0 | if (!style) |
396 | 0 | return; |
397 | | |
398 | | /* raw pos and raw size include the quotes, skip them when they are |
399 | | present. */ |
400 | 0 | raw_start = ATTR_POS (tag, attrind, ctx); |
401 | 0 | raw_len = ATTR_SIZE (tag, attrind); |
402 | 0 | if( *(char *)(ctx->text + raw_start) == '\'' |
403 | 0 | || *(char *)(ctx->text + raw_start) == '"') |
404 | 0 | { |
405 | 0 | raw_start += 1; |
406 | 0 | raw_len -= 2; |
407 | 0 | } |
408 | |
|
409 | 0 | if(raw_len <= 0) |
410 | 0 | return; |
411 | | |
412 | 0 | get_urls_css (ctx, raw_start, raw_len); |
413 | 0 | } |
414 | | |
415 | | /* All the tag_* functions are called from collect_tags_mapper, as |
416 | | specified by KNOWN_TAGS. */ |
417 | | |
418 | | /* Default tag handler: collect URLs from attributes specified for |
419 | | this tag by tag_url_attributes. */ |
420 | | |
421 | | static void |
422 | | tag_find_urls (int tagid, struct taginfo *tag, struct map_context *ctx) |
423 | 0 | { |
424 | 0 | size_t i; |
425 | 0 | int attrind; |
426 | 0 | int first = -1; |
427 | |
|
428 | 0 | for (i = 0; i < countof (tag_url_attributes); i++) |
429 | 0 | if (tag_url_attributes[i].tagid == tagid) |
430 | 0 | { |
431 | | /* We've found the index of tag_url_attributes where the |
432 | | attributes of our tag begin. */ |
433 | 0 | first = i; |
434 | 0 | break; |
435 | 0 | } |
436 | 0 | assert (first != -1); |
437 | | |
438 | | /* Loop over the "interesting" attributes of this tag. In this |
439 | | example, it will loop over "src" and "lowsrc". |
440 | | |
441 | | <img src="foo.png" lowsrc="bar.png"> |
442 | | |
443 | | This has to be done in the outer loop so that the attributes are |
444 | | processed in the same order in which they appear in the page. |
445 | | This is required when converting links. */ |
446 | |
|
447 | 0 | for (attrind = 0; attrind < tag->nattrs; attrind++) |
448 | 0 | { |
449 | | /* Find whether TAG/ATTRIND is a combination that contains a |
450 | | URL. */ |
451 | 0 | char *link = tag->attrs[attrind].value; |
452 | 0 | const size_t size = countof (tag_url_attributes); |
453 | | |
454 | | /* If you're cringing at the inefficiency of the nested loops, |
455 | | remember that they both iterate over a very small number of |
456 | | items. The worst-case inner loop is for the IMG tag, which |
457 | | has three attributes. */ |
458 | 0 | for (i = first; i < size && tag_url_attributes[i].tagid == tagid; i++) |
459 | 0 | { |
460 | 0 | if (0 == strcasecmp (tag->attrs[attrind].name, |
461 | 0 | tag_url_attributes[i].attr_name)) |
462 | 0 | { |
463 | 0 | struct urlpos *up = append_url (link, ATTR_POS(tag,attrind,ctx), |
464 | 0 | ATTR_SIZE(tag,attrind), ctx); |
465 | 0 | if (up) |
466 | 0 | { |
467 | 0 | int flags = tag_url_attributes[i].flags; |
468 | 0 | if (flags & ATTR_INLINE) |
469 | 0 | up->link_inline_p = 1; |
470 | 0 | if (flags & ATTR_HTML) |
471 | 0 | up->link_expect_html = 1; |
472 | 0 | } |
473 | 0 | } |
474 | 0 | } |
475 | 0 | } |
476 | 0 | } |
477 | | |
478 | | /* Handle the BASE tag, for <base href=...>. */ |
479 | | |
480 | | static void |
481 | | tag_handle_base (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) |
482 | 0 | { |
483 | 0 | struct urlpos *base_urlpos; |
484 | 0 | int attrind; |
485 | 0 | char *newbase = find_attr (tag, "href", &attrind); |
486 | 0 | if (!newbase) |
487 | 0 | return; |
488 | | |
489 | 0 | base_urlpos = append_url (newbase, ATTR_POS(tag,attrind,ctx), |
490 | 0 | ATTR_SIZE(tag,attrind), ctx); |
491 | 0 | if (!base_urlpos) |
492 | 0 | return; |
493 | 0 | base_urlpos->ignore_when_downloading = 1; |
494 | 0 | base_urlpos->link_base_p = 1; |
495 | |
|
496 | 0 | xfree (ctx->base); |
497 | 0 | if (ctx->parent_base) |
498 | 0 | ctx->base = uri_merge (ctx->parent_base, newbase); |
499 | 0 | else |
500 | 0 | ctx->base = xstrdup (newbase); |
501 | 0 | } |
502 | | |
503 | | /* Mark the URL found in <form action=...> for conversion. */ |
504 | | |
505 | | static void |
506 | | tag_handle_form (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) |
507 | 0 | { |
508 | 0 | int attrind; |
509 | 0 | char *action = find_attr (tag, "action", &attrind); |
510 | |
|
511 | 0 | if (action) |
512 | 0 | { |
513 | 0 | struct urlpos *up = append_url (action, ATTR_POS(tag,attrind,ctx), |
514 | 0 | ATTR_SIZE(tag,attrind), ctx); |
515 | 0 | if (up) |
516 | 0 | up->ignore_when_downloading = 1; |
517 | 0 | } |
518 | 0 | } |
519 | | |
520 | | /* Handle the LINK tag. It requires special handling because how its |
521 | | links will be followed in -p mode depends on the REL attribute. */ |
522 | | |
523 | | static void |
524 | | tag_handle_link (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) |
525 | 0 | { |
526 | 0 | int attrind; |
527 | 0 | char *href = find_attr (tag, "href", &attrind); |
528 | | |
529 | | /* All <link href="..."> link references are external, except those |
530 | | known not to be, such as style sheet and shortcut icon: |
531 | | |
532 | | <link rel="stylesheet" href="..."> or <link rel="alternate stylesheet" href="..."> |
533 | | <link rel="shortcut icon" href="..."> or <link rel="icon" href="..."> |
534 | | */ |
535 | 0 | if (href) |
536 | 0 | { |
537 | 0 | struct urlpos *up = append_url (href, ATTR_POS(tag,attrind,ctx), |
538 | 0 | ATTR_SIZE(tag,attrind), ctx); |
539 | 0 | if (up) |
540 | 0 | { |
541 | 0 | char *rel = find_attr (tag, "rel", NULL); |
542 | 0 | if (rel) |
543 | 0 | { |
544 | 0 | if (0 == c_strcasecmp (rel, "stylesheet") || 0 == c_strcasecmp (rel, "alternate stylesheet")) |
545 | 0 | { |
546 | 0 | up->link_inline_p = 1; |
547 | 0 | up->link_expect_css = 1; |
548 | 0 | } |
549 | 0 | else if (0 == c_strcasecmp (rel, "shortcut icon") || 0 == c_strcasecmp (rel, "icon")) |
550 | 0 | { |
551 | 0 | up->link_inline_p = 1; |
552 | 0 | } |
553 | 0 | else if (0 == c_strcasecmp (rel, "manifest")) |
554 | 0 | { |
555 | 0 | up->link_inline_p = 1; |
556 | 0 | } |
557 | 0 | else |
558 | 0 | { |
559 | | /* The external ones usually point to HTML pages, such as |
560 | | <link rel="next" href="..."> |
561 | | except when the type attribute says otherwise: |
562 | | <link rel="alternate" type="application/rss+xml" href=".../?feed=rss2" /> |
563 | | */ |
564 | 0 | char *type = find_attr (tag, "type", NULL); |
565 | 0 | if (!type || c_strcasecmp (type, "text/html") == 0) |
566 | 0 | up->link_expect_html = 1; |
567 | 0 | } |
568 | 0 | } |
569 | 0 | } |
570 | 0 | } |
571 | 0 | } |
572 | | |
573 | | /* Handle the META tag. This requires special handling because of the |
574 | | refresh feature and because of robot exclusion. */ |
575 | | |
576 | | static void |
577 | | tag_handle_meta (int tagid _GL_UNUSED, struct taginfo *tag, struct map_context *ctx) |
578 | 0 | { |
579 | 0 | char *name = find_attr (tag, "name", NULL); |
580 | 0 | char *http_equiv = find_attr (tag, "http-equiv", NULL); |
581 | |
|
582 | 0 | if (http_equiv && 0 == c_strcasecmp (http_equiv, "refresh")) |
583 | 0 | { |
584 | | /* Some pages use a META tag to specify that the page be |
585 | | refreshed by a new page after a given number of seconds. The |
586 | | general format for this is: |
587 | | |
588 | | <meta http-equiv=Refresh content="NUMBER; URL=index2.html"> |
589 | | |
590 | | So we just need to skip past the "NUMBER; URL=" garbage to |
591 | | get to the URL. */ |
592 | |
|
593 | 0 | struct urlpos *entry; |
594 | 0 | int attrind; |
595 | 0 | int timeout; |
596 | 0 | char *p; |
597 | |
|
598 | 0 | char *refresh = find_attr (tag, "content", &attrind); |
599 | 0 | if (!refresh) |
600 | 0 | return; |
601 | | |
602 | 0 | timeout = strtol(refresh, &p, 10); |
603 | |
|
604 | 0 | if (timeout < 0 || *p++ != ';') |
605 | 0 | return; |
606 | | |
607 | 0 | while (c_isspace (*p)) |
608 | 0 | ++p; |
609 | 0 | if (!( c_toupper (*p) == 'U' |
610 | 0 | && c_toupper (*(p + 1)) == 'R' |
611 | 0 | && c_toupper (*(p + 2)) == 'L' |
612 | 0 | && *(p + 3) == '=')) |
613 | 0 | return; |
614 | 0 | p += 4; |
615 | 0 | while (c_isspace (*p)) |
616 | 0 | ++p; |
617 | |
|
618 | 0 | entry = append_url (p, ATTR_POS(tag,attrind,ctx), |
619 | 0 | ATTR_SIZE(tag,attrind), ctx); |
620 | 0 | if (entry) |
621 | 0 | { |
622 | 0 | entry->link_refresh_p = 1; |
623 | 0 | entry->refresh_timeout = timeout; |
624 | 0 | entry->link_expect_html = 1; |
625 | 0 | } |
626 | 0 | } |
627 | 0 | else if (http_equiv && 0 == c_strcasecmp (http_equiv, "content-type")) |
628 | 0 | { |
629 | | /* Handle stuff like: |
630 | | <meta http-equiv="Content-Type" content="text/html; charset=CHARSET"> */ |
631 | |
|
632 | 0 | char *mcharset; |
633 | 0 | char *content = find_attr (tag, "content", NULL); |
634 | 0 | if (!content) |
635 | 0 | return; |
636 | | |
637 | 0 | mcharset = parse_charset (content); |
638 | 0 | if (!mcharset) |
639 | 0 | return; |
640 | | |
641 | 0 | xfree (meta_charset); |
642 | 0 | meta_charset = mcharset; |
643 | 0 | } |
644 | 0 | else if (name && 0 == c_strcasecmp (name, "robots")) |
645 | 0 | { |
646 | | /* Handle stuff like: |
647 | | <meta name="robots" content="index,nofollow"> */ |
648 | 0 | char *content = find_attr (tag, "content", NULL); |
649 | 0 | if (!content) |
650 | 0 | return; |
651 | 0 | if (!c_strcasecmp (content, "none")) |
652 | 0 | ctx->nofollow = true; |
653 | 0 | else |
654 | 0 | { |
655 | 0 | while (*content) |
656 | 0 | { |
657 | 0 | char *end; |
658 | | /* Skip any initial whitespace. */ |
659 | 0 | content += strspn (content, " \f\n\r\t\v"); |
660 | | /* Find the next occurrence of ',' or whitespace, |
661 | | * or the end of the string. */ |
662 | 0 | end = content + strcspn (content, ", \f\n\r\t\v"); |
663 | 0 | if (!c_strncasecmp (content, "nofollow", end - content)) |
664 | 0 | ctx->nofollow = true; |
665 | | /* Skip past the next comma, if any. */ |
666 | 0 | if (*end == ',') |
667 | 0 | ++end; |
668 | 0 | else |
669 | 0 | { |
670 | 0 | end = strchr (end, ','); |
671 | 0 | if (end) |
672 | 0 | ++end; |
673 | 0 | else |
674 | 0 | end = content + strlen (content); |
675 | 0 | } |
676 | 0 | content = end; |
677 | 0 | } |
678 | 0 | } |
679 | 0 | } |
680 | 0 | } |
681 | | |
682 | | /* Handle the IMG tag. This requires special handling for the srcset attr, |
683 | | while the traditional src/lowsrc/href attributes can be handled generically. |
684 | | */ |
685 | | |
686 | | static void |
687 | 0 | tag_handle_img (int tagid, struct taginfo *tag, struct map_context *ctx) { |
688 | 0 | int attrind; |
689 | 0 | char *srcset; |
690 | | |
691 | | /* Use the generic approach for the attributes without special syntax. */ |
692 | 0 | tag_find_urls(tagid, tag, ctx); |
693 | |
|
694 | 0 | srcset = find_attr (tag, "srcset", &attrind); |
695 | 0 | if (srcset) |
696 | 0 | { |
697 | | /* These are relative to the input text. */ |
698 | 0 | int base_ind = ATTR_POS (tag,attrind,ctx); |
699 | 0 | int size = strlen (srcset); |
700 | | |
701 | | /* These are relative to srcset. */ |
702 | 0 | int offset, url_start, url_end; |
703 | | |
704 | | /* Make sure to line up base_ind with srcset[0], not outside quotes. */ |
705 | 0 | if (ctx->text[base_ind] == '"' || ctx->text[base_ind] == '\'') |
706 | 0 | ++base_ind; |
707 | |
|
708 | 0 | offset = 0; |
709 | 0 | while (offset < size) |
710 | 0 | { |
711 | 0 | bool has_descriptor = true; |
712 | | |
713 | | /* Skip over initial whitespace and commas. Note there is no \v |
714 | | in HTML5 whitespace. */ |
715 | 0 | url_start = offset + strspn (srcset + offset, " \f\n\r\t,"); |
716 | |
|
717 | 0 | if (url_start == size) |
718 | 0 | return; |
719 | | |
720 | | /* URL is any non-whitespace chars (including commas) - but with |
721 | | trailing commas removed. */ |
722 | 0 | url_end = url_start + strcspn (srcset + url_start, " \f\n\r\t"); |
723 | 0 | while ((url_end - 1) > url_start && srcset[url_end - 1] == ',') |
724 | 0 | { |
725 | 0 | has_descriptor = false; |
726 | 0 | --url_end; |
727 | 0 | } |
728 | |
|
729 | 0 | if (url_end > url_start) |
730 | 0 | { |
731 | 0 | char *url_text = strdupdelim (srcset + url_start, |
732 | 0 | srcset + url_end); |
733 | 0 | struct urlpos *up = append_url (url_text, base_ind + url_start, |
734 | 0 | url_end - url_start, ctx); |
735 | 0 | if (up) |
736 | 0 | { |
737 | 0 | up->link_inline_p = 1; |
738 | 0 | up->link_noquote_html_p = 1; |
739 | 0 | } |
740 | 0 | xfree (url_text); |
741 | 0 | } |
742 | | |
743 | | /* If the URL wasn't terminated by a , there may also be a descriptor |
744 | | which we just skip. */ |
745 | 0 | if (has_descriptor) |
746 | 0 | { |
747 | | /* This is comma-terminated, except there may be one level of |
748 | | parentheses escaping that. */ |
749 | 0 | bool in_paren = false; |
750 | 0 | for (offset = url_end; offset < size; ++offset) |
751 | 0 | { |
752 | 0 | char c = srcset[offset]; |
753 | 0 | if (c == '(') |
754 | 0 | in_paren = true; |
755 | 0 | else if (c == ')' && in_paren) |
756 | 0 | in_paren = false; |
757 | 0 | else if (c == ',' && !in_paren) |
758 | 0 | break; |
759 | 0 | } |
760 | 0 | } |
761 | 0 | else |
762 | 0 | offset = url_end; |
763 | 0 | } |
764 | 0 | } |
765 | 0 | } |
766 | | |
767 | | /* Dispatch the tag handler appropriate for the tag we're mapping |
768 | | over. See known_tags[] for definition of tag handlers. */ |
769 | | |
770 | | static void |
771 | | collect_tags_mapper (struct taginfo *tag, void *arg) |
772 | 0 | { |
773 | 0 | struct map_context *ctx = (struct map_context *)arg; |
774 | | |
775 | | /* Find the tag in our table of tags. This must not fail because |
776 | | map_html_tags only returns tags found in interesting_tags. |
777 | | |
778 | | I've changed this for now, I'm passing NULL as interesting_tags |
779 | | to map_html_tags. This way we can check all tags for a style |
780 | | attribute. |
781 | | */ |
782 | 0 | struct known_tag *t = hash_table_get (interesting_tags, tag->name); |
783 | |
|
784 | 0 | if (t != NULL) |
785 | 0 | t->handler (t->tagid, tag, ctx); |
786 | |
|
787 | 0 | check_style_attr (tag, ctx); |
788 | |
|
789 | 0 | if (tag->end_tag_p && (0 == c_strcasecmp (tag->name, "style")) |
790 | 0 | && tag->contents_begin && tag->contents_end |
791 | 0 | && tag->contents_begin <= tag->contents_end) |
792 | 0 | { |
793 | | /* parse contents */ |
794 | 0 | get_urls_css (ctx, tag->contents_begin - ctx->text, |
795 | 0 | tag->contents_end - tag->contents_begin); |
796 | 0 | } |
797 | 0 | } |
798 | | |
799 | | /* Analyze HTML tags FILE and construct a list of URLs referenced from |
800 | | it. It merges relative links in FILE with URL. It is aware of |
801 | | <base href=...> and does the right thing. */ |
802 | | |
803 | | struct urlpos * |
804 | | get_urls_html_fm (const char *file, const struct file_memory *fm, |
805 | | const char *url, bool *meta_disallow_follow, |
806 | | struct iri *iri) |
807 | 0 | { |
808 | 0 | struct map_context ctx; |
809 | 0 | int flags; |
810 | |
|
811 | 0 | ctx.text = fm->content; |
812 | 0 | ctx.head = NULL; |
813 | 0 | ctx.base = NULL; |
814 | 0 | ctx.parent_base = url ? url : opt.base_href; |
815 | 0 | ctx.document_file = file; |
816 | 0 | ctx.nofollow = false; |
817 | |
|
818 | 0 | if (!interesting_tags) |
819 | 0 | init_interesting (); |
820 | | |
821 | | /* Specify MHT_TRIM_VALUES because of buggy HTML generators that |
822 | | generate <a href=" foo"> instead of <a href="foo"> (browsers |
823 | | ignore spaces as well.) If you really mean space, use &32; or |
824 | | %20. MHT_TRIM_VALUES also causes squashing of embedded newlines, |
825 | | e.g. in <img src="foo.[newline]html">. Such newlines are also |
826 | | ignored by IE and Mozilla and are presumably introduced by |
827 | | writing HTML with editors that force word wrap. */ |
828 | 0 | flags = MHT_TRIM_VALUES; |
829 | 0 | if (opt.strict_comments) |
830 | 0 | flags |= MHT_STRICT_COMMENTS; |
831 | | |
832 | | /* the NULL here used to be interesting_tags */ |
833 | 0 | map_html_tags (fm->content, fm->length, collect_tags_mapper, &ctx, flags, |
834 | 0 | NULL, interesting_attributes); |
835 | |
|
836 | 0 | #ifdef ENABLE_IRI |
837 | | /* Meta charset is only valid if there was no HTTP header Content-Type charset. */ |
838 | | /* This is true for HTTP 1.0 and 1.1. */ |
839 | 0 | if (iri && !iri->content_encoding && meta_charset) |
840 | 0 | set_content_encoding (iri, meta_charset); |
841 | 0 | #endif |
842 | 0 | xfree (meta_charset); |
843 | |
|
844 | 0 | DEBUGP (("nofollow in %s: %d\n", file, ctx.nofollow)); |
845 | |
|
846 | 0 | if (meta_disallow_follow) |
847 | 0 | *meta_disallow_follow = ctx.nofollow; |
848 | |
|
849 | 0 | xfree (ctx.base); |
850 | 0 | return ctx.head; |
851 | 0 | } |
852 | | |
853 | | struct urlpos * |
854 | | get_urls_html (const char *file, const char *url, bool *meta_disallow_follow, |
855 | | struct iri *iri) |
856 | 0 | { |
857 | 0 | struct urlpos *urls; |
858 | 0 | struct file_memory *fm; |
859 | |
|
860 | 0 | fm = wget_read_file (file); |
861 | 0 | if (!fm) |
862 | 0 | { |
863 | 0 | logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); |
864 | 0 | return NULL; |
865 | 0 | } |
866 | 0 | DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); |
867 | |
|
868 | 0 | urls = get_urls_html_fm (file, fm, url, meta_disallow_follow, iri); |
869 | 0 | wget_read_file_free (fm); |
870 | 0 | return urls; |
871 | 0 | } |
872 | | |
873 | | /* This doesn't really have anything to do with HTML, but it's similar |
874 | | to get_urls_html, so we put it here. */ |
875 | | |
876 | | struct urlpos * |
877 | | get_urls_file (const char *file, bool *read_again) |
878 | 0 | { |
879 | 0 | struct file_memory *fm; |
880 | 0 | struct urlpos *head, *tail; |
881 | 0 | const char *text, *text_end; |
882 | | |
883 | | /* Load the file. */ |
884 | 0 | fm = wget_read_from_file (file, read_again); |
885 | 0 | if (!fm) |
886 | 0 | { |
887 | 0 | logprintf (LOG_NOTQUIET, "%s: %s\n", file, strerror (errno)); |
888 | 0 | return NULL; |
889 | 0 | } |
890 | 0 | if (fm->length) |
891 | 0 | DEBUGP (("Loaded %s (size %s).\n", file, number_to_static_string (fm->length))); |
892 | |
|
893 | 0 | head = tail = NULL; |
894 | 0 | text = fm->content; |
895 | 0 | text_end = fm->content + fm->length; |
896 | 0 | while (text < text_end) |
897 | 0 | { |
898 | 0 | int up_error_code; |
899 | 0 | char *url_text; |
900 | 0 | char *new_url; |
901 | 0 | struct urlpos *entry; |
902 | 0 | struct url *url; |
903 | |
|
904 | 0 | const char *line_beg = text; |
905 | 0 | const char *line_end = memchr (text, '\n', text_end - text); |
906 | 0 | if (!line_end) |
907 | 0 | line_end = text_end; |
908 | 0 | else |
909 | 0 | ++line_end; |
910 | 0 | text = line_end; |
911 | | |
912 | | /* Strip whitespace from the beginning and end of line. */ |
913 | 0 | while (line_beg < line_end && c_isspace (*line_beg)) |
914 | 0 | ++line_beg; |
915 | 0 | while (line_end > line_beg && c_isspace (*(line_end - 1))) |
916 | 0 | --line_end; |
917 | |
|
918 | 0 | if (line_beg == line_end) |
919 | 0 | continue; |
920 | | |
921 | | /* The URL is in the [line_beg, line_end) region. */ |
922 | | |
923 | | /* We must copy the URL to a zero-terminated string, and we |
924 | | can't use alloca because we're in a loop. *sigh*. */ |
925 | 0 | url_text = strdupdelim (line_beg, line_end); |
926 | |
|
927 | 0 | if (opt.base_href) |
928 | 0 | { |
929 | | /* Merge opt.base_href with URL. */ |
930 | 0 | char *merged = uri_merge (opt.base_href, url_text); |
931 | 0 | xfree (url_text); |
932 | 0 | url_text = merged; |
933 | 0 | } |
934 | |
|
935 | 0 | new_url = maybe_prepend_scheme (url_text); |
936 | 0 | if (new_url) |
937 | 0 | { |
938 | 0 | xfree (url_text); |
939 | 0 | url_text = new_url; |
940 | 0 | } |
941 | |
|
942 | 0 | url = url_parse (url_text, &up_error_code, NULL, false); |
943 | 0 | if (!url) |
944 | 0 | { |
945 | 0 | logprintf (LOG_NOTQUIET, _("%s: Invalid URL %s: %s\n"), |
946 | 0 | file, url_text, url_error (up_error_code)); |
947 | 0 | xfree (url_text); |
948 | 0 | inform_exit_status (URLERROR); |
949 | 0 | continue; |
950 | 0 | } |
951 | 0 | xfree (url_text); |
952 | |
|
953 | 0 | entry = xnew0 (struct urlpos); |
954 | 0 | entry->url = url; |
955 | |
|
956 | 0 | if (!head) |
957 | 0 | head = entry; |
958 | 0 | else |
959 | 0 | tail->next = entry; |
960 | 0 | tail = entry; |
961 | 0 | } |
962 | 0 | wget_read_file_free (fm); |
963 | 0 | return head; |
964 | 0 | } |
965 | | |
966 | | #if defined DEBUG_MALLOC || defined TESTING |
967 | | void |
968 | | cleanup_html_url (void) |
969 | 0 | { |
970 | | /* Destroy the hash tables. The hash table keys and values are not |
971 | | allocated by this code, so we don't need to free them here. */ |
972 | 0 | if (interesting_tags) |
973 | 0 | hash_table_destroy (interesting_tags); |
974 | 0 | if (interesting_attributes) |
975 | 0 | hash_table_destroy (interesting_attributes); |
976 | 0 | } |
977 | | #endif |