Line | Count | Source (jump to first uncovered line) |
1 | | /* Support for Robot Exclusion Standard (RES). |
2 | | Copyright (C) 2001, 2006-2011, 2015, 2018-2023 Free Software |
3 | | Foundation, Inc. |
4 | | |
5 | | This file is part of Wget. |
6 | | |
7 | | This program is free software; you can redistribute it and/or modify |
8 | | it under the terms of the GNU General Public License as published by |
9 | | the Free Software Foundation; either version 3 of the License, or (at |
10 | | your option) any later version. |
11 | | |
12 | | This program is distributed in the hope that it will be useful, but |
13 | | WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
15 | | General Public License for more details. |
16 | | |
17 | | You should have received a copy of the GNU General Public License |
18 | | along with Wget. If not, see <http://www.gnu.org/licenses/>. |
19 | | |
20 | | Additional permission under GNU GPL version 3 section 7 |
21 | | |
22 | | If you modify this program, or any covered work, by linking or |
23 | | combining it with the OpenSSL project's OpenSSL library (or a |
24 | | modified version of that library), containing parts covered by the |
25 | | terms of the OpenSSL or SSLeay licenses, the Free Software Foundation |
26 | | grants you additional permission to convey the resulting work. |
27 | | Corresponding Source for a non-source form of such a combination |
28 | | shall include the source code for the parts of OpenSSL used as well |
29 | | as that of the covered work. */ |
30 | | |
31 | | /* This file implements the Robot Exclusion Standard (RES). |
32 | | |
33 | | RES is a simple protocol that enables site admins to signalize to |
34 | | the web crawlers that certain parts of the site should not be |
35 | | accessed. All the admin needs to do is create a "robots.txt" file |
36 | | in the web server root, and use simple commands to allow or |
37 | | disallow access to certain parts of the site. |
38 | | |
39 | | The first specification was written by Martijn Koster in 1994, and |
40 | | is still available at <http://www.robotstxt.org/orig.html>. |
41 | | In 1996, Martijn wrote an Internet Draft specifying an improved RES |
42 | | specification; however, that work was apparently abandoned since |
43 | | the draft has expired in 1997 and hasn't been replaced since. The |
44 | | draft is available at |
45 | | <http://www.robotstxt.org/norobots-rfc.txt>. |
46 | | |
47 | | This file implements RES as specified by the draft. Note that this |
48 | | only handles the "robots.txt" support. The META tag that controls |
49 | | whether the links should be followed is handled in `html-url.c'. |
50 | | |
51 | | Known deviations: |
52 | | |
53 | | * The end-of-line comment recognition is more in the spirit of the |
54 | | Bourne Shell (as specified by RES-1994). That means that |
55 | | "foo#bar" is taken literally, whereas "foo #bar" is interpreted |
56 | | as "foo". The Draft apparently specifies that both should be |
57 | | interpreted as "foo". |
58 | | |
59 | | * We don't recognize sole CR as the line ending. |
60 | | |
61 | | * We don't implement expiry mechanism for /robots.txt specs. I |
62 | | consider it non-necessary for a relatively short-lived |
63 | | application such as Wget. Besides, it is highly questionable |
64 | | whether anyone deploys the recommended expiry scheme for |
65 | | robots.txt. |
66 | | |
67 | | Entry points are functions res_parse, res_parse_from_file, |
68 | | res_match_path, res_register_specs, res_get_specs, and |
69 | | res_retrieve_file. */ |
70 | | |
71 | | #include "wget.h" |
72 | | |
73 | | #include <stdio.h> |
74 | | #include <stdlib.h> |
75 | | #include <string.h> |
76 | | #include <errno.h> |
77 | | #include <assert.h> |
78 | | |
79 | | #include "utils.h" |
80 | | #include "hash.h" |
81 | | #include "url.h" |
82 | | #include "retr.h" |
83 | | #include "res.h" |
84 | | #include "c-strcase.h" |
85 | | |
86 | | #ifdef TESTING |
87 | | #include "../tests/unit-tests.h" |
88 | | #endif |
89 | | |
90 | | struct path_info { |
91 | | char *path; |
92 | | bool allowedp; |
93 | | bool user_agent_exact_p; |
94 | | }; |
95 | | |
96 | | struct robot_specs { |
97 | | int count; |
98 | | int size; |
99 | | struct path_info *paths; |
100 | | }; |
101 | | |
102 | | /* Parsing the robot spec. */ |
103 | | |
104 | | /* Check whether AGENT (a string of length LENGTH) equals "wget" or |
105 | | "*". If it is either of them, *matches is set to one. If it is |
106 | | "wget", *exact_match is set to one. */ |
107 | | |
108 | | static void |
109 | | match_user_agent (const char *agent, int length, |
110 | | bool *matches, bool *exact_match) |
111 | 1.63k | { |
112 | 1.63k | if (length == 1 && *agent == '*') |
113 | 517 | { |
114 | 517 | *matches = true; |
115 | 517 | *exact_match = false; |
116 | 517 | } |
117 | 1.11k | else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget")) |
118 | 243 | { |
119 | 243 | *matches = true; |
120 | 243 | *exact_match = true; |
121 | 243 | } |
122 | 870 | else |
123 | 870 | { |
124 | 870 | *matches = false; |
125 | 870 | *exact_match = false; |
126 | 870 | } |
127 | 1.63k | } |
128 | | |
129 | | /* Add a path specification between PATH_B and PATH_E as one of the |
130 | | paths in SPECS. */ |
131 | | |
132 | | static void |
133 | | add_path (struct robot_specs *specs, const char *path_b, const char *path_e, |
134 | | bool allowedp, bool exactp) |
135 | 11.7k | { |
136 | 11.7k | struct path_info pp; |
137 | 11.7k | if (path_b < path_e && *path_b == '/') |
138 | | /* Our path representation doesn't use a leading slash, so remove |
139 | | one from theirs. */ |
140 | 393 | ++path_b; |
141 | 11.7k | pp.path = strdupdelim (path_b, path_e); |
142 | 11.7k | pp.allowedp = allowedp; |
143 | 11.7k | pp.user_agent_exact_p = exactp; |
144 | 11.7k | ++specs->count; |
145 | 11.7k | if (specs->count > specs->size) |
146 | 1.28k | { |
147 | 1.28k | if (specs->size == 0) |
148 | 353 | specs->size = 1; |
149 | 928 | else |
150 | 928 | specs->size <<= 1; |
151 | 1.28k | specs->paths = xrealloc (specs->paths, |
152 | 1.28k | specs->size * sizeof (struct path_info)); |
153 | 1.28k | } |
154 | 11.7k | specs->paths[specs->count - 1] = pp; |
155 | 11.7k | } |
156 | | |
157 | | /* Recreate SPECS->paths with only those paths that have |
158 | | user_agent_exact_p set to true. */ |
159 | | |
160 | | static void |
161 | | prune_non_exact (struct robot_specs *specs) |
162 | 57 | { |
163 | 57 | struct path_info *newpaths; |
164 | 57 | int i, j, cnt; |
165 | 57 | cnt = 0; |
166 | 3.17k | for (i = 0; i < specs->count; i++) |
167 | 3.11k | if (specs->paths[i].user_agent_exact_p) |
168 | 2.91k | ++cnt; |
169 | 57 | newpaths = xnew_array (struct path_info, cnt); |
170 | 3.17k | for (i = 0, j = 0; i < specs->count; i++) |
171 | 3.11k | if (specs->paths[i].user_agent_exact_p) |
172 | 2.91k | newpaths[j++] = specs->paths[i]; |
173 | 194 | else |
174 | 194 | xfree (specs->paths[i].path); |
175 | 57 | assert (j == cnt); |
176 | 57 | xfree (specs->paths); |
177 | 57 | specs->paths = newpaths; |
178 | 57 | specs->count = cnt; |
179 | 57 | specs->size = cnt; |
180 | 57 | } |
181 | | |
182 | 581k | #define EOL(p) ((p) >= lineend) |
183 | | |
184 | 56.5k | #define SKIP_SPACE(p) do { \ |
185 | 60.1k | while (!EOL (p) && c_isspace (*p)) \ |
186 | 56.5k | ++p; \ |
187 | 56.5k | } while (0) |
188 | | |
189 | | #define FIELD_IS(string_literal) \ |
190 | 34.3k | BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal) |
191 | | |
192 | | /* Parse textual RES specs beginning with SOURCE of length LENGTH. |
193 | | Return a specs objects ready to be fed to res_match_path. |
194 | | |
195 | | The parsing itself is trivial, but creating a correct SPECS object |
196 | | is trickier than it seems, because RES is surprisingly byzantine if |
197 | | you attempt to implement it correctly. |
198 | | |
199 | | A "record" is a block of one or more `User-Agent' lines followed by |
200 | | one or more `Allow' or `Disallow' lines. Record is accepted by |
201 | | Wget if one of the `User-Agent' lines was "wget", or if the user |
202 | | agent line was "*". |
203 | | |
204 | | After all the lines have been read, we examine whether an exact |
205 | | ("wget") user-agent field was specified. If so, we delete all the |
206 | | lines read under "User-Agent: *" blocks because we have our own |
207 | | Wget-specific blocks. This enables the admin to say: |
208 | | |
209 | | User-Agent: * |
210 | | Disallow: / |
211 | | |
212 | | User-Agent: google |
213 | | User-Agent: wget |
214 | | Disallow: /cgi-bin |
215 | | |
216 | | This means that to Wget and to Google, /cgi-bin is disallowed, |
217 | | whereas for all other crawlers, everything is disallowed. |
218 | | res_parse is implemented so that the order of records doesn't |
219 | | matter. In the case above, the "User-Agent: *" could have come |
220 | | after the other one. */ |
221 | | |
222 | | struct robot_specs * |
223 | | res_parse (const char *source, int length) |
224 | 1.27k | { |
225 | 1.27k | int line_count = 1; |
226 | | |
227 | 1.27k | const char *p = source; |
228 | 1.27k | const char *end = source + length; |
229 | | |
230 | | /* true if last applicable user-agent field matches Wget. */ |
231 | 1.27k | bool user_agent_applies = false; |
232 | | |
233 | | /* true if last applicable user-agent field *exactly* matches |
234 | | Wget. */ |
235 | 1.27k | bool user_agent_exact = false; |
236 | | |
237 | | /* whether we ever encountered exact user agent. */ |
238 | 1.27k | bool found_exact = false; |
239 | | |
240 | | /* count of allow/disallow lines in the current "record", i.e. after |
241 | | the last `user-agent' instructions. */ |
242 | 1.27k | int record_count = 0; |
243 | | |
244 | 1.27k | struct robot_specs *specs = xnew0 (struct robot_specs); |
245 | | |
246 | 21.9k | while (1) |
247 | 21.9k | { |
248 | 21.9k | const char *lineend, *lineend_real; |
249 | 21.9k | const char *field_b, *field_e; |
250 | 21.9k | const char *value_b, *value_e; |
251 | | |
252 | 21.9k | if (p == end) |
253 | 1.27k | break; |
254 | 20.6k | lineend_real = memchr (p, '\n', end - p); |
255 | 20.6k | if (lineend_real) |
256 | 19.3k | ++lineend_real; |
257 | 1.26k | else |
258 | 1.26k | lineend_real = end; |
259 | 20.6k | lineend = lineend_real; |
260 | | |
261 | | /* Before doing anything else, check whether the line is empty |
262 | | or comment-only. */ |
263 | 20.6k | SKIP_SPACE (p); |
264 | 20.6k | if (EOL (p) || *p == '#') |
265 | 1.71k | goto next; |
266 | | |
267 | | /* Make sure the end-of-line comments are respected by setting |
268 | | lineend to a location preceding the first comment. Real line |
269 | | ending remains in lineend_real. */ |
270 | 249k | for (lineend = p; lineend < lineend_real; lineend++) |
271 | 230k | if ((lineend == p || c_isspace (*(lineend - 1))) |
272 | 230k | && *lineend == '#') |
273 | 197 | break; |
274 | | |
275 | | /* Ignore trailing whitespace in the same way. */ |
276 | 37.4k | while (lineend > p && c_isspace (*(lineend - 1))) |
277 | 18.4k | --lineend; |
278 | | |
279 | 18.9k | assert (!EOL (p)); |
280 | | |
281 | 18.9k | field_b = p; |
282 | 161k | while (!EOL (p) && (c_isalnum (*p) || *p == '-')) |
283 | 142k | ++p; |
284 | 18.9k | field_e = p; |
285 | | |
286 | 18.9k | SKIP_SPACE (p); |
287 | 18.9k | if (field_b == field_e || EOL (p) || *p != ':') |
288 | 1.91k | { |
289 | 1.91k | DEBUGP (("Ignoring malformed line %d\n", line_count)); |
290 | 1.91k | goto next; |
291 | 1.91k | } |
292 | 17.0k | ++p; /* skip ':' */ |
293 | 17.0k | SKIP_SPACE (p); |
294 | | |
295 | 17.0k | value_b = p; |
296 | 60.3k | while (!EOL (p)) |
297 | 43.3k | ++p; |
298 | 17.0k | value_e = p; |
299 | | |
300 | | /* Finally, we have a syntactically valid line. */ |
301 | 17.0k | if (FIELD_IS ("user-agent")) |
302 | 2.02k | { |
303 | | /* We have to support several cases: |
304 | | |
305 | | --previous records-- |
306 | | |
307 | | User-Agent: foo |
308 | | User-Agent: Wget |
309 | | User-Agent: bar |
310 | | ... matching record ... |
311 | | |
312 | | User-Agent: baz |
313 | | User-Agent: qux |
314 | | ... non-matching record ... |
315 | | |
316 | | User-Agent: * |
317 | | ... matching record, but will be pruned later ... |
318 | | |
319 | | We have to respect `User-Agent' at the beginning of each |
320 | | new record simply because we don't know if we're going to |
321 | | encounter "Wget" among the agents or not. Hence, |
322 | | match_user_agent is called when record_count != 0. |
323 | | |
324 | | But if record_count is 0, we have to keep calling it |
325 | | until it matches, and if that happens, we must not call |
326 | | it any more, until the next record. Hence the other part |
327 | | of the condition. */ |
328 | 2.02k | if (record_count != 0 || user_agent_applies == false) |
329 | 1.63k | match_user_agent (value_b, value_e - value_b, |
330 | 1.63k | &user_agent_applies, &user_agent_exact); |
331 | 2.02k | if (user_agent_exact) |
332 | 443 | found_exact = true; |
333 | 2.02k | record_count = 0; |
334 | 2.02k | } |
335 | 14.9k | else if (FIELD_IS ("allow")) |
336 | 12.6k | { |
337 | 12.6k | if (user_agent_applies) |
338 | 10.9k | { |
339 | 10.9k | add_path (specs, value_b, value_e, true, user_agent_exact); |
340 | 10.9k | } |
341 | 12.6k | ++record_count; |
342 | 12.6k | } |
343 | 2.37k | else if (FIELD_IS ("disallow")) |
344 | 1.01k | { |
345 | 1.01k | if (user_agent_applies) |
346 | 822 | { |
347 | 822 | bool allowed = false; |
348 | 822 | if (value_b == value_e) |
349 | | /* Empty "disallow" line means everything is *allowed*! */ |
350 | 415 | allowed = true; |
351 | 822 | add_path (specs, value_b, value_e, allowed, user_agent_exact); |
352 | 822 | } |
353 | 1.01k | ++record_count; |
354 | 1.01k | } |
355 | 1.35k | else |
356 | 1.35k | { |
357 | 1.35k | DEBUGP (("Ignoring unknown field at line %d\n", line_count)); |
358 | 1.35k | goto next; |
359 | 1.35k | } |
360 | | |
361 | 20.6k | next: |
362 | 20.6k | p = lineend_real; |
363 | 20.6k | ++line_count; |
364 | 20.6k | } |
365 | | |
366 | 1.27k | if (found_exact) |
367 | 57 | { |
368 | | /* We've encountered an exactly matching user-agent. Throw out |
369 | | all the stuff with user-agent: *. */ |
370 | 57 | prune_non_exact (specs); |
371 | 57 | } |
372 | 1.22k | else if (specs->size > specs->count) |
373 | 67 | { |
374 | | /* add_path normally over-allocates specs->paths. Reallocate it |
375 | | to the correct size in order to conserve some memory. */ |
376 | 67 | specs->paths = xrealloc (specs->paths, |
377 | 67 | specs->count * sizeof (struct path_info)); |
378 | 67 | specs->size = specs->count; |
379 | 67 | } |
380 | | |
381 | 1.27k | return specs; |
382 | 1.27k | } |
383 | | |
384 | | /* The same like res_parse, but first map the FILENAME into memory, |
385 | | and then parse it. */ |
386 | | |
387 | | struct robot_specs * |
388 | | res_parse_from_file (const char *filename) |
389 | 0 | { |
390 | 0 | struct robot_specs *specs; |
391 | 0 | struct file_memory *fm = wget_read_file (filename); |
392 | 0 | if (!fm) |
393 | 0 | { |
394 | 0 | logprintf (LOG_NOTQUIET, _("Cannot open %s: %s\n"), |
395 | 0 | filename, strerror (errno)); |
396 | 0 | return NULL; |
397 | 0 | } |
398 | 0 | specs = res_parse (fm->content, fm->length); |
399 | 0 | wget_read_file_free (fm); |
400 | 0 | return specs; |
401 | 0 | } |
402 | | |
403 | | static void |
404 | | free_specs (struct robot_specs *specs) |
405 | 1.27k | { |
406 | 1.27k | int i; |
407 | 12.8k | for (i = 0; i < specs->count; i++) |
408 | 11.5k | xfree (specs->paths[i].path); |
409 | 1.27k | xfree (specs->paths); |
410 | 1.27k | xfree (specs); |
411 | 1.27k | } |
412 | | |
413 | | /* Matching of a path according to the specs. */ |
414 | | |
415 | | /* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if |
416 | | that number is not a numerical representation of '/', decode C and |
417 | | advance the pointer. */ |
418 | | |
419 | 13.9k | #define DECODE_MAYBE(c, ptr) do { \ |
420 | 13.9k | if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \ |
421 | 13.9k | { \ |
422 | 4.40k | unsigned char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \ |
423 | 4.40k | if (decoded != '/') \ |
424 | 4.40k | { \ |
425 | 4.21k | c = decoded; \ |
426 | 4.21k | ptr += 2; \ |
427 | 4.21k | } \ |
428 | 4.40k | } \ |
429 | 13.9k | } while (0) |
430 | | |
431 | | /* The inner matching engine: return true if RECORD_PATH matches |
432 | | URL_PATH. The rules for matching are described at |
433 | | <http://www.robotstxt.org/norobots-rfc.txt>, section 3.2.2. */ |
434 | | |
435 | | static bool |
436 | | matches (const char *record_path, const char *url_path) |
437 | 3.09k | { |
438 | 3.09k | const char *rp = record_path; |
439 | 3.09k | const char *up = url_path; |
440 | | |
441 | 4.24k | for (; ; ++rp, ++up) |
442 | 7.33k | { |
443 | 7.33k | char rc = *rp; |
444 | 7.33k | char uc = *up; |
445 | 7.33k | if (!rc) |
446 | 144 | return true; |
447 | 7.19k | if (!uc) |
448 | 237 | return false; |
449 | 6.95k | DECODE_MAYBE(rc, rp); |
450 | 6.95k | DECODE_MAYBE(uc, up); |
451 | 6.95k | if (rc != uc) |
452 | 2.71k | return false; |
453 | 6.95k | } |
454 | 3.09k | } |
455 | | |
456 | | /* Iterate through all paths in SPECS. For the first one that |
457 | | matches, return its allow/reject status. If none matches, |
458 | | retrieval is by default allowed. */ |
459 | | |
460 | | bool |
461 | | res_match_path (const struct robot_specs *specs, const char *path) |
462 | 1.27k | { |
463 | 1.27k | int i; |
464 | 1.27k | if (!specs) |
465 | 0 | return true; |
466 | 4.22k | for (i = 0; i < specs->count; i++) |
467 | 3.09k | if (matches (specs->paths[i].path, path)) |
468 | 144 | { |
469 | 144 | bool allowedp = specs->paths[i].allowedp; |
470 | 144 | DEBUGP (("%s path %s because of rule %s.\n", |
471 | 144 | allowedp ? "Allowing" : "Rejecting", |
472 | 144 | path, quote (specs->paths[i].path))); |
473 | 144 | return allowedp; |
474 | 144 | } |
475 | 1.13k | return true; |
476 | 1.27k | } |
477 | | |
478 | | /* Registering the specs. */ |
479 | | |
480 | | static struct hash_table *registered_specs; |
481 | | |
482 | | /* Register RES specs that below to server on HOST:PORT. They will |
483 | | later be retrievable using res_get_specs. */ |
484 | | |
485 | | void |
486 | | res_register_specs (const char *host, int port, struct robot_specs *specs) |
487 | 1.27k | { |
488 | 1.27k | struct robot_specs *old; |
489 | 1.27k | char buf[256], *hp, *hp_old; |
490 | | |
491 | 1.27k | if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf)) |
492 | 0 | hp = aprintf("%s:%d", host, port); |
493 | 1.27k | else |
494 | 1.27k | hp = buf; |
495 | | |
496 | 1.27k | if (!registered_specs) |
497 | 1.27k | registered_specs = make_nocase_string_hash_table (0); |
498 | | |
499 | 1.27k | if (hash_table_get_pair (registered_specs, hp, &hp_old, &old)) |
500 | 0 | { |
501 | 0 | if (hp != buf) |
502 | 0 | xfree (hp); |
503 | 0 | if (old) |
504 | 0 | free_specs (old); |
505 | 0 | hash_table_put (registered_specs, hp_old, specs); |
506 | 0 | } |
507 | 1.27k | else |
508 | 1.27k | { |
509 | 1.27k | hash_table_put (registered_specs, hp == buf ? xstrdup (hp) : hp, specs); |
510 | 1.27k | } |
511 | 1.27k | } |
512 | | |
513 | | /* Get the specs that belong to HOST:PORT. */ |
514 | | |
515 | | struct robot_specs * |
516 | | res_get_specs (const char *host, int port) |
517 | 0 | { |
518 | 0 | char buf[256], *hp; |
519 | |
|
520 | 0 | if (!registered_specs) |
521 | 0 | return NULL; |
522 | | |
523 | 0 | if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf)) |
524 | 0 | hp = aprintf("%s:%d", host, port); |
525 | 0 | else |
526 | 0 | hp = buf; |
527 | |
|
528 | 0 | return hash_table_get (registered_specs, hp); |
529 | 0 | } |
530 | | |
531 | | /* Loading the robots file. */ |
532 | | |
533 | 0 | #define RES_SPECS_LOCATION "/robots.txt" |
534 | | |
535 | | /* Retrieve the robots.txt from the server root of the server that |
536 | | serves URL. The file will be named according to the currently |
537 | | active rules, and the file name will be returned in *file. |
538 | | |
539 | | Return true if robots were retrieved OK, false otherwise. */ |
540 | | |
541 | | bool |
542 | | res_retrieve_file (const char *url, char **file, struct iri *iri) |
543 | 0 | { |
544 | 0 | struct iri *i = iri_new (); |
545 | 0 | uerr_t err; |
546 | 0 | char *robots_url = uri_merge (url, RES_SPECS_LOCATION); |
547 | 0 | int saved_ts_val = opt.timestamping; |
548 | 0 | int saved_sp_val = opt.spider, url_err; |
549 | 0 | struct url * url_parsed; |
550 | | |
551 | | /* Copy server URI encoding for a possible IDNA transformation, no need to |
552 | | encode the full URI in UTF-8 because "robots.txt" is plain ASCII */ |
553 | 0 | set_uri_encoding (i, iri->uri_encoding, false); |
554 | 0 | i->utf8_encode = false; |
555 | |
|
556 | 0 | logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n")); |
557 | 0 | *file = NULL; |
558 | 0 | opt.timestamping = false; |
559 | 0 | opt.spider = false; |
560 | |
|
561 | 0 | url_parsed = url_parse (robots_url, &url_err, i, true); |
562 | 0 | if (!url_parsed) |
563 | 0 | { |
564 | 0 | logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, url_error (url_err)); |
565 | 0 | err = URLERROR; |
566 | 0 | } |
567 | 0 | else |
568 | 0 | { |
569 | 0 | err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL, |
570 | 0 | false, i, false); |
571 | 0 | url_free(url_parsed); |
572 | 0 | } |
573 | |
|
574 | 0 | opt.timestamping = saved_ts_val; |
575 | 0 | opt.spider = saved_sp_val; |
576 | 0 | xfree (robots_url); |
577 | 0 | iri_free (i); |
578 | |
|
579 | 0 | if (err != RETROK && *file != NULL) |
580 | 0 | { |
581 | | /* If the file is not retrieved correctly, but retrieve_url |
582 | | allocated the file name, deallocate is here so that the |
583 | | caller doesn't have to worry about it. */ |
584 | 0 | xfree (*file); |
585 | 0 | } |
586 | 0 | return err == RETROK; |
587 | 0 | } |
588 | | |
589 | | bool |
590 | | is_robots_txt_url (const char *url) |
591 | 0 | { |
592 | 0 | char *robots_url = uri_merge (url, RES_SPECS_LOCATION); |
593 | 0 | bool ret = are_urls_equal (url, robots_url); |
594 | |
|
595 | 0 | xfree (robots_url); |
596 | |
|
597 | 0 | return ret; |
598 | 0 | } |
599 | | |
600 | | #if defined DEBUG_MALLOC || defined TESTING |
601 | | void |
602 | | res_cleanup (void) |
603 | 1.27k | { |
604 | 1.27k | if (registered_specs) |
605 | 1.27k | { |
606 | 1.27k | hash_table_iterator iter; |
607 | 1.27k | for (hash_table_iterate (registered_specs, &iter); |
608 | 2.55k | hash_table_iter_next (&iter); |
609 | 1.27k | ) |
610 | 1.27k | { |
611 | 1.27k | xfree (iter.key); |
612 | 1.27k | free_specs (iter.value); |
613 | 1.27k | } |
614 | 1.27k | hash_table_destroy (registered_specs); |
615 | 1.27k | registered_specs = NULL; |
616 | 1.27k | } |
617 | 1.27k | } |
618 | | #endif |
619 | | |
620 | | #ifdef TESTING |
621 | | |
622 | | const char * |
623 | | test_is_robots_txt_url(void) |
624 | 0 | { |
625 | 0 | unsigned i; |
626 | 0 | static const struct { |
627 | 0 | const char *url; |
628 | 0 | bool expected_result; |
629 | 0 | } test_array[] = { |
630 | 0 | { "http://www.yoyodyne.com/robots.txt", true }, |
631 | 0 | { "http://www.yoyodyne.com/somepath/", false }, |
632 | 0 | { "http://www.yoyodyne.com/somepath/robots.txt", false }, |
633 | 0 | }; |
634 | |
|
635 | 0 | for (i = 0; i < countof(test_array); ++i) |
636 | 0 | { |
637 | 0 | mu_assert ("test_is_robots_txt_url: wrong result", |
638 | 0 | is_robots_txt_url (test_array[i].url) == test_array[i].expected_result); |
639 | 0 | } |
640 | | |
641 | 0 | return NULL; |
642 | 0 | } |
643 | | |
644 | | #endif /* TESTING */ |
645 | | |
646 | | /* |
647 | | * vim: et ts=2 sw=2 |
648 | | */ |