Coverage Report

Created: 2023-11-19 07:52

/src/wget/src/res.c
Line
Count
Source (jump to first uncovered line)
1
/* Support for Robot Exclusion Standard (RES).
2
   Copyright (C) 2001, 2006-2011, 2015, 2018-2023 Free Software
3
   Foundation, Inc.
4
5
This file is part of Wget.
6
7
This program is free software; you can redistribute it and/or modify
8
it under the terms of the GNU General Public License as published by
9
the Free Software Foundation; either version 3 of the License, or (at
10
your option) any later version.
11
12
This program is distributed in the hope that it will be useful, but
13
WITHOUT ANY WARRANTY; without even the implied warranty of
14
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
General Public License for more details.
16
17
You should have received a copy of the GNU General Public License
18
along with Wget.  If not, see <http://www.gnu.org/licenses/>.
19
20
Additional permission under GNU GPL version 3 section 7
21
22
If you modify this program, or any covered work, by linking or
23
combining it with the OpenSSL project's OpenSSL library (or a
24
modified version of that library), containing parts covered by the
25
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26
grants you additional permission to convey the resulting work.
27
Corresponding Source for a non-source form of such a combination
28
shall include the source code for the parts of OpenSSL used as well
29
as that of the covered work.  */
30
31
/* This file implements the Robot Exclusion Standard (RES).
32
33
   RES is a simple protocol that enables site admins to signalize to
34
   the web crawlers that certain parts of the site should not be
35
   accessed.  All the admin needs to do is create a "robots.txt" file
36
   in the web server root, and use simple commands to allow or
37
   disallow access to certain parts of the site.
38
39
   The first specification was written by Martijn Koster in 1994, and
40
   is still available at <http://www.robotstxt.org/orig.html>.
41
   In 1996, Martijn wrote an Internet Draft specifying an improved RES
42
   specification; however, that work was apparently abandoned since
43
   the draft has expired in 1997 and hasn't been replaced since.  The
44
   draft is available at
45
   <http://www.robotstxt.org/norobots-rfc.txt>.
46
47
   This file implements RES as specified by the draft.  Note that this
48
   only handles the "robots.txt" support.  The META tag that controls
49
   whether the links should be followed is handled in `html-url.c'.
50
51
   Known deviations:
52
53
   * The end-of-line comment recognition is more in the spirit of the
54
     Bourne Shell (as specified by RES-1994).  That means that
55
     "foo#bar" is taken literally, whereas "foo #bar" is interpreted
56
     as "foo".  The Draft apparently specifies that both should be
57
     interpreted as "foo".
58
59
   * We don't recognize sole CR as the line ending.
60
61
   * We don't implement expiry mechanism for /robots.txt specs.  I
62
     consider it non-necessary for a relatively short-lived
63
     application such as Wget.  Besides, it is highly questionable
64
     whether anyone deploys the recommended expiry scheme for
65
     robots.txt.
66
67
   Entry points are functions res_parse, res_parse_from_file,
68
   res_match_path, res_register_specs, res_get_specs, and
69
   res_retrieve_file.  */
70
71
#include "wget.h"
72
73
#include <stdio.h>
74
#include <stdlib.h>
75
#include <string.h>
76
#include <errno.h>
77
#include <assert.h>
78
79
#include "utils.h"
80
#include "hash.h"
81
#include "url.h"
82
#include "retr.h"
83
#include "res.h"
84
#include "c-strcase.h"
85
86
#ifdef TESTING
87
#include "../tests/unit-tests.h"
88
#endif
89
90
struct path_info {
91
  char *path;
92
  bool allowedp;
93
  bool user_agent_exact_p;
94
};
95
96
struct robot_specs {
97
  int count;
98
  int size;
99
  struct path_info *paths;
100
};
101
102
/* Parsing the robot spec. */
103
104
/* Check whether AGENT (a string of length LENGTH) equals "wget" or
105
   "*".  If it is either of them, *matches is set to one.  If it is
106
   "wget", *exact_match is set to one.  */
107
108
static void
109
match_user_agent (const char *agent, int length,
110
                  bool *matches, bool *exact_match)
111
1.63k
{
112
1.63k
  if (length == 1 && *agent == '*')
113
517
    {
114
517
      *matches = true;
115
517
      *exact_match = false;
116
517
    }
117
1.11k
  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
118
243
    {
119
243
      *matches = true;
120
243
      *exact_match = true;
121
243
    }
122
870
  else
123
870
    {
124
870
      *matches = false;
125
870
      *exact_match = false;
126
870
    }
127
1.63k
}
128
129
/* Add a path specification between PATH_B and PATH_E as one of the
130
   paths in SPECS.  */
131
132
static void
133
add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
134
          bool allowedp, bool exactp)
135
11.7k
{
136
11.7k
  struct path_info pp;
137
11.7k
  if (path_b < path_e && *path_b == '/')
138
    /* Our path representation doesn't use a leading slash, so remove
139
       one from theirs. */
140
393
    ++path_b;
141
11.7k
  pp.path     = strdupdelim (path_b, path_e);
142
11.7k
  pp.allowedp = allowedp;
143
11.7k
  pp.user_agent_exact_p = exactp;
144
11.7k
  ++specs->count;
145
11.7k
  if (specs->count > specs->size)
146
1.28k
    {
147
1.28k
      if (specs->size == 0)
148
353
        specs->size = 1;
149
928
      else
150
928
        specs->size <<= 1;
151
1.28k
      specs->paths = xrealloc (specs->paths,
152
1.28k
                               specs->size * sizeof (struct path_info));
153
1.28k
    }
154
11.7k
  specs->paths[specs->count - 1] = pp;
155
11.7k
}
156
157
/* Recreate SPECS->paths with only those paths that have
158
   user_agent_exact_p set to true.  */
159
160
static void
161
prune_non_exact (struct robot_specs *specs)
162
57
{
163
57
  struct path_info *newpaths;
164
57
  int i, j, cnt;
165
57
  cnt = 0;
166
3.17k
  for (i = 0; i < specs->count; i++)
167
3.11k
    if (specs->paths[i].user_agent_exact_p)
168
2.91k
      ++cnt;
169
57
  newpaths = xnew_array (struct path_info, cnt);
170
3.17k
  for (i = 0, j = 0; i < specs->count; i++)
171
3.11k
    if (specs->paths[i].user_agent_exact_p)
172
2.91k
      newpaths[j++] = specs->paths[i];
173
194
   else
174
194
     xfree (specs->paths[i].path);
175
57
  assert (j == cnt);
176
57
  xfree (specs->paths);
177
57
  specs->paths = newpaths;
178
57
  specs->count = cnt;
179
57
  specs->size  = cnt;
180
57
}
181
182
581k
#define EOL(p) ((p) >= lineend)
183
184
56.5k
#define SKIP_SPACE(p) do {              \
185
60.1k
  while (!EOL (p) && c_isspace (*p))      \
186
56.5k
    ++p;                                \
187
56.5k
} while (0)
188
189
#define FIELD_IS(string_literal)        \
190
34.3k
  BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
191
192
/* Parse textual RES specs beginning with SOURCE of length LENGTH.
193
   Return a specs objects ready to be fed to res_match_path.
194
195
   The parsing itself is trivial, but creating a correct SPECS object
196
   is trickier than it seems, because RES is surprisingly byzantine if
197
   you attempt to implement it correctly.
198
199
   A "record" is a block of one or more `User-Agent' lines followed by
200
   one or more `Allow' or `Disallow' lines.  Record is accepted by
201
   Wget if one of the `User-Agent' lines was "wget", or if the user
202
   agent line was "*".
203
204
   After all the lines have been read, we examine whether an exact
205
   ("wget") user-agent field was specified.  If so, we delete all the
206
   lines read under "User-Agent: *" blocks because we have our own
207
   Wget-specific blocks.  This enables the admin to say:
208
209
       User-Agent: *
210
       Disallow: /
211
212
       User-Agent: google
213
       User-Agent: wget
214
       Disallow: /cgi-bin
215
216
   This means that to Wget and to Google, /cgi-bin is disallowed,
217
   whereas for all other crawlers, everything is disallowed.
218
   res_parse is implemented so that the order of records doesn't
219
   matter.  In the case above, the "User-Agent: *" could have come
220
   after the other one.  */
221
222
struct robot_specs *
223
res_parse (const char *source, int length)
224
1.27k
{
225
1.27k
  int line_count = 1;
226
227
1.27k
  const char *p   = source;
228
1.27k
  const char *end = source + length;
229
230
  /* true if last applicable user-agent field matches Wget. */
231
1.27k
  bool user_agent_applies = false;
232
233
  /* true if last applicable user-agent field *exactly* matches
234
     Wget.  */
235
1.27k
  bool user_agent_exact = false;
236
237
  /* whether we ever encountered exact user agent. */
238
1.27k
  bool found_exact = false;
239
240
  /* count of allow/disallow lines in the current "record", i.e. after
241
     the last `user-agent' instructions.  */
242
1.27k
  int record_count = 0;
243
244
1.27k
  struct robot_specs *specs = xnew0 (struct robot_specs);
245
246
21.9k
  while (1)
247
21.9k
    {
248
21.9k
      const char *lineend, *lineend_real;
249
21.9k
      const char *field_b, *field_e;
250
21.9k
      const char *value_b, *value_e;
251
252
21.9k
      if (p == end)
253
1.27k
        break;
254
20.6k
      lineend_real = memchr (p, '\n', end - p);
255
20.6k
      if (lineend_real)
256
19.3k
        ++lineend_real;
257
1.26k
      else
258
1.26k
        lineend_real = end;
259
20.6k
      lineend = lineend_real;
260
261
      /* Before doing anything else, check whether the line is empty
262
         or comment-only. */
263
20.6k
      SKIP_SPACE (p);
264
20.6k
      if (EOL (p) || *p == '#')
265
1.71k
        goto next;
266
267
      /* Make sure the end-of-line comments are respected by setting
268
         lineend to a location preceding the first comment.  Real line
269
         ending remains in lineend_real.  */
270
249k
      for (lineend = p; lineend < lineend_real; lineend++)
271
230k
        if ((lineend == p || c_isspace (*(lineend - 1)))
272
230k
            && *lineend == '#')
273
197
          break;
274
275
      /* Ignore trailing whitespace in the same way. */
276
37.4k
      while (lineend > p && c_isspace (*(lineend - 1)))
277
18.4k
        --lineend;
278
279
18.9k
      assert (!EOL (p));
280
281
18.9k
      field_b = p;
282
161k
      while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
283
142k
        ++p;
284
18.9k
      field_e = p;
285
286
18.9k
      SKIP_SPACE (p);
287
18.9k
      if (field_b == field_e || EOL (p) || *p != ':')
288
1.91k
        {
289
1.91k
          DEBUGP (("Ignoring malformed line %d\n", line_count));
290
1.91k
          goto next;
291
1.91k
        }
292
17.0k
      ++p;                      /* skip ':' */
293
17.0k
      SKIP_SPACE (p);
294
295
17.0k
      value_b = p;
296
60.3k
      while (!EOL (p))
297
43.3k
        ++p;
298
17.0k
      value_e = p;
299
300
      /* Finally, we have a syntactically valid line. */
301
17.0k
      if (FIELD_IS ("user-agent"))
302
2.02k
        {
303
          /* We have to support several cases:
304
305
             --previous records--
306
307
             User-Agent: foo
308
             User-Agent: Wget
309
             User-Agent: bar
310
             ... matching record ...
311
312
             User-Agent: baz
313
             User-Agent: qux
314
             ... non-matching record ...
315
316
             User-Agent: *
317
             ... matching record, but will be pruned later ...
318
319
             We have to respect `User-Agent' at the beginning of each
320
             new record simply because we don't know if we're going to
321
             encounter "Wget" among the agents or not.  Hence,
322
             match_user_agent is called when record_count != 0.
323
324
             But if record_count is 0, we have to keep calling it
325
             until it matches, and if that happens, we must not call
326
             it any more, until the next record.  Hence the other part
327
             of the condition.  */
328
2.02k
          if (record_count != 0 || user_agent_applies == false)
329
1.63k
            match_user_agent (value_b, value_e - value_b,
330
1.63k
                              &user_agent_applies, &user_agent_exact);
331
2.02k
          if (user_agent_exact)
332
443
            found_exact = true;
333
2.02k
          record_count = 0;
334
2.02k
        }
335
14.9k
      else if (FIELD_IS ("allow"))
336
12.6k
        {
337
12.6k
          if (user_agent_applies)
338
10.9k
            {
339
10.9k
              add_path (specs, value_b, value_e, true, user_agent_exact);
340
10.9k
            }
341
12.6k
          ++record_count;
342
12.6k
        }
343
2.37k
      else if (FIELD_IS ("disallow"))
344
1.01k
        {
345
1.01k
          if (user_agent_applies)
346
822
            {
347
822
              bool allowed = false;
348
822
              if (value_b == value_e)
349
                /* Empty "disallow" line means everything is *allowed*!  */
350
415
                allowed = true;
351
822
              add_path (specs, value_b, value_e, allowed, user_agent_exact);
352
822
            }
353
1.01k
          ++record_count;
354
1.01k
        }
355
1.35k
      else
356
1.35k
        {
357
1.35k
          DEBUGP (("Ignoring unknown field at line %d\n", line_count));
358
1.35k
          goto next;
359
1.35k
        }
360
361
20.6k
    next:
362
20.6k
      p = lineend_real;
363
20.6k
      ++line_count;
364
20.6k
    }
365
366
1.27k
  if (found_exact)
367
57
    {
368
      /* We've encountered an exactly matching user-agent.  Throw out
369
         all the stuff with user-agent: *.  */
370
57
      prune_non_exact (specs);
371
57
    }
372
1.22k
  else if (specs->size > specs->count)
373
67
    {
374
      /* add_path normally over-allocates specs->paths.  Reallocate it
375
         to the correct size in order to conserve some memory.  */
376
67
      specs->paths = xrealloc (specs->paths,
377
67
                               specs->count * sizeof (struct path_info));
378
67
      specs->size = specs->count;
379
67
    }
380
381
1.27k
  return specs;
382
1.27k
}
383
384
/* The same like res_parse, but first map the FILENAME into memory,
385
   and then parse it.  */
386
387
struct robot_specs *
388
res_parse_from_file (const char *filename)
389
0
{
390
0
  struct robot_specs *specs;
391
0
  struct file_memory *fm = wget_read_file (filename);
392
0
  if (!fm)
393
0
    {
394
0
      logprintf (LOG_NOTQUIET, _("Cannot open %s: %s\n"),
395
0
                 filename, strerror (errno));
396
0
      return NULL;
397
0
    }
398
0
  specs = res_parse (fm->content, fm->length);
399
0
  wget_read_file_free (fm);
400
0
  return specs;
401
0
}
402
403
static void
404
free_specs (struct robot_specs *specs)
405
1.27k
{
406
1.27k
  int i;
407
12.8k
  for (i = 0; i < specs->count; i++)
408
11.5k
    xfree (specs->paths[i].path);
409
1.27k
  xfree (specs->paths);
410
1.27k
  xfree (specs);
411
1.27k
}
412
413
/* Matching of a path according to the specs. */
414
415
/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
416
   that number is not a numerical representation of '/', decode C and
417
   advance the pointer.  */
418
419
13.9k
#define DECODE_MAYBE(c, ptr) do {                               \
420
13.9k
  if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2]))       \
421
13.9k
    {                                                           \
422
4.40k
      unsigned char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
423
4.40k
      if (decoded != '/')                                       \
424
4.40k
        {                                                       \
425
4.21k
          c = decoded;                                          \
426
4.21k
          ptr += 2;                                             \
427
4.21k
        }                                                       \
428
4.40k
    }                                                           \
429
13.9k
} while (0)
430
431
/* The inner matching engine: return true if RECORD_PATH matches
432
   URL_PATH.  The rules for matching are described at
433
   <http://www.robotstxt.org/norobots-rfc.txt>, section 3.2.2.  */
434
435
static bool
436
matches (const char *record_path, const char *url_path)
437
3.09k
{
438
3.09k
  const char *rp = record_path;
439
3.09k
  const char *up = url_path;
440
441
4.24k
  for (; ; ++rp, ++up)
442
7.33k
    {
443
7.33k
      char rc = *rp;
444
7.33k
      char uc = *up;
445
7.33k
      if (!rc)
446
144
        return true;
447
7.19k
      if (!uc)
448
237
        return false;
449
6.95k
      DECODE_MAYBE(rc, rp);
450
6.95k
      DECODE_MAYBE(uc, up);
451
6.95k
      if (rc != uc)
452
2.71k
        return false;
453
6.95k
    }
454
3.09k
}
455
456
/* Iterate through all paths in SPECS.  For the first one that
457
   matches, return its allow/reject status.  If none matches,
458
   retrieval is by default allowed.  */
459
460
bool
461
res_match_path (const struct robot_specs *specs, const char *path)
462
1.27k
{
463
1.27k
  int i;
464
1.27k
  if (!specs)
465
0
    return true;
466
4.22k
  for (i = 0; i < specs->count; i++)
467
3.09k
    if (matches (specs->paths[i].path, path))
468
144
      {
469
144
        bool allowedp = specs->paths[i].allowedp;
470
144
        DEBUGP (("%s path %s because of rule %s.\n",
471
144
                 allowedp ? "Allowing" : "Rejecting",
472
144
                 path, quote (specs->paths[i].path)));
473
144
        return allowedp;
474
144
      }
475
1.13k
  return true;
476
1.27k
}
477
478
/* Registering the specs. */
479
480
static struct hash_table *registered_specs;
481
482
/* Register RES specs that below to server on HOST:PORT.  They will
483
   later be retrievable using res_get_specs.  */
484
485
void
486
res_register_specs (const char *host, int port, struct robot_specs *specs)
487
1.27k
{
488
1.27k
  struct robot_specs *old;
489
1.27k
  char buf[256], *hp, *hp_old;
490
491
1.27k
  if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
492
0
    hp = aprintf("%s:%d", host, port);
493
1.27k
  else
494
1.27k
    hp = buf;
495
496
1.27k
  if (!registered_specs)
497
1.27k
    registered_specs = make_nocase_string_hash_table (0);
498
499
1.27k
  if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
500
0
    {
501
0
      if (hp != buf)
502
0
        xfree (hp);
503
0
      if (old)
504
0
        free_specs (old);
505
0
      hash_table_put (registered_specs, hp_old, specs);
506
0
    }
507
1.27k
  else
508
1.27k
    {
509
1.27k
      hash_table_put (registered_specs, hp == buf ? xstrdup (hp) : hp, specs);
510
1.27k
    }
511
1.27k
}
512
513
/* Get the specs that belong to HOST:PORT. */
514
515
struct robot_specs *
516
res_get_specs (const char *host, int port)
517
0
{
518
0
  char buf[256], *hp;
519
520
0
  if (!registered_specs)
521
0
    return NULL;
522
523
0
  if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
524
0
    hp = aprintf("%s:%d", host, port);
525
0
  else
526
0
    hp = buf;
527
528
0
  return hash_table_get (registered_specs, hp);
529
0
}
530
531
/* Loading the robots file.  */
532
533
0
#define RES_SPECS_LOCATION "/robots.txt"
534
535
/* Retrieve the robots.txt from the server root of the server that
536
   serves URL.  The file will be named according to the currently
537
   active rules, and the file name will be returned in *file.
538
539
   Return true if robots were retrieved OK, false otherwise.  */
540
541
bool
542
res_retrieve_file (const char *url, char **file, struct iri *iri)
543
0
{
544
0
  struct iri *i = iri_new ();
545
0
  uerr_t err;
546
0
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
547
0
  int saved_ts_val = opt.timestamping;
548
0
  int saved_sp_val = opt.spider, url_err;
549
0
  struct url * url_parsed;
550
551
  /* Copy server URI encoding for a possible IDNA transformation, no need to
552
     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
553
0
  set_uri_encoding (i, iri->uri_encoding, false);
554
0
  i->utf8_encode = false;
555
556
0
  logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
557
0
  *file = NULL;
558
0
  opt.timestamping = false;
559
0
  opt.spider       = false;
560
561
0
  url_parsed = url_parse (robots_url, &url_err, i, true);
562
0
  if (!url_parsed)
563
0
    {
564
0
      logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, url_error (url_err));
565
0
      err = URLERROR;
566
0
    }
567
0
  else
568
0
    {
569
0
      err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
570
0
                          false, i, false);
571
0
      url_free(url_parsed);
572
0
    }
573
574
0
  opt.timestamping = saved_ts_val;
575
0
  opt.spider       = saved_sp_val;
576
0
  xfree (robots_url);
577
0
  iri_free (i);
578
579
0
  if (err != RETROK && *file != NULL)
580
0
    {
581
      /* If the file is not retrieved correctly, but retrieve_url
582
         allocated the file name, deallocate is here so that the
583
         caller doesn't have to worry about it.  */
584
0
      xfree (*file);
585
0
    }
586
0
  return err == RETROK;
587
0
}
588
589
bool
590
is_robots_txt_url (const char *url)
591
0
{
592
0
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
593
0
  bool ret = are_urls_equal (url, robots_url);
594
595
0
  xfree (robots_url);
596
597
0
  return ret;
598
0
}
599
600
#if defined DEBUG_MALLOC || defined TESTING
601
void
602
res_cleanup (void)
603
1.27k
{
604
1.27k
  if (registered_specs)
605
1.27k
    {
606
1.27k
      hash_table_iterator iter;
607
1.27k
      for (hash_table_iterate (registered_specs, &iter);
608
2.55k
           hash_table_iter_next (&iter);
609
1.27k
           )
610
1.27k
        {
611
1.27k
          xfree (iter.key);
612
1.27k
          free_specs (iter.value);
613
1.27k
        }
614
1.27k
      hash_table_destroy (registered_specs);
615
1.27k
      registered_specs = NULL;
616
1.27k
    }
617
1.27k
}
618
#endif
619
620
#ifdef TESTING
621
622
const char *
623
test_is_robots_txt_url(void)
624
0
{
625
0
  unsigned i;
626
0
  static const struct {
627
0
    const char *url;
628
0
    bool expected_result;
629
0
  } test_array[] = {
630
0
    { "http://www.yoyodyne.com/robots.txt", true },
631
0
    { "http://www.yoyodyne.com/somepath/", false },
632
0
    { "http://www.yoyodyne.com/somepath/robots.txt", false },
633
0
  };
634
635
0
  for (i = 0; i < countof(test_array); ++i)
636
0
    {
637
0
      mu_assert ("test_is_robots_txt_url: wrong result",
638
0
                 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
639
0
    }
640
641
0
  return NULL;
642
0
}
643
644
#endif /* TESTING */
645
646
/*
647
 * vim: et ts=2 sw=2
648
 */