/src/wget/src/res.c

Source (jump to first uncovered line)
/* Support for Robot Exclusion Standard (RES).
   Copyright (C) 2001, 2006-2011, 2015, 2018-2023 Free Software
   Foundation, Inc.

This file is part of Wget.

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 3 of the License, or (at
your option) any later version.

This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
General Public License for more details.

You should have received a copy of the GNU General Public License
along with Wget.  If not, see <http://www.gnu.org/licenses/>.

Additional permission under GNU GPL version 3 section 7

If you modify this program, or any covered work, by linking or
combining it with the OpenSSL project's OpenSSL library (or a
modified version of that library), containing parts covered by the
terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
grants you additional permission to convey the resulting work.
Corresponding Source for a non-source form of such a combination
shall include the source code for the parts of OpenSSL used as well
as that of the covered work.  */

/* This file implements the Robot Exclusion Standard (RES).

   RES is a simple protocol that enables site admins to signalize to
   the web crawlers that certain parts of the site should not be
   accessed.  All the admin needs to do is create a "robots.txt" file
   in the web server root, and use simple commands to allow or
   disallow access to certain parts of the site.

   The first specification was written by Martijn Koster in 1994, and
   is still available at <http://www.robotstxt.org/orig.html>.
   In 1996, Martijn wrote an Internet Draft specifying an improved RES
   specification; however, that work was apparently abandoned since
   the draft has expired in 1997 and hasn't been replaced since.  The
   draft is available at
   <http://www.robotstxt.org/norobots-rfc.txt>.

   This file implements RES as specified by the draft.  Note that this
   only handles the "robots.txt" support.  The META tag that controls
   whether the links should be followed is handled in `html-url.c'.

   Known deviations:

   * The end-of-line comment recognition is more in the spirit of the
     Bourne Shell (as specified by RES-1994).  That means that
     "foo#bar" is taken literally, whereas "foo #bar" is interpreted
     as "foo".  The Draft apparently specifies that both should be
     interpreted as "foo".

   * We don't recognize sole CR as the line ending.

   * We don't implement expiry mechanism for /robots.txt specs.  I
     consider it non-necessary for a relatively short-lived
     application such as Wget.  Besides, it is highly questionable
     whether anyone deploys the recommended expiry scheme for
     robots.txt.

   Entry points are functions res_parse, res_parse_from_file,
   res_match_path, res_register_specs, res_get_specs, and
   res_retrieve_file.  */

#include "wget.h"

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <errno.h>
#include <assert.h>

#include "utils.h"
#include "hash.h"
#include "url.h"
#include "retr.h"
#include "res.h"
#include "c-strcase.h"

#ifdef TESTING
#include "../tests/unit-tests.h"
#endif

struct path_info {
  char *path;
  bool allowedp;
  bool user_agent_exact_p;
};

struct robot_specs {
  int count;
  int size;
  struct path_info *paths;
};

/* Parsing the robot spec. */

/* Check whether AGENT (a string of length LENGTH) equals "wget" or
   "*".  If it is either of them, *matches is set to one.  If it is
   "wget", *exact_match is set to one.  */

static void
match_user_agent (const char *agent, int length,
                  bool *matches, bool *exact_match)
{
  if (length == 1 && *agent == '*')
    {
      *matches = true;
      *exact_match = false;
    }
  else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
    {
      *matches = true;
      *exact_match = true;
    }
  else
    {
      *matches = false;
      *exact_match = false;
    }
}

/* Add a path specification between PATH_B and PATH_E as one of the
   paths in SPECS.  */

static void
add_path (struct robot_specs *specs, const char *path_b, const char *path_e,
          bool allowedp, bool exactp)
{
  struct path_info pp;
  if (path_b < path_e && *path_b == '/')
    /* Our path representation doesn't use a leading slash, so remove
       one from theirs. */
    ++path_b;
  pp.path     = strdupdelim (path_b, path_e);
  pp.allowedp = allowedp;
  pp.user_agent_exact_p = exactp;
  ++specs->count;
  if (specs->count > specs->size)
    {
      if (specs->size == 0)
        specs->size = 1;
      else
        specs->size <<= 1;
      specs->paths = xrealloc (specs->paths,
                               specs->size * sizeof (struct path_info));
    }
  specs->paths[specs->count - 1] = pp;
}

/* Recreate SPECS->paths with only those paths that have
   user_agent_exact_p set to true.  */

static void
prune_non_exact (struct robot_specs *specs)
{
  struct path_info *newpaths;
  int i, j, cnt;
  cnt = 0;
  for (i = 0; i < specs->count; i++)
    if (specs->paths[i].user_agent_exact_p)
      ++cnt;
  newpaths = xnew_array (struct path_info, cnt);
  for (i = 0, j = 0; i < specs->count; i++)
    if (specs->paths[i].user_agent_exact_p)
      newpaths[j++] = specs->paths[i];
   else
     xfree (specs->paths[i].path);
  assert (j == cnt);
  xfree (specs->paths);
  specs->paths = newpaths;
  specs->count = cnt;
  specs->size  = cnt;
}

#define EOL(p) ((p) >= lineend)

#define SKIP_SPACE(p) do {              \
  while (!EOL (p) && c_isspace (*p))      \
    ++p;                                \
} while (0)

#define FIELD_IS(string_literal)        \
  BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)

/* Parse textual RES specs beginning with SOURCE of length LENGTH.
   Return a specs objects ready to be fed to res_match_path.

   The parsing itself is trivial, but creating a correct SPECS object
   is trickier than it seems, because RES is surprisingly byzantine if
   you attempt to implement it correctly.

   A "record" is a block of one or more `User-Agent' lines followed by
   one or more `Allow' or `Disallow' lines.  Record is accepted by
   Wget if one of the `User-Agent' lines was "wget", or if the user
   agent line was "*".

   After all the lines have been read, we examine whether an exact
   ("wget") user-agent field was specified.  If so, we delete all the
   lines read under "User-Agent: *" blocks because we have our own
   Wget-specific blocks.  This enables the admin to say:

       User-Agent: *
       Disallow: /

       User-Agent: google
       User-Agent: wget
       Disallow: /cgi-bin

   This means that to Wget and to Google, /cgi-bin is disallowed,
   whereas for all other crawlers, everything is disallowed.
   res_parse is implemented so that the order of records doesn't
   matter.  In the case above, the "User-Agent: *" could have come
   after the other one.  */

struct robot_specs *
res_parse (const char *source, int length)
{
  int line_count = 1;

  const char *p   = source;
  const char *end = source + length;

  /* true if last applicable user-agent field matches Wget. */
  bool user_agent_applies = false;

  /* true if last applicable user-agent field *exactly* matches
     Wget.  */
  bool user_agent_exact = false;

  /* whether we ever encountered exact user agent. */
  bool found_exact = false;

  /* count of allow/disallow lines in the current "record", i.e. after
     the last `user-agent' instructions.  */
  int record_count = 0;

  struct robot_specs *specs = xnew0 (struct robot_specs);

  while (1)
    {
      const char *lineend, *lineend_real;
      const char *field_b, *field_e;
      const char *value_b, *value_e;

      if (p == end)
        break;
      lineend_real = memchr (p, '\n', end - p);
      if (lineend_real)
        ++lineend_real;
      else
        lineend_real = end;
      lineend = lineend_real;

      /* Before doing anything else, check whether the line is empty
         or comment-only. */
      SKIP_SPACE (p);
      if (EOL (p) || *p == '#')
        goto next;

      /* Make sure the end-of-line comments are respected by setting
         lineend to a location preceding the first comment.  Real line
         ending remains in lineend_real.  */
      for (lineend = p; lineend < lineend_real; lineend++)
        if ((lineend == p || c_isspace (*(lineend - 1)))
            && *lineend == '#')
          break;

      /* Ignore trailing whitespace in the same way. */
      while (lineend > p && c_isspace (*(lineend - 1)))
        --lineend;

      assert (!EOL (p));

      field_b = p;
      while (!EOL (p) && (c_isalnum (*p) || *p == '-'))
        ++p;
      field_e = p;

      SKIP_SPACE (p);
      if (field_b == field_e || EOL (p) || *p != ':')
        {
          DEBUGP (("Ignoring malformed line %d\n", line_count));
          goto next;
        }
      ++p;                      /* skip ':' */
      SKIP_SPACE (p);

      value_b = p;
      while (!EOL (p))
        ++p;
      value_e = p;

      /* Finally, we have a syntactically valid line. */
      if (FIELD_IS ("user-agent"))
        {
          /* We have to support several cases:

             --previous records--

             User-Agent: foo
             User-Agent: Wget
             User-Agent: bar
             ... matching record ...

             User-Agent: baz
             User-Agent: qux
             ... non-matching record ...

             User-Agent: *
             ... matching record, but will be pruned later ...

             We have to respect `User-Agent' at the beginning of each
             new record simply because we don't know if we're going to
             encounter "Wget" among the agents or not.  Hence,
             match_user_agent is called when record_count != 0.

             But if record_count is 0, we have to keep calling it
             until it matches, and if that happens, we must not call
             it any more, until the next record.  Hence the other part
             of the condition.  */
          if (record_count != 0 || user_agent_applies == false)
            match_user_agent (value_b, value_e - value_b,
                              &user_agent_applies, &user_agent_exact);
          if (user_agent_exact)
            found_exact = true;
          record_count = 0;
        }
      else if (FIELD_IS ("allow"))
        {
          if (user_agent_applies)
            {
              add_path (specs, value_b, value_e, true, user_agent_exact);
            }
          ++record_count;
        }
      else if (FIELD_IS ("disallow"))
        {
          if (user_agent_applies)
            {
              bool allowed = false;
              if (value_b == value_e)
                /* Empty "disallow" line means everything is *allowed*!  */
                allowed = true;
              add_path (specs, value_b, value_e, allowed, user_agent_exact);
            }
          ++record_count;
        }
      else
        {
          DEBUGP (("Ignoring unknown field at line %d\n", line_count));
          goto next;
        }

    next:
      p = lineend_real;
      ++line_count;
    }

  if (found_exact)
    {
      /* We've encountered an exactly matching user-agent.  Throw out
         all the stuff with user-agent: *.  */
      prune_non_exact (specs);
    }
  else if (specs->size > specs->count)
    {
      /* add_path normally over-allocates specs->paths.  Reallocate it
         to the correct size in order to conserve some memory.  */
      specs->paths = xrealloc (specs->paths,
                               specs->count * sizeof (struct path_info));
      specs->size = specs->count;
    }

  return specs;
}

/* The same like res_parse, but first map the FILENAME into memory,
   and then parse it.  */

struct robot_specs *
res_parse_from_file (const char *filename)
{
  struct robot_specs *specs;
  struct file_memory *fm = wget_read_file (filename);
  if (!fm)
    {
      logprintf (LOG_NOTQUIET, _("Cannot open %s: %s\n"),
                 filename, strerror (errno));
      return NULL;
    }
  specs = res_parse (fm->content, fm->length);
  wget_read_file_free (fm);
  return specs;
}

static void
free_specs (struct robot_specs *specs)
{
  int i;
  for (i = 0; i < specs->count; i++)
    xfree (specs->paths[i].path);
  xfree (specs->paths);
  xfree (specs);
}

/* Matching of a path according to the specs. */

/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
   that number is not a numerical representation of '/', decode C and
   advance the pointer.  */

#define DECODE_MAYBE(c, ptr) do {                               \
  if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2]))       \
    {                                                           \
      unsigned char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]);          \
      if (decoded != '/')                                       \
        {                                                       \
          c = decoded;                                          \
          ptr += 2;                                             \
        }                                                       \
    }                                                           \
} while (0)

/* The inner matching engine: return true if RECORD_PATH matches
   URL_PATH.  The rules for matching are described at
   <http://www.robotstxt.org/norobots-rfc.txt>, section 3.2.2.  */

static bool
matches (const char *record_path, const char *url_path)
{
  const char *rp = record_path;
  const char *up = url_path;

  for (; ; ++rp, ++up)
    {
      char rc = *rp;
      char uc = *up;
      if (!rc)
        return true;
      if (!uc)
        return false;
      DECODE_MAYBE(rc, rp);
      DECODE_MAYBE(uc, up);
      if (rc != uc)
        return false;
    }
}

/* Iterate through all paths in SPECS.  For the first one that
   matches, return its allow/reject status.  If none matches,
   retrieval is by default allowed.  */

bool
res_match_path (const struct robot_specs *specs, const char *path)
{
  int i;
  if (!specs)
    return true;
  for (i = 0; i < specs->count; i++)
    if (matches (specs->paths[i].path, path))
      {
        bool allowedp = specs->paths[i].allowedp;
        DEBUGP (("%s path %s because of rule %s.\n",
                 allowedp ? "Allowing" : "Rejecting",
                 path, quote (specs->paths[i].path)));
        return allowedp;
      }
  return true;
}

/* Registering the specs. */

static struct hash_table *registered_specs;

/* Register RES specs that below to server on HOST:PORT.  They will
   later be retrievable using res_get_specs.  */

void
res_register_specs (const char *host, int port, struct robot_specs *specs)
{
  struct robot_specs *old;
  char buf[256], *hp, *hp_old;

  if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
    hp = aprintf("%s:%d", host, port);
  else
    hp = buf;

  if (!registered_specs)
    registered_specs = make_nocase_string_hash_table (0);

  if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
    {
      if (hp != buf)
        xfree (hp);
      if (old)
        free_specs (old);
      hash_table_put (registered_specs, hp_old, specs);
    }
  else
    {
      hash_table_put (registered_specs, hp == buf ? xstrdup (hp) : hp, specs);
    }
}

/* Get the specs that belong to HOST:PORT. */

struct robot_specs *
res_get_specs (const char *host, int port)
{
  char buf[256], *hp;

  if (!registered_specs)
    return NULL;

  if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
    hp = aprintf("%s:%d", host, port);
  else
    hp = buf;

  return hash_table_get (registered_specs, hp);
}

/* Loading the robots file.  */

#define RES_SPECS_LOCATION "/robots.txt"

/* Retrieve the robots.txt from the server root of the server that
   serves URL.  The file will be named according to the currently
   active rules, and the file name will be returned in *file.

   Return true if robots were retrieved OK, false otherwise.  */

bool
res_retrieve_file (const char *url, char **file, struct iri *iri)
{
  struct iri *i = iri_new ();
  uerr_t err;
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
  int saved_ts_val = opt.timestamping;
  int saved_sp_val = opt.spider, url_err;
  struct url * url_parsed;

  /* Copy server URI encoding for a possible IDNA transformation, no need to
     encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
  set_uri_encoding (i, iri->uri_encoding, false);
  i->utf8_encode = false;

  logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
  *file = NULL;
  opt.timestamping = false;
  opt.spider       = false;

  url_parsed = url_parse (robots_url, &url_err, i, true);
  if (!url_parsed)
    {
      logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, url_error (url_err));
      err = URLERROR;
    }
  else
    {
      err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
                          false, i, false);
      url_free(url_parsed);
    }

  opt.timestamping = saved_ts_val;
  opt.spider       = saved_sp_val;
  xfree (robots_url);
  iri_free (i);

  if (err != RETROK && *file != NULL)
    {
      /* If the file is not retrieved correctly, but retrieve_url
         allocated the file name, deallocate is here so that the
         caller doesn't have to worry about it.  */
      xfree (*file);
    }
  return err == RETROK;
}

bool
is_robots_txt_url (const char *url)
{
  char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
  bool ret = are_urls_equal (url, robots_url);

  xfree (robots_url);

  return ret;
}

#if defined DEBUG_MALLOC || defined TESTING
void
res_cleanup (void)
{
  if (registered_specs)
    {
      hash_table_iterator iter;
      for (hash_table_iterate (registered_specs, &iter);
           hash_table_iter_next (&iter);
           )
        {
          xfree (iter.key);
          free_specs (iter.value);
        }
      hash_table_destroy (registered_specs);
      registered_specs = NULL;
    }
}
#endif

#ifdef TESTING

const char *
test_is_robots_txt_url(void)
{
  unsigned i;
  static const struct {
    const char *url;
    bool expected_result;
  } test_array[] = {
    { "http://www.yoyodyne.com/robots.txt", true },
    { "http://www.yoyodyne.com/somepath/", false },
    { "http://www.yoyodyne.com/somepath/robots.txt", false },
  };

  for (i = 0; i < countof(test_array); ++i)
    {
      mu_assert ("test_is_robots_txt_url: wrong result",
                 is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
    }

  return NULL;
}

#endif /* TESTING */

/*
 * vim: et ts=2 sw=2
 */

Coverage Report

Created: 2023-11-19 07:52

Line	Count	Source (jump to first uncovered line)
1		/* Support for Robot Exclusion Standard (RES).
2		Copyright (C) 2001, 2006-2011, 2015, 2018-2023 Free Software
3		Foundation, Inc.
4
5		This file is part of Wget.
6
7		This program is free software; you can redistribute it and/or modify
8		it under the terms of the GNU General Public License as published by
9		the Free Software Foundation; either version 3 of the License, or (at
10		your option) any later version.
11
12		This program is distributed in the hope that it will be useful, but
13		WITHOUT ANY WARRANTY; without even the implied warranty of
14		MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15		General Public License for more details.
16
17		You should have received a copy of the GNU General Public License
18		along with Wget. If not, see <http://www.gnu.org/licenses/>.
19
20		Additional permission under GNU GPL version 3 section 7
21
22		If you modify this program, or any covered work, by linking or
23		combining it with the OpenSSL project's OpenSSL library (or a
24		modified version of that library), containing parts covered by the
25		terms of the OpenSSL or SSLeay licenses, the Free Software Foundation
26		grants you additional permission to convey the resulting work.
27		Corresponding Source for a non-source form of such a combination
28		shall include the source code for the parts of OpenSSL used as well
29		as that of the covered work. */
30
31		/* This file implements the Robot Exclusion Standard (RES).
32
33		RES is a simple protocol that enables site admins to signalize to
34		the web crawlers that certain parts of the site should not be
35		accessed. All the admin needs to do is create a "robots.txt" file
36		in the web server root, and use simple commands to allow or
37		disallow access to certain parts of the site.
38
39		The first specification was written by Martijn Koster in 1994, and
40		is still available at <http://www.robotstxt.org/orig.html>.
41		In 1996, Martijn wrote an Internet Draft specifying an improved RES
42		specification; however, that work was apparently abandoned since
43		the draft has expired in 1997 and hasn't been replaced since. The
44		draft is available at
45		<http://www.robotstxt.org/norobots-rfc.txt>.
46
47		This file implements RES as specified by the draft. Note that this
48		only handles the "robots.txt" support. The META tag that controls
49		whether the links should be followed is handled in `html-url.c'.
50
51		Known deviations:
52
53		* The end-of-line comment recognition is more in the spirit of the
54		Bourne Shell (as specified by RES-1994). That means that
55		"foo#bar" is taken literally, whereas "foo #bar" is interpreted
56		as "foo". The Draft apparently specifies that both should be
57		interpreted as "foo".
58
59		* We don't recognize sole CR as the line ending.
60
61		* We don't implement expiry mechanism for /robots.txt specs. I
62		consider it non-necessary for a relatively short-lived
63		application such as Wget. Besides, it is highly questionable
64		whether anyone deploys the recommended expiry scheme for
65		robots.txt.
66
67		Entry points are functions res_parse, res_parse_from_file,
68		res_match_path, res_register_specs, res_get_specs, and
69		res_retrieve_file. */
70
71		#include "wget.h"
72
73		#include <stdio.h>
74		#include <stdlib.h>
75		#include <string.h>
76		#include <errno.h>
77		#include <assert.h>
78
79		#include "utils.h"
80		#include "hash.h"
81		#include "url.h"
82		#include "retr.h"
83		#include "res.h"
84		#include "c-strcase.h"
85
86		#ifdef TESTING
87		#include "../tests/unit-tests.h"
88		#endif
89
90		struct path_info {
91		char *path;
92		bool allowedp;
93		bool user_agent_exact_p;
94		};
95
96		struct robot_specs {
97		int count;
98		int size;
99		struct path_info *paths;
100		};
101
102		/* Parsing the robot spec. */
103
104		/* Check whether AGENT (a string of length LENGTH) equals "wget" or
105		"". If it is either of them, matches is set to one. If it is
106		"wget", exact_match is set to one. /
107
108		static void
109		match_user_agent (const char *agent, int length,
110		bool matches, bool exact_match)
111	1.63k	{
112	1.63k	if (length == 1 && agent == '')
113	517	{
114	517	*matches = true;
115	517	*exact_match = false;
116	517	}
117	1.11k	else if (BOUNDED_EQUAL_NO_CASE (agent, agent + length, "wget"))
118	243	{
119	243	*matches = true;
120	243	*exact_match = true;
121	243	}
122	870	else
123	870	{
124	870	*matches = false;
125	870	*exact_match = false;
126	870	}
127	1.63k	}
128
129		/* Add a path specification between PATH_B and PATH_E as one of the
130		paths in SPECS. */
131
132		static void
133		add_path (struct robot_specs specs, const char path_b, const char *path_e,
134		bool allowedp, bool exactp)
135	11.7k	{
136	11.7k	struct path_info pp;
137	11.7k	if (path_b < path_e && *path_b == '/')
138		/* Our path representation doesn't use a leading slash, so remove
139		one from theirs. */
140	393	++path_b;
141	11.7k	pp.path = strdupdelim (path_b, path_e);
142	11.7k	pp.allowedp = allowedp;
143	11.7k	pp.user_agent_exact_p = exactp;
144	11.7k	++specs->count;
145	11.7k	if (specs->count > specs->size)
146	1.28k	{
147	1.28k	if (specs->size == 0)
148	353	specs->size = 1;
149	928	else
150	928	specs->size <<= 1;
151	1.28k	specs->paths = xrealloc (specs->paths,
152	1.28k	specs->size * sizeof (struct path_info));
153	1.28k	}
154	11.7k	specs->paths[specs->count - 1] = pp;
155	11.7k	}
156
157		/* Recreate SPECS->paths with only those paths that have
158		user_agent_exact_p set to true. */
159
160		static void
161		prune_non_exact (struct robot_specs *specs)
162	57	{
163	57	struct path_info *newpaths;
164	57	int i, j, cnt;
165	57	cnt = 0;
166	3.17k	for (i = 0; i < specs->count; i++)
167	3.11k	if (specs->paths[i].user_agent_exact_p)
168	2.91k	++cnt;
169	57	newpaths = xnew_array (struct path_info, cnt);
170	3.17k	for (i = 0, j = 0; i < specs->count; i++)
171	3.11k	if (specs->paths[i].user_agent_exact_p)
172	2.91k	newpaths[j++] = specs->paths[i];
173	194	else
174	194	xfree (specs->paths[i].path);
175	57	assert (j == cnt);
176	57	xfree (specs->paths);
177	57	specs->paths = newpaths;
178	57	specs->count = cnt;
179	57	specs->size = cnt;
180	57	}
181
182	581k	#define EOL(p) ((p) >= lineend)
183
184	56.5k	#define SKIP_SPACE(p) do { \
185	60.1k	while (!EOL (p) && c_isspace (*p)) \
186	56.5k	++p; \
187	56.5k	} while (0)
188
189		#define FIELD_IS(string_literal) \
190	34.3k	BOUNDED_EQUAL_NO_CASE (field_b, field_e, string_literal)
191
192		/* Parse textual RES specs beginning with SOURCE of length LENGTH.
193		Return a specs objects ready to be fed to res_match_path.
194
195		The parsing itself is trivial, but creating a correct SPECS object
196		is trickier than it seems, because RES is surprisingly byzantine if
197		you attempt to implement it correctly.
198
199		A "record" is a block of one or more `User-Agent' lines followed by
200		one or more `Allow' or `Disallow' lines. Record is accepted by
201		Wget if one of the `User-Agent' lines was "wget", or if the user
202		agent line was "*".
203
204		After all the lines have been read, we examine whether an exact
205		("wget") user-agent field was specified. If so, we delete all the
206		lines read under "User-Agent: *" blocks because we have our own
207		Wget-specific blocks. This enables the admin to say:
208
209		User-Agent: *
210		Disallow: /
211
212		User-Agent: google
213		User-Agent: wget
214		Disallow: /cgi-bin
215
216		This means that to Wget and to Google, /cgi-bin is disallowed,
217		whereas for all other crawlers, everything is disallowed.
218		res_parse is implemented so that the order of records doesn't
219		matter. In the case above, the "User-Agent: *" could have come
220		after the other one. */
221
222		struct robot_specs *
223		res_parse (const char *source, int length)
224	1.27k	{
225	1.27k	int line_count = 1;
226
227	1.27k	const char *p = source;
228	1.27k	const char *end = source + length;
229
230		/* true if last applicable user-agent field matches Wget. */
231	1.27k	bool user_agent_applies = false;
232
233		/* true if last applicable user-agent field exactly matches
234		Wget. */
235	1.27k	bool user_agent_exact = false;
236
237		/* whether we ever encountered exact user agent. */
238	1.27k	bool found_exact = false;
239
240		/* count of allow/disallow lines in the current "record", i.e. after
241		the last `user-agent' instructions. */
242	1.27k	int record_count = 0;
243
244	1.27k	struct robot_specs *specs = xnew0 (struct robot_specs);
245
246	21.9k	while (1)
247	21.9k	{
248	21.9k	const char lineend, lineend_real;
249	21.9k	const char field_b, field_e;
250	21.9k	const char value_b, value_e;
251
252	21.9k	if (p == end)
253	1.27k	break;
254	20.6k	lineend_real = memchr (p, '\n', end - p);
255	20.6k	if (lineend_real)
256	19.3k	++lineend_real;
257	1.26k	else
258	1.26k	lineend_real = end;
259	20.6k	lineend = lineend_real;
260
261		/* Before doing anything else, check whether the line is empty
262		or comment-only. */
263	20.6k	SKIP_SPACE (p);
264	20.6k	if (EOL (p) \|\| *p == '#')
265	1.71k	goto next;
266
267		/* Make sure the end-of-line comments are respected by setting
268		lineend to a location preceding the first comment. Real line
269		ending remains in lineend_real. */
270	249k	for (lineend = p; lineend < lineend_real; lineend++)
271	230k	if ((lineend == p \|\| c_isspace (*(lineend - 1)))
272	230k	&& *lineend == '#')
273	197	break;
274
275		/* Ignore trailing whitespace in the same way. */
276	37.4k	while (lineend > p && c_isspace (*(lineend - 1)))
277	18.4k	--lineend;
278
279	18.9k	assert (!EOL (p));
280
281	18.9k	field_b = p;
282	161k	while (!EOL (p) && (c_isalnum (p) \|\| p == '-'))
283	142k	++p;
284	18.9k	field_e = p;
285
286	18.9k	SKIP_SPACE (p);
287	18.9k	if (field_b == field_e \|\| EOL (p) \|\| *p != ':')
288	1.91k	{
289	1.91k	DEBUGP (("Ignoring malformed line %d\n", line_count));
290	1.91k	goto next;
291	1.91k	}
292	17.0k	++p; /* skip ':' */
293	17.0k	SKIP_SPACE (p);
294
295	17.0k	value_b = p;
296	60.3k	while (!EOL (p))
297	43.3k	++p;
298	17.0k	value_e = p;
299
300		/* Finally, we have a syntactically valid line. */
301	17.0k	if (FIELD_IS ("user-agent"))
302	2.02k	{
303		/* We have to support several cases:
304
305		--previous records--
306
307		User-Agent: foo
308		User-Agent: Wget
309		User-Agent: bar
310		... matching record ...
311
312		User-Agent: baz
313		User-Agent: qux
314		... non-matching record ...
315
316		User-Agent: *
317		... matching record, but will be pruned later ...
318
319		We have to respect `User-Agent' at the beginning of each
320		new record simply because we don't know if we're going to
321		encounter "Wget" among the agents or not. Hence,
322		match_user_agent is called when record_count != 0.
323
324		But if record_count is 0, we have to keep calling it
325		until it matches, and if that happens, we must not call
326		it any more, until the next record. Hence the other part
327		of the condition. */
328	2.02k	if (record_count != 0 \|\| user_agent_applies == false)
329	1.63k	match_user_agent (value_b, value_e - value_b,
330	1.63k	&user_agent_applies, &user_agent_exact);
331	2.02k	if (user_agent_exact)
332	443	found_exact = true;
333	2.02k	record_count = 0;
334	2.02k	}
335	14.9k	else if (FIELD_IS ("allow"))
336	12.6k	{
337	12.6k	if (user_agent_applies)
338	10.9k	{
339	10.9k	add_path (specs, value_b, value_e, true, user_agent_exact);
340	10.9k	}
341	12.6k	++record_count;
342	12.6k	}
343	2.37k	else if (FIELD_IS ("disallow"))
344	1.01k	{
345	1.01k	if (user_agent_applies)
346	822	{
347	822	bool allowed = false;
348	822	if (value_b == value_e)
349		/* Empty "disallow" line means everything is allowed! */
350	415	allowed = true;
351	822	add_path (specs, value_b, value_e, allowed, user_agent_exact);
352	822	}
353	1.01k	++record_count;
354	1.01k	}
355	1.35k	else
356	1.35k	{
357	1.35k	DEBUGP (("Ignoring unknown field at line %d\n", line_count));
358	1.35k	goto next;
359	1.35k	}
360
361	20.6k	next:
362	20.6k	p = lineend_real;
363	20.6k	++line_count;
364	20.6k	}
365
366	1.27k	if (found_exact)
367	57	{
368		/* We've encountered an exactly matching user-agent. Throw out
369		all the stuff with user-agent: . /
370	57	prune_non_exact (specs);
371	57	}
372	1.22k	else if (specs->size > specs->count)
373	67	{
374		/* add_path normally over-allocates specs->paths. Reallocate it
375		to the correct size in order to conserve some memory. */
376	67	specs->paths = xrealloc (specs->paths,
377	67	specs->count * sizeof (struct path_info));
378	67	specs->size = specs->count;
379	67	}
380
381	1.27k	return specs;
382	1.27k	}
383
384		/* The same like res_parse, but first map the FILENAME into memory,
385		and then parse it. */
386
387		struct robot_specs *
388		res_parse_from_file (const char *filename)
389	0	{
390	0	struct robot_specs *specs;
391	0	struct file_memory *fm = wget_read_file (filename);
392	0	if (!fm)
393	0	{
394	0	logprintf (LOG_NOTQUIET, _("Cannot open %s: %s\n"),
395	0	filename, strerror (errno));
396	0	return NULL;
397	0	}
398	0	specs = res_parse (fm->content, fm->length);
399	0	wget_read_file_free (fm);
400	0	return specs;
401	0	}
402
403		static void
404		free_specs (struct robot_specs *specs)
405	1.27k	{
406	1.27k	int i;
407	12.8k	for (i = 0; i < specs->count; i++)
408	11.5k	xfree (specs->paths[i].path);
409	1.27k	xfree (specs->paths);
410	1.27k	xfree (specs);
411	1.27k	}
412
413		/* Matching of a path according to the specs. */
414
415		/* If C is '%' and (ptr[1], ptr[2]) form a hexadecimal number, and if
416		that number is not a numerical representation of '/', decode C and
417		advance the pointer. */
418
419	13.9k	#define DECODE_MAYBE(c, ptr) do { \
420	13.9k	if (c == '%' && c_isxdigit (ptr[1]) && c_isxdigit (ptr[2])) \
421	13.9k	{ \
422	4.40k	unsigned char decoded = X2DIGITS_TO_NUM (ptr[1], ptr[2]); \
423	4.40k	if (decoded != '/') \
424	4.40k	{ \
425	4.21k	c = decoded; \
426	4.21k	ptr += 2; \
427	4.21k	} \
428	4.40k	} \
429	13.9k	} while (0)
430
431		/* The inner matching engine: return true if RECORD_PATH matches
432		URL_PATH. The rules for matching are described at
433		<http://www.robotstxt.org/norobots-rfc.txt>, section 3.2.2. */
434
435		static bool
436		matches (const char record_path, const char url_path)
437	3.09k	{
438	3.09k	const char *rp = record_path;
439	3.09k	const char *up = url_path;
440
441	4.24k	for (; ; ++rp, ++up)
442	7.33k	{
443	7.33k	char rc = *rp;
444	7.33k	char uc = *up;
445	7.33k	if (!rc)
446	144	return true;
447	7.19k	if (!uc)
448	237	return false;
449	6.95k	DECODE_MAYBE(rc, rp);
450	6.95k	DECODE_MAYBE(uc, up);
451	6.95k	if (rc != uc)
452	2.71k	return false;
453	6.95k	}
454	3.09k	}
455
456		/* Iterate through all paths in SPECS. For the first one that
457		matches, return its allow/reject status. If none matches,
458		retrieval is by default allowed. */
459
460		bool
461		res_match_path (const struct robot_specs specs, const char path)
462	1.27k	{
463	1.27k	int i;
464	1.27k	if (!specs)
465	0	return true;
466	4.22k	for (i = 0; i < specs->count; i++)
467	3.09k	if (matches (specs->paths[i].path, path))
468	144	{
469	144	bool allowedp = specs->paths[i].allowedp;
470	144	DEBUGP (("%s path %s because of rule %s.\n",
471	144	allowedp ? "Allowing" : "Rejecting",
472	144	path, quote (specs->paths[i].path)));
473	144	return allowedp;
474	144	}
475	1.13k	return true;
476	1.27k	}
477
478		/* Registering the specs. */
479
480		static struct hash_table *registered_specs;
481
482		/* Register RES specs that below to server on HOST:PORT. They will
483		later be retrievable using res_get_specs. */
484
485		void
486		res_register_specs (const char host, int port, struct robot_specs specs)
487	1.27k	{
488	1.27k	struct robot_specs *old;
489	1.27k	char buf[256], hp, hp_old;
490
491	1.27k	if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
492	0	hp = aprintf("%s:%d", host, port);
493	1.27k	else
494	1.27k	hp = buf;
495
496	1.27k	if (!registered_specs)
497	1.27k	registered_specs = make_nocase_string_hash_table (0);
498
499	1.27k	if (hash_table_get_pair (registered_specs, hp, &hp_old, &old))
500	0	{
501	0	if (hp != buf)
502	0	xfree (hp);
503	0	if (old)
504	0	free_specs (old);
505	0	hash_table_put (registered_specs, hp_old, specs);
506	0	}
507	1.27k	else
508	1.27k	{
509	1.27k	hash_table_put (registered_specs, hp == buf ? xstrdup (hp) : hp, specs);
510	1.27k	}
511	1.27k	}
512
513		/* Get the specs that belong to HOST:PORT. */
514
515		struct robot_specs *
516		res_get_specs (const char *host, int port)
517	0	{
518	0	char buf[256], *hp;
519
520	0	if (!registered_specs)
521	0	return NULL;
522
523	0	if (((unsigned) snprintf (buf, sizeof (buf), "%s:%d", host, port)) >= sizeof (buf))
524	0	hp = aprintf("%s:%d", host, port);
525	0	else
526	0	hp = buf;
527
528	0	return hash_table_get (registered_specs, hp);
529	0	}
530
531		/* Loading the robots file. */
532
533	0	#define RES_SPECS_LOCATION "/robots.txt"
534
535		/* Retrieve the robots.txt from the server root of the server that
536		serves URL. The file will be named according to the currently
537		active rules, and the file name will be returned in *file.
538
539		Return true if robots were retrieved OK, false otherwise. */
540
541		bool
542		res_retrieve_file (const char url, char file, struct iri iri)
543	0	{
544	0	struct iri *i = iri_new ();
545	0	uerr_t err;
546	0	char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
547	0	int saved_ts_val = opt.timestamping;
548	0	int saved_sp_val = opt.spider, url_err;
549	0	struct url * url_parsed;
550
551		/* Copy server URI encoding for a possible IDNA transformation, no need to
552		encode the full URI in UTF-8 because "robots.txt" is plain ASCII */
553	0	set_uri_encoding (i, iri->uri_encoding, false);
554	0	i->utf8_encode = false;
555
556	0	logputs (LOG_VERBOSE, _("Loading robots.txt; please ignore errors.\n"));
557	0	*file = NULL;
558	0	opt.timestamping = false;
559	0	opt.spider = false;
560
561	0	url_parsed = url_parse (robots_url, &url_err, i, true);
562	0	if (!url_parsed)
563	0	{
564	0	logprintf (LOG_NOTQUIET, "%s: %s.\n", robots_url, url_error (url_err));
565	0	err = URLERROR;
566	0	}
567	0	else
568	0	{
569	0	err = retrieve_url (url_parsed, robots_url, file, NULL, NULL, NULL,
570	0	false, i, false);
571	0	url_free(url_parsed);
572	0	}
573
574	0	opt.timestamping = saved_ts_val;
575	0	opt.spider = saved_sp_val;
576	0	xfree (robots_url);
577	0	iri_free (i);
578
579	0	if (err != RETROK && *file != NULL)
580	0	{
581		/* If the file is not retrieved correctly, but retrieve_url
582		allocated the file name, deallocate is here so that the
583		caller doesn't have to worry about it. */
584	0	xfree (*file);
585	0	}
586	0	return err == RETROK;
587	0	}
588
589		bool
590		is_robots_txt_url (const char *url)
591	0	{
592	0	char *robots_url = uri_merge (url, RES_SPECS_LOCATION);
593	0	bool ret = are_urls_equal (url, robots_url);
594
595	0	xfree (robots_url);
596
597	0	return ret;
598	0	}
599
600		#if defined DEBUG_MALLOC \|\| defined TESTING
601		void
602		res_cleanup (void)
603	1.27k	{
604	1.27k	if (registered_specs)
605	1.27k	{
606	1.27k	hash_table_iterator iter;
607	1.27k	for (hash_table_iterate (registered_specs, &iter);
608	2.55k	hash_table_iter_next (&iter);
609	1.27k	)
610	1.27k	{
611	1.27k	xfree (iter.key);
612	1.27k	free_specs (iter.value);
613	1.27k	}
614	1.27k	hash_table_destroy (registered_specs);
615	1.27k	registered_specs = NULL;
616	1.27k	}
617	1.27k	}
618		#endif
619
620		#ifdef TESTING
621
622		const char *
623		test_is_robots_txt_url(void)
624	0	{
625	0	unsigned i;
626	0	static const struct {
627	0	const char *url;
628	0	bool expected_result;
629	0	} test_array[] = {
630	0	{ "http://www.yoyodyne.com/robots.txt", true },
631	0	{ "http://www.yoyodyne.com/somepath/", false },
632	0	{ "http://www.yoyodyne.com/somepath/robots.txt", false },
633	0	};
634
635	0	for (i = 0; i < countof(test_array); ++i)
636	0	{
637	0	mu_assert ("test_is_robots_txt_url: wrong result",
638	0	is_robots_txt_url (test_array[i].url) == test_array[i].expected_result);
639	0	}
640
641	0	return NULL;
642	0	}
643
644		#endif /* TESTING */
645
646		/*
647		* vim: et ts=2 sw=2
648		*/