/src/wget2/libwget/rss_url.c

Source
/*
 * Copyright (c) 2013 Tim Ruehsen
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
 *
 * This file is part of libwget.
 *
 * Libwget is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as published by
 * the Free Software Foundation, either version 3 of the License, or
 * (at your option) any later version.
 *
 * Libwget is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU Lesser General Public License for more details.
 *
 * You should have received a copy of the GNU Lesser General Public License
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
 *
 *
 * Extracting URLs from RSS feeds (https://cyber.harvard.edu/rss/rss.html)
 *
 * Changelog
 * 21.12.2013  Tim Ruehsen  created
 *
 */

#include <config.h>

#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <c-ctype.h>

#include <wget.h>
#include "private.h"

struct rss_context {
  wget_vector
    *urls;
};

static void rss_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos WGET_GCC_UNUSED)
{
  struct rss_context *ctx = context;
  wget_string * url;

  if (!val || !len)
    return;

  if ((flags & XML_FLG_ATTRIBUTE)) {
    if (!wget_strcasecmp_ascii(attr, "url") || !wget_strcasecmp_ascii(attr, "href")
      || !wget_strcasecmp_ascii(attr, "src") || !wget_strcasecmp_ascii(attr, "domain")
      || !wget_strcasecmp_ascii(attr, "xmlns") || !wget_strncasecmp_ascii(attr, "xmlns:", 6))
    {
      for (;len && c_isspace(*val); val++, len--); // skip leading spaces
      for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces

      if (!(url = wget_malloc(sizeof(wget_string))))
        return;

      url->p = val;
      url->len = len;

      if (!ctx->urls)
        ctx->urls = wget_vector_create(32, NULL);

      wget_vector_add(ctx->urls, url);
    }
  }
  else if ((flags & XML_FLG_CONTENT)) {
    const char *elem = strrchr(dir, '/');

    if (elem) {
      elem++;

      if (!wget_strcasecmp_ascii(elem, "guid") || !wget_strcasecmp_ascii(elem, "link")
         || !wget_strcasecmp_ascii(elem, "comments") || !wget_strcasecmp_ascii(elem, "docs"))
      {
        for (;len && c_isspace(*val); val++, len--); // skip leading spaces
        for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces

        // debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len);

        if (!(url = wget_malloc(sizeof(wget_string))))
          return;

        url->p = val;
        url->len = len;

        if (!ctx->urls)
          ctx->urls = wget_vector_create(32, NULL);

        wget_vector_add(ctx->urls, url);
      }
    }
  }
}

void wget_rss_get_urls_inline(const char *rss, wget_vector **urls)
{
  struct rss_context context = { .urls = NULL };

  wget_xml_parse_buffer(rss, rss_get_url, &context, XML_HINT_REMOVE_EMPTY_CONTENT);

  *urls = context.urls;
}

Line	Count	Source
1		/*
2		* Copyright (c) 2013 Tim Ruehsen
3		* Copyright (c) 2015-2024 Free Software Foundation, Inc.
4		*
5		* This file is part of libwget.
6		*
7		* Libwget is free software: you can redistribute it and/or modify
8		* it under the terms of the GNU Lesser General Public License as published by
9		* the Free Software Foundation, either version 3 of the License, or
10		* (at your option) any later version.
11		*
12		* Libwget is distributed in the hope that it will be useful,
13		* but WITHOUT ANY WARRANTY; without even the implied warranty of
14		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15		* GNU Lesser General Public License for more details.
16		*
17		* You should have received a copy of the GNU Lesser General Public License
18		* along with libwget. If not, see <https://www.gnu.org/licenses/>.
19		*
20		*
21		* Extracting URLs from RSS feeds (https://cyber.harvard.edu/rss/rss.html)
22		*
23		* Changelog
24		* 21.12.2013 Tim Ruehsen created
25		*
26		*/
27
28		#include <config.h>
29
30		#include <unistd.h>
31		#include <stdlib.h>
32		#include <string.h>
33		#include <c-ctype.h>
34
35		#include <wget.h>
36		#include "private.h"
37
38		struct rss_context {
39		wget_vector
40		*urls;
41		};
42
43		static void rss_get_url(void context, int flags, const char dir, const char attr, const char val, size_t len, size_t pos WGET_GCC_UNUSED)
44	98.0k	{
45	98.0k	struct rss_context *ctx = context;
46	98.0k	wget_string * url;
47
48	98.0k	if (!val \|\| !len)
49	78.4k	return;
50
51	19.6k	if ((flags & XML_FLG_ATTRIBUTE)) {
52	3.96k	if (!wget_strcasecmp_ascii(attr, "url") \|\| !wget_strcasecmp_ascii(attr, "href")
53	3.50k	\|\| !wget_strcasecmp_ascii(attr, "src") \|\| !wget_strcasecmp_ascii(attr, "domain")
54	2.10k	\|\| !wget_strcasecmp_ascii(attr, "xmlns") \|\| !wget_strncasecmp_ascii(attr, "xmlns:", 6))
55	2.43k	{
56	2.82k	for (;len && c_isspace(*val); val++, len--); // skip leading spaces
57	2.81k	for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces
58
59	2.43k	if (!(url = wget_malloc(sizeof(wget_string))))
60	0	return;
61
62	2.43k	url->p = val;
63	2.43k	url->len = len;
64
65	2.43k	if (!ctx->urls)
66	122	ctx->urls = wget_vector_create(32, NULL);
67
68	2.43k	wget_vector_add(ctx->urls, url);
69	2.43k	}
70	3.96k	}
71	15.6k	else if ((flags & XML_FLG_CONTENT)) {
72	15.1k	const char *elem = strrchr(dir, '/');
73
74	15.1k	if (elem) {
75	15.1k	elem++;
76
77	15.1k	if (!wget_strcasecmp_ascii(elem, "guid") \|\| !wget_strcasecmp_ascii(elem, "link")
78	14.6k	\|\| !wget_strcasecmp_ascii(elem, "comments") \|\| !wget_strcasecmp_ascii(elem, "docs"))
79	1.17k	{
80	2.06k	for (;len && c_isspace(*val); val++, len--); // skip leading spaces
81	1.93k	for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces
82
83		// debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len);
84
85	1.17k	if (!(url = wget_malloc(sizeof(wget_string))))
86	0	return;
87
88	1.17k	url->p = val;
89	1.17k	url->len = len;
90
91	1.17k	if (!ctx->urls)
92	86	ctx->urls = wget_vector_create(32, NULL);
93
94	1.17k	wget_vector_add(ctx->urls, url);
95	1.17k	}
96	15.1k	}
97	15.1k	}
98	19.6k	}
99
100		void wget_rss_get_urls_inline(const char rss, wget_vector *urls)
101	1.38k	{
102	1.38k	struct rss_context context = { .urls = NULL };
103
104	1.38k	wget_xml_parse_buffer(rss, rss_get_url, &context, XML_HINT_REMOVE_EMPTY_CONTENT);
105
106	1.38k	*urls = context.urls;
107	1.38k	}

Coverage Report

Created: 2026-01-31 06:23