/src/wget2/libwget/rss_url.c
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) 2013 Tim Ruehsen |
3 | | * Copyright (c) 2015-2024 Free Software Foundation, Inc. |
4 | | * |
5 | | * This file is part of libwget. |
6 | | * |
7 | | * Libwget is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as published by |
9 | | * the Free Software Foundation, either version 3 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * Libwget is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libwget. If not, see <https://www.gnu.org/licenses/>. |
19 | | * |
20 | | * |
21 | | * Extracting URLs from RSS feeds (https://cyber.harvard.edu/rss/rss.html) |
22 | | * |
23 | | * Changelog |
24 | | * 21.12.2013 Tim Ruehsen created |
25 | | * |
26 | | */ |
27 | | |
28 | | #include <config.h> |
29 | | |
30 | | #include <unistd.h> |
31 | | #include <stdlib.h> |
32 | | #include <string.h> |
33 | | #include <c-ctype.h> |
34 | | |
35 | | #include <wget.h> |
36 | | #include "private.h" |
37 | | |
38 | | struct rss_context { |
39 | | wget_vector |
40 | | *urls; |
41 | | }; |
42 | | |
43 | | static void rss_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos WGET_GCC_UNUSED) |
44 | 98.0k | { |
45 | 98.0k | struct rss_context *ctx = context; |
46 | 98.0k | wget_string * url; |
47 | | |
48 | 98.0k | if (!val || !len) |
49 | 78.4k | return; |
50 | | |
51 | 19.6k | if ((flags & XML_FLG_ATTRIBUTE)) { |
52 | 3.96k | if (!wget_strcasecmp_ascii(attr, "url") || !wget_strcasecmp_ascii(attr, "href") |
53 | 3.50k | || !wget_strcasecmp_ascii(attr, "src") || !wget_strcasecmp_ascii(attr, "domain") |
54 | 2.10k | || !wget_strcasecmp_ascii(attr, "xmlns") || !wget_strncasecmp_ascii(attr, "xmlns:", 6)) |
55 | 2.43k | { |
56 | 2.82k | for (;len && c_isspace(*val); val++, len--); // skip leading spaces |
57 | 2.81k | for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces |
58 | | |
59 | 2.43k | if (!(url = wget_malloc(sizeof(wget_string)))) |
60 | 0 | return; |
61 | | |
62 | 2.43k | url->p = val; |
63 | 2.43k | url->len = len; |
64 | | |
65 | 2.43k | if (!ctx->urls) |
66 | 122 | ctx->urls = wget_vector_create(32, NULL); |
67 | | |
68 | 2.43k | wget_vector_add(ctx->urls, url); |
69 | 2.43k | } |
70 | 3.96k | } |
71 | 15.6k | else if ((flags & XML_FLG_CONTENT)) { |
72 | 15.1k | const char *elem = strrchr(dir, '/'); |
73 | | |
74 | 15.1k | if (elem) { |
75 | 15.1k | elem++; |
76 | | |
77 | 15.1k | if (!wget_strcasecmp_ascii(elem, "guid") || !wget_strcasecmp_ascii(elem, "link") |
78 | 14.6k | || !wget_strcasecmp_ascii(elem, "comments") || !wget_strcasecmp_ascii(elem, "docs")) |
79 | 1.17k | { |
80 | 2.06k | for (;len && c_isspace(*val); val++, len--); // skip leading spaces |
81 | 1.93k | for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces |
82 | | |
83 | | // debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len); |
84 | | |
85 | 1.17k | if (!(url = wget_malloc(sizeof(wget_string)))) |
86 | 0 | return; |
87 | | |
88 | 1.17k | url->p = val; |
89 | 1.17k | url->len = len; |
90 | | |
91 | 1.17k | if (!ctx->urls) |
92 | 86 | ctx->urls = wget_vector_create(32, NULL); |
93 | | |
94 | 1.17k | wget_vector_add(ctx->urls, url); |
95 | 1.17k | } |
96 | 15.1k | } |
97 | 15.1k | } |
98 | 19.6k | } |
99 | | |
100 | | void wget_rss_get_urls_inline(const char *rss, wget_vector **urls) |
101 | 1.38k | { |
102 | 1.38k | struct rss_context context = { .urls = NULL }; |
103 | | |
104 | 1.38k | wget_xml_parse_buffer(rss, rss_get_url, &context, XML_HINT_REMOVE_EMPTY_CONTENT); |
105 | | |
106 | 1.38k | *urls = context.urls; |
107 | 1.38k | } |