Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | * Copyright (c) 2012 Tim Ruehsen |
3 | | * Copyright (c) 2015-2024 Free Software Foundation, Inc. |
4 | | * |
5 | | * This file is part of libwget. |
6 | | * |
7 | | * Libwget is free software: you can redistribute it and/or modify |
8 | | * it under the terms of the GNU Lesser General Public License as published by |
9 | | * the Free Software Foundation, either version 3 of the License, or |
10 | | * (at your option) any later version. |
11 | | * |
12 | | * Libwget is distributed in the hope that it will be useful, |
13 | | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
14 | | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
15 | | * GNU Lesser General Public License for more details. |
16 | | * |
17 | | * You should have received a copy of the GNU Lesser General Public License |
18 | | * along with libwget. If not, see <https://www.gnu.org/licenses/>. |
19 | | * |
20 | | * |
21 | | * xml parsing routines |
22 | | * |
23 | | * Changelog |
24 | | * 22.06.2012 Tim Ruehsen created, but needs definitely a rewrite |
25 | | * |
26 | | * This derives from an old source code that I wrote in 2001. |
27 | | * It is short, fast and has a low memory print, BUT it is a hack. |
28 | | * It has to be replaced by e.g. libxml2 or something better. |
29 | | * |
30 | | * HTML parsing is (very) different from XML parsing, see here: |
31 | | * https://html.spec.whatwg.org/multipage/syntax.html |
32 | | * It is a PITA and should be handled by a specialized, external library ! |
33 | | * |
34 | | */ |
35 | | |
36 | | #include <config.h> |
37 | | |
38 | | #include <unistd.h> |
39 | | #include <stdio.h> |
40 | | #include <string.h> |
41 | | #include <fcntl.h> |
42 | | #include <sys/stat.h> |
43 | | #ifdef HAVE_MMAP |
44 | | #include <sys/mman.h> |
45 | | #endif |
46 | | |
47 | | #include <wget.h> |
48 | | #include "private.h" |
49 | | |
50 | | typedef struct { |
51 | | const char |
52 | | *buf, // pointer to original start of buffer (0-terminated) |
53 | | *p, // pointer next char in buffer |
54 | | *token; // token buffer |
55 | | int |
56 | | hints; // XML_HINT... |
57 | | size_t |
58 | | token_size, // size of token buffer |
59 | | token_len; // used bytes of token buffer (not counting terminating 0 byte) |
60 | | void |
61 | | *user_ctx; // user context (not needed if we were using nested functions) |
62 | | wget_xml_callback |
63 | | *callback; |
64 | | } xml_context; |
65 | | |
66 | | /* \cond _hide_internal_symbols */ |
67 | 5.56M | #define ascii_isspace(c) (c == ' ' || (c >= 9 && c <= 13)) |
68 | | |
69 | | // working only for consecutive alphabets, e.g. EBCDIC would not work |
70 | 3.50M | #define ascii_isalpha(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')) |
71 | | /* \endcond */ |
72 | | |
73 | | // append a char to token buffer |
74 | | |
75 | | static const char *getToken(xml_context *context) |
76 | 2.13M | { |
77 | 2.13M | int c; |
78 | 2.13M | const char *p; |
79 | | |
80 | | // skip leading whitespace |
81 | 2.15M | while ((c = *context->p) && ascii_isspace(c)) |
82 | 24.9k | context->p++; |
83 | 2.13M | if (!c) return NULL; // eof |
84 | 1.75M | context->token = context->p++; |
85 | | |
86 | | // info_printf("a c=%c\n", c); |
87 | | |
88 | 1.75M | if (ascii_isalpha(c) || c == '_') { |
89 | 863k | while ((c = *context->p) && !ascii_isspace(c) && c != '>' && c != '=') |
90 | 545k | context->p++; |
91 | 317k | if (!c) return NULL; // syntax error |
92 | | |
93 | 317k | context->token_len = context->p - context->token; |
94 | 317k | return context->token; |
95 | 317k | } |
96 | | |
97 | 1.43M | if (c == '/') { |
98 | 9.01k | if (!(c = *context->p)) return NULL; // syntax error |
99 | 9.00k | context->p++; |
100 | 9.00k | if (c == '>') { |
101 | 4.24k | context->token_len = 2; |
102 | 4.24k | return context->token; |
103 | 4.75k | } else return NULL; // syntax error |
104 | 9.00k | } |
105 | | |
106 | 1.42M | if (c == '\"' || c == '\'') { // read in quoted value |
107 | 6.28k | int quote = c; |
108 | | |
109 | 6.28k | context->token = context->p; |
110 | | |
111 | 6.28k | if (!(p = strchr(context->p, quote))) |
112 | 761 | return NULL; |
113 | 5.52k | context->p = p + 1; |
114 | | |
115 | 5.52k | context->token_len = context->p - context->token - 1; |
116 | 5.52k | return context->token; |
117 | 6.28k | } |
118 | | |
119 | 1.41M | if (c == '<') { // fetch specials, e.g. start of comments '<!--' |
120 | 649k | if (!(c = *context->p)) return NULL; // syntax error |
121 | 648k | context->p++; |
122 | 648k | if (c == '?' || c == '/') { |
123 | 14.5k | context->token_len = 2; |
124 | 14.5k | return context->token; |
125 | 14.5k | } |
126 | | |
127 | 633k | if (c == '!') { |
128 | | // left: <!--, <![CDATA[ and <!WHATEVER |
129 | 80.4k | if (!(c = *context->p)) return NULL; // syntax error |
130 | 80.4k | if (c == '-') { |
131 | 6.23k | context->p++; |
132 | 6.23k | if (!(c = *context->p)) return NULL; // syntax error |
133 | 6.22k | context->p++; |
134 | 6.22k | if (c == '-') { |
135 | 4.72k | context->token_len = 4; |
136 | 4.72k | return context->token; |
137 | 4.72k | } else { |
138 | 1.49k | context->p -= 2; |
139 | 1.49k | context->token_len = 2; |
140 | 1.49k | return context->token; |
141 | 1.49k | } |
142 | 74.1k | } else { |
143 | 74.1k | context->token_len = 2; |
144 | 74.1k | return context->token; |
145 | 74.1k | } |
146 | 553k | } else { |
147 | 553k | context->p--; |
148 | 553k | context->token_len = 1; |
149 | 553k | return context->token; |
150 | 553k | } |
151 | 633k | } |
152 | | |
153 | 767k | if (c == '>' || c == '=') { |
154 | 747k | context->token_len = 1; |
155 | 747k | return context->token; |
156 | 747k | } |
157 | | |
158 | 20.2k | if (c == '-') { // fetch specials, e.g. end of comments '-->' |
159 | 4.98k | if (!(c = *context->p)) return NULL; // syntax error |
160 | 4.96k | if (c != '-') { |
161 | 2.76k | c = '-'; //??? |
162 | 2.76k | } else { |
163 | 2.19k | context->p++; |
164 | 2.19k | if (!(c = *context->p)) return NULL; // syntax error |
165 | 2.18k | context->p++; |
166 | 2.18k | if (c != '>') { |
167 | 1.19k | context->p -= 2; |
168 | 1.19k | c = '-'; |
169 | 1.19k | } else { |
170 | 990 | context->token_len = 3; |
171 | 990 | return context->token; |
172 | 990 | } |
173 | 2.18k | } |
174 | 4.96k | } |
175 | | |
176 | 19.2k | if (c == '?') { // fetch specials, e.g. '?>' |
177 | 3.13k | if (!(c = *context->p)) return NULL; // syntax error |
178 | 3.11k | if (c != '>') { |
179 | | // c = '?'; |
180 | 2.02k | } else { |
181 | 1.08k | context->p++; |
182 | 1.08k | context->token_len = 2; |
183 | 1.08k | return context->token; |
184 | 1.08k | } |
185 | 3.11k | } |
186 | | |
187 | 1.34M | while ((c = *context->p) && !ascii_isspace(c)) |
188 | 1.32M | context->p++; |
189 | | |
190 | 18.1k | if (c) { |
191 | 16.4k | debug_printf("getToken =%.*s\n", (int)(context->p - context->token), context->token); |
192 | 16.4k | context->token_len = context->p - context->token; |
193 | 16.4k | return context->token; |
194 | 16.4k | } |
195 | | |
196 | 1.72k | return NULL; |
197 | 18.1k | } |
198 | | |
199 | | static const char *getHTMLValue(xml_context *context) |
200 | 17.4k | { |
201 | 17.4k | int c; |
202 | 17.4k | const char *p; |
203 | | |
204 | | // skip leading whitespace |
205 | 18.3k | while ((c = *context->p) && ascii_isspace(c)) |
206 | 895 | context->p++; |
207 | 17.4k | if (!c) return NULL; // eof |
208 | 17.4k | context->token = context->p++; |
209 | | |
210 | | // Check for and read in quoted value. |
211 | 17.4k | if (c == '\"' || c == '\'' || c == '`') { |
212 | 2.52k | int quote = c; |
213 | | |
214 | 2.52k | context->token = context->p; |
215 | | |
216 | 2.52k | if (!(p = strchr(context->p, quote))) |
217 | 80 | return NULL; |
218 | 2.44k | context->p = p + 1; |
219 | | |
220 | 2.44k | context->token_len = context->p - context->token - 1; |
221 | 2.44k | return context->token; |
222 | 2.52k | } |
223 | | |
224 | | // Read in unquoted value. |
225 | 296k | while ((c = *context->p) && !ascii_isspace(c) && c != '<' && c != '>' && !(c == '/' && *context->p == '>')) |
226 | 281k | context->p++; |
227 | 14.9k | if (c) { |
228 | 14.6k | debug_printf("getHTMLValue =%.*s\n", (int)(context->p - context->token), context->token); |
229 | 14.6k | context->token_len = context->p - context->token; |
230 | 14.6k | return context->token; |
231 | 14.6k | } |
232 | | |
233 | 246 | return NULL; |
234 | 14.9k | } |
235 | | |
236 | | static int getValue(xml_context *context) |
237 | 79.9k | { |
238 | 79.9k | int c; |
239 | | |
240 | 79.9k | context->token_len = 0; |
241 | 79.9k | context->token = context->p; |
242 | | |
243 | | // remove leading spaces |
244 | 112k | while ((c = *context->p) && ascii_isspace(c)) |
245 | 32.6k | context->p++; |
246 | 79.9k | if (!c) return EOF; |
247 | | |
248 | 78.7k | if (c == '=') { |
249 | 31.9k | context->p++; |
250 | | |
251 | 31.9k | if (context->hints&XML_HINT_HTML) { |
252 | 17.4k | if (!getHTMLValue(context)) |
253 | 385 | return EOF; // syntax error |
254 | 17.0k | else |
255 | 17.0k | return 1; // token valid |
256 | 17.4k | } |
257 | | |
258 | 14.4k | if (!getToken(context)) |
259 | 1.43k | return EOF; // syntax error |
260 | 13.0k | else |
261 | 13.0k | return 1; // token valid |
262 | 14.4k | } |
263 | | |
264 | | // attribute without value |
265 | 46.8k | context->token = context->p; |
266 | 46.8k | return 1; |
267 | 78.7k | } |
268 | | |
269 | | // special HTML <script> content parsing |
270 | | // see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element |
271 | | // see https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements |
272 | | |
273 | | static const char *getScriptContent(xml_context *context) |
274 | 2.61k | { |
275 | 2.61k | int comment = 0, length_valid = 0; |
276 | 2.61k | const char *p; |
277 | | |
278 | 68.1k | for (p = context->token = context->p; *p; p++) { |
279 | 67.6k | if (comment) { |
280 | 12.7k | if (*p == '-' && !strncmp(p, "-->", 3)) { |
281 | 1.37k | p += 3 - 1; |
282 | 1.37k | comment = 0; |
283 | 1.37k | } |
284 | 54.8k | } else { |
285 | 54.8k | if (*p == '<' && !strncmp(p, "<!--", 4)) { |
286 | 1.52k | p += 4 - 1; |
287 | 1.52k | comment = 1; |
288 | 53.2k | } else if (*p == '<' && !wget_strncasecmp_ascii(p, "</script", 8)) { |
289 | 2.91k | context->token_len = p - context->token; |
290 | 2.91k | length_valid = 1; |
291 | 4.06k | for (p += 8; ascii_isspace(*p); p++); |
292 | 2.91k | if (*p == '>') { |
293 | 2.00k | p++; |
294 | 2.00k | break; // found end of <script> |
295 | 2.00k | } else if (!*p) |
296 | 80 | break; // end of input |
297 | 2.91k | } |
298 | 54.8k | } |
299 | 67.6k | } |
300 | 2.61k | context->p = p; |
301 | | |
302 | 2.61k | if (!length_valid) |
303 | 439 | context->token_len = p - context->token; |
304 | | |
305 | 2.61k | if (!*p && !context->token_len) |
306 | 170 | return NULL; |
307 | | |
308 | 2.44k | if (context->callback) |
309 | 2.44k | context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "script", NULL, context->token, context->token_len, context->token - context->buf); |
310 | | |
311 | 2.44k | return context->token; |
312 | 2.61k | } |
313 | | |
314 | | // special HTML <style> content parsing |
315 | | // see https://html.spec.whatwg.org/multipage/semantics.html#the-style-element |
316 | | static const char *getStyleContent(xml_context *context) |
317 | 3.41k | { |
318 | 3.41k | int comment = 0, length_valid = 0; |
319 | 3.41k | const char *p; |
320 | | |
321 | 199k | for (p = context->token = context->p; *p; p++) { |
322 | 198k | if (comment) { |
323 | 20.9k | if (p[0] == '*' && p[1] == '/') { |
324 | 1.14k | p += 2 - 1; |
325 | 1.14k | comment = 0; |
326 | 1.14k | } |
327 | 177k | } else { |
328 | 177k | if (p[0] == '/' && p[1] == '*') { |
329 | 1.26k | p += 2 - 1; |
330 | 1.26k | comment = 1; |
331 | 175k | } else if (*p == '<' && !wget_strncasecmp_ascii(p, "</style", 7)) { |
332 | 3.01k | context->token_len = p - context->token; |
333 | 3.01k | length_valid = 1; |
334 | 4.04k | for (p += 7; ascii_isspace(*p); p++); |
335 | 3.01k | if (*p == '>') { |
336 | 2.24k | p++; |
337 | 2.24k | break; // found end of <style> |
338 | 2.24k | } else if (!*p) |
339 | 85 | break; // end of input |
340 | 3.01k | } |
341 | 177k | } |
342 | 198k | } |
343 | 3.41k | context->p = p; |
344 | | |
345 | 3.41k | if (!length_valid) |
346 | 1.01k | context->token_len = p - context->token; |
347 | | |
348 | 3.41k | if (!*p && !context->token_len) |
349 | 159 | return NULL; |
350 | | |
351 | 3.25k | if (context->callback) |
352 | 3.25k | context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "style", NULL, context->token, context->token_len, context->token - context->buf); |
353 | | |
354 | 3.25k | return context->token; |
355 | 3.41k | } |
356 | | |
357 | | static const char *getUnparsed(xml_context *context, int flags, const char *end, size_t len, const char *directory) |
358 | 81.8k | { |
359 | 81.8k | int c; |
360 | | |
361 | 81.8k | if (len == 1) { |
362 | 235k | for (context->token = context->p; (c = *context->p) && c != *end; context->p++); |
363 | 74.6k | } else { |
364 | 334k | for (context->token = context->p; (c = *context->p); context->p++) { |
365 | 332k | if (c == *end && context->p[1] == end[1] && (len == 2 || context->p[2] == end[2])) { |
366 | 5.92k | break; |
367 | 5.92k | } |
368 | 332k | } |
369 | 7.24k | } |
370 | | |
371 | 81.8k | context->token_len = context->p - context->token; |
372 | 81.8k | if (c) context->p += len; |
373 | | |
374 | 81.8k | if (!c && !context->token_len) |
375 | 133 | return NULL; |
376 | | /* |
377 | | if (context->token && context->token_len && context->hints & XML_HINT_REMOVE_EMPTY_CONTENT) { |
378 | | int notempty = 0; |
379 | | char *p; |
380 | | |
381 | | for (p = context->token; *p; p++) { |
382 | | if (!ascii_isspace(*p)) { |
383 | | notempty = 1; |
384 | | break; |
385 | | } |
386 | | } |
387 | | |
388 | | if (notempty) { |
389 | | if (context->callback) |
390 | | context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf); |
391 | | } else { |
392 | | // ignore empty content |
393 | | context->token_len = 0; |
394 | | context->token[0] = 0; |
395 | | } |
396 | | } else { |
397 | | */ |
398 | 81.7k | if (context->callback) |
399 | 80.2k | context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf); |
400 | | |
401 | | // } |
402 | | |
403 | 81.7k | return context->token; |
404 | 81.8k | } |
405 | | |
406 | | static const char *getComment(xml_context *context) |
407 | 3.75k | { |
408 | 3.75k | return getUnparsed(context, XML_FLG_COMMENT, "-->", 3, NULL); |
409 | 3.75k | } |
410 | | |
411 | | static const char *getProcessing(xml_context *context) |
412 | 3.49k | { |
413 | 3.49k | return getUnparsed(context, XML_FLG_PROCESSING, "?>", 2, NULL); |
414 | 3.49k | } |
415 | | |
416 | | static const char *getSpecial(xml_context *context) |
417 | 74.6k | { |
418 | 74.6k | return getUnparsed(context, XML_FLG_SPECIAL, ">", 1, NULL); |
419 | 74.6k | } |
420 | | |
421 | | static const char *getContent(xml_context *context, const char *directory) |
422 | 989k | { |
423 | 989k | int c; |
424 | | |
425 | 1.73M | for (context->token = context->p; (c = *context->p) && c != '<'; context->p++); |
426 | | |
427 | 989k | context->token_len = context->p - context->token; |
428 | | |
429 | 989k | if (!c && !context->token_len) |
430 | 374k | return NULL; |
431 | | |
432 | | // debug_printf("content=%.*s\n", (int)context->token_len, context->token); |
433 | 615k | if (context->callback && context->token_len) |
434 | 80.6k | context->callback(context->user_ctx, XML_FLG_CONTENT, directory, NULL, context->token, context->token_len, context->token - context->buf); |
435 | | |
436 | 615k | return context->token; |
437 | 989k | } |
438 | | |
439 | | static int parseXML(const char *dir, xml_context *context) |
440 | 398k | { |
441 | 398k | const char *tok; |
442 | 398k | char directory[256] = ""; |
443 | 398k | size_t pos = 0; |
444 | | |
445 | 398k | if (!(context->hints & XML_HINT_HTML)) { |
446 | 391k | pos = wget_strlcpy(directory, dir, sizeof(directory)); |
447 | 391k | if (pos >= sizeof(directory)) pos = sizeof(directory) - 1; |
448 | 391k | } |
449 | | |
450 | 989k | do { |
451 | 989k | getContent(context, directory); |
452 | 989k | if (context->token_len) |
453 | 81.6k | debug_printf("%s='%.*s'\n", directory, (int)context->token_len, context->token); |
454 | | |
455 | 989k | if (!(tok = getToken(context))) return WGET_E_SUCCESS; //eof |
456 | | // debug_printf("A Token '%.*s' len=%zu tok='%s'\n", (int)context->token_len, context->token, context->token_len, tok); |
457 | | |
458 | 609k | if (context->token_len == 1 && *tok == '<') { |
459 | | // get element name and add it to directory |
460 | 519k | int flags = XML_FLG_BEGIN; |
461 | | |
462 | 519k | if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR; // syntax error |
463 | | |
464 | | // debug_printf("A2 Token '%.*s'\n", (int)context->token_len, context->token); |
465 | | |
466 | 516k | if (!(context->hints & XML_HINT_HTML)) { |
467 | 387k | if (!pos || directory[pos - 1] != '/') |
468 | 96.7k | wget_snprintf(&directory[pos], sizeof(directory) - pos, "/%.*s", (int)context->token_len, tok); |
469 | 290k | else |
470 | 290k | wget_snprintf(&directory[pos], sizeof(directory) - pos, "%.*s", (int)context->token_len, tok); |
471 | 387k | } else { |
472 | | // wget_snprintf(directory, sizeof(directory), "%.*s", (int)context->token_len, tok); |
473 | 129k | size_t dirlen = context->token_len >= sizeof(directory) ? sizeof(directory) - 1 : context->token_len; |
474 | | |
475 | 129k | memcpy(directory, tok, dirlen); |
476 | 129k | directory[dirlen] = 0; |
477 | 129k | } |
478 | | |
479 | 593k | while ((tok = getToken(context))) { |
480 | | // debug_printf("C Token %.*s %zu %p %p dir=%s tok=%s\n", (int)context->token_len, context->token, context->token_len, context->token, context->p, directory, tok); |
481 | 587k | if (context->token_len == 2 && !strncmp(tok, "/>", 2)) { |
482 | 3.58k | if (context->callback) |
483 | 3.16k | context->callback(context->user_ctx, flags | XML_FLG_END, directory, NULL, NULL, 0, 0); |
484 | 3.58k | break; // stay in this level |
485 | 583k | } else if (context->token_len == 1 && *tok == '>') { |
486 | 504k | if (context->callback) |
487 | 442k | context->callback(context->user_ctx, flags | XML_FLG_CLOSE, directory, NULL, NULL, 0, 0); |
488 | 504k | if (context->hints & XML_HINT_HTML) { |
489 | 125k | if (!wget_strcasecmp_ascii(directory, "script")) { |
490 | | // special HTML <script> content parsing |
491 | | // see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element |
492 | | // 4.3.1.2 Restrictions for contents of script elements |
493 | 2.61k | debug_printf("*** need special <script> handling\n"); |
494 | 2.61k | getScriptContent(context); |
495 | 2.61k | if (context->token_len) |
496 | 840 | debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token); |
497 | 2.61k | } |
498 | 122k | else if (!wget_strcasecmp_ascii(directory, "style")) { |
499 | 3.41k | getStyleContent(context); |
500 | 3.41k | if (context->token_len) |
501 | 1.46k | debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token); |
502 | 3.41k | } |
503 | 125k | } else |
504 | 378k | parseXML(directory, context); // descend one level |
505 | 504k | break; |
506 | 504k | } else { |
507 | 79.9k | char attribute[256]; |
508 | 79.9k | size_t attrlen = context->token_len >= sizeof(attribute) ? sizeof(attribute) - 1 : context->token_len; |
509 | | |
510 | 79.9k | memcpy(attribute, tok, attrlen); |
511 | 79.9k | attribute[attrlen] = 0; |
512 | | |
513 | 79.9k | if (getValue(context) == EOF) return WGET_E_XML_PARSE_ERR; // syntax error |
514 | | |
515 | 76.9k | if (context->token_len) { |
516 | 28.2k | debug_printf("%s/@%s=%.*s\n", directory, attribute, (int)context->token_len, context->token); |
517 | 28.2k | if (context->callback) |
518 | 26.6k | context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, context->token, context->token_len, context->token - context->buf); |
519 | 48.7k | } else { |
520 | 48.7k | debug_printf("%s/@%s\n", directory, attribute); |
521 | 48.7k | if (context->callback) |
522 | 43.4k | context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, NULL, 0, 0); |
523 | 48.7k | } |
524 | 76.9k | flags = 0; |
525 | 76.9k | } |
526 | 587k | } |
527 | 513k | directory[pos] = 0; |
528 | 513k | } else if (context->token_len == 2) { |
529 | 86.8k | if (!strncmp(tok, "</", 2)) { |
530 | | // ascend one level |
531 | | // cleanup - get name and '>' |
532 | 8.73k | if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR; |
533 | | // debug_printf("X Token %s\n",tok); |
534 | 7.49k | if (context->callback) { |
535 | 5.73k | if (!(context->hints & XML_HINT_HTML)) |
536 | 3.96k | context->callback(context->user_ctx, XML_FLG_END, directory, NULL, NULL, 0, 0); |
537 | 1.77k | else { |
538 | 1.77k | char tmp[128], *tag = tmp; // we need to \0 terminate tok |
539 | 1.77k | if (context->token_len >= sizeof(tmp)) |
540 | 288 | tag = wget_malloc(context->token_len + 1); |
541 | 1.77k | if (tag) { |
542 | 1.77k | memcpy(tag, tok, context->token_len); |
543 | 1.77k | tag[context->token_len] = 0; |
544 | 1.77k | context->callback(context->user_ctx, XML_FLG_END, tag, NULL, NULL, 0, 0); |
545 | 1.77k | if (tag != tmp) |
546 | 288 | xfree(tag); |
547 | 1.77k | } |
548 | 1.77k | } |
549 | 5.73k | } |
550 | 7.49k | if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR; |
551 | | // debug_printf("Y Token %s\n",tok); |
552 | 6.53k | if (!(context->hints & XML_HINT_HTML)) |
553 | 4.84k | return WGET_E_SUCCESS; |
554 | 1.69k | else |
555 | 1.69k | continue; |
556 | 78.1k | } else if (!strncmp(tok, "<?", 2)) { // special info - ignore |
557 | 3.49k | getProcessing(context); |
558 | 3.49k | debug_printf("%s=<?%.*s?>\n", directory, (int)context->token_len, context->token); |
559 | 3.49k | continue; |
560 | 74.6k | } else if (!strncmp(tok, "<!", 2)) { |
561 | 74.6k | getSpecial(context); |
562 | 74.6k | debug_printf("%s=<!%.*s>\n", directory, (int)context->token_len, context->token); |
563 | 74.6k | } |
564 | 86.8k | } else if (context->token_len == 4 && !strncmp(tok, "<!--", 4)) { // comment - ignore |
565 | 3.75k | getComment(context); |
566 | 3.75k | debug_printf("%s=<!--%.*s-->\n", directory, (int)context->token_len, context->token); |
567 | 3.75k | continue; |
568 | 3.75k | } |
569 | 609k | } while (tok); |
570 | 5.89k | return WGET_E_SUCCESS; |
571 | 398k | } |
572 | | |
573 | | /** |
574 | | * \file |
575 | | * \brief XML parsing functions |
576 | | * \defgroup libwget-xml XML parsing functions |
577 | | * @{ |
578 | | */ |
579 | | |
580 | | /** |
581 | | * \param[in] buf Zero-terminated XML or HTML input data |
582 | | * \param[in] callback Function called for each token scan result |
583 | | * \param[in] user_ctx User-defined context variable, handed to \p callback |
584 | | * \param[in] hints Flags to influence parsing |
585 | | * |
586 | | * This function scans the XML input from \p buf and calls \p callback for each token |
587 | | * found. \p user_ctx is a user-defined context variable and given to each call of \p callback. |
588 | | * |
589 | | * \p hints may be 0 or any combination of %XML_HINT_REMOVE_EMPTY_CONTENT and %XML_HINT_HTML. |
590 | | * |
591 | | * %XML_HINT_REMOVE_EMPTY_CONTENT reduces the number of calls to \p callback by ignoring |
592 | | * empty content and superfluous spaces. |
593 | | * |
594 | | * %XML_HINT_HTML turns on HTML scanning. |
595 | | */ |
596 | | int wget_xml_parse_buffer( |
597 | | const char *buf, |
598 | | wget_xml_callback *callback, |
599 | | void *user_ctx, |
600 | | int hints) |
601 | 20.2k | { |
602 | 20.2k | xml_context context; |
603 | | |
604 | 20.2k | context.token = NULL; |
605 | 20.2k | context.token_size = 0; |
606 | 20.2k | context.token_len = 0; |
607 | 20.2k | context.buf = buf; |
608 | 20.2k | context.p = buf; |
609 | 20.2k | context.user_ctx = user_ctx; |
610 | 20.2k | context.callback = callback; |
611 | 20.2k | context.hints = hints; |
612 | | |
613 | 20.2k | return parseXML ("/", &context); |
614 | 20.2k | } |
615 | | |
616 | | /** |
617 | | * \param[in] buf Zero-terminated HTML input data |
618 | | * \param[in] callback Function called for each token scan result |
619 | | * \param[in] user_ctx User-defined context variable, handed to \p callback |
620 | | * \param[in] hints Flags to influence parsing |
621 | | * |
622 | | * Convenience function that calls wget_xml_parse_buffer() with HTML parsing turned on. |
623 | | */ |
624 | | void wget_html_parse_buffer( |
625 | | const char *buf, |
626 | | wget_xml_callback *callback, |
627 | | void *user_ctx, |
628 | | int hints) |
629 | 6.90k | { |
630 | 6.90k | wget_xml_parse_buffer(buf, callback, user_ctx, hints | XML_HINT_HTML); |
631 | 6.90k | } |
632 | | |
633 | | /** |
634 | | * \param[in] fname Name of XML or HTML input file |
635 | | * \param[in] callback Function called for each token scan result |
636 | | * \param[in] user_ctx User-defined context variable, handed to \p callback |
637 | | * \param[in] hints Flags to influence parsing |
638 | | * |
639 | | * Convenience function that calls wget_xml_parse_buffer() with the file content. |
640 | | * |
641 | | * If \p fname is `-`, the data is read from stdin. |
642 | | */ |
643 | | void wget_xml_parse_file( |
644 | | const char *fname, |
645 | | wget_xml_callback *callback, |
646 | | void *user_ctx, |
647 | | int hints) |
648 | 3.25k | { |
649 | 3.25k | if (strcmp(fname,"-")) { |
650 | 1.62k | int fd; |
651 | | |
652 | 1.62k | if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) { |
653 | 1.62k | struct stat st; |
654 | 1.62k | if (fstat(fd, &st) == 0) { |
655 | 1.62k | #ifdef HAVE_MMAP |
656 | 1.62k | size_t nread = st.st_size; |
657 | 1.62k | char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); |
658 | | #else |
659 | | char *buf = wget_malloc(st.st_size + 1); |
660 | | if (!buf) |
661 | | error_printf(_("Failed to allocate %zu bytes for XML parse buffer\n"), st.st_size + 1); |
662 | | size_t nread = buf ? read(fd, buf, st.st_size) : -1; |
663 | | #endif |
664 | | |
665 | 1.62k | if (nread > 0) { |
666 | 0 | buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system |
667 | 0 | wget_xml_parse_buffer(buf, callback, user_ctx, hints); |
668 | 0 | } |
669 | | |
670 | 1.62k | #ifdef HAVE_MMAP |
671 | 1.62k | munmap(buf, nread); |
672 | | #else |
673 | | xfree(buf); |
674 | | #endif |
675 | 1.62k | } |
676 | 1.62k | close(fd); |
677 | 1.62k | } else |
678 | 0 | error_printf(_("Failed to open %s\n"), fname); |
679 | 1.62k | } else { |
680 | | // read data from STDIN. |
681 | | // maybe should use yy_scan_bytes instead of buffering into memory. |
682 | 1.62k | char tmp[4096]; |
683 | 1.62k | ssize_t nbytes; |
684 | 1.62k | wget_buffer buf; |
685 | | |
686 | 1.62k | wget_buffer_init(&buf, NULL, 4096); |
687 | | |
688 | 1.62k | while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) { |
689 | 0 | wget_buffer_memcat(&buf, tmp, nbytes); |
690 | 0 | } |
691 | | |
692 | 1.62k | if (buf.length) |
693 | 0 | wget_xml_parse_buffer(buf.data, callback, user_ctx, hints); |
694 | | |
695 | 1.62k | wget_buffer_deinit(&buf); |
696 | 1.62k | } |
697 | 3.25k | } |
698 | | |
699 | | /** |
700 | | * \param[in] fname Name of XML or HTML input file |
701 | | * \param[in] callback Function called for each token scan result |
702 | | * \param[in] user_ctx User-defined context variable, handed to \p callback |
703 | | * \param[in] hints Flags to influence parsing |
704 | | * |
705 | | * Convenience function that calls wget_xml_parse_file() with HTML parsing turned on. |
706 | | * |
707 | | * If \p fname is `-`, the data is read from stdin. |
708 | | */ |
709 | | void wget_html_parse_file( |
710 | | const char *fname, |
711 | | wget_xml_callback *callback, |
712 | | void *user_ctx, |
713 | | int hints) |
714 | 3.25k | { |
715 | 3.25k | wget_xml_parse_file(fname, callback, user_ctx, hints | XML_HINT_HTML); |
716 | 3.25k | } |
717 | | |
718 | | /** |
719 | | * \param[in] src A string |
720 | | * \return A pointer to \p src, after the XML entities have been converted |
721 | | * |
722 | | * Decode XML entities from \p src. |
723 | | * |
724 | | * **The transformation is done inline**, so `src` will be modified after this function returns. |
725 | | * If no XML entities have been found, \p src is left untouched. |
726 | | * |
727 | | * Only a small subset of available XML entities is currently recognized. |
728 | | */ |
729 | | char *wget_xml_decode_entities_inline(char *src) |
730 | 0 | { |
731 | 0 | char *ret = NULL; |
732 | 0 | unsigned char *s = (unsigned char *)src; // just a helper to avoid casting a lot |
733 | 0 | unsigned char *d = s; |
734 | |
|
735 | 0 | while (*s) { |
736 | 0 | if (*s == '&') { |
737 | | // entities are case sensitive (RFC1866, 3.2.3) |
738 | 0 | if (s[1] == '#') { |
739 | 0 | if (s[2] == 'x') |
740 | 0 | *d = (unsigned char) strtol((char *) s + 3, (char **) &s, 16); |
741 | 0 | else |
742 | 0 | *d = (unsigned char) strtol((char *) s + 2, (char **) &s, 10); |
743 | 0 | if (*d == ' ') *d = '+'; // hack |
744 | 0 | d++; |
745 | 0 | if (*s == ';') s++; |
746 | 0 | ret = src; |
747 | 0 | continue; |
748 | 0 | } else if (!strncmp((char *) s + 1, "amp;", 4)) { |
749 | 0 | *d++ = '&'; |
750 | 0 | s += 5; |
751 | 0 | ret = src; |
752 | 0 | continue; |
753 | 0 | } else if (!strncmp((char *) s + 1, "gt;", 3)) { |
754 | 0 | *d++ = '>'; |
755 | 0 | s += 4; |
756 | 0 | ret = src; |
757 | 0 | continue; |
758 | 0 | } else if (!strncmp((char *) s + 1, "lt;", 3)) { |
759 | 0 | *d++ = '<'; |
760 | 0 | s += 4; |
761 | 0 | ret = src; |
762 | 0 | continue; |
763 | 0 | } else if (!strncmp((char *) s + 1, "quot;", 5)) { |
764 | 0 | *d++ = '\"'; |
765 | 0 | s += 6; |
766 | 0 | ret = src; |
767 | 0 | continue; |
768 | 0 | } else if (!strncmp((char *) s + 1, "apos;", 5)) { |
769 | 0 | *d++ = '\''; |
770 | 0 | s += 6; |
771 | 0 | ret = src; |
772 | 0 | continue; |
773 | 0 | } |
774 | 0 | } |
775 | | |
776 | 0 | *d++ = *s++; |
777 | 0 | } |
778 | 0 | *d = 0; |
779 | |
|
780 | 0 | return ret; |
781 | 0 | } |
782 | | |
783 | | |
784 | | /** @} */ |