Coverage Report

Created: 2025-12-31 07:08

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/wget2/libwget/xml.c
Line
Count
Source
1
/*
2
 * Copyright (c) 2012 Tim Ruehsen
3
 * Copyright (c) 2015-2024 Free Software Foundation, Inc.
4
 *
5
 * This file is part of libwget.
6
 *
7
 * Libwget is free software: you can redistribute it and/or modify
8
 * it under the terms of the GNU Lesser General Public License as published by
9
 * the Free Software Foundation, either version 3 of the License, or
10
 * (at your option) any later version.
11
 *
12
 * Libwget is distributed in the hope that it will be useful,
13
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15
 * GNU Lesser General Public License for more details.
16
 *
17
 * You should have received a copy of the GNU Lesser General Public License
18
 * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19
 *
20
 *
21
 * xml parsing routines
22
 *
23
 * Changelog
24
 * 22.06.2012  Tim Ruehsen  created, but needs definitely a rewrite
25
 *
26
 * This derives from an old source code that I wrote in 2001.
27
 * It is short, fast and has a low memory print, BUT it is a hack.
28
 * It has to be replaced by e.g. libxml2 or something better.
29
 *
30
 * HTML parsing is (very) different from XML parsing, see here:
31
 * https://html.spec.whatwg.org/multipage/syntax.html
32
 * It is a PITA and should be handled by a specialized, external library !
33
 *
34
 */
35
36
#include <config.h>
37
38
#include <unistd.h>
39
#include <stdio.h>
40
#include <string.h>
41
#include <fcntl.h>
42
#include <sys/stat.h>
43
#ifdef HAVE_MMAP
44
#include <sys/mman.h>
45
#endif
46
47
#include <wget.h>
48
#include "private.h"
49
50
typedef struct {
51
  const char
52
    *buf, //!< pointer to original start of buffer (0-terminated)
53
    *p, //!< pointer next char in buffer
54
    *token; //!< token buffer
55
  int
56
    hints; //!< XML_HINT...
57
  size_t
58
    token_size, //!< size of token buffer
59
    token_len; //!< used bytes of token buffer (not counting terminating 0 byte)
60
  void
61
    *user_ctx; //!< user context (not needed if we were using nested functions)
62
  wget_xml_callback
63
    *callback; //!< callback function for tokens
64
} xml_context;
65
66
/* \cond _hide_internal_symbols */
67
5.88M
#define ascii_isspace(c) (c == ' ' || (c >= 9 && c <=  13))
68
69
// working only for consecutive alphabets, e.g. EBCDIC would not work
70
3.67M
#define ascii_isalpha(c) ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))
71
/* \endcond */
72
73
// append a char to token buffer
74
75
static const char *getToken(xml_context *context)
76
2.22M
{
77
2.22M
  int c;
78
2.22M
  const char *p;
79
80
  // skip leading whitespace
81
2.25M
  while ((c = *context->p) && ascii_isspace(c))
82
25.3k
    context->p++;
83
2.22M
  if (!c) return NULL; // eof
84
1.83M
  context->token = context->p++;
85
86
//  info_printf("a c=%c\n", c);
87
88
1.83M
  if (ascii_isalpha(c) || c == '_') {
89
1.02M
    while ((c = *context->p) && !ascii_isspace(c) && c != '>' && c != '=')
90
720k
      context->p++;
91
303k
    if (!c) return NULL; // syntax error
92
93
302k
    context->token_len = context->p - context->token;
94
302k
    return context->token;
95
303k
  }
96
97
1.53M
  if (c == '/') {
98
10.5k
    if (!(c = *context->p)) return NULL; // syntax error
99
10.4k
    context->p++;
100
10.4k
    if (c == '>') {
101
4.85k
      context->token_len = 2;
102
4.85k
      return context->token;
103
5.63k
    } else return NULL; // syntax error
104
10.4k
  }
105
106
1.52M
  if (c == '\"' || c == '\'') { // read in quoted value
107
8.32k
    int quote = c;
108
109
8.32k
    context->token = context->p;
110
111
8.32k
    if (!(p = strchr(context->p, quote)))
112
689
      return NULL;
113
7.63k
    context->p = p + 1;
114
115
7.63k
    context->token_len = context->p - context->token - 1;
116
7.63k
    return context->token;
117
8.32k
  }
118
119
1.51M
  if (c == '<') { // fetch specials, e.g. start of comments '<!--'
120
689k
    if (!(c = *context->p)) return NULL; // syntax error
121
689k
    context->p++;
122
689k
    if (c == '?' || c == '/') {
123
15.5k
      context->token_len = 2;
124
15.5k
      return context->token;
125
15.5k
    }
126
127
673k
    if (c == '!') {
128
      // left: <!--, <![CDATA[ and <!WHATEVER
129
99.5k
      if (!(c = *context->p)) return NULL; // syntax error
130
99.5k
      if (c == '-') {
131
8.22k
        context->p++;
132
8.22k
        if (!(c = *context->p)) return NULL; // syntax error
133
8.21k
        context->p++;
134
8.21k
        if (c == '-') {
135
5.61k
          context->token_len = 4;
136
5.61k
          return context->token;
137
5.61k
        } else {
138
2.59k
          context->p -= 2;
139
2.59k
          context->token_len = 2;
140
2.59k
          return context->token;
141
2.59k
        }
142
91.2k
      } else {
143
91.2k
        context->token_len = 2;
144
91.2k
        return context->token;
145
91.2k
      }
146
574k
    } else {
147
574k
      context->p--;
148
574k
      context->token_len = 1;
149
574k
      return context->token;
150
574k
    }
151
673k
  }
152
153
823k
  if (c == '>' || c == '=') {
154
802k
    context->token_len = 1;
155
802k
    return context->token;
156
802k
  }
157
158
20.7k
  if (c == '-') { // fetch specials, e.g. end of comments '-->'
159
5.34k
    if (!(c = *context->p)) return NULL; // syntax error
160
5.31k
    if (c != '-') {
161
2.77k
      c = '-';  //???
162
2.77k
    } else {
163
2.53k
      context->p++;
164
2.53k
      if (!(c = *context->p)) return NULL; // syntax error
165
2.52k
      context->p++;
166
2.52k
      if (c != '>') {
167
1.45k
        context->p -= 2;
168
1.45k
        c = '-';
169
1.45k
      } else {
170
1.07k
        context->token_len = 3;
171
1.07k
        return context->token;
172
1.07k
      }
173
2.52k
    }
174
5.31k
  }
175
176
19.6k
  if (c == '?') { // fetch specials, e.g. '?>'
177
2.85k
    if (!(c = *context->p)) return NULL; // syntax error
178
2.83k
    if (c != '>') {
179
      // c = '?';
180
1.52k
    } else {
181
1.31k
      context->p++;
182
1.31k
      context->token_len = 2;
183
1.31k
      return context->token;
184
1.31k
    }
185
2.83k
  }
186
187
1.30M
  while ((c = *context->p) && !ascii_isspace(c))
188
1.28M
    context->p++;
189
190
18.2k
  if (c) {
191
16.4k
    debug_printf("getToken =%.*s\n", (int)(context->p - context->token), context->token);
192
16.4k
    context->token_len = context->p - context->token;
193
16.4k
    return context->token;
194
16.4k
  }
195
196
1.80k
  return NULL;
197
18.2k
}
198
199
static const char *getHTMLValue(xml_context *context)
200
17.7k
{
201
17.7k
  int c;
202
17.7k
  const char *p;
203
204
  // skip leading whitespace
205
18.6k
  while ((c = *context->p) && ascii_isspace(c))
206
945
    context->p++;
207
17.7k
  if (!c) return NULL; // eof
208
17.6k
  context->token = context->p++;
209
210
  // Check for and read in quoted value.
211
17.6k
  if (c == '\"' || c == '\'' || c == '`') {
212
3.07k
    int quote = c;
213
214
3.07k
    context->token = context->p;
215
216
3.07k
    if (!(p = strchr(context->p, quote)))
217
85
      return NULL;
218
2.99k
    context->p = p + 1;
219
220
2.99k
    context->token_len = context->p - context->token - 1;
221
2.99k
    return context->token;
222
3.07k
  }
223
224
  // Read in unquoted value.
225
266k
  while ((c = *context->p) && !ascii_isspace(c) && c != '<' && c != '>' && !(c == '/' && *context->p == '>'))
226
251k
    context->p++;
227
14.5k
  if (c) {
228
14.3k
    debug_printf("getHTMLValue =%.*s\n", (int)(context->p - context->token), context->token);
229
14.3k
    context->token_len = context->p - context->token;
230
14.3k
    return context->token;
231
14.3k
  }
232
233
249
  return NULL;
234
14.5k
}
235
236
static int getValue(xml_context *context)
237
87.2k
{
238
87.2k
  int c;
239
240
87.2k
  context->token_len = 0;
241
87.2k
  context->token = context->p;
242
243
  // remove leading spaces
244
120k
  while ((c = *context->p) && ascii_isspace(c))
245
32.8k
    context->p++;
246
87.2k
  if (!c) return EOF;
247
248
85.9k
  if (c == '=') {
249
35.0k
    context->p++;
250
251
35.0k
    if (context->hints&XML_HINT_HTML) {
252
17.7k
      if (!getHTMLValue(context))
253
396
        return EOF; // syntax error
254
17.3k
      else
255
17.3k
        return 1; // token valid
256
17.7k
    }
257
258
17.3k
    if (!getToken(context))
259
1.77k
      return EOF; // syntax error
260
15.5k
    else
261
15.5k
      return 1; // token valid
262
17.3k
  }
263
264
  // attribute without value
265
50.9k
  context->token = context->p;
266
50.9k
  return 1;
267
85.9k
}
268
269
// special HTML <script> content parsing
270
// see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element
271
// see https://html.spec.whatwg.org/multipage/scripting.html#restrictions-for-contents-of-script-elements
272
273
static const char *getScriptContent(xml_context *context)
274
2.70k
{
275
2.70k
  int comment = 0, length_valid = 0;
276
2.70k
  const char *p;
277
278
95.9k
  for (p = context->token = context->p; *p; p++) {
279
95.4k
    if (comment) {
280
20.4k
      if (*p == '-' && !strncmp(p, "-->", 3)) {
281
976
        p += 3 - 1;
282
976
        comment = 0;
283
976
      }
284
74.9k
    } else {
285
74.9k
      if (*p == '<' && !strncmp(p, "<!--", 4)) {
286
1.12k
        p += 4 - 1;
287
1.12k
        comment = 1;
288
73.8k
      } else if (*p == '<' && !wget_strncasecmp_ascii(p, "</script", 8)) {
289
3.05k
        context->token_len = p - context->token;
290
3.05k
        length_valid = 1;
291
5.31k
        for (p += 8; ascii_isspace(*p); p++);
292
3.05k
        if (*p == '>') {
293
2.10k
          p++;
294
2.10k
          break; // found end of <script>
295
2.10k
        } else if (!*p)
296
80
          break; // end of input
297
3.05k
      }
298
74.9k
    }
299
95.4k
  }
300
2.70k
  context->p = p;
301
302
2.70k
  if (!length_valid)
303
438
    context->token_len = p - context->token;
304
305
2.70k
  if (!*p && !context->token_len)
306
163
    return NULL;
307
308
2.54k
  if (context->callback)
309
2.54k
    context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "script", NULL, context->token, context->token_len, context->token - context->buf);
310
311
2.54k
  return context->token;
312
2.70k
}
313
314
// special HTML <style> content parsing
315
// see https://html.spec.whatwg.org/multipage/semantics.html#the-style-element
316
static const char *getStyleContent(xml_context *context)
317
3.51k
{
318
3.51k
  int comment = 0, length_valid = 0;
319
3.51k
  const char *p;
320
321
166k
  for (p = context->token = context->p; *p; p++) {
322
164k
    if (comment) {
323
28.4k
      if (p[0] == '*' && p[1] == '/') {
324
2.81k
        p += 2 - 1;
325
2.81k
        comment = 0;
326
2.81k
      }
327
136k
    } else {
328
136k
      if (p[0] == '/' && p[1] == '*') {
329
2.95k
        p += 2 - 1;
330
2.95k
        comment = 1;
331
133k
      } else if (*p == '<' && !wget_strncasecmp_ascii(p, "</style", 7)) {
332
3.07k
        context->token_len = p - context->token;
333
3.07k
        length_valid = 1;
334
4.38k
        for (p += 7; ascii_isspace(*p); p++);
335
3.07k
        if (*p == '>') {
336
2.26k
          p++;
337
2.26k
          break; // found end of <style>
338
2.26k
        } else if (!*p)
339
81
          break; // end of input
340
3.07k
      }
341
136k
    }
342
164k
  }
343
3.51k
  context->p = p;
344
345
3.51k
  if (!length_valid)
346
1.07k
    context->token_len = p - context->token;
347
348
3.51k
  if (!*p && !context->token_len)
349
170
    return NULL;
350
351
3.34k
  if (context->callback)
352
3.34k
    context->callback(context->user_ctx, XML_FLG_CONTENT | XML_FLG_END, "style", NULL, context->token, context->token_len, context->token - context->buf);
353
354
3.34k
  return context->token;
355
3.51k
}
356
357
static const char *getUnparsed(xml_context *context, int flags, const char *end, size_t len, const char *directory)
358
100k
{
359
100k
  int c;
360
361
100k
  if (len == 1) {
362
110k
    for (context->token = context->p; (c = *context->p) && c != *end; context->p++);
363
92.6k
  } else {
364
80.7k
    for (context->token = context->p; (c = *context->p); context->p++) {
365
79.4k
      if (c == *end && context->p[1] == end[1] && (len == 2 || context->p[2] == end[2])) {
366
6.90k
        break;
367
6.90k
      }
368
79.4k
    }
369
8.16k
  }
370
371
100k
  context->token_len = context->p - context->token;
372
100k
  if (c) context->p += len;
373
374
100k
  if (!c && !context->token_len)
375
142
    return NULL;
376
/*
377
  if (context->token && context->token_len && context->hints & XML_HINT_REMOVE_EMPTY_CONTENT) {
378
    int notempty = 0;
379
    char *p;
380
381
    for (p = context->token; *p; p++) {
382
      if (!ascii_isspace(*p)) {
383
        notempty = 1;
384
        break;
385
      }
386
    }
387
388
    if (notempty) {
389
      if (context->callback)
390
        context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf);
391
    } else {
392
      // ignore empty content
393
      context->token_len = 0;
394
      context->token[0] = 0;
395
    }
396
  } else {
397
*/
398
100k
  if (context->callback)
399
98.3k
    context->callback(context->user_ctx, flags, directory, NULL, context->token, context->token_len, context->token - context->buf);
400
401
//  }
402
403
100k
  return context->token;
404
100k
}
405
406
static const char *getComment(xml_context *context)
407
4.59k
{
408
4.59k
  return getUnparsed(context, XML_FLG_COMMENT, "-->", 3, NULL);
409
4.59k
}
410
411
static const char *getProcessing(xml_context *context)
412
3.57k
{
413
3.57k
  return getUnparsed(context, XML_FLG_PROCESSING, "?>", 2, NULL);
414
3.57k
}
415
416
static const char *getSpecial(xml_context *context)
417
92.6k
{
418
92.6k
  return getUnparsed(context, XML_FLG_SPECIAL, ">", 1, NULL);
419
92.6k
}
420
421
static const char *getContent(xml_context *context, const char *directory)
422
1.03M
{
423
1.03M
  int c;
424
425
1.85M
  for (context->token = context->p; (c = *context->p) && c != '<'; context->p++);
426
427
1.03M
  context->token_len = context->p - context->token;
428
429
1.03M
  if (!c && !context->token_len)
430
383k
    return NULL;
431
432
  // debug_printf("content=%.*s\n", (int)context->token_len, context->token);
433
654k
  if (context->callback && context->token_len)
434
98.1k
    context->callback(context->user_ctx, XML_FLG_CONTENT, directory, NULL, context->token, context->token_len, context->token - context->buf);
435
436
654k
  return context->token;
437
1.03M
}
438
439
static int parseXML(const char *dir, xml_context *context)
440
410k
{
441
410k
  const char *tok;
442
410k
  char directory[256] = "";
443
410k
  size_t pos = 0;
444
445
410k
  if (!(context->hints & XML_HINT_HTML)) {
446
403k
    pos = wget_strlcpy(directory, dir, sizeof(directory));
447
403k
    if (pos >= sizeof(directory)) pos = sizeof(directory) - 1;
448
403k
  }
449
450
1.03M
  do {
451
1.03M
    getContent(context, directory);
452
1.03M
    if (context->token_len)
453
99.3k
      debug_printf("%s='%.*s'\n", directory, (int)context->token_len, context->token);
454
455
1.03M
    if (!(tok = getToken(context))) return WGET_E_SUCCESS;  //eof
456
    // debug_printf("A Token '%.*s' len=%zu tok='%s'\n", (int)context->token_len, context->token, context->token_len, tok);
457
458
647k
    if (context->token_len == 1 && *tok == '<') {
459
      // get element name and add it to directory
460
538k
      int flags = XML_FLG_BEGIN;
461
462
538k
      if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR; // syntax error
463
464
      // debug_printf("A2 Token '%.*s'\n", (int)context->token_len, context->token);
465
466
534k
      if (!(context->hints & XML_HINT_HTML)) {
467
398k
        if (!pos || directory[pos - 1] != '/')
468
101k
          wget_snprintf(&directory[pos], sizeof(directory) - pos, "/%.*s", (int)context->token_len, tok);
469
296k
        else
470
296k
          wget_snprintf(&directory[pos], sizeof(directory) - pos, "%.*s", (int)context->token_len, tok);
471
398k
      } else {
472
        // wget_snprintf(directory, sizeof(directory), "%.*s", (int)context->token_len, tok);
473
136k
        size_t dirlen = context->token_len >= sizeof(directory) ? sizeof(directory) - 1 : context->token_len;
474
475
136k
        memcpy(directory, tok, dirlen);
476
136k
        directory[dirlen] = 0;
477
136k
      }
478
479
618k
      while ((tok = getToken(context))) {
480
        // debug_printf("C Token %.*s %zu %p %p dir=%s tok=%s\n", (int)context->token_len, context->token, context->token_len, context->token, context->p, directory, tok);
481
612k
        if (context->token_len == 2 && !strncmp(tok, "/>", 2)) {
482
3.90k
          if (context->callback)
483
3.44k
            context->callback(context->user_ctx, flags | XML_FLG_END, directory, NULL, NULL, 0, 0);
484
3.90k
          break; // stay in this level
485
608k
        } else if (context->token_len == 1 && *tok == '>') {
486
521k
          if (context->callback)
487
455k
            context->callback(context->user_ctx, flags | XML_FLG_CLOSE, directory, NULL, NULL, 0, 0);
488
521k
          if (context->hints & XML_HINT_HTML) {
489
132k
            if (!wget_strcasecmp_ascii(directory, "script")) {
490
              // special HTML <script> content parsing
491
              // see https://html.spec.whatwg.org/multipage/scripting.html#the-script-element
492
              // 4.3.1.2 Restrictions for contents of script elements
493
2.70k
              debug_printf("*** need special <script> handling\n");
494
2.70k
              getScriptContent(context);
495
2.70k
              if (context->token_len)
496
906
                debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token);
497
2.70k
            }
498
129k
            else if (!wget_strcasecmp_ascii(directory, "style")) {
499
3.51k
              getStyleContent(context);
500
3.51k
              if (context->token_len)
501
1.72k
                debug_printf("%s=%.*s\n", directory, (int)context->token_len, context->token);
502
3.51k
            }
503
132k
          } else
504
388k
            parseXML(directory, context); // descend one level
505
521k
          break;
506
521k
        } else {
507
87.2k
          char attribute[256];
508
87.2k
          size_t attrlen = context->token_len >= sizeof(attribute) ? sizeof(attribute) - 1 : context->token_len;
509
510
87.2k
          memcpy(attribute, tok, attrlen);
511
87.2k
          attribute[attrlen] = 0;
512
513
87.2k
          if (getValue(context) == EOF) return WGET_E_XML_PARSE_ERR; // syntax error
514
515
83.8k
          if (context->token_len) {
516
31.1k
            debug_printf("%s/@%s=%.*s\n", directory, attribute, (int)context->token_len, context->token);
517
31.1k
            if (context->callback)
518
29.5k
              context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, context->token, context->token_len, context->token - context->buf);
519
52.6k
          } else {
520
52.6k
            debug_printf("%s/@%s\n", directory, attribute);
521
52.6k
            if (context->callback)
522
48.0k
              context->callback(context->user_ctx, flags | XML_FLG_ATTRIBUTE, directory, attribute, NULL, 0, 0);
523
52.6k
          }
524
83.8k
          flags = 0;
525
83.8k
        }
526
612k
      }
527
531k
      directory[pos] = 0;
528
531k
    } else if (context->token_len == 2) {
529
105k
      if (!strncmp(tok, "</", 2)) {
530
        // ascend one level
531
        // cleanup - get name and '>'
532
9.06k
        if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR;
533
        // debug_printf("X Token %s\n",tok);
534
7.79k
        if (context->callback) {
535
5.92k
          if (!(context->hints & XML_HINT_HTML))
536
4.13k
            context->callback(context->user_ctx, XML_FLG_END, directory, NULL, NULL, 0, 0);
537
1.78k
          else {
538
1.78k
            char tmp[128], *tag = tmp; // we need to \0 terminate tok
539
1.78k
            if (context->token_len >= sizeof(tmp))
540
272
              tag = wget_malloc(context->token_len + 1);
541
1.78k
            if (tag) {
542
1.78k
              memcpy(tag, tok, context->token_len);
543
1.78k
              tag[context->token_len] = 0;
544
1.78k
              context->callback(context->user_ctx, XML_FLG_END, tag, NULL, NULL, 0, 0);
545
1.78k
              if (tag != tmp)
546
272
                xfree(tag);
547
1.78k
            }
548
1.78k
          }
549
5.92k
        }
550
7.79k
        if (!(tok = getToken(context))) return WGET_E_XML_PARSE_ERR;
551
        // debug_printf("Y Token %s\n",tok);
552
6.83k
        if (!(context->hints & XML_HINT_HTML))
553
5.12k
          return WGET_E_SUCCESS;
554
1.71k
        else
555
1.71k
          continue;
556
96.1k
      } else if (!strncmp(tok, "<?", 2)) { // special info - ignore
557
3.57k
        getProcessing(context);
558
3.57k
        debug_printf("%s=<?%.*s?>\n", directory, (int)context->token_len, context->token);
559
3.57k
        continue;
560
92.6k
      } else if (!strncmp(tok, "<!", 2)) {
561
92.6k
        getSpecial(context);
562
92.6k
        debug_printf("%s=<!%.*s>\n", directory, (int)context->token_len, context->token);
563
92.6k
      }
564
105k
    } else if (context->token_len == 4 && !strncmp(tok, "<!--", 4)) { // comment - ignore
565
4.59k
      getComment(context);
566
4.59k
      debug_printf("%s=<!--%.*s-->\n", directory, (int)context->token_len, context->token);
567
4.59k
      continue;
568
4.59k
    }
569
647k
  } while (tok);
570
6.05k
  return WGET_E_SUCCESS;
571
410k
}
572
573
/**
574
 * \file
575
 * \brief XML parsing functions
576
 * \defgroup libwget-xml XML parsing functions
577
 * @{
578
 */
579
580
/**
581
 * \param[in] buf Zero-terminated XML or HTML input data
582
 * \param[in] callback Function called for each token scan result
583
 * \param[in] user_ctx User-defined context variable, handed to \p callback
584
 * \param[in] hints Flags to influence parsing
585
 *
586
 * This function scans the XML input from \p buf and calls \p callback for each token
587
 * found. \p user_ctx is a user-defined context variable and given to each call of \p callback.
588
 *
589
 * \p hints may be 0 or any combination of %XML_HINT_REMOVE_EMPTY_CONTENT and %XML_HINT_HTML.
590
 *
591
 * %XML_HINT_REMOVE_EMPTY_CONTENT reduces the number of calls to \p callback by ignoring
592
 * empty content and superfluous spaces.
593
 *
594
 * %XML_HINT_HTML turns on HTML scanning.
595
 */
596
int wget_xml_parse_buffer(
597
  const char *buf,
598
  wget_xml_callback *callback,
599
  void *user_ctx,
600
  int hints)
601
21.4k
{
602
21.4k
  xml_context context;
603
604
21.4k
  context.token = NULL;
605
21.4k
  context.token_size = 0;
606
21.4k
  context.token_len = 0;
607
21.4k
  context.buf = buf;
608
21.4k
  context.p = buf;
609
21.4k
  context.user_ctx = user_ctx;
610
21.4k
  context.callback = callback;
611
21.4k
  context.hints = hints;
612
613
21.4k
  return parseXML ("/", &context);
614
21.4k
}
615
616
/**
617
 * \param[in] buf Zero-terminated HTML input data
618
 * \param[in] callback Function called for each token scan result
619
 * \param[in] user_ctx User-defined context variable, handed to \p callback
620
 * \param[in] hints Flags to influence parsing
621
 *
622
 * Convenience function that calls wget_xml_parse_buffer() with HTML parsing turned on.
623
 */
624
void wget_html_parse_buffer(
625
  const char *buf,
626
  wget_xml_callback *callback,
627
  void *user_ctx,
628
  int hints)
629
7.07k
{
630
7.07k
  wget_xml_parse_buffer(buf, callback, user_ctx, hints | XML_HINT_HTML);
631
7.07k
}
632
633
/**
634
 * \param[in] fname Name of XML or HTML input file
635
 * \param[in] callback Function called for each token scan result
636
 * \param[in] user_ctx User-defined context variable, handed to \p callback
637
 * \param[in] hints Flags to influence parsing
638
 *
639
 * Convenience function that calls wget_xml_parse_buffer() with the file content.
640
 *
641
 * If \p fname is `-`, the data is read from stdin.
642
 */
643
void wget_xml_parse_file(
644
  const char *fname,
645
  wget_xml_callback *callback,
646
  void *user_ctx,
647
  int hints)
648
3.35k
{
649
3.35k
  if (strcmp(fname,"-")) {
650
1.67k
    int fd;
651
652
1.67k
    if ((fd = open(fname, O_RDONLY|O_BINARY)) != -1) {
653
1.67k
      struct stat st;
654
1.67k
      if (fstat(fd, &st) == 0) {
655
1.67k
#ifdef HAVE_MMAP
656
1.67k
        size_t nread = st.st_size;
657
1.67k
        char *buf = mmap(NULL, nread + 1, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
658
#else
659
        char *buf = wget_malloc(st.st_size + 1);
660
        if (!buf)
661
          error_printf(_("Failed to allocate %zu bytes for XML parse buffer\n"), st.st_size + 1);
662
        size_t nread = buf ? read(fd, buf, st.st_size) : -1;
663
#endif
664
665
1.67k
        if (nread > 0) {
666
0
          buf[nread] = 0; // PROT_WRITE allows this write, MAP_PRIVATE prevents changes in underlying file system
667
0
          wget_xml_parse_buffer(buf, callback, user_ctx, hints);
668
0
        }
669
670
1.67k
#ifdef HAVE_MMAP
671
1.67k
        munmap(buf, nread);
672
#else
673
        xfree(buf);
674
#endif
675
1.67k
      }
676
1.67k
      close(fd);
677
1.67k
    } else
678
0
      error_printf(_("Failed to open %s\n"), fname);
679
1.67k
  } else {
680
    // read data from STDIN.
681
    // maybe should use yy_scan_bytes instead of buffering into memory.
682
1.67k
    char tmp[4096];
683
1.67k
    ssize_t nbytes;
684
1.67k
    wget_buffer buf;
685
686
1.67k
    wget_buffer_init(&buf, NULL, 4096);
687
688
1.67k
    while ((nbytes = read(STDIN_FILENO, tmp, sizeof(tmp))) > 0) {
689
0
      wget_buffer_memcat(&buf, tmp, nbytes);
690
0
    }
691
692
1.67k
    if (buf.length)
693
0
      wget_xml_parse_buffer(buf.data, callback, user_ctx, hints);
694
695
1.67k
    wget_buffer_deinit(&buf);
696
1.67k
  }
697
3.35k
}
698
699
/**
700
 * \param[in] fname Name of XML or HTML input file
701
 * \param[in] callback Function called for each token scan result
702
 * \param[in] user_ctx User-defined context variable, handed to \p callback
703
 * \param[in] hints Flags to influence parsing
704
 *
705
 * Convenience function that calls wget_xml_parse_file() with HTML parsing turned on.
706
 *
707
 * If \p fname is `-`, the data is read from stdin.
708
 */
709
void wget_html_parse_file(
710
  const char *fname,
711
  wget_xml_callback *callback,
712
  void *user_ctx,
713
  int hints)
714
3.35k
{
715
3.35k
  wget_xml_parse_file(fname, callback, user_ctx, hints | XML_HINT_HTML);
716
3.35k
}
717
718
/**
719
 * \param[in] src A string
720
 * \return A pointer to \p src, after the XML entities have been converted
721
 *
722
 * Decode XML entities from \p src.
723
 *
724
 * **The transformation is done inline**, so `src` will be modified after this function returns.
725
 * If no XML entities have been found, \p src is left untouched.
726
 *
727
 * Only a small subset of available XML entities is currently recognized.
728
 */
729
char *wget_xml_decode_entities_inline(char *src)
730
0
{
731
0
  char *ret = NULL;
732
0
  unsigned char *s = (unsigned char *)src; // just a helper to avoid casting a lot
733
0
  unsigned char *d = s;
734
735
0
  while (*s) {
736
0
    if (*s == '&') {
737
      // entities are case sensitive (RFC1866, 3.2.3)
738
0
      if (s[1] == '#') {
739
0
        if (s[2] == 'x')
740
0
          *d = (unsigned char) strtol((char *) s + 3, (char **) &s, 16);
741
0
        else
742
0
          *d = (unsigned char) strtol((char *) s + 2, (char **) &s, 10);
743
0
        if (*d == ' ') *d = '+'; // hack
744
0
        d++;
745
0
        if (*s == ';') s++;
746
0
        ret = src;
747
0
        continue;
748
0
      } else if (!strncmp((char *) s + 1, "amp;", 4)) {
749
0
        *d++ = '&';
750
0
        s += 5;
751
0
        ret = src;
752
0
        continue;
753
0
      } else if (!strncmp((char *) s + 1, "gt;", 3)) {
754
0
        *d++ = '>';
755
0
        s += 4;
756
0
        ret = src;
757
0
        continue;
758
0
      } else if (!strncmp((char *) s + 1, "lt;", 3)) {
759
0
        *d++ = '<';
760
0
        s += 4;
761
0
        ret = src;
762
0
        continue;
763
0
      } else if (!strncmp((char *) s + 1, "quot;", 5)) {
764
0
        *d++ = '\"';
765
0
        s += 6;
766
0
        ret = src;
767
0
        continue;
768
0
      } else if (!strncmp((char *) s + 1, "apos;", 5)) {
769
0
        *d++ = '\'';
770
0
        s += 6;
771
0
        ret = src;
772
0
        continue;
773
0
      }
774
0
    }
775
776
0
    *d++ = *s++;
777
0
  }
778
0
  *d = 0;
779
780
0
  return ret;
781
0
}
782
783
784
/** @} */