Coverage Report

Created: 2026-03-12 06:42

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libxml2-2.9.7/parserInternals.c
Line
Count
Source
1
/*
2
 * parserInternals.c : Internal routines (and obsolete ones) needed for the
3
 *                     XML and HTML parsers.
4
 *
5
 * See Copyright for the status of this software.
6
 *
7
 * daniel@veillard.com
8
 */
9
10
#define IN_LIBXML
11
#include "libxml.h"
12
13
#if defined(_WIN32) && !defined (__CYGWIN__)
14
#define XML_DIR_SEP '\\'
15
#else
16
#define XML_DIR_SEP '/'
17
#endif
18
19
#include <string.h>
20
#ifdef HAVE_CTYPE_H
21
#include <ctype.h>
22
#endif
23
#ifdef HAVE_STDLIB_H
24
#include <stdlib.h>
25
#endif
26
#ifdef HAVE_SYS_STAT_H
27
#include <sys/stat.h>
28
#endif
29
#ifdef HAVE_FCNTL_H
30
#include <fcntl.h>
31
#endif
32
#ifdef HAVE_UNISTD_H
33
#include <unistd.h>
34
#endif
35
#ifdef HAVE_ZLIB_H
36
#include <zlib.h>
37
#endif
38
39
#include <libxml/xmlmemory.h>
40
#include <libxml/tree.h>
41
#include <libxml/parser.h>
42
#include <libxml/parserInternals.h>
43
#include <libxml/valid.h>
44
#include <libxml/entities.h>
45
#include <libxml/xmlerror.h>
46
#include <libxml/encoding.h>
47
#include <libxml/valid.h>
48
#include <libxml/xmlIO.h>
49
#include <libxml/uri.h>
50
#include <libxml/dict.h>
51
#include <libxml/SAX.h>
52
#ifdef LIBXML_CATALOG_ENABLED
53
#include <libxml/catalog.h>
54
#endif
55
#include <libxml/globals.h>
56
#include <libxml/chvalid.h>
57
58
49.5M
#define CUR(ctxt) ctxt->input->cur
59
49.5M
#define END(ctxt) ctxt->input->end
60
49.5M
#define VALID_CTXT(ctxt) (CUR(ctxt) <= END(ctxt))
61
62
#include "buf.h"
63
#include "enc.h"
64
65
/*
66
 * Various global defaults for parsing
67
 */
68
69
/**
70
 * xmlCheckVersion:
71
 * @version: the include version number
72
 *
73
 * check the compiled lib version against the include one.
74
 * This can warn or immediately kill the application
75
 */
76
void
77
0
xmlCheckVersion(int version) {
78
0
    int myversion = (int) LIBXML_VERSION;
79
80
0
    xmlInitParser();
81
82
0
    if ((myversion / 10000) != (version / 10000)) {
83
0
  xmlGenericError(xmlGenericErrorContext,
84
0
    "Fatal: program compiled against libxml %d using libxml %d\n",
85
0
    (version / 10000), (myversion / 10000));
86
0
  fprintf(stderr,
87
0
    "Fatal: program compiled against libxml %d using libxml %d\n",
88
0
    (version / 10000), (myversion / 10000));
89
0
    }
90
0
    if ((myversion / 100) < (version / 100)) {
91
0
  xmlGenericError(xmlGenericErrorContext,
92
0
    "Warning: program compiled against libxml %d using older %d\n",
93
0
    (version / 100), (myversion / 100));
94
0
    }
95
0
}
96
97
98
/************************************************************************
99
 *                  *
100
 *    Some factorized error routines        *
101
 *                  *
102
 ************************************************************************/
103
104
105
/**
106
 * xmlErrMemory:
107
 * @ctxt:  an XML parser context
108
 * @extra:  extra informations
109
 *
110
 * Handle a redefinition of attribute error
111
 */
112
void
113
xmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
114
7
{
115
7
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
116
3
        (ctxt->instate == XML_PARSER_EOF))
117
0
  return;
118
7
    if (ctxt != NULL) {
119
7
        ctxt->errNo = XML_ERR_NO_MEMORY;
120
7
        ctxt->instate = XML_PARSER_EOF;
121
7
        ctxt->disableSAX = 1;
122
7
    }
123
7
    if (extra)
124
1
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
125
1
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
126
1
                        NULL, NULL, 0, 0,
127
1
                        "Memory allocation failed : %s\n", extra);
128
6
    else
129
6
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
130
6
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
131
6
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
132
7
}
133
134
/**
135
 * __xmlErrEncoding:
136
 * @ctxt:  an XML parser context
137
 * @xmlerr:  the error number
138
 * @msg:  the error message
139
 * @str1:  an string info
140
 * @str2:  an string info
141
 *
142
 * Handle an encoding error
143
 */
144
void
145
__xmlErrEncoding(xmlParserCtxtPtr ctxt, xmlParserErrors xmlerr,
146
                 const char *msg, const xmlChar * str1, const xmlChar * str2)
147
65.2k
{
148
65.2k
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
149
0
        (ctxt->instate == XML_PARSER_EOF))
150
0
  return;
151
65.2k
    if (ctxt != NULL)
152
65.2k
        ctxt->errNo = xmlerr;
153
65.2k
    __xmlRaiseError(NULL, NULL, NULL,
154
65.2k
                    ctxt, NULL, XML_FROM_PARSER, xmlerr, XML_ERR_FATAL,
155
65.2k
                    NULL, 0, (const char *) str1, (const char *) str2,
156
65.2k
                    NULL, 0, 0, msg, str1, str2);
157
65.2k
    if (ctxt != NULL) {
158
65.2k
        ctxt->wellFormed = 0;
159
65.2k
        if (ctxt->recovery == 0)
160
41
            ctxt->disableSAX = 1;
161
65.2k
    }
162
65.2k
}
163
164
/**
165
 * xmlErrInternal:
166
 * @ctxt:  an XML parser context
167
 * @msg:  the error message
168
 * @str:  error informations
169
 *
170
 * Handle an internal error
171
 */
172
static void LIBXML_ATTR_FORMAT(2,0)
173
xmlErrInternal(xmlParserCtxtPtr ctxt, const char *msg, const xmlChar * str)
174
57
{
175
57
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
176
0
        (ctxt->instate == XML_PARSER_EOF))
177
0
  return;
178
57
    if (ctxt != NULL)
179
57
        ctxt->errNo = XML_ERR_INTERNAL_ERROR;
180
57
    __xmlRaiseError(NULL, NULL, NULL,
181
57
                    ctxt, NULL, XML_FROM_PARSER, XML_ERR_INTERNAL_ERROR,
182
57
                    XML_ERR_FATAL, NULL, 0, (const char *) str, NULL, NULL,
183
57
                    0, 0, msg, str);
184
57
    if (ctxt != NULL) {
185
57
        ctxt->wellFormed = 0;
186
57
        if (ctxt->recovery == 0)
187
0
            ctxt->disableSAX = 1;
188
57
    }
189
57
}
190
191
/**
192
 * xmlErrEncodingInt:
193
 * @ctxt:  an XML parser context
194
 * @error:  the error number
195
 * @msg:  the error message
196
 * @val:  an integer value
197
 *
198
 * n encoding error
199
 */
200
static void LIBXML_ATTR_FORMAT(3,0)
201
xmlErrEncodingInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
202
                  const char *msg, int val)
203
362k
{
204
362k
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
205
0
        (ctxt->instate == XML_PARSER_EOF))
206
0
  return;
207
362k
    if (ctxt != NULL)
208
343k
        ctxt->errNo = error;
209
362k
    __xmlRaiseError(NULL, NULL, NULL,
210
362k
                    ctxt, NULL, XML_FROM_PARSER, error, XML_ERR_FATAL,
211
362k
                    NULL, 0, NULL, NULL, NULL, val, 0, msg, val);
212
362k
    if (ctxt != NULL) {
213
343k
        ctxt->wellFormed = 0;
214
343k
        if (ctxt->recovery == 0)
215
0
            ctxt->disableSAX = 1;
216
343k
    }
217
362k
}
218
219
/**
220
 * xmlIsLetter:
221
 * @c:  an unicode character (int)
222
 *
223
 * Check whether the character is allowed by the production
224
 * [84] Letter ::= BaseChar | Ideographic
225
 *
226
 * Returns 0 if not, non-zero otherwise
227
 */
228
int
229
0
xmlIsLetter(int c) {
230
0
    return(IS_BASECHAR(c) || IS_IDEOGRAPHIC(c));
231
0
}
232
233
/************************************************************************
234
 *                  *
235
 *    Input handling functions for progressive parsing  *
236
 *                  *
237
 ************************************************************************/
238
239
/* #define DEBUG_INPUT */
240
/* #define DEBUG_STACK */
241
/* #define DEBUG_PUSH */
242
243
244
/* we need to keep enough input to show errors in context */
245
183k
#define LINE_LEN        80
246
247
#ifdef DEBUG_INPUT
248
#define CHECK_BUFFER(in) check_buffer(in)
249
250
static
251
void check_buffer(xmlParserInputPtr in) {
252
    if (in->base != xmlBufContent(in->buf->buffer)) {
253
        xmlGenericError(xmlGenericErrorContext,
254
    "xmlParserInput: base mismatch problem\n");
255
    }
256
    if (in->cur < in->base) {
257
        xmlGenericError(xmlGenericErrorContext,
258
    "xmlParserInput: cur < base problem\n");
259
    }
260
    if (in->cur > in->base + xmlBufUse(in->buf->buffer)) {
261
        xmlGenericError(xmlGenericErrorContext,
262
    "xmlParserInput: cur > base + use problem\n");
263
    }
264
    xmlGenericError(xmlGenericErrorContext,"buffer %x : content %x, cur %d, use %d\n",
265
            (int) in, (int) xmlBufContent(in->buf->buffer), in->cur - in->base,
266
      xmlBufUse(in->buf->buffer));
267
}
268
269
#else
270
#define CHECK_BUFFER(in)
271
#endif
272
273
274
/**
275
 * xmlParserInputRead:
276
 * @in:  an XML parser input
277
 * @len:  an indicative size for the lookahead
278
 *
279
 * This function was internal and is deprecated.
280
 *
281
 * Returns -1 as this is an error to use it.
282
 */
283
int
284
0
xmlParserInputRead(xmlParserInputPtr in ATTRIBUTE_UNUSED, int len ATTRIBUTE_UNUSED) {
285
0
    return(-1);
286
0
}
287
288
/**
289
 * xmlParserInputGrow:
290
 * @in:  an XML parser input
291
 * @len:  an indicative size for the lookahead
292
 *
293
 * This function increase the input for the parser. It tries to
294
 * preserve pointers to the input buffer, and keep already read data
295
 *
296
 * Returns the amount of char read, or -1 in case of error, 0 indicate the
297
 * end of this entity
298
 */
299
int
300
22.0M
xmlParserInputGrow(xmlParserInputPtr in, int len) {
301
22.0M
    int ret;
302
22.0M
    size_t indx;
303
22.0M
    const xmlChar *content;
304
305
22.0M
    if ((in == NULL) || (len < 0)) return(-1);
306
#ifdef DEBUG_INPUT
307
    xmlGenericError(xmlGenericErrorContext, "Grow\n");
308
#endif
309
22.0M
    if (in->buf == NULL) return(-1);
310
14.7M
    if (in->base == NULL) return(-1);
311
14.7M
    if (in->cur == NULL) return(-1);
312
14.7M
    if (in->buf->buffer == NULL) return(-1);
313
314
14.7M
    CHECK_BUFFER(in);
315
316
14.7M
    indx = in->cur - in->base;
317
14.7M
    if (xmlBufUse(in->buf->buffer) > (unsigned int) indx + INPUT_CHUNK) {
318
319
621k
  CHECK_BUFFER(in);
320
321
621k
        return(0);
322
621k
    }
323
14.0M
    if (in->buf->readcallback != NULL) {
324
10.5M
  ret = xmlParserInputBufferGrow(in->buf, len);
325
10.5M
    } else
326
3.54M
        return(0);
327
328
    /*
329
     * NOTE : in->base may be a "dangling" i.e. freed pointer in this
330
     *        block, but we use it really as an integer to do some
331
     *        pointer arithmetic. Insure will raise it as a bug but in
332
     *        that specific case, that's not !
333
     */
334
335
10.5M
    content = xmlBufContent(in->buf->buffer);
336
10.5M
    if (in->base != content) {
337
        /*
338
   * the buffer has been reallocated
339
   */
340
3.37k
  indx = in->cur - in->base;
341
3.37k
  in->base = content;
342
3.37k
  in->cur = &content[indx];
343
3.37k
    }
344
10.5M
    in->end = xmlBufEnd(in->buf->buffer);
345
346
10.5M
    CHECK_BUFFER(in);
347
348
10.5M
    return(ret);
349
14.0M
}
350
351
/**
352
 * xmlParserInputShrink:
353
 * @in:  an XML parser input
354
 *
355
 * This function removes used input for the parser.
356
 */
357
void
358
264k
xmlParserInputShrink(xmlParserInputPtr in) {
359
264k
    size_t used;
360
264k
    size_t ret;
361
264k
    size_t indx;
362
264k
    const xmlChar *content;
363
364
#ifdef DEBUG_INPUT
365
    xmlGenericError(xmlGenericErrorContext, "Shrink\n");
366
#endif
367
264k
    if (in == NULL) return;
368
264k
    if (in->buf == NULL) return;
369
183k
    if (in->base == NULL) return;
370
183k
    if (in->cur == NULL) return;
371
183k
    if (in->buf->buffer == NULL) return;
372
373
183k
    CHECK_BUFFER(in);
374
375
183k
    used = in->cur - xmlBufContent(in->buf->buffer);
376
    /*
377
     * Do not shrink on large buffers whose only a tiny fraction
378
     * was consumed
379
     */
380
183k
    if (used > INPUT_CHUNK) {
381
183k
  ret = xmlBufShrink(in->buf->buffer, used - LINE_LEN);
382
183k
  if (ret > 0) {
383
183k
      in->cur -= ret;
384
183k
      in->consumed += ret;
385
183k
  }
386
183k
  in->end = xmlBufEnd(in->buf->buffer);
387
183k
    }
388
389
183k
    CHECK_BUFFER(in);
390
391
183k
    if (xmlBufUse(in->buf->buffer) > INPUT_CHUNK) {
392
141k
        return;
393
141k
    }
394
41.4k
    xmlParserInputBufferRead(in->buf, 2 * INPUT_CHUNK);
395
41.4k
    content = xmlBufContent(in->buf->buffer);
396
41.4k
    if (in->base != content) {
397
        /*
398
   * the buffer has been reallocated
399
   */
400
0
  indx = in->cur - in->base;
401
0
  in->base = content;
402
0
  in->cur = &content[indx];
403
0
    }
404
41.4k
    in->end = xmlBufEnd(in->buf->buffer);
405
406
41.4k
    CHECK_BUFFER(in);
407
41.4k
}
408
409
/************************************************************************
410
 *                  *
411
 *    UTF8 character input and related functions    *
412
 *                  *
413
 ************************************************************************/
414
415
/**
416
 * xmlNextChar:
417
 * @ctxt:  the XML parser context
418
 *
419
 * Skip to the next char input char.
420
 */
421
422
void
423
xmlNextChar(xmlParserCtxtPtr ctxt)
424
49.5M
{
425
49.5M
    if ((ctxt == NULL) || (ctxt->instate == XML_PARSER_EOF) ||
426
49.5M
        (ctxt->input == NULL))
427
0
        return;
428
429
49.5M
    if (!(VALID_CTXT(ctxt))) {
430
0
        xmlErrInternal(ctxt, "Parser input data memory error\n", NULL);
431
0
  ctxt->errNo = XML_ERR_INTERNAL_ERROR;
432
0
        xmlStopParser(ctxt);
433
0
  return;
434
0
    }
435
436
49.5M
    if ((*ctxt->input->cur == 0) &&
437
2.81k
        (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
438
2.81k
        return;
439
2.81k
    }
440
441
49.5M
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
442
23.4M
        const unsigned char *cur;
443
23.4M
        unsigned char c;
444
445
        /*
446
         *   2.11 End-of-Line Handling
447
         *   the literal two-character sequence "#xD#xA" or a standalone
448
         *   literal #xD, an XML processor must pass to the application
449
         *   the single character #xA.
450
         */
451
23.4M
        if (*(ctxt->input->cur) == '\n') {
452
117k
            ctxt->input->line++; ctxt->input->col = 1;
453
117k
        } else
454
23.2M
            ctxt->input->col++;
455
456
        /*
457
         * We are supposed to handle UTF8, check it's valid
458
         * From rfc2044: encoding of the Unicode values on UTF-8:
459
         *
460
         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
461
         * 0000 0000-0000 007F   0xxxxxxx
462
         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
463
         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
464
         *
465
         * Check for the 0x110000 limit too
466
         */
467
23.4M
        cur = ctxt->input->cur;
468
469
23.4M
        c = *cur;
470
23.4M
        if (c & 0x80) {
471
37.5k
            if (c == 0xC0)
472
20
          goto encoding_error;
473
37.5k
            if (cur[1] == 0) {
474
66
                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
475
66
                cur = ctxt->input->cur;
476
66
            }
477
37.5k
            if ((cur[1] & 0xc0) != 0x80)
478
287
                goto encoding_error;
479
37.2k
            if ((c & 0xe0) == 0xe0) {
480
21.0k
                unsigned int val;
481
482
21.0k
                if (cur[2] == 0) {
483
12
                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
484
12
                    cur = ctxt->input->cur;
485
12
                }
486
21.0k
                if ((cur[2] & 0xc0) != 0x80)
487
46
                    goto encoding_error;
488
21.0k
                if ((c & 0xf0) == 0xf0) {
489
4.48k
                    if (cur[3] == 0) {
490
17
                        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
491
17
                        cur = ctxt->input->cur;
492
17
                    }
493
4.48k
                    if (((c & 0xf8) != 0xf0) ||
494
4.46k
                        ((cur[3] & 0xc0) != 0x80))
495
33
                        goto encoding_error;
496
                    /* 4-byte code */
497
4.44k
                    ctxt->input->cur += 4;
498
4.44k
                    val = (cur[0] & 0x7) << 18;
499
4.44k
                    val |= (cur[1] & 0x3f) << 12;
500
4.44k
                    val |= (cur[2] & 0x3f) << 6;
501
4.44k
                    val |= cur[3] & 0x3f;
502
16.5k
                } else {
503
                    /* 3-byte code */
504
16.5k
                    ctxt->input->cur += 3;
505
16.5k
                    val = (cur[0] & 0xf) << 12;
506
16.5k
                    val |= (cur[1] & 0x3f) << 6;
507
16.5k
                    val |= cur[2] & 0x3f;
508
16.5k
                }
509
21.0k
                if (((val > 0xd7ff) && (val < 0xe000)) ||
510
17.9k
                    ((val > 0xfffd) && (val < 0x10000)) ||
511
15.5k
                    (val >= 0x110000)) {
512
7.35k
    xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
513
7.35k
          "Char 0x%X out of allowed range\n",
514
7.35k
          val);
515
7.35k
                }
516
21.0k
            } else
517
                /* 2-byte code */
518
16.1k
                ctxt->input->cur += 2;
519
37.2k
        } else
520
            /* 1-byte code */
521
23.3M
            ctxt->input->cur++;
522
523
23.4M
        ctxt->nbChars++;
524
26.1M
    } else {
525
        /*
526
         * Assume it's a fixed length encoding (1) with
527
         * a compatible encoding for the ASCII set, since
528
         * XML constructs only use < 128 chars
529
         */
530
531
26.1M
        if (*(ctxt->input->cur) == '\n') {
532
1.90M
            ctxt->input->line++; ctxt->input->col = 1;
533
1.90M
        } else
534
24.2M
            ctxt->input->col++;
535
26.1M
        ctxt->input->cur++;
536
26.1M
        ctxt->nbChars++;
537
26.1M
    }
538
49.5M
    if (*ctxt->input->cur == 0)
539
46.6k
        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
540
49.5M
    return;
541
386
encoding_error:
542
    /*
543
     * If we detect an UTF8 error that probably mean that the
544
     * input encoding didn't get properly advertised in the
545
     * declaration header. Report the error and switch the encoding
546
     * to ISO-Latin-1 (if you don't like this policy, just declare the
547
     * encoding !)
548
     */
549
386
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
550
386
        (ctxt->input->end - ctxt->input->cur < 4)) {
551
160
  __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
552
160
         "Input is not proper UTF-8, indicate encoding !\n",
553
160
         NULL, NULL);
554
226
    } else {
555
226
        char buffer[150];
556
557
226
  snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
558
226
      ctxt->input->cur[0], ctxt->input->cur[1],
559
226
      ctxt->input->cur[2], ctxt->input->cur[3]);
560
226
  __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
561
226
         "Input is not proper UTF-8, indicate encoding !\n%s",
562
226
         BAD_CAST buffer, NULL);
563
226
    }
564
386
    ctxt->charset = XML_CHAR_ENCODING_8859_1;
565
386
    ctxt->input->cur++;
566
386
    return;
567
49.5M
}
568
569
/**
570
 * xmlCurrentChar:
571
 * @ctxt:  the XML parser context
572
 * @len:  pointer to the length of the char read
573
 *
574
 * The current char value, if using UTF-8 this may actually span multiple
575
 * bytes in the input buffer. Implement the end of line normalization:
576
 * 2.11 End-of-Line Handling
577
 * Wherever an external parsed entity or the literal entity value
578
 * of an internal parsed entity contains either the literal two-character
579
 * sequence "#xD#xA" or a standalone literal #xD, an XML processor
580
 * must pass to the application the single character #xA.
581
 * This behavior can conveniently be produced by normalizing all
582
 * line breaks to #xA on input, before parsing.)
583
 *
584
 * Returns the current char value and its length
585
 */
586
587
int
588
5.24G
xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
589
5.24G
    if ((ctxt == NULL) || (len == NULL) || (ctxt->input == NULL)) return(0);
590
5.24G
    if (ctxt->instate == XML_PARSER_EOF)
591
18
  return(0);
592
593
5.24G
    if ((*ctxt->input->cur >= 0x20) && (*ctxt->input->cur <= 0x7F)) {
594
263M
      *len = 1;
595
263M
      return((int) *ctxt->input->cur);
596
263M
    }
597
4.98G
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
598
  /*
599
   * We are supposed to handle UTF8, check it's valid
600
   * From rfc2044: encoding of the Unicode values on UTF-8:
601
   *
602
   * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
603
   * 0000 0000-0000 007F   0xxxxxxx
604
   * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
605
   * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
606
   *
607
   * Check for the 0x110000 limit too
608
   */
609
4.84G
  const unsigned char *cur = ctxt->input->cur;
610
4.84G
  unsigned char c;
611
4.84G
  unsigned int val;
612
613
4.84G
  c = *cur;
614
4.84G
  if (c & 0x80) {
615
4.82G
      if (((c & 0x40) == 0) || (c == 0xC0))
616
23.5k
    goto encoding_error;
617
4.82G
      if (cur[1] == 0) {
618
2.30k
    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
619
2.30k
                cur = ctxt->input->cur;
620
2.30k
            }
621
4.82G
      if ((cur[1] & 0xc0) != 0x80)
622
44.6k
    goto encoding_error;
623
4.82G
      if ((c & 0xe0) == 0xe0) {
624
4.81G
    if (cur[2] == 0) {
625
1.09k
        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
626
1.09k
                    cur = ctxt->input->cur;
627
1.09k
                }
628
4.81G
    if ((cur[2] & 0xc0) != 0x80)
629
1.89k
        goto encoding_error;
630
4.81G
    if ((c & 0xf0) == 0xf0) {
631
481k
        if (cur[3] == 0) {
632
1.02k
      xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
633
1.02k
                        cur = ctxt->input->cur;
634
1.02k
                    }
635
481k
        if (((c & 0xf8) != 0xf0) ||
636
481k
      ((cur[3] & 0xc0) != 0x80))
637
1.33k
      goto encoding_error;
638
        /* 4-byte code */
639
480k
        *len = 4;
640
480k
        val = (cur[0] & 0x7) << 18;
641
480k
        val |= (cur[1] & 0x3f) << 12;
642
480k
        val |= (cur[2] & 0x3f) << 6;
643
480k
        val |= cur[3] & 0x3f;
644
480k
        if (val < 0x10000)
645
57
      goto encoding_error;
646
4.81G
    } else {
647
      /* 3-byte code */
648
4.81G
        *len = 3;
649
4.81G
        val = (cur[0] & 0xf) << 12;
650
4.81G
        val |= (cur[1] & 0x3f) << 6;
651
4.81G
        val |= cur[2] & 0x3f;
652
4.81G
        if (val < 0x800)
653
119
      goto encoding_error;
654
4.81G
    }
655
4.81G
      } else {
656
        /* 2-byte code */
657
10.3M
    *len = 2;
658
10.3M
    val = (cur[0] & 0x1f) << 6;
659
10.3M
    val |= cur[1] & 0x3f;
660
10.3M
    if (val < 0x80)
661
205
        goto encoding_error;
662
10.3M
      }
663
4.82G
      if (!IS_CHAR(val)) {
664
328k
          xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
665
328k
          "Char 0x%X out of allowed range\n", val);
666
328k
      }
667
4.82G
      return(val);
668
4.82G
  } else {
669
      /* 1-byte code */
670
20.2M
      *len = 1;
671
20.2M
      if (*ctxt->input->cur == 0)
672
323k
    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
673
20.2M
      if ((*ctxt->input->cur == 0) &&
674
323k
          (ctxt->input->end > ctxt->input->cur)) {
675
7.92k
          xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
676
7.92k
          "Char 0x0 out of allowed range\n", 0);
677
7.92k
      }
678
20.2M
      if (*ctxt->input->cur == 0xD) {
679
1.16M
    if (ctxt->input->cur[1] == 0xA) {
680
34.5k
        ctxt->nbChars++;
681
34.5k
        ctxt->input->cur++;
682
34.5k
    }
683
1.16M
    return(0xA);
684
1.16M
      }
685
19.0M
      return((int) *ctxt->input->cur);
686
20.2M
  }
687
4.84G
    }
688
    /*
689
     * Assume it's a fixed length encoding (1) with
690
     * a compatible encoding for the ASCII set, since
691
     * XML constructs only use < 128 chars
692
     */
693
139M
    *len = 1;
694
139M
    if (*ctxt->input->cur == 0xD) {
695
2.82M
  if (ctxt->input->cur[1] == 0xA) {
696
283k
      ctxt->nbChars++;
697
283k
      ctxt->input->cur++;
698
283k
  }
699
2.82M
  return(0xA);
700
2.82M
    }
701
136M
    return((int) *ctxt->input->cur);
702
71.7k
encoding_error:
703
    /*
704
     * An encoding problem may arise from a truncated input buffer
705
     * splitting a character in the middle. In that case do not raise
706
     * an error but return 0 to endicate an end of stream problem
707
     */
708
71.7k
    if (ctxt->input->end - ctxt->input->cur < 4) {
709
8.19k
  *len = 0;
710
8.19k
  return(0);
711
8.19k
    }
712
713
    /*
714
     * If we detect an UTF8 error that probably mean that the
715
     * input encoding didn't get properly advertised in the
716
     * declaration header. Report the error and switch the encoding
717
     * to ISO-Latin-1 (if you don't like this policy, just declare the
718
     * encoding !)
719
     */
720
63.5k
    {
721
63.5k
        char buffer[150];
722
723
63.5k
  snprintf(&buffer[0], 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
724
63.5k
      ctxt->input->cur[0], ctxt->input->cur[1],
725
63.5k
      ctxt->input->cur[2], ctxt->input->cur[3]);
726
63.5k
  __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
727
63.5k
         "Input is not proper UTF-8, indicate encoding !\n%s",
728
63.5k
         BAD_CAST buffer, NULL);
729
63.5k
    }
730
63.5k
    ctxt->charset = XML_CHAR_ENCODING_8859_1;
731
63.5k
    *len = 1;
732
63.5k
    return((int) *ctxt->input->cur);
733
71.7k
}
734
735
/**
736
 * xmlStringCurrentChar:
737
 * @ctxt:  the XML parser context
738
 * @cur:  pointer to the beginning of the char
739
 * @len:  pointer to the length of the char read
740
 *
741
 * The current char value, if using UTF-8 this may actually span multiple
742
 * bytes in the input buffer.
743
 *
744
 * Returns the current char value and its length
745
 */
746
747
int
748
xmlStringCurrentChar(xmlParserCtxtPtr ctxt, const xmlChar * cur, int *len)
749
1.96G
{
750
1.96G
    if ((len == NULL) || (cur == NULL)) return(0);
751
1.96G
    if ((ctxt == NULL) || (ctxt->charset == XML_CHAR_ENCODING_UTF8)) {
752
        /*
753
         * We are supposed to handle UTF8, check it's valid
754
         * From rfc2044: encoding of the Unicode values on UTF-8:
755
         *
756
         * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
757
         * 0000 0000-0000 007F   0xxxxxxx
758
         * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
759
         * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
760
         *
761
         * Check for the 0x110000 limit too
762
         */
763
1.94G
        unsigned char c;
764
1.94G
        unsigned int val;
765
766
1.94G
        c = *cur;
767
1.94G
        if (c & 0x80) {
768
1.91G
            if ((cur[1] & 0xc0) != 0x80)
769
51.0k
                goto encoding_error;
770
1.91G
            if ((c & 0xe0) == 0xe0) {
771
772
1.91G
                if ((cur[2] & 0xc0) != 0x80)
773
6.71k
                    goto encoding_error;
774
1.91G
                if ((c & 0xf0) == 0xf0) {
775
41.4k
                    if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80))
776
4.20k
                        goto encoding_error;
777
                    /* 4-byte code */
778
37.2k
                    *len = 4;
779
37.2k
                    val = (cur[0] & 0x7) << 18;
780
37.2k
                    val |= (cur[1] & 0x3f) << 12;
781
37.2k
                    val |= (cur[2] & 0x3f) << 6;
782
37.2k
                    val |= cur[3] & 0x3f;
783
1.91G
                } else {
784
                    /* 3-byte code */
785
1.91G
                    *len = 3;
786
1.91G
                    val = (cur[0] & 0xf) << 12;
787
1.91G
                    val |= (cur[1] & 0x3f) << 6;
788
1.91G
                    val |= cur[2] & 0x3f;
789
1.91G
                }
790
1.91G
            } else {
791
                /* 2-byte code */
792
905k
                *len = 2;
793
905k
                val = (cur[0] & 0x1f) << 6;
794
905k
                val |= cur[1] & 0x3f;
795
905k
            }
796
1.91G
            if (!IS_CHAR(val)) {
797
16.6k
          xmlErrEncodingInt(ctxt, XML_ERR_INVALID_CHAR,
798
16.6k
          "Char 0x%X out of allowed range\n", val);
799
16.6k
            }
800
1.91G
            return (val);
801
1.91G
        } else {
802
            /* 1-byte code */
803
27.2M
            *len = 1;
804
27.2M
            return ((int) *cur);
805
27.2M
        }
806
1.94G
    }
807
    /*
808
     * Assume it's a fixed length encoding (1) with
809
     * a compatible encoding for the ASCII set, since
810
     * XML constructs only use < 128 chars
811
     */
812
28.2M
    *len = 1;
813
28.2M
    return ((int) *cur);
814
61.9k
encoding_error:
815
816
    /*
817
     * An encoding problem may arise from a truncated input buffer
818
     * splitting a character in the middle. In that case do not raise
819
     * an error but return 0 to endicate an end of stream problem
820
     */
821
61.9k
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
822
61.9k
        (ctxt->input->end - ctxt->input->cur < 4)) {
823
61.9k
  *len = 0;
824
61.9k
  return(0);
825
61.9k
    }
826
    /*
827
     * If we detect an UTF8 error that probably mean that the
828
     * input encoding didn't get properly advertised in the
829
     * declaration header. Report the error and switch the encoding
830
     * to ISO-Latin-1 (if you don't like this policy, just declare the
831
     * encoding !)
832
     */
833
0
    {
834
0
        char buffer[150];
835
836
0
  snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
837
0
      ctxt->input->cur[0], ctxt->input->cur[1],
838
0
      ctxt->input->cur[2], ctxt->input->cur[3]);
839
0
  __xmlErrEncoding(ctxt, XML_ERR_INVALID_CHAR,
840
0
         "Input is not proper UTF-8, indicate encoding !\n%s",
841
0
         BAD_CAST buffer, NULL);
842
0
    }
843
0
    *len = 1;
844
0
    return ((int) *cur);
845
61.9k
}
846
847
/**
848
 * xmlCopyCharMultiByte:
849
 * @out:  pointer to an array of xmlChar
850
 * @val:  the char value
851
 *
852
 * append the char value in the array
853
 *
854
 * Returns the number of xmlChar written
855
 */
856
int
857
6.54G
xmlCopyCharMultiByte(xmlChar *out, int val) {
858
6.54G
    if (out == NULL) return(0);
859
    /*
860
     * We are supposed to handle UTF8, check it's valid
861
     * From rfc2044: encoding of the Unicode values on UTF-8:
862
     *
863
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
864
     * 0000 0000-0000 007F   0xxxxxxx
865
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
866
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
867
     */
868
6.54G
    if  (val >= 0x80) {
869
6.54G
  xmlChar *savedout = out;
870
6.54G
  int bits;
871
6.54G
  if (val <   0x800) { *out++= (val >>  6) | 0xC0;  bits=  0; }
872
6.53G
  else if (val < 0x10000) { *out++= (val >> 12) | 0xE0;  bits=  6;}
873
485k
  else if (val < 0x110000)  { *out++= (val >> 18) | 0xF0;  bits=  12; }
874
2.17k
  else {
875
2.17k
      xmlErrEncodingInt(NULL, XML_ERR_INVALID_CHAR,
876
2.17k
        "Internal error, xmlCopyCharMultiByte 0x%X out of bound\n",
877
2.17k
            val);
878
2.17k
      return(0);
879
2.17k
  }
880
19.6G
  for ( ; bits >= 0; bits-= 6)
881
13.0G
      *out++= ((val >> bits) & 0x3F) | 0x80 ;
882
6.54G
  return (out - savedout);
883
6.54G
    }
884
340k
    *out = (xmlChar) val;
885
340k
    return 1;
886
6.54G
}
887
888
/**
889
 * xmlCopyChar:
890
 * @len:  Ignored, compatibility
891
 * @out:  pointer to an array of xmlChar
892
 * @val:  the char value
893
 *
894
 * append the char value in the array
895
 *
896
 * Returns the number of xmlChar written
897
 */
898
899
int
900
209k
xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) {
901
209k
    if (out == NULL) return(0);
902
    /* the len parameter is ignored */
903
209k
    if  (val >= 0x80) {
904
70.7k
  return(xmlCopyCharMultiByte (out, val));
905
70.7k
    }
906
138k
    *out = (xmlChar) val;
907
138k
    return 1;
908
209k
}
909
910
/************************************************************************
911
 *                  *
912
 *    Commodity functions to switch encodings     *
913
 *                  *
914
 ************************************************************************/
915
916
static int
917
xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
918
                       xmlCharEncodingHandlerPtr handler, int len);
919
static int
920
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
921
                          xmlCharEncodingHandlerPtr handler, int len);
922
/**
923
 * xmlSwitchEncoding:
924
 * @ctxt:  the parser context
925
 * @enc:  the encoding value (number)
926
 *
927
 * change the input functions when discovering the character encoding
928
 * of a given entity.
929
 *
930
 * Returns 0 in case of success, -1 otherwise
931
 */
932
int
933
xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc)
934
76.5k
{
935
76.5k
    xmlCharEncodingHandlerPtr handler;
936
76.5k
    int len = -1;
937
76.5k
    int ret;
938
939
76.5k
    if (ctxt == NULL) return(-1);
940
76.5k
    switch (enc) {
941
0
  case XML_CHAR_ENCODING_ERROR:
942
0
      __xmlErrEncoding(ctxt, XML_ERR_UNKNOWN_ENCODING,
943
0
                     "encoding unknown\n", NULL, NULL);
944
0
      return(-1);
945
0
  case XML_CHAR_ENCODING_NONE:
946
      /* let's assume it's UTF-8 without the XML decl */
947
0
      ctxt->charset = XML_CHAR_ENCODING_UTF8;
948
0
      return(0);
949
72.4k
  case XML_CHAR_ENCODING_UTF8:
950
      /* default encoding, no conversion should be needed */
951
72.4k
      ctxt->charset = XML_CHAR_ENCODING_UTF8;
952
953
      /*
954
       * Errata on XML-1.0 June 20 2001
955
       * Specific handling of the Byte Order Mark for
956
       * UTF-8
957
       */
958
72.4k
      if ((ctxt->input != NULL) &&
959
72.4k
    (ctxt->input->cur[0] == 0xEF) &&
960
1.88k
    (ctxt->input->cur[1] == 0xBB) &&
961
1.88k
    (ctxt->input->cur[2] == 0xBF)) {
962
1.88k
    ctxt->input->cur += 3;
963
1.88k
      }
964
72.4k
      return(0);
965
2.05k
    case XML_CHAR_ENCODING_UTF16LE:
966
3.67k
    case XML_CHAR_ENCODING_UTF16BE:
967
        /*The raw input characters are encoded
968
         *in UTF-16. As we expect this function
969
         *to be called after xmlCharEncInFunc, we expect
970
         *ctxt->input->cur to contain UTF-8 encoded characters.
971
         *So the raw UTF16 Byte Order Mark
972
         *has also been converted into
973
         *an UTF-8 BOM. Let's skip that BOM.
974
         */
975
3.67k
        if ((ctxt->input != NULL) && (ctxt->input->cur != NULL) &&
976
3.67k
            (ctxt->input->cur[0] == 0xEF) &&
977
2.24k
            (ctxt->input->cur[1] == 0xBB) &&
978
2.24k
            (ctxt->input->cur[2] == 0xBF)) {
979
2.24k
            ctxt->input->cur += 3;
980
2.24k
        }
981
3.67k
        len = 90;
982
3.67k
  break;
983
0
    case XML_CHAR_ENCODING_UCS2:
984
0
        len = 90;
985
0
  break;
986
151
    case XML_CHAR_ENCODING_UCS4BE:
987
171
    case XML_CHAR_ENCODING_UCS4LE:
988
178
    case XML_CHAR_ENCODING_UCS4_2143:
989
185
    case XML_CHAR_ENCODING_UCS4_3412:
990
185
        len = 180;
991
185
  break;
992
221
    case XML_CHAR_ENCODING_EBCDIC:
993
221
    case XML_CHAR_ENCODING_8859_1:
994
221
    case XML_CHAR_ENCODING_8859_2:
995
221
    case XML_CHAR_ENCODING_8859_3:
996
221
    case XML_CHAR_ENCODING_8859_4:
997
221
    case XML_CHAR_ENCODING_8859_5:
998
221
    case XML_CHAR_ENCODING_8859_6:
999
221
    case XML_CHAR_ENCODING_8859_7:
1000
221
    case XML_CHAR_ENCODING_8859_8:
1001
221
    case XML_CHAR_ENCODING_8859_9:
1002
221
    case XML_CHAR_ENCODING_ASCII:
1003
221
    case XML_CHAR_ENCODING_2022_JP:
1004
221
    case XML_CHAR_ENCODING_SHIFT_JIS:
1005
221
    case XML_CHAR_ENCODING_EUC_JP:
1006
221
        len = 45;
1007
221
  break;
1008
76.5k
    }
1009
4.07k
    handler = xmlGetCharEncodingHandler(enc);
1010
4.07k
    if (handler == NULL) {
1011
  /*
1012
   * Default handlers.
1013
   */
1014
14
  switch (enc) {
1015
0
      case XML_CHAR_ENCODING_ASCII:
1016
    /* default encoding, no conversion should be needed */
1017
0
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
1018
0
    return(0);
1019
0
      case XML_CHAR_ENCODING_UTF16LE:
1020
0
    break;
1021
0
      case XML_CHAR_ENCODING_UTF16BE:
1022
0
    break;
1023
0
      case XML_CHAR_ENCODING_UCS4LE:
1024
0
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1025
0
             "encoding not supported %s\n",
1026
0
             BAD_CAST "USC4 little endian", NULL);
1027
0
    break;
1028
0
      case XML_CHAR_ENCODING_UCS4BE:
1029
0
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1030
0
             "encoding not supported %s\n",
1031
0
             BAD_CAST "USC4 big endian", NULL);
1032
0
    break;
1033
0
      case XML_CHAR_ENCODING_EBCDIC:
1034
0
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1035
0
             "encoding not supported %s\n",
1036
0
             BAD_CAST "EBCDIC", NULL);
1037
0
    break;
1038
7
      case XML_CHAR_ENCODING_UCS4_2143:
1039
7
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1040
7
             "encoding not supported %s\n",
1041
7
             BAD_CAST "UCS4 2143", NULL);
1042
7
    break;
1043
7
      case XML_CHAR_ENCODING_UCS4_3412:
1044
7
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1045
7
             "encoding not supported %s\n",
1046
7
             BAD_CAST "UCS4 3412", NULL);
1047
7
    break;
1048
0
      case XML_CHAR_ENCODING_UCS2:
1049
0
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1050
0
             "encoding not supported %s\n",
1051
0
             BAD_CAST "UCS2", NULL);
1052
0
    break;
1053
0
      case XML_CHAR_ENCODING_8859_1:
1054
0
      case XML_CHAR_ENCODING_8859_2:
1055
0
      case XML_CHAR_ENCODING_8859_3:
1056
0
      case XML_CHAR_ENCODING_8859_4:
1057
0
      case XML_CHAR_ENCODING_8859_5:
1058
0
      case XML_CHAR_ENCODING_8859_6:
1059
0
      case XML_CHAR_ENCODING_8859_7:
1060
0
      case XML_CHAR_ENCODING_8859_8:
1061
0
      case XML_CHAR_ENCODING_8859_9:
1062
    /*
1063
     * We used to keep the internal content in the
1064
     * document encoding however this turns being unmaintainable
1065
     * So xmlGetCharEncodingHandler() will return non-null
1066
     * values for this now.
1067
     */
1068
0
    if ((ctxt->inputNr == 1) &&
1069
0
        (ctxt->encoding == NULL) &&
1070
0
        (ctxt->input != NULL) &&
1071
0
        (ctxt->input->encoding != NULL)) {
1072
0
        ctxt->encoding = xmlStrdup(ctxt->input->encoding);
1073
0
    }
1074
0
    ctxt->charset = enc;
1075
0
    return(0);
1076
0
      case XML_CHAR_ENCODING_2022_JP:
1077
0
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1078
0
             "encoding not supported %s\n",
1079
0
             BAD_CAST "ISO-2022-JP", NULL);
1080
0
    break;
1081
0
      case XML_CHAR_ENCODING_SHIFT_JIS:
1082
0
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1083
0
             "encoding not supported %s\n",
1084
0
             BAD_CAST "Shift_JIS", NULL);
1085
0
    break;
1086
0
      case XML_CHAR_ENCODING_EUC_JP:
1087
0
    __xmlErrEncoding(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
1088
0
             "encoding not supported %s\n",
1089
0
             BAD_CAST "EUC-JP", NULL);
1090
0
    break;
1091
0
      default:
1092
0
          break;
1093
14
  }
1094
14
    }
1095
    /*
1096
     * TODO: We could recover from errors in external entites if we
1097
     * didn't stop the parser. But most callers of this function don't
1098
     * check the return value.
1099
     */
1100
4.07k
    if (handler == NULL) {
1101
14
        xmlStopParser(ctxt);
1102
14
  return(-1);
1103
14
    }
1104
4.06k
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
1105
4.06k
    ret = xmlSwitchToEncodingInt(ctxt, handler, len);
1106
4.06k
    if ((ret < 0) || (ctxt->errNo == XML_I18N_CONV_FAILED)) {
1107
        /*
1108
   * on encoding conversion errors, stop the parser
1109
   */
1110
0
        xmlStopParser(ctxt);
1111
0
  ctxt->errNo = XML_I18N_CONV_FAILED;
1112
0
    }
1113
4.06k
    return(ret);
1114
4.07k
}
1115
1116
/**
1117
 * xmlSwitchInputEncoding:
1118
 * @ctxt:  the parser context
1119
 * @input:  the input stream
1120
 * @handler:  the encoding handler
1121
 * @len:  the number of bytes to convert for the first line or -1
1122
 *
1123
 * change the input functions when discovering the character encoding
1124
 * of a given entity.
1125
 *
1126
 * Returns 0 in case of success, -1 otherwise
1127
 */
1128
static int
1129
xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
1130
                          xmlCharEncodingHandlerPtr handler, int len)
1131
30.7k
{
1132
30.7k
    int nbchars;
1133
1134
30.7k
    if (handler == NULL)
1135
0
        return (-1);
1136
30.7k
    if (input == NULL)
1137
0
        return (-1);
1138
30.7k
    if (input->buf != NULL) {
1139
30.7k
        if (input->buf->encoder != NULL) {
1140
            /*
1141
             * Check in case the auto encoding detetection triggered
1142
             * in already.
1143
             */
1144
4.06k
            if (input->buf->encoder == handler)
1145
3.67k
                return (0);
1146
1147
            /*
1148
             * "UTF-16" can be used for both LE and BE
1149
             if ((!xmlStrncmp(BAD_CAST input->buf->encoder->name,
1150
             BAD_CAST "UTF-16", 6)) &&
1151
             (!xmlStrncmp(BAD_CAST handler->name,
1152
             BAD_CAST "UTF-16", 6))) {
1153
             return(0);
1154
             }
1155
             */
1156
1157
            /*
1158
             * Note: this is a bit dangerous, but that's what it
1159
             * takes to use nearly compatible signature for different
1160
             * encodings.
1161
             */
1162
395
            xmlCharEncCloseFunc(input->buf->encoder);
1163
395
            input->buf->encoder = handler;
1164
395
            return (0);
1165
4.06k
        }
1166
26.6k
        input->buf->encoder = handler;
1167
1168
        /*
1169
         * Is there already some content down the pipe to convert ?
1170
         */
1171
26.6k
        if (xmlBufIsEmpty(input->buf->buffer) == 0) {
1172
26.6k
            int processed;
1173
26.6k
      unsigned int use;
1174
1175
            /*
1176
             * Specific handling of the Byte Order Mark for
1177
             * UTF-16
1178
             */
1179
26.6k
            if ((handler->name != NULL) &&
1180
26.6k
                (!strcmp(handler->name, "UTF-16LE") ||
1181
26.5k
                 !strcmp(handler->name, "UTF-16")) &&
1182
123
                (input->cur[0] == 0xFF) && (input->cur[1] == 0xFE)) {
1183
9
                input->cur += 2;
1184
9
            }
1185
26.6k
            if ((handler->name != NULL) &&
1186
26.6k
                (!strcmp(handler->name, "UTF-16BE")) &&
1187
120
                (input->cur[0] == 0xFE) && (input->cur[1] == 0xFF)) {
1188
10
                input->cur += 2;
1189
10
            }
1190
            /*
1191
             * Errata on XML-1.0 June 20 2001
1192
             * Specific handling of the Byte Order Mark for
1193
             * UTF-8
1194
             */
1195
26.6k
            if ((handler->name != NULL) &&
1196
26.6k
                (!strcmp(handler->name, "UTF-8")) &&
1197
0
                (input->cur[0] == 0xEF) &&
1198
0
                (input->cur[1] == 0xBB) && (input->cur[2] == 0xBF)) {
1199
0
                input->cur += 3;
1200
0
            }
1201
1202
            /*
1203
             * Shrink the current input buffer.
1204
             * Move it as the raw buffer and create a new input buffer
1205
             */
1206
26.6k
            processed = input->cur - input->base;
1207
26.6k
            xmlBufShrink(input->buf->buffer, processed);
1208
26.6k
            input->buf->raw = input->buf->buffer;
1209
26.6k
            input->buf->buffer = xmlBufCreate();
1210
26.6k
      input->buf->rawconsumed = processed;
1211
26.6k
      use = xmlBufUse(input->buf->raw);
1212
1213
26.6k
            if (ctxt->html) {
1214
                /*
1215
                 * convert as much as possible of the buffer
1216
                 */
1217
0
                nbchars = xmlCharEncInput(input->buf, 1);
1218
26.6k
            } else {
1219
                /*
1220
                 * convert just enough to get
1221
                 * '<?xml version="1.0" encoding="xxx"?>'
1222
                 * parsed with the autodetected encoding
1223
                 * into the parser reading buffer.
1224
                 */
1225
26.6k
                nbchars = xmlCharEncFirstLineInput(input->buf, len);
1226
26.6k
            }
1227
26.6k
            xmlBufResetInput(input->buf->buffer, input);
1228
26.6k
            if (nbchars < 0) {
1229
57
                xmlErrInternal(ctxt,
1230
57
                               "switching encoding: encoder error\n",
1231
57
                               NULL);
1232
57
                return (-1);
1233
57
            }
1234
26.6k
      input->buf->rawconsumed += use - xmlBufUse(input->buf->raw);
1235
26.6k
        }
1236
26.6k
        return (0);
1237
26.6k
    } else if (input->length == 0) {
1238
  /*
1239
   * When parsing a static memory array one must know the
1240
   * size to be able to convert the buffer.
1241
   */
1242
0
  xmlErrInternal(ctxt, "switching encoding : no input\n", NULL);
1243
0
  return (-1);
1244
0
    }
1245
0
    return (0);
1246
30.7k
}
1247
1248
/**
1249
 * xmlSwitchInputEncoding:
1250
 * @ctxt:  the parser context
1251
 * @input:  the input stream
1252
 * @handler:  the encoding handler
1253
 *
1254
 * change the input functions when discovering the character encoding
1255
 * of a given entity.
1256
 *
1257
 * Returns 0 in case of success, -1 otherwise
1258
 */
1259
int
1260
xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input,
1261
0
                          xmlCharEncodingHandlerPtr handler) {
1262
0
    return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1));
1263
0
}
1264
1265
/**
1266
 * xmlSwitchToEncodingInt:
1267
 * @ctxt:  the parser context
1268
 * @handler:  the encoding handler
1269
 * @len: the length to convert or -1
1270
 *
1271
 * change the input functions when discovering the character encoding
1272
 * of a given entity, and convert only @len bytes of the output, this
1273
 * is needed on auto detect to allows any declared encoding later to
1274
 * convert the actual content after the xmlDecl
1275
 *
1276
 * Returns 0 in case of success, -1 otherwise
1277
 */
1278
static int
1279
xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt,
1280
30.7k
                       xmlCharEncodingHandlerPtr handler, int len) {
1281
30.7k
    int ret = 0;
1282
1283
30.7k
    if (handler != NULL) {
1284
30.7k
        if (ctxt->input != NULL) {
1285
30.7k
      ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len);
1286
30.7k
  } else {
1287
0
      xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n",
1288
0
                     NULL);
1289
0
      return(-1);
1290
0
  }
1291
  /*
1292
   * The parsing is now done in UTF8 natively
1293
   */
1294
30.7k
  ctxt->charset = XML_CHAR_ENCODING_UTF8;
1295
30.7k
    } else
1296
0
  return(-1);
1297
30.7k
    return(ret);
1298
30.7k
}
1299
1300
/**
1301
 * xmlSwitchToEncoding:
1302
 * @ctxt:  the parser context
1303
 * @handler:  the encoding handler
1304
 *
1305
 * change the input functions when discovering the character encoding
1306
 * of a given entity.
1307
 *
1308
 * Returns 0 in case of success, -1 otherwise
1309
 */
1310
int
1311
xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler)
1312
26.6k
{
1313
26.6k
    return (xmlSwitchToEncodingInt(ctxt, handler, -1));
1314
26.6k
}
1315
1316
/************************************************************************
1317
 *                  *
1318
 *  Commodity functions to handle entities processing   *
1319
 *                  *
1320
 ************************************************************************/
1321
1322
/**
1323
 * xmlFreeInputStream:
1324
 * @input:  an xmlParserInputPtr
1325
 *
1326
 * Free up an input stream.
1327
 */
1328
void
1329
821k
xmlFreeInputStream(xmlParserInputPtr input) {
1330
821k
    if (input == NULL) return;
1331
1332
821k
    if (input->filename != NULL) xmlFree((char *) input->filename);
1333
821k
    if (input->directory != NULL) xmlFree((char *) input->directory);
1334
821k
    if (input->encoding != NULL) xmlFree((char *) input->encoding);
1335
821k
    if (input->version != NULL) xmlFree((char *) input->version);
1336
821k
    if ((input->free != NULL) && (input->base != NULL))
1337
0
        input->free((xmlChar *) input->base);
1338
821k
    if (input->buf != NULL)
1339
396k
        xmlFreeParserInputBuffer(input->buf);
1340
821k
    xmlFree(input);
1341
821k
}
1342
1343
/**
1344
 * xmlNewInputStream:
1345
 * @ctxt:  an XML parser context
1346
 *
1347
 * Create a new input stream structure.
1348
 *
1349
 * Returns the new input stream or NULL
1350
 */
1351
xmlParserInputPtr
1352
821k
xmlNewInputStream(xmlParserCtxtPtr ctxt) {
1353
821k
    xmlParserInputPtr input;
1354
1355
821k
    input = (xmlParserInputPtr) xmlMalloc(sizeof(xmlParserInput));
1356
821k
    if (input == NULL) {
1357
0
        xmlErrMemory(ctxt,  "couldn't allocate a new input stream\n");
1358
0
  return(NULL);
1359
0
    }
1360
821k
    memset(input, 0, sizeof(xmlParserInput));
1361
821k
    input->line = 1;
1362
821k
    input->col = 1;
1363
821k
    input->standalone = -1;
1364
1365
    /*
1366
     * If the context is NULL the id cannot be initialized, but that
1367
     * should not happen while parsing which is the situation where
1368
     * the id is actually needed.
1369
     */
1370
821k
    if (ctxt != NULL)
1371
821k
        input->id = ctxt->input_id++;
1372
1373
821k
    return(input);
1374
821k
}
1375
1376
/**
1377
 * xmlNewIOInputStream:
1378
 * @ctxt:  an XML parser context
1379
 * @input:  an I/O Input
1380
 * @enc:  the charset encoding if known
1381
 *
1382
 * Create a new input stream structure encapsulating the @input into
1383
 * a stream suitable for the parser.
1384
 *
1385
 * Returns the new input stream or NULL
1386
 */
1387
xmlParserInputPtr
1388
xmlNewIOInputStream(xmlParserCtxtPtr ctxt, xmlParserInputBufferPtr input,
1389
0
              xmlCharEncoding enc) {
1390
0
    xmlParserInputPtr inputStream;
1391
1392
0
    if (input == NULL) return(NULL);
1393
0
    if (xmlParserDebugEntities)
1394
0
  xmlGenericError(xmlGenericErrorContext, "new input from I/O\n");
1395
0
    inputStream = xmlNewInputStream(ctxt);
1396
0
    if (inputStream == NULL) {
1397
0
  return(NULL);
1398
0
    }
1399
0
    inputStream->filename = NULL;
1400
0
    inputStream->buf = input;
1401
0
    xmlBufResetInput(inputStream->buf->buffer, inputStream);
1402
1403
0
    if (enc != XML_CHAR_ENCODING_NONE) {
1404
0
        xmlSwitchEncoding(ctxt, enc);
1405
0
    }
1406
1407
0
    return(inputStream);
1408
0
}
1409
1410
/**
1411
 * xmlNewEntityInputStream:
1412
 * @ctxt:  an XML parser context
1413
 * @entity:  an Entity pointer
1414
 *
1415
 * Create a new input stream based on an xmlEntityPtr
1416
 *
1417
 * Returns the new input stream or NULL
1418
 */
1419
xmlParserInputPtr
1420
425k
xmlNewEntityInputStream(xmlParserCtxtPtr ctxt, xmlEntityPtr entity) {
1421
425k
    xmlParserInputPtr input;
1422
1423
425k
    if (entity == NULL) {
1424
0
        xmlErrInternal(ctxt, "xmlNewEntityInputStream entity = NULL\n",
1425
0
                 NULL);
1426
0
  return(NULL);
1427
0
    }
1428
425k
    if (xmlParserDebugEntities)
1429
0
  xmlGenericError(xmlGenericErrorContext,
1430
0
    "new input from entity: %s\n", entity->name);
1431
425k
    if (entity->content == NULL) {
1432
0
  switch (entity->etype) {
1433
0
            case XML_EXTERNAL_GENERAL_UNPARSED_ENTITY:
1434
0
          xmlErrInternal(ctxt, "Cannot parse entity %s\n",
1435
0
                   entity->name);
1436
0
                break;
1437
0
            case XML_EXTERNAL_GENERAL_PARSED_ENTITY:
1438
0
            case XML_EXTERNAL_PARAMETER_ENTITY:
1439
0
    return(xmlLoadExternalEntity((char *) entity->URI,
1440
0
           (char *) entity->ExternalID, ctxt));
1441
0
            case XML_INTERNAL_GENERAL_ENTITY:
1442
0
          xmlErrInternal(ctxt,
1443
0
          "Internal entity %s without content !\n",
1444
0
                   entity->name);
1445
0
                break;
1446
0
            case XML_INTERNAL_PARAMETER_ENTITY:
1447
0
          xmlErrInternal(ctxt,
1448
0
          "Internal parameter entity %s without content !\n",
1449
0
                   entity->name);
1450
0
                break;
1451
0
            case XML_INTERNAL_PREDEFINED_ENTITY:
1452
0
          xmlErrInternal(ctxt,
1453
0
          "Predefined entity %s without content !\n",
1454
0
                   entity->name);
1455
0
                break;
1456
0
  }
1457
0
  return(NULL);
1458
0
    }
1459
425k
    input = xmlNewInputStream(ctxt);
1460
425k
    if (input == NULL) {
1461
0
  return(NULL);
1462
0
    }
1463
425k
    if (entity->URI != NULL)
1464
0
  input->filename = (char *) xmlStrdup((xmlChar *) entity->URI);
1465
425k
    input->base = entity->content;
1466
425k
    if (entity->length == 0)
1467
1.62k
        entity->length = xmlStrlen(entity->content);
1468
425k
    input->cur = entity->content;
1469
425k
    input->length = entity->length;
1470
425k
    input->end = &entity->content[input->length];
1471
425k
    return(input);
1472
425k
}
1473
1474
/**
1475
 * xmlNewStringInputStream:
1476
 * @ctxt:  an XML parser context
1477
 * @buffer:  an memory buffer
1478
 *
1479
 * Create a new input stream based on a memory buffer.
1480
 * Returns the new input stream
1481
 */
1482
xmlParserInputPtr
1483
0
xmlNewStringInputStream(xmlParserCtxtPtr ctxt, const xmlChar *buffer) {
1484
0
    xmlParserInputPtr input;
1485
1486
0
    if (buffer == NULL) {
1487
0
        xmlErrInternal(ctxt, "xmlNewStringInputStream string = NULL\n",
1488
0
                 NULL);
1489
0
  return(NULL);
1490
0
    }
1491
0
    if (xmlParserDebugEntities)
1492
0
  xmlGenericError(xmlGenericErrorContext,
1493
0
    "new fixed input: %.30s\n", buffer);
1494
0
    input = xmlNewInputStream(ctxt);
1495
0
    if (input == NULL) {
1496
0
        xmlErrMemory(ctxt,  "couldn't allocate a new input stream\n");
1497
0
  return(NULL);
1498
0
    }
1499
0
    input->base = buffer;
1500
0
    input->cur = buffer;
1501
0
    input->length = xmlStrlen(buffer);
1502
0
    input->end = &buffer[input->length];
1503
0
    return(input);
1504
0
}
1505
1506
/**
1507
 * xmlNewInputFromFile:
1508
 * @ctxt:  an XML parser context
1509
 * @filename:  the filename to use as entity
1510
 *
1511
 * Create a new input stream based on a file or an URL.
1512
 *
1513
 * Returns the new input stream or NULL in case of error
1514
 */
1515
xmlParserInputPtr
1516
0
xmlNewInputFromFile(xmlParserCtxtPtr ctxt, const char *filename) {
1517
0
    xmlParserInputBufferPtr buf;
1518
0
    xmlParserInputPtr inputStream;
1519
0
    char *directory = NULL;
1520
0
    xmlChar *URI = NULL;
1521
1522
0
    if (xmlParserDebugEntities)
1523
0
  xmlGenericError(xmlGenericErrorContext,
1524
0
    "new input from file: %s\n", filename);
1525
0
    if (ctxt == NULL) return(NULL);
1526
0
    buf = xmlParserInputBufferCreateFilename(filename, XML_CHAR_ENCODING_NONE);
1527
0
    if (buf == NULL) {
1528
0
  if (filename == NULL)
1529
0
      __xmlLoaderErr(ctxt,
1530
0
                     "failed to load external entity: NULL filename \n",
1531
0
         NULL);
1532
0
  else
1533
0
      __xmlLoaderErr(ctxt, "failed to load external entity \"%s\"\n",
1534
0
         (const char *) filename);
1535
0
  return(NULL);
1536
0
    }
1537
1538
0
    inputStream = xmlNewInputStream(ctxt);
1539
0
    if (inputStream == NULL)
1540
0
  return(NULL);
1541
1542
0
    inputStream->buf = buf;
1543
0
    inputStream = xmlCheckHTTPInput(ctxt, inputStream);
1544
0
    if (inputStream == NULL)
1545
0
        return(NULL);
1546
1547
0
    if (inputStream->filename == NULL)
1548
0
  URI = xmlStrdup((xmlChar *) filename);
1549
0
    else
1550
0
  URI = xmlStrdup((xmlChar *) inputStream->filename);
1551
0
    directory = xmlParserGetDirectory((const char *) URI);
1552
0
    if (inputStream->filename != NULL) xmlFree((char *)inputStream->filename);
1553
0
    inputStream->filename = (char *) xmlCanonicPath((const xmlChar *) URI);
1554
0
    if (URI != NULL) xmlFree((char *) URI);
1555
0
    inputStream->directory = directory;
1556
1557
0
    xmlBufResetInput(inputStream->buf->buffer, inputStream);
1558
0
    if ((ctxt->directory == NULL) && (directory != NULL))
1559
0
        ctxt->directory = (char *) xmlStrdup((const xmlChar *) directory);
1560
0
    return(inputStream);
1561
0
}
1562
1563
/************************************************************************
1564
 *                  *
1565
 *    Commodity functions to handle parser contexts   *
1566
 *                  *
1567
 ************************************************************************/
1568
1569
/**
1570
 * xmlInitParserCtxt:
1571
 * @ctxt:  an XML parser context
1572
 *
1573
 * Initialize a parser context
1574
 *
1575
 * Returns 0 in case of success and -1 in case of error
1576
 */
1577
1578
int
1579
xmlInitParserCtxt(xmlParserCtxtPtr ctxt)
1580
396k
{
1581
396k
    xmlParserInputPtr input;
1582
1583
396k
    if(ctxt==NULL) {
1584
0
        xmlErrInternal(NULL, "Got NULL parser context\n", NULL);
1585
0
        return(-1);
1586
0
    }
1587
1588
396k
    xmlDefaultSAXHandlerInit();
1589
1590
396k
    if (ctxt->dict == NULL)
1591
396k
  ctxt->dict = xmlDictCreate();
1592
396k
    if (ctxt->dict == NULL) {
1593
0
        xmlErrMemory(NULL, "cannot initialize parser context\n");
1594
0
  return(-1);
1595
0
    }
1596
396k
    xmlDictSetLimit(ctxt->dict, XML_MAX_DICTIONARY_LIMIT);
1597
1598
396k
    if (ctxt->sax == NULL)
1599
396k
  ctxt->sax = (xmlSAXHandler *) xmlMalloc(sizeof(xmlSAXHandler));
1600
396k
    if (ctxt->sax == NULL) {
1601
0
        xmlErrMemory(NULL, "cannot initialize parser context\n");
1602
0
  return(-1);
1603
0
    }
1604
396k
    else
1605
396k
        xmlSAXVersion(ctxt->sax, 2);
1606
1607
396k
    ctxt->maxatts = 0;
1608
396k
    ctxt->atts = NULL;
1609
    /* Allocate the Input stack */
1610
396k
    if (ctxt->inputTab == NULL) {
1611
396k
  ctxt->inputTab = (xmlParserInputPtr *)
1612
396k
        xmlMalloc(5 * sizeof(xmlParserInputPtr));
1613
396k
  ctxt->inputMax = 5;
1614
396k
    }
1615
396k
    if (ctxt->inputTab == NULL) {
1616
0
        xmlErrMemory(NULL, "cannot initialize parser context\n");
1617
0
  ctxt->inputNr = 0;
1618
0
  ctxt->inputMax = 0;
1619
0
  ctxt->input = NULL;
1620
0
  return(-1);
1621
0
    }
1622
396k
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
1623
0
        xmlFreeInputStream(input);
1624
0
    }
1625
396k
    ctxt->inputNr = 0;
1626
396k
    ctxt->input = NULL;
1627
1628
396k
    ctxt->version = NULL;
1629
396k
    ctxt->encoding = NULL;
1630
396k
    ctxt->standalone = -1;
1631
396k
    ctxt->hasExternalSubset = 0;
1632
396k
    ctxt->hasPErefs = 0;
1633
396k
    ctxt->html = 0;
1634
396k
    ctxt->external = 0;
1635
396k
    ctxt->instate = XML_PARSER_START;
1636
396k
    ctxt->token = 0;
1637
396k
    ctxt->directory = NULL;
1638
1639
    /* Allocate the Node stack */
1640
396k
    if (ctxt->nodeTab == NULL) {
1641
396k
  ctxt->nodeTab = (xmlNodePtr *) xmlMalloc(10 * sizeof(xmlNodePtr));
1642
396k
  ctxt->nodeMax = 10;
1643
396k
    }
1644
396k
    if (ctxt->nodeTab == NULL) {
1645
0
        xmlErrMemory(NULL, "cannot initialize parser context\n");
1646
0
  ctxt->nodeNr = 0;
1647
0
  ctxt->nodeMax = 0;
1648
0
  ctxt->node = NULL;
1649
0
  ctxt->inputNr = 0;
1650
0
  ctxt->inputMax = 0;
1651
0
  ctxt->input = NULL;
1652
0
  return(-1);
1653
0
    }
1654
396k
    ctxt->nodeNr = 0;
1655
396k
    ctxt->node = NULL;
1656
1657
    /* Allocate the Name stack */
1658
396k
    if (ctxt->nameTab == NULL) {
1659
396k
  ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
1660
396k
  ctxt->nameMax = 10;
1661
396k
    }
1662
396k
    if (ctxt->nameTab == NULL) {
1663
0
        xmlErrMemory(NULL, "cannot initialize parser context\n");
1664
0
  ctxt->nodeNr = 0;
1665
0
  ctxt->nodeMax = 0;
1666
0
  ctxt->node = NULL;
1667
0
  ctxt->inputNr = 0;
1668
0
  ctxt->inputMax = 0;
1669
0
  ctxt->input = NULL;
1670
0
  ctxt->nameNr = 0;
1671
0
  ctxt->nameMax = 0;
1672
0
  ctxt->name = NULL;
1673
0
  return(-1);
1674
0
    }
1675
396k
    ctxt->nameNr = 0;
1676
396k
    ctxt->name = NULL;
1677
1678
    /* Allocate the space stack */
1679
396k
    if (ctxt->spaceTab == NULL) {
1680
396k
  ctxt->spaceTab = (int *) xmlMalloc(10 * sizeof(int));
1681
396k
  ctxt->spaceMax = 10;
1682
396k
    }
1683
396k
    if (ctxt->spaceTab == NULL) {
1684
0
        xmlErrMemory(NULL, "cannot initialize parser context\n");
1685
0
  ctxt->nodeNr = 0;
1686
0
  ctxt->nodeMax = 0;
1687
0
  ctxt->node = NULL;
1688
0
  ctxt->inputNr = 0;
1689
0
  ctxt->inputMax = 0;
1690
0
  ctxt->input = NULL;
1691
0
  ctxt->nameNr = 0;
1692
0
  ctxt->nameMax = 0;
1693
0
  ctxt->name = NULL;
1694
0
  ctxt->spaceNr = 0;
1695
0
  ctxt->spaceMax = 0;
1696
0
  ctxt->space = NULL;
1697
0
  return(-1);
1698
0
    }
1699
396k
    ctxt->spaceNr = 1;
1700
396k
    ctxt->spaceMax = 10;
1701
396k
    ctxt->spaceTab[0] = -1;
1702
396k
    ctxt->space = &ctxt->spaceTab[0];
1703
396k
    ctxt->userData = ctxt;
1704
396k
    ctxt->myDoc = NULL;
1705
396k
    ctxt->wellFormed = 1;
1706
396k
    ctxt->nsWellFormed = 1;
1707
396k
    ctxt->valid = 1;
1708
396k
    ctxt->loadsubset = xmlLoadExtDtdDefaultValue;
1709
396k
    if (ctxt->loadsubset) {
1710
0
        ctxt->options |= XML_PARSE_DTDLOAD;
1711
0
    }
1712
396k
    ctxt->validate = xmlDoValidityCheckingDefaultValue;
1713
396k
    ctxt->pedantic = xmlPedanticParserDefaultValue;
1714
396k
    if (ctxt->pedantic) {
1715
0
        ctxt->options |= XML_PARSE_PEDANTIC;
1716
0
    }
1717
396k
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
1718
396k
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
1719
396k
    if (ctxt->keepBlanks == 0) {
1720
0
  ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
1721
0
  ctxt->options |= XML_PARSE_NOBLANKS;
1722
0
    }
1723
1724
396k
    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
1725
396k
    ctxt->vctxt.userData = ctxt;
1726
396k
    ctxt->vctxt.error = xmlParserValidityError;
1727
396k
    ctxt->vctxt.warning = xmlParserValidityWarning;
1728
396k
    if (ctxt->validate) {
1729
0
  if (xmlGetWarningsDefaultValue == 0)
1730
0
      ctxt->vctxt.warning = NULL;
1731
0
  else
1732
0
      ctxt->vctxt.warning = xmlParserValidityWarning;
1733
0
  ctxt->vctxt.nodeMax = 0;
1734
0
        ctxt->options |= XML_PARSE_DTDVALID;
1735
0
    }
1736
396k
    ctxt->replaceEntities = xmlSubstituteEntitiesDefaultValue;
1737
396k
    if (ctxt->replaceEntities) {
1738
0
        ctxt->options |= XML_PARSE_NOENT;
1739
0
    }
1740
396k
    ctxt->record_info = 0;
1741
396k
    ctxt->nbChars = 0;
1742
396k
    ctxt->checkIndex = 0;
1743
396k
    ctxt->inSubset = 0;
1744
396k
    ctxt->errNo = XML_ERR_OK;
1745
396k
    ctxt->depth = 0;
1746
396k
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
1747
396k
    ctxt->catalogs = NULL;
1748
396k
    ctxt->nbentities = 0;
1749
396k
    ctxt->sizeentities = 0;
1750
396k
    ctxt->sizeentcopy = 0;
1751
396k
    ctxt->input_id = 1;
1752
396k
    xmlInitNodeInfoSeq(&ctxt->node_seq);
1753
396k
    return(0);
1754
396k
}
1755
1756
/**
1757
 * xmlFreeParserCtxt:
1758
 * @ctxt:  an XML parser context
1759
 *
1760
 * Free all the memory used by a parser context. However the parsed
1761
 * document in ctxt->myDoc is not freed.
1762
 */
1763
1764
void
1765
xmlFreeParserCtxt(xmlParserCtxtPtr ctxt)
1766
396k
{
1767
396k
    xmlParserInputPtr input;
1768
1769
396k
    if (ctxt == NULL) return;
1770
1771
793k
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
1772
396k
        xmlFreeInputStream(input);
1773
396k
    }
1774
396k
    if (ctxt->spaceTab != NULL) xmlFree(ctxt->spaceTab);
1775
396k
    if (ctxt->nameTab != NULL) xmlFree((xmlChar * *)ctxt->nameTab);
1776
396k
    if (ctxt->nodeTab != NULL) xmlFree(ctxt->nodeTab);
1777
396k
    if (ctxt->nodeInfoTab != NULL) xmlFree(ctxt->nodeInfoTab);
1778
396k
    if (ctxt->inputTab != NULL) xmlFree(ctxt->inputTab);
1779
396k
    if (ctxt->version != NULL) xmlFree((char *) ctxt->version);
1780
396k
    if (ctxt->encoding != NULL) xmlFree((char *) ctxt->encoding);
1781
396k
    if (ctxt->extSubURI != NULL) xmlFree((char *) ctxt->extSubURI);
1782
396k
    if (ctxt->extSubSystem != NULL) xmlFree((char *) ctxt->extSubSystem);
1783
396k
#ifdef LIBXML_SAX1_ENABLED
1784
396k
    if ((ctxt->sax != NULL) &&
1785
396k
        (ctxt->sax != (xmlSAXHandlerPtr) &xmlDefaultSAXHandler))
1786
#else
1787
    if (ctxt->sax != NULL)
1788
#endif /* LIBXML_SAX1_ENABLED */
1789
396k
        xmlFree(ctxt->sax);
1790
396k
    if (ctxt->directory != NULL) xmlFree((char *) ctxt->directory);
1791
396k
    if (ctxt->vctxt.nodeTab != NULL) xmlFree(ctxt->vctxt.nodeTab);
1792
396k
    if (ctxt->atts != NULL) xmlFree((xmlChar * *)ctxt->atts);
1793
396k
    if (ctxt->dict != NULL) xmlDictFree(ctxt->dict);
1794
396k
    if (ctxt->nsTab != NULL) xmlFree((char *) ctxt->nsTab);
1795
396k
    if (ctxt->pushTab != NULL) xmlFree(ctxt->pushTab);
1796
396k
    if (ctxt->attallocs != NULL) xmlFree(ctxt->attallocs);
1797
396k
    if (ctxt->attsDefault != NULL)
1798
12.0k
        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
1799
396k
    if (ctxt->attsSpecial != NULL)
1800
12.7k
        xmlHashFree(ctxt->attsSpecial, NULL);
1801
396k
    if (ctxt->freeElems != NULL) {
1802
159k
        xmlNodePtr cur, next;
1803
1804
159k
  cur = ctxt->freeElems;
1805
4.05M
  while (cur != NULL) {
1806
3.89M
      next = cur->next;
1807
3.89M
      xmlFree(cur);
1808
3.89M
      cur = next;
1809
3.89M
  }
1810
159k
    }
1811
396k
    if (ctxt->freeAttrs != NULL) {
1812
84.8k
        xmlAttrPtr cur, next;
1813
1814
84.8k
  cur = ctxt->freeAttrs;
1815
1.26M
  while (cur != NULL) {
1816
1.18M
      next = cur->next;
1817
1.18M
      xmlFree(cur);
1818
1.18M
      cur = next;
1819
1.18M
  }
1820
84.8k
    }
1821
    /*
1822
     * cleanup the error strings
1823
     */
1824
396k
    if (ctxt->lastError.message != NULL)
1825
341k
        xmlFree(ctxt->lastError.message);
1826
396k
    if (ctxt->lastError.file != NULL)
1827
86.5k
        xmlFree(ctxt->lastError.file);
1828
396k
    if (ctxt->lastError.str1 != NULL)
1829
91.3k
        xmlFree(ctxt->lastError.str1);
1830
396k
    if (ctxt->lastError.str2 != NULL)
1831
13.3k
        xmlFree(ctxt->lastError.str2);
1832
396k
    if (ctxt->lastError.str3 != NULL)
1833
8.06k
        xmlFree(ctxt->lastError.str3);
1834
1835
396k
#ifdef LIBXML_CATALOG_ENABLED
1836
396k
    if (ctxt->catalogs != NULL)
1837
372
  xmlCatalogFreeLocal(ctxt->catalogs);
1838
396k
#endif
1839
396k
    xmlFree(ctxt);
1840
396k
}
1841
1842
/**
1843
 * xmlNewParserCtxt:
1844
 *
1845
 * Allocate and initialize a new parser context.
1846
 *
1847
 * Returns the xmlParserCtxtPtr or NULL
1848
 */
1849
1850
xmlParserCtxtPtr
1851
xmlNewParserCtxt(void)
1852
396k
{
1853
396k
    xmlParserCtxtPtr ctxt;
1854
1855
396k
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
1856
396k
    if (ctxt == NULL) {
1857
0
  xmlErrMemory(NULL, "cannot allocate parser context\n");
1858
0
  return(NULL);
1859
0
    }
1860
396k
    memset(ctxt, 0, sizeof(xmlParserCtxt));
1861
396k
    if (xmlInitParserCtxt(ctxt) < 0) {
1862
0
        xmlFreeParserCtxt(ctxt);
1863
0
  return(NULL);
1864
0
    }
1865
396k
    return(ctxt);
1866
396k
}
1867
1868
/************************************************************************
1869
 *                  *
1870
 *    Handling of node informations       *
1871
 *                  *
1872
 ************************************************************************/
1873
1874
/**
1875
 * xmlClearParserCtxt:
1876
 * @ctxt:  an XML parser context
1877
 *
1878
 * Clear (release owned resources) and reinitialize a parser context
1879
 */
1880
1881
void
1882
xmlClearParserCtxt(xmlParserCtxtPtr ctxt)
1883
0
{
1884
0
  if (ctxt==NULL)
1885
0
    return;
1886
0
  xmlClearNodeInfoSeq(&ctxt->node_seq);
1887
0
  xmlCtxtReset(ctxt);
1888
0
}
1889
1890
1891
/**
1892
 * xmlParserFindNodeInfo:
1893
 * @ctx:  an XML parser context
1894
 * @node:  an XML node within the tree
1895
 *
1896
 * Find the parser node info struct for a given node
1897
 *
1898
 * Returns an xmlParserNodeInfo block pointer or NULL
1899
 */
1900
const xmlParserNodeInfo *
1901
xmlParserFindNodeInfo(const xmlParserCtxtPtr ctx, const xmlNodePtr node)
1902
0
{
1903
0
    unsigned long pos;
1904
1905
0
    if ((ctx == NULL) || (node == NULL))
1906
0
        return (NULL);
1907
    /* Find position where node should be at */
1908
0
    pos = xmlParserFindNodeInfoIndex(&ctx->node_seq, node);
1909
0
    if (pos < ctx->node_seq.length
1910
0
        && ctx->node_seq.buffer[pos].node == node)
1911
0
        return &ctx->node_seq.buffer[pos];
1912
0
    else
1913
0
        return NULL;
1914
0
}
1915
1916
1917
/**
1918
 * xmlInitNodeInfoSeq:
1919
 * @seq:  a node info sequence pointer
1920
 *
1921
 * -- Initialize (set to initial state) node info sequence
1922
 */
1923
void
1924
xmlInitNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
1925
396k
{
1926
396k
    if (seq == NULL)
1927
0
        return;
1928
396k
    seq->length = 0;
1929
396k
    seq->maximum = 0;
1930
396k
    seq->buffer = NULL;
1931
396k
}
1932
1933
/**
1934
 * xmlClearNodeInfoSeq:
1935
 * @seq:  a node info sequence pointer
1936
 *
1937
 * -- Clear (release memory and reinitialize) node
1938
 *   info sequence
1939
 */
1940
void
1941
xmlClearNodeInfoSeq(xmlParserNodeInfoSeqPtr seq)
1942
0
{
1943
0
    if (seq == NULL)
1944
0
        return;
1945
0
    if (seq->buffer != NULL)
1946
0
        xmlFree(seq->buffer);
1947
0
    xmlInitNodeInfoSeq(seq);
1948
0
}
1949
1950
/**
1951
 * xmlParserFindNodeInfoIndex:
1952
 * @seq:  a node info sequence pointer
1953
 * @node:  an XML node pointer
1954
 *
1955
 *
1956
 * xmlParserFindNodeInfoIndex : Find the index that the info record for
1957
 *   the given node is or should be at in a sorted sequence
1958
 *
1959
 * Returns a long indicating the position of the record
1960
 */
1961
unsigned long
1962
xmlParserFindNodeInfoIndex(const xmlParserNodeInfoSeqPtr seq,
1963
                           const xmlNodePtr node)
1964
0
{
1965
0
    unsigned long upper, lower, middle;
1966
0
    int found = 0;
1967
1968
0
    if ((seq == NULL) || (node == NULL))
1969
0
        return ((unsigned long) -1);
1970
1971
    /* Do a binary search for the key */
1972
0
    lower = 1;
1973
0
    upper = seq->length;
1974
0
    middle = 0;
1975
0
    while (lower <= upper && !found) {
1976
0
        middle = lower + (upper - lower) / 2;
1977
0
        if (node == seq->buffer[middle - 1].node)
1978
0
            found = 1;
1979
0
        else if (node < seq->buffer[middle - 1].node)
1980
0
            upper = middle - 1;
1981
0
        else
1982
0
            lower = middle + 1;
1983
0
    }
1984
1985
    /* Return position */
1986
0
    if (middle == 0 || seq->buffer[middle - 1].node < node)
1987
0
        return middle;
1988
0
    else
1989
0
        return middle - 1;
1990
0
}
1991
1992
1993
/**
1994
 * xmlParserAddNodeInfo:
1995
 * @ctxt:  an XML parser context
1996
 * @info:  a node info sequence pointer
1997
 *
1998
 * Insert node info record into the sorted sequence
1999
 */
2000
void
2001
xmlParserAddNodeInfo(xmlParserCtxtPtr ctxt,
2002
                     const xmlParserNodeInfoPtr info)
2003
0
{
2004
0
    unsigned long pos;
2005
2006
0
    if ((ctxt == NULL) || (info == NULL)) return;
2007
2008
    /* Find pos and check to see if node is already in the sequence */
2009
0
    pos = xmlParserFindNodeInfoIndex(&ctxt->node_seq, (xmlNodePtr)
2010
0
                                     info->node);
2011
2012
0
    if ((pos < ctxt->node_seq.length) &&
2013
0
        (ctxt->node_seq.buffer != NULL) &&
2014
0
        (ctxt->node_seq.buffer[pos].node == info->node)) {
2015
0
        ctxt->node_seq.buffer[pos] = *info;
2016
0
    }
2017
2018
    /* Otherwise, we need to add new node to buffer */
2019
0
    else {
2020
0
        if ((ctxt->node_seq.length + 1 > ctxt->node_seq.maximum) ||
2021
0
      (ctxt->node_seq.buffer == NULL)) {
2022
0
            xmlParserNodeInfo *tmp_buffer;
2023
0
            unsigned int byte_size;
2024
2025
0
            if (ctxt->node_seq.maximum == 0)
2026
0
                ctxt->node_seq.maximum = 2;
2027
0
            byte_size = (sizeof(*ctxt->node_seq.buffer) *
2028
0
      (2 * ctxt->node_seq.maximum));
2029
2030
0
            if (ctxt->node_seq.buffer == NULL)
2031
0
                tmp_buffer = (xmlParserNodeInfo *) xmlMalloc(byte_size);
2032
0
            else
2033
0
                tmp_buffer =
2034
0
                    (xmlParserNodeInfo *) xmlRealloc(ctxt->node_seq.buffer,
2035
0
                                                     byte_size);
2036
2037
0
            if (tmp_buffer == NULL) {
2038
0
    xmlErrMemory(ctxt, "failed to allocate buffer\n");
2039
0
                return;
2040
0
            }
2041
0
            ctxt->node_seq.buffer = tmp_buffer;
2042
0
            ctxt->node_seq.maximum *= 2;
2043
0
        }
2044
2045
        /* If position is not at end, move elements out of the way */
2046
0
        if (pos != ctxt->node_seq.length) {
2047
0
            unsigned long i;
2048
2049
0
            for (i = ctxt->node_seq.length; i > pos; i--)
2050
0
                ctxt->node_seq.buffer[i] = ctxt->node_seq.buffer[i - 1];
2051
0
        }
2052
2053
        /* Copy element and increase length */
2054
0
        ctxt->node_seq.buffer[pos] = *info;
2055
0
        ctxt->node_seq.length++;
2056
0
    }
2057
0
}
2058
2059
/************************************************************************
2060
 *                  *
2061
 *    Defaults settings         *
2062
 *                  *
2063
 ************************************************************************/
2064
/**
2065
 * xmlPedanticParserDefault:
2066
 * @val:  int 0 or 1
2067
 *
2068
 * Set and return the previous value for enabling pedantic warnings.
2069
 *
2070
 * Returns the last value for 0 for no substitution, 1 for substitution.
2071
 */
2072
2073
int
2074
0
xmlPedanticParserDefault(int val) {
2075
0
    int old = xmlPedanticParserDefaultValue;
2076
2077
0
    xmlPedanticParserDefaultValue = val;
2078
0
    return(old);
2079
0
}
2080
2081
/**
2082
 * xmlLineNumbersDefault:
2083
 * @val:  int 0 or 1
2084
 *
2085
 * Set and return the previous value for enabling line numbers in elements
2086
 * contents. This may break on old application and is turned off by default.
2087
 *
2088
 * Returns the last value for 0 for no substitution, 1 for substitution.
2089
 */
2090
2091
int
2092
0
xmlLineNumbersDefault(int val) {
2093
0
    int old = xmlLineNumbersDefaultValue;
2094
2095
0
    xmlLineNumbersDefaultValue = val;
2096
0
    return(old);
2097
0
}
2098
2099
/**
2100
 * xmlSubstituteEntitiesDefault:
2101
 * @val:  int 0 or 1
2102
 *
2103
 * Set and return the previous value for default entity support.
2104
 * Initially the parser always keep entity references instead of substituting
2105
 * entity values in the output. This function has to be used to change the
2106
 * default parser behavior
2107
 * SAX::substituteEntities() has to be used for changing that on a file by
2108
 * file basis.
2109
 *
2110
 * Returns the last value for 0 for no substitution, 1 for substitution.
2111
 */
2112
2113
int
2114
0
xmlSubstituteEntitiesDefault(int val) {
2115
0
    int old = xmlSubstituteEntitiesDefaultValue;
2116
2117
0
    xmlSubstituteEntitiesDefaultValue = val;
2118
0
    return(old);
2119
0
}
2120
2121
/**
2122
 * xmlKeepBlanksDefault:
2123
 * @val:  int 0 or 1
2124
 *
2125
 * Set and return the previous value for default blanks text nodes support.
2126
 * The 1.x version of the parser used an heuristic to try to detect
2127
 * ignorable white spaces. As a result the SAX callback was generating
2128
 * xmlSAX2IgnorableWhitespace() callbacks instead of characters() one, and when
2129
 * using the DOM output text nodes containing those blanks were not generated.
2130
 * The 2.x and later version will switch to the XML standard way and
2131
 * ignorableWhitespace() are only generated when running the parser in
2132
 * validating mode and when the current element doesn't allow CDATA or
2133
 * mixed content.
2134
 * This function is provided as a way to force the standard behavior
2135
 * on 1.X libs and to switch back to the old mode for compatibility when
2136
 * running 1.X client code on 2.X . Upgrade of 1.X code should be done
2137
 * by using xmlIsBlankNode() commodity function to detect the "empty"
2138
 * nodes generated.
2139
 * This value also affect autogeneration of indentation when saving code
2140
 * if blanks sections are kept, indentation is not generated.
2141
 *
2142
 * Returns the last value for 0 for no substitution, 1 for substitution.
2143
 */
2144
2145
int
2146
0
xmlKeepBlanksDefault(int val) {
2147
0
    int old = xmlKeepBlanksDefaultValue;
2148
2149
0
    xmlKeepBlanksDefaultValue = val;
2150
0
    if (!val) xmlIndentTreeOutput = 1;
2151
0
    return(old);
2152
0
}
2153
2154
#define bottom_parserInternals
2155
#include "elfgcchack.h"