Coverage Report

Created: 2026-03-06 07:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/libxml2/HTMLparser.c
Line
Count
Source
1
/*
2
 * HTMLparser.c : an HTML parser
3
 *
4
 * References:
5
 *   HTML Living Standard
6
 *     https://html.spec.whatwg.org/multipage/parsing.html
7
 *
8
 * Tokenization now conforms to HTML5. Tree construction still follows
9
 * a custom, non-standard implementation. See:
10
 *
11
 *     https://gitlab.gnome.org/GNOME/libxml2/-/issues/211
12
 *
13
 * See Copyright for the status of this software.
14
 *
15
 * Author: Daniel Veillard
16
 */
17
18
#define IN_LIBXML
19
#include "libxml.h"
20
#ifdef LIBXML_HTML_ENABLED
21
22
#include <string.h>
23
#include <ctype.h>
24
#include <stdlib.h>
25
26
#include <libxml/HTMLparser.h>
27
#include <libxml/xmlmemory.h>
28
#include <libxml/tree.h>
29
#include <libxml/parser.h>
30
#include <libxml/parserInternals.h>
31
#include <libxml/xmlerror.h>
32
#include <libxml/HTMLtree.h>
33
#include <libxml/entities.h>
34
#include <libxml/encoding.h>
35
#include <libxml/xmlIO.h>
36
#include <libxml/uri.h>
37
38
#include "private/buf.h"
39
#include "private/dict.h"
40
#include "private/enc.h"
41
#include "private/error.h"
42
#include "private/html.h"
43
#include "private/io.h"
44
#include "private/memory.h"
45
#include "private/parser.h"
46
#include "private/tree.h"
47
48
#define HTML_MAX_NAMELEN 1000
49
14.1k
#define HTML_MAX_ATTRS 100000000 /* 100 million */
50
232k
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
51
41.6M
#define HTML_PARSER_BUFFER_SIZE 100
52
53
#define IS_HEX_DIGIT(c) \
54
92.8k
    ((IS_ASCII_DIGIT(c)) || \
55
92.8k
     ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
56
57
#define IS_UPPER(c) \
58
15.8M
    (((c) >= 'A') && ((c) <= 'Z'))
59
60
#define IS_ALNUM(c) \
61
42.9k
    (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c))
62
63
typedef enum {
64
    INSERT_INITIAL = 1,
65
    INSERT_IN_HEAD = 3,
66
    INSERT_IN_BODY = 10
67
} htmlInsertMode;
68
69
typedef const unsigned htmlAsciiMask[2];
70
71
static htmlAsciiMask MASK_DQ = {
72
    0,
73
    1u << ('"' - 32),
74
};
75
static htmlAsciiMask MASK_SQ = {
76
    0,
77
    1u << ('\'' - 32),
78
};
79
static htmlAsciiMask MASK_GT = {
80
    0,
81
    1u << ('>' - 32),
82
};
83
static htmlAsciiMask MASK_DASH = {
84
    0,
85
    1u << ('-' - 32),
86
};
87
static htmlAsciiMask MASK_WS_GT = {
88
    1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
89
    1u << (' ' - 32) | 1u << ('>' - 32),
90
};
91
static htmlAsciiMask MASK_DQ_GT = {
92
    0,
93
    1u << ('"' - 32) | 1u << ('>' - 32),
94
};
95
static htmlAsciiMask MASK_SQ_GT = {
96
    0,
97
    1u << ('\'' - 32) | 1u << ('>' - 32),
98
};
99
100
static int htmlOmittedDefaultValue = 1;
101
102
static int
103
htmlParseElementInternal(htmlParserCtxtPtr ctxt);
104
105
/************************************************************************
106
 *                  *
107
 *    Some factorized error routines        *
108
 *                  *
109
 ************************************************************************/
110
111
/**
112
 * Handle an out-of-memory error
113
 *
114
 * @param ctxt  an HTML parser context
115
 */
116
static void
117
htmlErrMemory(xmlParserCtxtPtr ctxt)
118
945
{
119
945
    xmlCtxtErrMemory(ctxt);
120
945
}
121
122
/**
123
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
124
 *
125
 * @param ctxt  an HTML parser context
126
 * @param error  the error number
127
 * @param msg  the error message
128
 * @param str1  string infor
129
 * @param str2  string infor
130
 */
131
static void LIBXML_ATTR_FORMAT(3,0)
132
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
133
             const char *msg, const xmlChar *str1, const xmlChar *str2)
134
28.7k
{
135
28.7k
    xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
136
28.7k
               str1, str2, NULL, 0, msg, str1, str2);
137
28.7k
}
138
139
/************************************************************************
140
 *                  *
141
 *  Parser stacks related functions and macros    *
142
 *                  *
143
 ************************************************************************/
144
145
/**
146
 * Pushes a new element name on top of the name stack
147
 *
148
 * @param ctxt  an HTML parser context
149
 * @param value  the element name
150
 * @returns -1 in case of error, the index in the stack otherwise
151
 */
152
static int
153
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
154
269k
{
155
269k
    if ((ctxt->html < INSERT_IN_HEAD) && (xmlStrEqual(value, BAD_CAST "head")))
156
763
        ctxt->html = INSERT_IN_HEAD;
157
269k
    if ((ctxt->html < INSERT_IN_BODY) && (xmlStrEqual(value, BAD_CAST "body")))
158
6.46k
        ctxt->html = INSERT_IN_BODY;
159
269k
    if (ctxt->nameNr >= ctxt->nameMax) {
160
19.5k
        const xmlChar **tmp;
161
19.5k
        int newSize;
162
163
19.5k
        newSize = xmlGrowCapacity(ctxt->nameMax, sizeof(tmp[0]),
164
19.5k
                                  10, XML_MAX_ITEMS);
165
19.5k
        if (newSize < 0) {
166
0
            htmlErrMemory(ctxt);
167
0
            return (-1);
168
0
        }
169
19.5k
        tmp = xmlRealloc(ctxt->nameTab, newSize * sizeof(tmp[0]));
170
19.5k
        if (tmp == NULL) {
171
70
            htmlErrMemory(ctxt);
172
70
            return(-1);
173
70
        }
174
19.4k
        ctxt->nameTab = tmp;
175
19.4k
        ctxt->nameMax = newSize;
176
19.4k
    }
177
269k
    ctxt->nameTab[ctxt->nameNr] = value;
178
269k
    ctxt->name = value;
179
269k
    return (ctxt->nameNr++);
180
269k
}
181
/**
182
 * Pops the top element name from the name stack
183
 *
184
 * @param ctxt  an HTML parser context
185
 * @returns the name just removed
186
 */
187
static const xmlChar *
188
htmlnamePop(htmlParserCtxtPtr ctxt)
189
259k
{
190
259k
    const xmlChar *ret;
191
192
259k
    if (ctxt->nameNr <= 0)
193
0
        return (NULL);
194
259k
    ctxt->nameNr--;
195
259k
    if (ctxt->nameNr < 0)
196
0
        return (NULL);
197
259k
    if (ctxt->nameNr > 0)
198
177k
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
199
81.9k
    else
200
81.9k
        ctxt->name = NULL;
201
259k
    ret = ctxt->nameTab[ctxt->nameNr];
202
259k
    ctxt->nameTab[ctxt->nameNr] = NULL;
203
259k
    return (ret);
204
259k
}
205
206
/**
207
 * Pushes a new element name on top of the node info stack
208
 *
209
 * @param ctxt  an HTML parser context
210
 * @param value  the node info
211
 * @returns 0 in case of error, the index in the stack otherwise
212
 */
213
static int
214
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
215
0
{
216
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
217
0
        xmlParserNodeInfo *tmp;
218
0
        int newSize;
219
220
0
        newSize = xmlGrowCapacity(ctxt->nodeInfoMax, sizeof(tmp[0]),
221
0
                                  5, XML_MAX_ITEMS);
222
0
        if (newSize < 0) {
223
0
            htmlErrMemory(ctxt);
224
0
            return (0);
225
0
        }
226
0
        tmp = xmlRealloc(ctxt->nodeInfoTab, newSize * sizeof(tmp[0]));
227
0
        if (tmp == NULL) {
228
0
            htmlErrMemory(ctxt);
229
0
            return (0);
230
0
        }
231
0
        ctxt->nodeInfoTab = tmp;
232
0
        ctxt->nodeInfoMax = newSize;
233
0
    }
234
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
235
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
236
0
    return (ctxt->nodeInfoNr++);
237
0
}
238
239
/**
240
 * Pops the top element name from the node info stack
241
 *
242
 * @param ctxt  an HTML parser context
243
 * @returns 0 in case of error, the pointer to NodeInfo otherwise
244
 */
245
static htmlParserNodeInfo *
246
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
247
0
{
248
0
    if (ctxt->nodeInfoNr <= 0)
249
0
        return (NULL);
250
0
    ctxt->nodeInfoNr--;
251
0
    if (ctxt->nodeInfoNr < 0)
252
0
        return (NULL);
253
0
    if (ctxt->nodeInfoNr > 0)
254
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
255
0
    else
256
0
        ctxt->nodeInfo = NULL;
257
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
258
0
}
259
260
/*
261
 * Macros for accessing the content. Those should be used only by the parser,
262
 * and not exported.
263
 *
264
 * Dirty macros, i.e. one need to make assumption on the context to use them
265
 *
266
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
267
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
268
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
269
 *           in UNICODE mode. This should be used internally by the parser
270
 *           only to compare to ASCII values otherwise it would break when
271
 *           running with UTF-8 encoding.
272
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
273
 *           to compare on ASCII based substring.
274
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
275
 *           it should be used only to compare on ASCII based substring.
276
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
277
 *           strings without newlines within the parser.
278
 *
279
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
280
 *
281
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
282
 */
283
284
58.4k
#define UPPER (toupper(*ctxt->input->cur))
285
286
4.32M
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
287
288
4.31M
#define NXT(val) ctxt->input->cur[(val)]
289
290
1.74M
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
291
292
0
#define CUR_PTR ctxt->input->cur
293
#define BASE_PTR ctxt->input->base
294
295
#define SHRINK \
296
129M
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
297
129M
        (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
298
129M
  (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
299
129M
  xmlParserShrink(ctxt);
300
301
#define GROW \
302
7.19M
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
303
7.19M
        (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
304
127k
  xmlParserGrow(ctxt);
305
306
11.4M
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
307
308
/* Imported from XML */
309
310
25.2M
#define CUR (*ctxt->input->cur)
311
312
/**
313
 * Prescan to find encoding.
314
 *
315
 * Try to find an encoding in the current data available in the input
316
 * buffer.
317
 *
318
 * TODO: Implement HTML5 prescan algorithm.
319
 *
320
 * @param ctxt  the HTML parser context
321
 * @returns  an encoding string or NULL if not found
322
 */
323
static xmlChar *
324
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
325
0
    const xmlChar *start, *cur, *end;
326
0
    xmlChar *ret;
327
0
328
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
329
0
        (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
330
0
        return(NULL);
331
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
332
0
        return(NULL);
333
0
334
0
    start = ctxt->input->cur;
335
0
    end = ctxt->input->end;
336
0
    /* we also expect the input buffer to be zero terminated */
337
0
    if (*end != 0)
338
0
        return(NULL);
339
0
340
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
341
0
    if (cur == NULL)
342
0
        return(NULL);
343
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
344
0
    if (cur == NULL)
345
0
        return(NULL);
346
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
347
0
    if (cur == NULL)
348
0
        return(NULL);
349
0
    cur += 8;
350
0
    start = cur;
351
0
    while ((IS_ALNUM(*cur)) ||
352
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
353
0
           cur++;
354
0
    if (cur == start)
355
0
        return(NULL);
356
0
    ret = xmlStrndup(start, cur - start);
357
0
    if (ret == NULL)
358
0
        htmlErrMemory(ctxt);
359
0
    return(ret);
360
0
}
361
362
static int
363
173M
htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
364
173M
    if (c >= 64)
365
115M
        return(0);
366
57.7M
    return((mask[c/32] >> (c & 31)) & 1);
367
173M
}
368
369
static int
370
htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len,
371
267M
                 int partial) {
372
267M
    unsigned c = str[0];
373
267M
    int size;
374
375
267M
    if (c < 0xC2) {
376
3.45M
        goto invalid;
377
264M
    } else if (c < 0xE0) {
378
34.4M
        if (len < 2)
379
566
            goto incomplete;
380
34.4M
        if ((str[1] & 0xC0) != 0x80)
381
77.6k
            goto invalid;
382
34.3M
        size = 2;
383
229M
    } else if (c < 0xF0) {
384
229M
        unsigned v;
385
386
229M
        if (len < 3)
387
267
            goto incomplete;
388
389
229M
        v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
390
229M
        v |= c << 16;
391
392
229M
        if (((v & 0x00C0C0) != 0x008080) ||
393
229M
            ((v & 0x0F2000) == 0x000000) ||
394
229M
            ((v & 0x0F2000) == 0x0D2000))
395
26.9k
            goto invalid;
396
397
229M
        size = 3;
398
229M
    } else {
399
434k
        unsigned v;
400
401
434k
        if (len < 4)
402
1.01k
            goto incomplete;
403
404
433k
        v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
405
406
433k
        if (((v & 0x00C0C0C0) != 0x00808080) ||
407
334k
            (v < 0xF0900000) || (v >= 0xF4900000))
408
99.1k
            goto invalid;
409
410
333k
        size = 4;
411
333k
    }
412
413
263M
    return(size);
414
415
1.84k
incomplete:
416
1.84k
    if (partial)
417
1.14k
        return(0);
418
419
3.65M
invalid:
420
    /* Only report the first error */
421
3.65M
    if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
422
2.05k
        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
423
2.05k
                     "Invalid bytes in character encoding\n", NULL, NULL);
424
2.05k
        ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
425
2.05k
    }
426
427
3.65M
    return(-1);
428
1.84k
}
429
430
/**
431
 * skip all blanks character found at that point in the input streams.
432
 *
433
 * @param ctxt  the HTML parser context
434
 * @returns the number of space chars skipped
435
 */
436
437
static int
438
11.4M
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
439
11.4M
    const xmlChar *cur = ctxt->input->cur;
440
11.4M
    size_t avail = ctxt->input->end - cur;
441
11.4M
    int res = 0;
442
11.4M
    int line = ctxt->input->line;
443
11.4M
    int col = ctxt->input->col;
444
445
29.7M
    while (!PARSER_STOPPED(ctxt)) {
446
29.6M
        if (avail == 0) {
447
9.47k
            ctxt->input->cur = cur;
448
9.47k
            GROW;
449
9.47k
            cur = ctxt->input->cur;
450
9.47k
            avail = ctxt->input->end - cur;
451
452
9.47k
            if (avail == 0)
453
8.13k
                break;
454
9.47k
        }
455
456
29.6M
        if (*cur == '\n') {
457
6.57M
            line++;
458
6.57M
            col = 1;
459
23.0M
        } else if (IS_WS_HTML(*cur)) {
460
11.7M
            col++;
461
11.7M
        } else {
462
11.3M
            break;
463
11.3M
        }
464
465
18.3M
        cur += 1;
466
18.3M
        avail -= 1;
467
468
18.3M
  if (res < INT_MAX)
469
18.3M
      res++;
470
18.3M
    }
471
472
11.4M
    ctxt->input->cur = cur;
473
11.4M
    ctxt->input->line = line;
474
11.4M
    ctxt->input->col = col;
475
476
11.4M
    if (res > 8)
477
52.6k
        GROW;
478
479
11.4M
    return(res);
480
11.4M
}
481
482
483
484
/************************************************************************
485
 *                  *
486
 *  The list of HTML elements and their properties    *
487
 *                  *
488
 ************************************************************************/
489
490
/*
491
 *  Start Tag: 1 means the start tag can be omitted
492
 *  End Tag:   1 means the end tag can be omitted
493
 *             2 means it's forbidden (empty elements)
494
 *             3 means the tag is stylistic and should be closed easily
495
 *  Depr:      this element is deprecated
496
 *  DTD:       1 means that this element is valid only in the Loose DTD
497
 *             2 means that this element is valid only in the Frameset DTD
498
 *
499
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
500
 */
501
502
static const htmlElemDesc
503
html40ElementTable[] = {
504
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
505
  NULL, NULL, NULL, NULL, NULL,
506
  0
507
},
508
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
509
  NULL, NULL, NULL, NULL, NULL,
510
  0
511
},
512
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
513
  NULL, NULL, NULL, NULL, NULL,
514
  0
515
},
516
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
517
  NULL, NULL, NULL, NULL, NULL,
518
  0
519
},
520
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
521
  NULL, NULL, NULL, NULL, NULL,
522
  0
523
},
524
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
525
  NULL, NULL, NULL, NULL, NULL,
526
  0
527
},
528
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
529
  NULL, NULL, NULL, NULL, NULL,
530
  0
531
},
532
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
533
  NULL, NULL, NULL, NULL, NULL,
534
  0
535
},
536
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
537
  NULL, NULL, NULL, NULL, NULL,
538
  0
539
},
540
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
541
  NULL, NULL, NULL, NULL, NULL,
542
  0
543
},
544
{ "bgsound",  0, 0, 2, 1, 0, 0, 0, "",
545
  NULL, NULL, NULL, NULL, NULL,
546
  0
547
},
548
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
549
  NULL, NULL, NULL, NULL, NULL,
550
  0
551
},
552
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
553
  NULL, NULL, NULL, NULL, NULL,
554
  0
555
},
556
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
557
  NULL, NULL, NULL, NULL, NULL,
558
  0
559
},
560
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
561
  NULL, NULL, NULL, NULL, NULL,
562
  0
563
},
564
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
565
  NULL, NULL, NULL, NULL, NULL,
566
  0
567
},
568
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
569
  NULL, NULL, NULL, NULL, NULL,
570
  0
571
},
572
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
573
  NULL, NULL, NULL, NULL, NULL,
574
  0
575
},
576
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
577
  NULL, NULL, NULL, NULL, NULL,
578
  0
579
},
580
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
581
  NULL, NULL, NULL, NULL, NULL,
582
  0
583
},
584
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
585
  NULL, NULL, NULL, NULL, NULL,
586
  0
587
},
588
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
589
  NULL, NULL, NULL, NULL, NULL,
590
  0
591
},
592
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
593
  NULL, NULL, NULL, NULL, NULL,
594
  0
595
},
596
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
597
  NULL, NULL, NULL, NULL, NULL,
598
  0
599
},
600
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
601
  NULL, NULL, NULL, NULL, NULL,
602
  0
603
},
604
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
605
  NULL, NULL, NULL, NULL, NULL,
606
  0
607
},
608
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
609
  NULL, NULL, NULL, NULL, NULL,
610
  0
611
},
612
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
613
  NULL, NULL, NULL, NULL, NULL,
614
  0
615
},
616
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
617
  NULL, NULL, NULL, NULL, NULL,
618
  0
619
},
620
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
621
  NULL, NULL, NULL, NULL, NULL,
622
  0
623
},
624
{ "embed",  0, 1, 2, 1, 1, 1, 1, "generic embedded object ",
625
  NULL, NULL, NULL, NULL, NULL,
626
  0
627
},
628
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
629
  NULL, NULL, NULL, NULL, NULL,
630
  0
631
},
632
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
633
  NULL, NULL, NULL, NULL, NULL,
634
  0
635
},
636
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
637
  NULL, NULL, NULL, NULL, NULL,
638
  0
639
},
640
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
641
  NULL, NULL, NULL, NULL, NULL,
642
  0
643
},
644
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
645
  NULL, NULL, NULL, NULL, NULL,
646
  0
647
},
648
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
649
  NULL, NULL, NULL, NULL, NULL,
650
  0
651
},
652
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
653
  NULL, NULL, NULL, NULL, NULL,
654
  0
655
},
656
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
657
  NULL, NULL, NULL, NULL, NULL,
658
  0
659
},
660
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
661
  NULL, NULL, NULL, NULL, NULL,
662
  0
663
},
664
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
665
  NULL, NULL, NULL, NULL, NULL,
666
  0
667
},
668
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
669
  NULL, NULL, NULL, NULL, NULL,
670
  0
671
},
672
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
673
  NULL, NULL, NULL, NULL, NULL,
674
  0
675
},
676
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
677
  NULL, NULL, NULL, NULL, NULL,
678
  0
679
},
680
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
681
  NULL, NULL, NULL, NULL, NULL,
682
  0
683
},
684
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
685
  NULL, NULL, NULL, NULL, NULL,
686
  0
687
},
688
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
689
  NULL, NULL, NULL, NULL, NULL,
690
  DATA_RAWTEXT
691
},
692
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
693
  NULL, NULL, NULL, NULL, NULL,
694
  0
695
},
696
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
697
  NULL, NULL, NULL, NULL, NULL,
698
  0
699
},
700
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
701
  NULL, NULL, NULL, NULL, NULL,
702
  0
703
},
704
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
705
  NULL, NULL, NULL, NULL, NULL,
706
  0
707
},
708
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
709
  NULL, NULL, NULL, NULL, NULL,
710
  0
711
},
712
{ "keygen", 0, 0, 2, 1, 0, 0, 0, "",
713
  NULL, NULL, NULL, NULL, NULL,
714
  0
715
},
716
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
717
  NULL, NULL, NULL, NULL, NULL,
718
  0
719
},
720
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
721
  NULL, NULL, NULL, NULL, NULL,
722
  0
723
},
724
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
725
  NULL, NULL, NULL, NULL, NULL,
726
  0
727
},
728
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
729
  NULL, NULL, NULL, NULL, NULL,
730
  0
731
},
732
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
733
  NULL, NULL, NULL, NULL, NULL,
734
  0
735
},
736
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
737
  NULL, NULL, NULL, NULL, NULL,
738
  0
739
},
740
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
741
  NULL, NULL, NULL, NULL, NULL,
742
  0
743
},
744
{ "noembed",  0, 0, 0, 0, 0, 0, 0, "",
745
  NULL, NULL, NULL, NULL, NULL,
746
  DATA_RAWTEXT
747
},
748
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
749
  NULL, NULL, NULL, NULL, NULL,
750
  DATA_RAWTEXT
751
},
752
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
753
  NULL, NULL, NULL, NULL, NULL,
754
  0
755
},
756
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
757
  NULL, NULL, NULL, NULL, NULL,
758
  0
759
},
760
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
761
  NULL, NULL, NULL, NULL, NULL,
762
  0
763
},
764
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
765
  NULL, NULL, NULL, NULL, NULL,
766
  0
767
},
768
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
769
  NULL, NULL, NULL, NULL, NULL,
770
  0
771
},
772
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
773
  NULL, NULL, NULL, NULL, NULL,
774
  0
775
},
776
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
777
  NULL, NULL, NULL, NULL, NULL,
778
  0
779
},
780
{ "plaintext",  0, 0, 0, 0, 0, 0, 0, "",
781
  NULL, NULL, NULL, NULL, NULL,
782
  DATA_PLAINTEXT
783
},
784
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
785
  NULL, NULL, NULL, NULL, NULL,
786
  0
787
},
788
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
789
  NULL, NULL, NULL, NULL, NULL,
790
  0
791
},
792
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
793
  NULL, NULL, NULL, NULL, NULL,
794
  0
795
},
796
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
797
  NULL, NULL, NULL, NULL, NULL,
798
  0
799
},
800
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
801
  NULL, NULL, NULL, NULL, NULL,
802
  DATA_SCRIPT
803
},
804
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
805
  NULL, NULL, NULL, NULL, NULL,
806
  0
807
},
808
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
809
  NULL, NULL, NULL, NULL, NULL,
810
  0
811
},
812
{ "source", 0, 0, 2, 1, 0, 0, 0, "",
813
  NULL, NULL, NULL, NULL, NULL,
814
  0
815
},
816
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
817
  NULL, NULL, NULL, NULL, NULL,
818
  0
819
},
820
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
821
  NULL, NULL, NULL, NULL, NULL,
822
  0
823
},
824
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
825
  NULL, NULL, NULL, NULL, NULL,
826
  0
827
},
828
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
829
  NULL, NULL, NULL, NULL, NULL,
830
  DATA_RAWTEXT
831
},
832
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
833
  NULL, NULL, NULL, NULL, NULL,
834
  0
835
},
836
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
837
  NULL, NULL, NULL, NULL, NULL,
838
  0
839
},
840
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
841
  NULL, NULL, NULL, NULL, NULL,
842
  0
843
},
844
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
845
  NULL, NULL, NULL, NULL, NULL,
846
  0
847
},
848
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
849
  NULL, NULL, NULL, NULL, NULL,
850
  0
851
},
852
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
853
  NULL, NULL, NULL, NULL, NULL,
854
  DATA_RCDATA
855
},
856
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
857
  NULL, NULL, NULL, NULL, NULL,
858
  0
859
},
860
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
861
  NULL, NULL, NULL, NULL, NULL,
862
  0
863
},
864
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
865
  NULL, NULL, NULL, NULL, NULL,
866
  0
867
},
868
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
869
  NULL, NULL, NULL, NULL, NULL,
870
  DATA_RCDATA
871
},
872
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
873
  NULL, NULL, NULL, NULL, NULL,
874
  0
875
},
876
{ "track",  0, 0, 2, 1, 0, 0, 0, "",
877
  NULL, NULL, NULL, NULL, NULL,
878
  0
879
},
880
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
881
  NULL, NULL, NULL, NULL, NULL,
882
  0
883
},
884
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
885
  NULL, NULL, NULL, NULL, NULL,
886
  0
887
},
888
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
889
  NULL, NULL, NULL, NULL, NULL,
890
  0
891
},
892
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893
  NULL, NULL, NULL, NULL, NULL,
894
  0
895
},
896
{ "wbr",  0, 0, 2, 1, 0, 0, 0, "",
897
  NULL, NULL, NULL, NULL, NULL,
898
  0
899
},
900
{ "xmp",  0, 0, 0, 0, 0, 0, 1, "",
901
  NULL, NULL, NULL, NULL, NULL,
902
  DATA_RAWTEXT
903
}
904
};
905
906
typedef struct {
907
    const char *oldTag;
908
    const char *newTag;
909
} htmlStartCloseEntry;
910
911
/*
912
 * start tags that imply the end of current element
913
 */
914
static const htmlStartCloseEntry htmlStartClose[] = {
915
    { "a", "a" },
916
    { "a", "fieldset" },
917
    { "a", "table" },
918
    { "a", "td" },
919
    { "a", "th" },
920
    { "address", "dd" },
921
    { "address", "dl" },
922
    { "address", "dt" },
923
    { "address", "form" },
924
    { "address", "li" },
925
    { "address", "ul" },
926
    { "b", "center" },
927
    { "b", "p" },
928
    { "b", "td" },
929
    { "b", "th" },
930
    { "big", "p" },
931
    { "caption", "col" },
932
    { "caption", "colgroup" },
933
    { "caption", "tbody" },
934
    { "caption", "tfoot" },
935
    { "caption", "thead" },
936
    { "caption", "tr" },
937
    { "col", "col" },
938
    { "col", "colgroup" },
939
    { "col", "tbody" },
940
    { "col", "tfoot" },
941
    { "col", "thead" },
942
    { "col", "tr" },
943
    { "colgroup", "colgroup" },
944
    { "colgroup", "tbody" },
945
    { "colgroup", "tfoot" },
946
    { "colgroup", "thead" },
947
    { "colgroup", "tr" },
948
    { "dd", "dt" },
949
    { "dir", "dd" },
950
    { "dir", "dl" },
951
    { "dir", "dt" },
952
    { "dir", "form" },
953
    { "dir", "ul" },
954
    { "dl", "form" },
955
    { "dl", "li" },
956
    { "dt", "dd" },
957
    { "dt", "dl" },
958
    { "font", "center" },
959
    { "font", "td" },
960
    { "font", "th" },
961
    { "form", "form" },
962
    { "h1", "fieldset" },
963
    { "h1", "form" },
964
    { "h1", "li" },
965
    { "h1", "p" },
966
    { "h1", "table" },
967
    { "h2", "fieldset" },
968
    { "h2", "form" },
969
    { "h2", "li" },
970
    { "h2", "p" },
971
    { "h2", "table" },
972
    { "h3", "fieldset" },
973
    { "h3", "form" },
974
    { "h3", "li" },
975
    { "h3", "p" },
976
    { "h3", "table" },
977
    { "h4", "fieldset" },
978
    { "h4", "form" },
979
    { "h4", "li" },
980
    { "h4", "p" },
981
    { "h4", "table" },
982
    { "h5", "fieldset" },
983
    { "h5", "form" },
984
    { "h5", "li" },
985
    { "h5", "p" },
986
    { "h5", "table" },
987
    { "h6", "fieldset" },
988
    { "h6", "form" },
989
    { "h6", "li" },
990
    { "h6", "p" },
991
    { "h6", "table" },
992
    { "head", "a" },
993
    { "head", "abbr" },
994
    { "head", "acronym" },
995
    { "head", "address" },
996
    { "head", "b" },
997
    { "head", "bdo" },
998
    { "head", "big" },
999
    { "head", "blockquote" },
1000
    { "head", "body" },
1001
    { "head", "br" },
1002
    { "head", "center" },
1003
    { "head", "cite" },
1004
    { "head", "code" },
1005
    { "head", "dd" },
1006
    { "head", "dfn" },
1007
    { "head", "dir" },
1008
    { "head", "div" },
1009
    { "head", "dl" },
1010
    { "head", "dt" },
1011
    { "head", "em" },
1012
    { "head", "fieldset" },
1013
    { "head", "font" },
1014
    { "head", "form" },
1015
    { "head", "frameset" },
1016
    { "head", "h1" },
1017
    { "head", "h2" },
1018
    { "head", "h3" },
1019
    { "head", "h4" },
1020
    { "head", "h5" },
1021
    { "head", "h6" },
1022
    { "head", "hr" },
1023
    { "head", "i" },
1024
    { "head", "iframe" },
1025
    { "head", "img" },
1026
    { "head", "kbd" },
1027
    { "head", "li" },
1028
    { "head", "listing" },
1029
    { "head", "map" },
1030
    { "head", "menu" },
1031
    { "head", "ol" },
1032
    { "head", "p" },
1033
    { "head", "pre" },
1034
    { "head", "q" },
1035
    { "head", "s" },
1036
    { "head", "samp" },
1037
    { "head", "small" },
1038
    { "head", "span" },
1039
    { "head", "strike" },
1040
    { "head", "strong" },
1041
    { "head", "sub" },
1042
    { "head", "sup" },
1043
    { "head", "table" },
1044
    { "head", "tt" },
1045
    { "head", "u" },
1046
    { "head", "ul" },
1047
    { "head", "var" },
1048
    { "head", "xmp" },
1049
    { "hr", "form" },
1050
    { "i", "center" },
1051
    { "i", "p" },
1052
    { "i", "td" },
1053
    { "i", "th" },
1054
    { "legend", "fieldset" },
1055
    { "li", "li" },
1056
    { "link", "body" },
1057
    { "link", "frameset" },
1058
    { "listing", "dd" },
1059
    { "listing", "dl" },
1060
    { "listing", "dt" },
1061
    { "listing", "fieldset" },
1062
    { "listing", "form" },
1063
    { "listing", "li" },
1064
    { "listing", "table" },
1065
    { "listing", "ul" },
1066
    { "menu", "dd" },
1067
    { "menu", "dl" },
1068
    { "menu", "dt" },
1069
    { "menu", "form" },
1070
    { "menu", "ul" },
1071
    { "ol", "form" },
1072
    { "option", "optgroup" },
1073
    { "option", "option" },
1074
    { "p", "address" },
1075
    { "p", "blockquote" },
1076
    { "p", "body" },
1077
    { "p", "caption" },
1078
    { "p", "center" },
1079
    { "p", "col" },
1080
    { "p", "colgroup" },
1081
    { "p", "dd" },
1082
    { "p", "dir" },
1083
    { "p", "div" },
1084
    { "p", "dl" },
1085
    { "p", "dt" },
1086
    { "p", "fieldset" },
1087
    { "p", "form" },
1088
    { "p", "frameset" },
1089
    { "p", "h1" },
1090
    { "p", "h2" },
1091
    { "p", "h3" },
1092
    { "p", "h4" },
1093
    { "p", "h5" },
1094
    { "p", "h6" },
1095
    { "p", "head" },
1096
    { "p", "hr" },
1097
    { "p", "li" },
1098
    { "p", "listing" },
1099
    { "p", "menu" },
1100
    { "p", "ol" },
1101
    { "p", "p" },
1102
    { "p", "pre" },
1103
    { "p", "table" },
1104
    { "p", "tbody" },
1105
    { "p", "td" },
1106
    { "p", "tfoot" },
1107
    { "p", "th" },
1108
    { "p", "title" },
1109
    { "p", "tr" },
1110
    { "p", "ul" },
1111
    { "p", "xmp" },
1112
    { "pre", "dd" },
1113
    { "pre", "dl" },
1114
    { "pre", "dt" },
1115
    { "pre", "fieldset" },
1116
    { "pre", "form" },
1117
    { "pre", "li" },
1118
    { "pre", "table" },
1119
    { "pre", "ul" },
1120
    { "s", "p" },
1121
    { "script", "noscript" },
1122
    { "small", "p" },
1123
    { "span", "td" },
1124
    { "span", "th" },
1125
    { "strike", "p" },
1126
    { "style", "body" },
1127
    { "style", "frameset" },
1128
    { "tbody", "tbody" },
1129
    { "tbody", "tfoot" },
1130
    { "td", "tbody" },
1131
    { "td", "td" },
1132
    { "td", "tfoot" },
1133
    { "td", "th" },
1134
    { "td", "tr" },
1135
    { "tfoot", "tbody" },
1136
    { "th", "tbody" },
1137
    { "th", "td" },
1138
    { "th", "tfoot" },
1139
    { "th", "th" },
1140
    { "th", "tr" },
1141
    { "thead", "tbody" },
1142
    { "thead", "tfoot" },
1143
    { "title", "body" },
1144
    { "title", "frameset" },
1145
    { "tr", "tbody" },
1146
    { "tr", "tfoot" },
1147
    { "tr", "tr" },
1148
    { "tt", "p" },
1149
    { "u", "p" },
1150
    { "u", "td" },
1151
    { "u", "th" },
1152
    { "ul", "address" },
1153
    { "ul", "form" },
1154
    { "ul", "menu" },
1155
    { "ul", "pre" },
1156
    { "xmp", "dd" },
1157
    { "xmp", "dl" },
1158
    { "xmp", "dt" },
1159
    { "xmp", "fieldset" },
1160
    { "xmp", "form" },
1161
    { "xmp", "li" },
1162
    { "xmp", "table" },
1163
    { "xmp", "ul" }
1164
};
1165
1166
/*
1167
 * The list of HTML attributes which are of content %Script;
1168
 * NOTE: when adding ones, check #htmlIsScriptAttribute since
1169
 *       it assumes the name starts with 'on'
1170
 */
1171
static const char *const htmlScriptAttributes[] = {
1172
    "onclick",
1173
    "ondblclick",
1174
    "onmousedown",
1175
    "onmouseup",
1176
    "onmouseover",
1177
    "onmousemove",
1178
    "onmouseout",
1179
    "onkeypress",
1180
    "onkeydown",
1181
    "onkeyup",
1182
    "onload",
1183
    "onunload",
1184
    "onfocus",
1185
    "onblur",
1186
    "onsubmit",
1187
    "onreset",
1188
    "onchange",
1189
    "onselect"
1190
};
1191
1192
/*
1193
 * This table is used by the htmlparser to know what to do with
1194
 * broken html pages. By assigning different priorities to different
1195
 * elements the parser can decide how to handle extra endtags.
1196
 * Endtags are only allowed to close elements with lower or equal
1197
 * priority.
1198
 */
1199
1200
typedef struct {
1201
    const char *name;
1202
    int priority;
1203
} elementPriority;
1204
1205
static const elementPriority htmlEndPriority[] = {
1206
    {"div",   150},
1207
    {"td",    160},
1208
    {"th",    160},
1209
    {"tr",    170},
1210
    {"thead", 180},
1211
    {"tbody", 180},
1212
    {"tfoot", 180},
1213
    {"table", 190},
1214
    {"head",  200},
1215
    {"body",  200},
1216
    {"html",  220},
1217
    {NULL,    100} /* Default priority */
1218
};
1219
1220
/************************************************************************
1221
 *                  *
1222
 *  functions to handle HTML specific data      *
1223
 *                  *
1224
 ************************************************************************/
1225
1226
static void
1227
200k
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
1228
    /*
1229
     * Capture end position and add node
1230
     */
1231
200k
    if ( ctxt->node != NULL && ctxt->record_info ) {
1232
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
1233
0
                                (CUR_PTR - ctxt->input->base);
1234
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
1235
0
       ctxt->nodeInfo->node = ctxt->node;
1236
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
1237
0
       htmlNodeInfoPop(ctxt);
1238
0
    }
1239
200k
}
1240
1241
/**
1242
 * @deprecated This is a no-op.
1243
 */
1244
void
1245
0
htmlInitAutoClose(void) {
1246
0
}
1247
1248
static int
1249
4.07M
htmlCompareTags(const void *key, const void *member) {
1250
4.07M
    const xmlChar *tag = (const xmlChar *) key;
1251
4.07M
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1252
1253
4.07M
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1254
4.07M
}
1255
1256
/**
1257
 * Lookup the HTML tag in the ElementTable
1258
 *
1259
 * @deprecated Only supports HTML 4.
1260
 *
1261
 * @param tag  The tag name in lowercase
1262
 * @returns the related htmlElemDesc or NULL if not found.
1263
 */
1264
const htmlElemDesc *
1265
645k
htmlTagLookup(const xmlChar *tag) {
1266
645k
    if (tag == NULL)
1267
523
        return(NULL);
1268
1269
644k
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1270
644k
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1271
644k
                sizeof(htmlElemDesc), htmlCompareTags));
1272
645k
}
1273
1274
/**
1275
 * @param name  The name of the element to look up the priority for.
1276
 * @returns value: The "endtag" priority.
1277
 **/
1278
static int
1279
45.9k
htmlGetEndPriority (const xmlChar *name) {
1280
45.9k
    int i = 0;
1281
1282
534k
    while ((htmlEndPriority[i].name != NULL) &&
1283
492k
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1284
488k
  i++;
1285
1286
45.9k
    return(htmlEndPriority[i].priority);
1287
45.9k
}
1288
1289
1290
static int
1291
1.47M
htmlCompareStartClose(const void *vkey, const void *member) {
1292
1.47M
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1293
1.47M
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1294
1.47M
    int ret;
1295
1296
1.47M
    ret = strcmp(key->oldTag, entry->oldTag);
1297
1.47M
    if (ret == 0)
1298
126k
        ret = strcmp(key->newTag, entry->newTag);
1299
1300
1.47M
    return(ret);
1301
1.47M
}
1302
1303
/**
1304
 * Checks whether the new tag is one of the registered valid tags for
1305
 * closing old.
1306
 *
1307
 * @param newtag  The new tag name
1308
 * @param oldtag  The old tag name
1309
 * @returns 0 if no, 1 if yes.
1310
 */
1311
static int
1312
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1313
195k
{
1314
195k
    htmlStartCloseEntry key;
1315
195k
    void *res;
1316
1317
195k
    key.oldTag = (const char *) oldtag;
1318
195k
    key.newTag = (const char *) newtag;
1319
195k
    res = bsearch(&key, htmlStartClose,
1320
195k
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1321
195k
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1322
195k
    return(res != NULL);
1323
195k
}
1324
1325
/**
1326
 * The HTML DTD allows an ending tag to implicitly close other tags.
1327
 *
1328
 * @param ctxt  an HTML parser context
1329
 * @param newtag  The new tag name
1330
 */
1331
static void
1332
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1333
15.1k
{
1334
15.1k
    const htmlElemDesc *info;
1335
15.1k
    int i, priority;
1336
1337
15.1k
    if (ctxt->options & HTML_PARSE_HTML5)
1338
0
        return;
1339
1340
15.1k
    priority = htmlGetEndPriority(newtag);
1341
1342
45.2k
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1343
1344
45.2k
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1345
14.4k
            break;
1346
        /*
1347
         * A misplaced endtag can only close elements with lower
1348
         * or equal priority, so if we find an element with higher
1349
         * priority before we find an element with
1350
         * matching name, we just ignore this endtag
1351
         */
1352
30.8k
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1353
686
            return;
1354
30.8k
    }
1355
14.4k
    if (i < 0)
1356
0
        return;
1357
1358
31.0k
    while (!xmlStrEqual(newtag, ctxt->name)) {
1359
16.6k
        info = htmlTagLookup(ctxt->name);
1360
16.6k
        if ((info != NULL) && (info->endTag == 3)) {
1361
5.84k
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1362
5.84k
                   "Opening and ending tag mismatch: %s and %s\n",
1363
5.84k
       newtag, ctxt->name);
1364
5.84k
        }
1365
16.6k
  htmlParserFinishElementParsing(ctxt);
1366
16.6k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1367
16.6k
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1368
16.6k
  htmlnamePop(ctxt);
1369
16.6k
    }
1370
14.4k
}
1371
1372
/**
1373
 * Close all remaining tags at the end of the stream
1374
 *
1375
 * @param ctxt  an HTML parser context
1376
 */
1377
static void
1378
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1379
18.0k
{
1380
18.0k
    int i;
1381
1382
18.0k
    if (ctxt->options & HTML_PARSE_HTML5)
1383
6.94k
        return;
1384
1385
11.0k
    if (ctxt->nameNr == 0)
1386
3.16k
        return;
1387
138k
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1388
130k
  htmlParserFinishElementParsing(ctxt);
1389
130k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1390
130k
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1391
130k
  htmlnamePop(ctxt);
1392
130k
    }
1393
7.89k
}
1394
1395
/**
1396
 * The HTML DTD allows a tag to implicitly close other tags.
1397
 * The list is kept in htmlStartClose array. This function is
1398
 * called when a new tag has been detected and generates the
1399
 * appropriates closes if possible/needed.
1400
 * If newtag is NULL this mean we are at the end of the resource
1401
 * and we should check
1402
 *
1403
 * @param ctxt  an HTML parser context
1404
 * @param newtag  The new tag name or NULL
1405
 */
1406
static void
1407
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1408
189k
{
1409
189k
    if (ctxt->options & HTML_PARSE_HTML5)
1410
0
        return;
1411
1412
189k
    if (newtag == NULL)
1413
0
        return;
1414
1415
206k
    while ((ctxt->name != NULL) &&
1416
195k
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1417
16.3k
  htmlParserFinishElementParsing(ctxt);
1418
16.3k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419
16.3k
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420
16.3k
  htmlnamePop(ctxt);
1421
16.3k
    }
1422
189k
}
1423
1424
/**
1425
 * The HTML DTD allows a tag to implicitly close other tags.
1426
 * The list is kept in htmlStartClose array. This function checks
1427
 * if the element or one of it's children would autoclose the
1428
 * given tag.
1429
 *
1430
 * @deprecated Internal function, don't use.
1431
 *
1432
 * @param doc  the HTML document
1433
 * @param name  The tag name
1434
 * @param elem  the HTML element
1435
 * @returns 1 if autoclose, 0 otherwise
1436
 */
1437
int
1438
0
htmlAutoCloseTag(xmlDoc *doc, const xmlChar *name, xmlNode *elem) {
1439
0
    htmlNodePtr child;
1440
1441
0
    if (elem == NULL) return(1);
1442
0
    if (xmlStrEqual(name, elem->name)) return(0);
1443
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1444
0
    child = elem->children;
1445
0
    while (child != NULL) {
1446
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1447
0
  child = child->next;
1448
0
    }
1449
0
    return(0);
1450
0
}
1451
1452
/**
1453
 * The HTML DTD allows a tag to implicitly close other tags.
1454
 * The list is kept in htmlStartClose array. This function checks
1455
 * if a tag is autoclosed by one of it's child
1456
 *
1457
 * @deprecated Internal function, don't use.
1458
 *
1459
 * @param doc  the HTML document
1460
 * @param elem  the HTML element
1461
 * @returns 1 if autoclosed, 0 otherwise
1462
 */
1463
int
1464
0
htmlIsAutoClosed(xmlDoc *doc, xmlNode *elem) {
1465
0
    htmlNodePtr child;
1466
1467
0
    if (elem == NULL) return(1);
1468
0
    child = elem->children;
1469
0
    while (child != NULL) {
1470
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471
0
  child = child->next;
1472
0
    }
1473
0
    return(0);
1474
0
}
1475
1476
/**
1477
 * The HTML DTD allows a tag to exists only implicitly
1478
 * called when a new tag has been detected and generates the
1479
 * appropriates implicit tags if missing
1480
 *
1481
 * @param ctxt  an HTML parser context
1482
 * @param newtag  The new tag name
1483
 */
1484
static void
1485
220k
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1486
220k
    int i;
1487
1488
220k
    if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1489
38.4k
        return;
1490
182k
    if (!htmlOmittedDefaultValue)
1491
0
  return;
1492
182k
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1493
848
  return;
1494
181k
    if (ctxt->nameNr <= 0) {
1495
7.26k
  htmlnamePush(ctxt, BAD_CAST"html");
1496
7.26k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497
7.26k
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1498
7.26k
    }
1499
181k
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1500
3.74k
        return;
1501
177k
    if ((ctxt->nameNr <= 1) &&
1502
12.3k
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1503
12.0k
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1504
11.8k
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1505
11.1k
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1506
10.9k
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1507
10.7k
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1508
1.85k
        if (ctxt->html >= INSERT_IN_HEAD) {
1509
            /* we already saw or generated an <head> before */
1510
1.21k
            return;
1511
1.21k
        }
1512
        /*
1513
         * dropped OBJECT ... i you put it first BODY will be
1514
         * assumed !
1515
         */
1516
637
        htmlnamePush(ctxt, BAD_CAST"head");
1517
637
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1518
637
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1519
175k
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1520
175k
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1521
175k
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1522
175k
        if (ctxt->html >= INSERT_IN_BODY) {
1523
            /* we already saw or generated a <body> before */
1524
159k
            return;
1525
159k
        }
1526
31.3k
  for (i = 0;i < ctxt->nameNr;i++) {
1527
25.0k
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1528
0
    return;
1529
0
      }
1530
25.0k
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1531
9.39k
    return;
1532
9.39k
      }
1533
25.0k
  }
1534
1535
6.29k
  htmlnamePush(ctxt, BAD_CAST"body");
1536
6.29k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1537
6.29k
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1538
6.29k
    }
1539
177k
}
1540
1541
/**
1542
 * Prepare for non-whitespace character data.
1543
 *
1544
 * @param ctxt  an HTML parser context
1545
 */
1546
1547
static void
1548
2.35M
htmlStartCharData(htmlParserCtxtPtr ctxt) {
1549
2.35M
    if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1550
2.31M
        return;
1551
31.5k
    if (!htmlOmittedDefaultValue)
1552
0
  return;
1553
1554
31.5k
    if (xmlStrEqual(ctxt->name, BAD_CAST "head"))
1555
666
        htmlAutoClose(ctxt, BAD_CAST "p");
1556
31.5k
    htmlCheckImplied(ctxt, BAD_CAST "p");
1557
31.5k
}
1558
1559
/**
1560
 * Check if an attribute is of content type Script
1561
 *
1562
 * @deprecated Only supports HTML 4.
1563
 *
1564
 * @param name  an attribute name
1565
 * @returns 1 is the attribute is a script 0 otherwise
1566
 */
1567
int
1568
0
htmlIsScriptAttribute(const xmlChar *name) {
1569
0
    unsigned int i;
1570
1571
0
    if (name == NULL)
1572
0
      return(0);
1573
    /*
1574
     * all script attributes start with 'on'
1575
     */
1576
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1577
0
      return(0);
1578
0
    for (i = 0;
1579
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1580
0
   i++) {
1581
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1582
0
      return(1);
1583
0
    }
1584
0
    return(0);
1585
0
}
1586
1587
/************************************************************************
1588
 *                  *
1589
 *  The list of HTML predefined entities      *
1590
 *                  *
1591
 ************************************************************************/
1592
1593
1594
static const htmlEntityDesc  html40EntitiesTable[] = {
1595
/*
1596
 * the 4 absolute ones, plus apostrophe.
1597
 */
1598
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1599
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1600
{ 39, "apos", "single quote" },
1601
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1602
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1603
1604
/*
1605
 * A bunch still in the 128-255 range
1606
 * Replacing them depend really on the charset used.
1607
 */
1608
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1609
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1610
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1611
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1612
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1613
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1614
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1615
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1616
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1617
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1618
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1619
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1620
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1621
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1622
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1623
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1624
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1625
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1626
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1627
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1628
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1629
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1630
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1631
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1632
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1633
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1634
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1635
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1636
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1637
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1638
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1639
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1640
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1641
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1642
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1643
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1644
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1645
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1646
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1647
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1648
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1649
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1650
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1651
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1652
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1653
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1654
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1655
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1656
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1657
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1658
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1659
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1660
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1661
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1662
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1663
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1664
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1665
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1666
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1667
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1668
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1669
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1670
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1671
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1672
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1673
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1674
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1675
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1676
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1677
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1678
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1679
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1680
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1681
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1682
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1683
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1684
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1685
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1686
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1687
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1688
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1689
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1690
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1691
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1692
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1693
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1694
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1695
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1696
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1697
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1698
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1699
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1700
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1701
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1702
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1703
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1704
1705
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1706
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1707
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1708
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1709
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1710
1711
/*
1712
 * Anything below should really be kept as entities references
1713
 */
1714
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1715
1716
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1717
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1718
1719
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1720
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1721
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1722
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1723
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1724
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1725
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1726
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1727
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1728
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1729
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1730
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1731
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1732
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1733
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1734
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1735
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1736
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1737
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1738
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1739
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1740
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1741
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1742
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1743
1744
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1745
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1746
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1747
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1748
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1749
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1750
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1751
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1752
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1753
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1754
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1755
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1756
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1757
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1758
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1759
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1760
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1761
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1762
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1763
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1764
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1765
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1766
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1767
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1768
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1769
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1770
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1771
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1772
1773
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1774
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1775
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1776
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1777
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1778
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1779
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1780
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1781
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1782
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1783
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1784
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1785
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1786
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1787
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1788
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1789
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1790
1791
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1792
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1793
1794
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1795
1796
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1797
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1798
1799
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1800
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1801
1802
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1803
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1804
1805
{ 8364, "euro", "euro sign, U+20AC NEW" },
1806
1807
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1808
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1809
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1810
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1811
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1812
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1813
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1814
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1815
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1816
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1817
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1818
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1819
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1820
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1821
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1822
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1823
1824
{ 8704, "forall","for all, U+2200 ISOtech" },
1825
{ 8706, "part", "partial differential, U+2202 ISOtech" },
1826
{ 8707, "exist","there exists, U+2203 ISOtech" },
1827
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1828
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1829
{ 8712, "isin", "element of, U+2208 ISOtech" },
1830
{ 8713, "notin","not an element of, U+2209 ISOtech" },
1831
{ 8715, "ni", "contains as member, U+220B ISOtech" },
1832
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1833
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1834
{ 8722, "minus","minus sign, U+2212 ISOtech" },
1835
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1836
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1837
{ 8733, "prop", "proportional to, U+221D ISOtech" },
1838
{ 8734, "infin","infinity, U+221E ISOtech" },
1839
{ 8736, "ang",  "angle, U+2220 ISOamso" },
1840
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1841
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1842
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1843
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
1844
{ 8747, "int",  "integral, U+222B ISOtech" },
1845
{ 8756, "there4","therefore, U+2234 ISOtech" },
1846
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1847
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1848
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1849
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1850
{ 8801, "equiv","identical to, U+2261 ISOtech" },
1851
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1852
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1853
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
1854
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
1855
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1856
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1857
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1858
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1859
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1860
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1861
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1862
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1863
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1864
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1865
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1866
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1867
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1868
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
1869
1870
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1871
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1872
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1873
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1874
1875
};
1876
1877
/************************************************************************
1878
 *                  *
1879
 *    Commodity functions to handle entities      *
1880
 *                  *
1881
 ************************************************************************/
1882
1883
/**
1884
 * Lookup the given entity in EntitiesTable
1885
 *
1886
 * @deprecated Only supports HTML 4.
1887
 *
1888
 * TODO: the linear scan is really ugly, an hash table is really needed.
1889
 *
1890
 * @param name  the entity name
1891
 * @returns the associated htmlEntityDesc if found, NULL otherwise.
1892
 */
1893
const htmlEntityDesc *
1894
0
htmlEntityLookup(const xmlChar *name) {
1895
0
    unsigned int i;
1896
1897
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
1898
0
                    sizeof(html40EntitiesTable[0]));i++) {
1899
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1900
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1901
0
  }
1902
0
    }
1903
0
    return(NULL);
1904
0
}
1905
1906
static int
1907
280M
htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
1908
280M
    const unsigned *key = vkey;
1909
280M
    const htmlEntityDesc *desc = vdesc;
1910
1911
280M
    return((int) *key - (int) desc->value);
1912
280M
}
1913
1914
/**
1915
 * Lookup the given entity in EntitiesTable
1916
 *
1917
 * @deprecated Only supports HTML 4.
1918
 *
1919
 * TODO: the linear scan is really ugly, an hash table is really needed.
1920
 *
1921
 * @param value  the entity's unicode value
1922
 * @returns the associated htmlEntityDesc if found, NULL otherwise.
1923
 */
1924
const htmlEntityDesc *
1925
37.5M
htmlEntityValueLookup(unsigned int value) {
1926
37.5M
    const htmlEntityDesc *desc;
1927
37.5M
    size_t nmemb;
1928
1929
37.5M
    nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
1930
37.5M
    desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
1931
37.5M
                   htmlCompareEntityDesc);
1932
1933
37.5M
    return(desc);
1934
37.5M
}
1935
1936
/**
1937
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938
 * plus HTML entities block of chars out.
1939
 *
1940
 * @deprecated Internal function, don't use.
1941
 *
1942
 * @param out  a pointer to an array of bytes to store the result
1943
 * @param outlen  the length of `out`
1944
 * @param in  a pointer to an array of UTF-8 chars
1945
 * @param inlen  the length of `in`
1946
 * @returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1947
 * The value of `inlen` after return is the number of octets consumed
1948
 *     as the return value is positive, else unpredictable.
1949
 * The value of `outlen` after return is the number of octets consumed.
1950
 */
1951
int
1952
htmlUTF8ToHtml(unsigned char* out, int *outlen,
1953
1.15M
               const unsigned char* in, int *inlen) {
1954
1.15M
    const unsigned char* instart = in;
1955
1.15M
    const unsigned char* inend;
1956
1.15M
    unsigned char* outstart = out;
1957
1.15M
    unsigned char* outend;
1958
1.15M
    int ret = XML_ENC_ERR_SPACE;
1959
1960
1.15M
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
1961
0
        return(XML_ENC_ERR_INTERNAL);
1962
1963
1.15M
    if (in == NULL) {
1964
        /*
1965
   * initialization nothing to do
1966
   */
1967
12.5k
  *outlen = 0;
1968
12.5k
  *inlen = 0;
1969
12.5k
  return(XML_ENC_ERR_SUCCESS);
1970
12.5k
    }
1971
1972
1.14M
    inend = in + *inlen;
1973
1.14M
    outend = out + *outlen;
1974
252M
    while (in < inend) {
1975
251M
        const htmlEntityDesc *ent;
1976
251M
        const char *cp;
1977
251M
        char nbuf[16];
1978
251M
        unsigned c, d;
1979
251M
        int seqlen, len, i;
1980
1981
251M
  d = *in;
1982
1983
251M
  if (d < 0x80) {
1984
213M
            if (out >= outend)
1985
0
                goto done;
1986
213M
            *out++ = d;
1987
213M
            in += 1;
1988
213M
            continue;
1989
213M
        }
1990
1991
37.5M
        if (d < 0xE0)      { c = d & 0x1F; seqlen = 2; }
1992
15.7M
        else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; }
1993
15.7M
        else               { c = d & 0x07; seqlen = 4; }
1994
1995
37.5M
  if (inend - in < seqlen)
1996
45.0k
      break;
1997
1998
106M
  for (i = 1; i < seqlen; i++) {
1999
68.9M
      d = in[i];
2000
68.9M
      c <<= 6;
2001
68.9M
      c |= d & 0x3F;
2002
68.9M
  }
2003
2004
        /*
2005
         * Try to lookup a predefined HTML entity for it
2006
         */
2007
37.5M
        ent = htmlEntityValueLookup(c);
2008
2009
37.5M
        if (ent == NULL) {
2010
33.3M
          snprintf(nbuf, sizeof(nbuf), "#%u", c);
2011
33.3M
          cp = nbuf;
2012
33.3M
        } else {
2013
4.14M
          cp = ent->name;
2014
4.14M
        }
2015
2016
37.5M
        len = strlen(cp);
2017
37.5M
        if (outend - out < len + 2)
2018
0
            goto done;
2019
2020
37.5M
        *out++ = '&';
2021
37.5M
        memcpy(out, cp, len);
2022
37.5M
        out += len;
2023
37.5M
        *out++ = ';';
2024
2025
37.5M
        in += seqlen;
2026
37.5M
    }
2027
2028
1.14M
    ret = out - outstart;
2029
2030
1.14M
done:
2031
1.14M
    *outlen = out - outstart;
2032
1.14M
    *inlen = in - instart;
2033
1.14M
    return(ret);
2034
1.14M
}
2035
2036
/**
2037
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2038
 * plus HTML entities block of chars out.
2039
 *
2040
 * @deprecated Only supports HTML 4.
2041
 *
2042
 * @param out  a pointer to an array of bytes to store the result
2043
 * @param outlen  the length of `out`
2044
 * @param in  a pointer to an array of UTF-8 chars
2045
 * @param inlen  the length of `in`
2046
 * @param quoteChar  the quote character to escape (' or ") or zero.
2047
 * @returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2048
 * The value of `inlen` after return is the number of octets consumed
2049
 *     as the return value is positive, else unpredictable.
2050
 * The value of `outlen` after return is the number of octets consumed.
2051
 */
2052
int
2053
htmlEncodeEntities(unsigned char* out, int *outlen,
2054
0
       const unsigned char* in, int *inlen, int quoteChar) {
2055
0
    const unsigned char* processed = in;
2056
0
    const unsigned char* outend;
2057
0
    const unsigned char* outstart = out;
2058
0
    const unsigned char* instart = in;
2059
0
    const unsigned char* inend;
2060
0
    unsigned int c, d;
2061
0
    int trailing;
2062
2063
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2064
0
        return(-1);
2065
0
    outend = out + (*outlen);
2066
0
    inend = in + (*inlen);
2067
0
    while (in < inend) {
2068
0
  d = *in++;
2069
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2070
0
  else if (d < 0xC0) {
2071
      /* trailing byte in leading position */
2072
0
      *outlen = out - outstart;
2073
0
      *inlen = processed - instart;
2074
0
      return(-2);
2075
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2076
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2077
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2078
0
  else {
2079
      /* no chance for this in Ascii */
2080
0
      *outlen = out - outstart;
2081
0
      *inlen = processed - instart;
2082
0
      return(-2);
2083
0
  }
2084
2085
0
  if (inend - in < trailing)
2086
0
      break;
2087
2088
0
  while (trailing--) {
2089
0
      if (((d= *in++) & 0xC0) != 0x80) {
2090
0
    *outlen = out - outstart;
2091
0
    *inlen = processed - instart;
2092
0
    return(-2);
2093
0
      }
2094
0
      c <<= 6;
2095
0
      c |= d & 0x3F;
2096
0
  }
2097
2098
  /* assertion: c is a single UTF-4 value */
2099
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2100
0
      (c != '&') && (c != '<') && (c != '>')) {
2101
0
      if (out >= outend)
2102
0
    break;
2103
0
      *out++ = c;
2104
0
  } else {
2105
0
      const htmlEntityDesc * ent;
2106
0
      const char *cp;
2107
0
      char nbuf[16];
2108
0
      int len;
2109
2110
      /*
2111
       * Try to lookup a predefined HTML entity for it
2112
       */
2113
0
      ent = htmlEntityValueLookup(c);
2114
0
      if (ent == NULL) {
2115
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2116
0
    cp = nbuf;
2117
0
      }
2118
0
      else
2119
0
    cp = ent->name;
2120
0
      len = strlen(cp);
2121
0
      if (outend - out < len + 2)
2122
0
    break;
2123
0
      *out++ = '&';
2124
0
      memcpy(out, cp, len);
2125
0
      out += len;
2126
0
      *out++ = ';';
2127
0
  }
2128
0
  processed = in;
2129
0
    }
2130
0
    *outlen = out - outstart;
2131
0
    *inlen = processed - instart;
2132
0
    return(0);
2133
0
}
2134
2135
/************************************************************************
2136
 *                  *
2137
 *    Commodity functions, cleanup needed ?     *
2138
 *                  *
2139
 ************************************************************************/
2140
/*
2141
 * all tags allowing pc data from the html 4.01 loose dtd
2142
 * NOTE: it might be more appropriate to integrate this information
2143
 * into the html40ElementTable array but I don't want to risk any
2144
 * binary incompatibility
2145
 */
2146
static const char *const allowPCData[] = {
2147
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2148
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2149
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2150
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2151
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2152
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2153
};
2154
2155
/**
2156
 * Is this a sequence of blank chars that one can ignore ?
2157
 *
2158
 * @param ctxt  an HTML parser context
2159
 * @param str  a xmlChar *
2160
 * @param len  the size of `str`
2161
 * @returns 1 if ignorable 0 if whitespace, -1 otherwise.
2162
 */
2163
2164
1.41M
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2165
1.41M
    unsigned int i;
2166
1.41M
    int j;
2167
1.41M
    xmlNodePtr lastChild;
2168
1.41M
    xmlDtdPtr dtd;
2169
2170
2.83M
    for (j = 0;j < len;j++)
2171
2.78M
        if (!(IS_WS_HTML(str[j]))) return(-1);
2172
2173
50.6k
    if (CUR == 0) return(1);
2174
47.4k
    if (CUR != '<') return(0);
2175
6.08k
    if (ctxt->name == NULL)
2176
0
  return(1);
2177
6.08k
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2178
0
  return(1);
2179
6.08k
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2180
0
  return(1);
2181
2182
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2183
6.08k
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2184
1.67k
        dtd = xmlGetIntSubset(ctxt->myDoc);
2185
1.67k
        if (dtd != NULL && dtd->ExternalID != NULL) {
2186
596
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2187
402
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2188
388
                return(1);
2189
596
        }
2190
1.67k
    }
2191
2192
5.69k
    if (ctxt->node == NULL) return(0);
2193
5.45k
    lastChild = xmlGetLastChild(ctxt->node);
2194
50.7k
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2195
45.2k
  lastChild = lastChild->prev;
2196
5.45k
    if (lastChild == NULL) {
2197
1.47k
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2198
0
            (ctxt->node->content != NULL)) return(0);
2199
  /* keep ws in constructs like ...<b> </b>...
2200
     for all tags "b" allowing PCDATA */
2201
67.8k
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2202
66.8k
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2203
501
    return(0);
2204
501
      }
2205
66.8k
  }
2206
3.98k
    } else if (xmlNodeIsText(lastChild)) {
2207
3.09k
        return(0);
2208
3.09k
    } else {
2209
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2210
     for all tags "p" allowing PCDATA */
2211
39.9k
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2212
39.4k
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2213
349
    return(0);
2214
349
      }
2215
39.4k
  }
2216
886
    }
2217
1.50k
    return(1);
2218
5.45k
}
2219
2220
/**
2221
 * Creates a new HTML document without a DTD node if `URI` and `publicId`
2222
 * are NULL
2223
 *
2224
 * @param URI  system ID (URI) of the DTD (optional)
2225
 * @param publicId  public ID of the DTD (optional)
2226
 * @returns a new document, do not initialize the DTD if not provided
2227
 */
2228
xmlDoc *
2229
63.0k
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *publicId) {
2230
63.0k
    xmlDocPtr cur;
2231
2232
    /*
2233
     * Allocate a new document and fill the fields.
2234
     */
2235
63.0k
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2236
63.0k
    if (cur == NULL)
2237
239
  return(NULL);
2238
62.8k
    memset(cur, 0, sizeof(xmlDoc));
2239
2240
62.8k
    cur->type = XML_HTML_DOCUMENT_NODE;
2241
62.8k
    cur->version = NULL;
2242
62.8k
    cur->intSubset = NULL;
2243
62.8k
    cur->doc = cur;
2244
62.8k
    cur->name = NULL;
2245
62.8k
    cur->children = NULL;
2246
62.8k
    cur->extSubset = NULL;
2247
62.8k
    cur->oldNs = NULL;
2248
62.8k
    cur->encoding = NULL;
2249
62.8k
    cur->standalone = 1;
2250
62.8k
    cur->compression = 0;
2251
62.8k
    cur->ids = NULL;
2252
62.8k
    cur->refs = NULL;
2253
62.8k
    cur->_private = NULL;
2254
62.8k
    cur->charset = XML_CHAR_ENCODING_UTF8;
2255
62.8k
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2256
62.8k
    if ((publicId != NULL) ||
2257
41.2k
  (URI != NULL)) {
2258
37.5k
        xmlDtdPtr intSubset;
2259
2260
37.5k
  intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", publicId, URI);
2261
37.5k
        if (intSubset == NULL) {
2262
12
            xmlFree(cur);
2263
12
            return(NULL);
2264
12
        }
2265
37.5k
    }
2266
62.7k
    if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2267
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2268
62.7k
    return(cur);
2269
62.8k
}
2270
2271
/**
2272
 * Creates a new HTML document
2273
 *
2274
 * @param URI  system ID (URI) of the DTD (optional)
2275
 * @param publicId  public ID of the DTD (optional)
2276
 * @returns a new document
2277
 */
2278
xmlDoc *
2279
22.8k
htmlNewDoc(const xmlChar *URI, const xmlChar *publicId) {
2280
22.8k
    if ((URI == NULL) && (publicId == NULL))
2281
7.45k
  return(htmlNewDocNoDtD(
2282
7.45k
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2283
7.45k
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2284
2285
15.3k
    return(htmlNewDocNoDtD(URI, publicId));
2286
22.8k
}
2287
2288
2289
/************************************************************************
2290
 *                  *
2291
 *      The parser itself       *
2292
 *  Relates to http://www.w3.org/TR/html40        *
2293
 *                  *
2294
 ************************************************************************/
2295
2296
/************************************************************************
2297
 *                  *
2298
 *      The parser itself       *
2299
 *                  *
2300
 ************************************************************************/
2301
2302
/**
2303
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2304
 * since HTML names are not case-sensitive.
2305
 *
2306
 * @param ctxt  an HTML parser context
2307
 * @param attr  whether this is an attribute name
2308
 * @returns the Tag Name parsed or NULL
2309
 */
2310
2311
static xmlHashedString
2312
5.59M
htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
2313
5.59M
    xmlHashedString ret;
2314
5.59M
    xmlChar buf[HTML_PARSER_BUFFER_SIZE];
2315
5.59M
    const xmlChar *in;
2316
5.59M
    size_t avail;
2317
5.59M
    int eof = PARSER_PROGRESSIVE(ctxt);
2318
5.59M
    int nbchar = 0;
2319
5.59M
    int stop = attr ? '=' : ' ';
2320
2321
5.59M
    in = ctxt->input->cur;
2322
5.59M
    avail = ctxt->input->end - in;
2323
2324
47.2M
    while (1) {
2325
47.2M
        int c, size;
2326
2327
47.2M
        if ((!eof) && (avail < 32)) {
2328
21.1k
            size_t oldAvail = avail;
2329
2330
21.1k
            ctxt->input->cur = in;
2331
2332
21.1k
            SHRINK;
2333
21.1k
            xmlParserGrow(ctxt);
2334
2335
21.1k
            in = ctxt->input->cur;
2336
21.1k
            avail = ctxt->input->end - in;
2337
2338
21.1k
            if (oldAvail == avail)
2339
18.1k
                eof = 1;
2340
21.1k
        }
2341
2342
47.2M
        if (avail == 0)
2343
2.83k
            break;
2344
2345
47.2M
        c = *in;
2346
47.2M
        size = 1;
2347
2348
47.2M
        if ((nbchar != 0) &&
2349
41.6M
            ((c == '/') || (c == '>') || (c == stop) ||
2350
41.2M
             (IS_WS_HTML(c))))
2351
5.59M
            break;
2352
2353
41.6M
        if (c == 0) {
2354
4.50M
            if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2355
1.35M
                buf[nbchar++] = 0xEF;
2356
1.35M
                buf[nbchar++] = 0xBF;
2357
1.35M
                buf[nbchar++] = 0xBD;
2358
1.35M
            }
2359
37.1M
        } else if (c < 0x80) {
2360
14.6M
            if (nbchar < HTML_PARSER_BUFFER_SIZE) {
2361
13.2M
                if (IS_UPPER(c))
2362
2.60M
                    c += 0x20;
2363
13.2M
                buf[nbchar++] = c;
2364
13.2M
            }
2365
22.4M
        } else {
2366
22.4M
            size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2367
2368
22.4M
            if (size > 0) {
2369
21.6M
                if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
2370
1.49M
                    memcpy(buf + nbchar, in, size);
2371
1.49M
                    nbchar += size;
2372
1.49M
                }
2373
21.6M
            } else {
2374
794k
                size = 1;
2375
2376
794k
                if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2377
117k
                    buf[nbchar++] = 0xEF;
2378
117k
                    buf[nbchar++] = 0xBF;
2379
117k
                    buf[nbchar++] = 0xBD;
2380
117k
                }
2381
794k
            }
2382
22.4M
        }
2383
2384
41.6M
        in += size;
2385
41.6M
        avail -= size;
2386
41.6M
    }
2387
2388
5.59M
    ctxt->input->cur = in;
2389
2390
5.59M
    SHRINK;
2391
2392
5.59M
    ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
2393
5.59M
    if (ret.name == NULL)
2394
158
        htmlErrMemory(ctxt);
2395
2396
5.59M
    return(ret);
2397
5.59M
}
2398
2399
static const short htmlC1Remap[32] = {
2400
    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
2401
    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
2402
    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
2403
    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
2404
};
2405
2406
static const xmlChar *
2407
73.4k
htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
2408
73.4k
    int i = 0;
2409
73.4k
    int bits, hi;
2410
2411
73.4k
    if ((c >= 0x80) && (c < 0xA0)) {
2412
579
        c = htmlC1Remap[c - 0x80];
2413
72.8k
    } else if ((c <= 0) ||
2414
64.2k
               ((c >= 0xD800) && (c < 0xE000)) ||
2415
63.7k
               (c > 0x10FFFF)) {
2416
10.0k
        c = 0xFFFD;
2417
10.0k
    }
2418
2419
73.4k
    if      (c <    0x80) { bits =  0; hi = 0x00; }
2420
19.8k
    else if (c <   0x800) { bits =  6; hi = 0xC0; }
2421
14.7k
    else if (c < 0x10000) { bits = 12; hi = 0xE0; }
2422
3.03k
    else                  { bits = 18; hi = 0xF0; }
2423
2424
73.4k
    out[i++] = (c >> bits) | hi;
2425
2426
111k
    while (bits > 0) {
2427
37.7k
        bits -= 6;
2428
37.7k
        out[i++] = ((c >> bits) & 0x3F) | 0x80;
2429
37.7k
    }
2430
2431
73.4k
    *osize = i;
2432
73.4k
    return(out);
2433
73.4k
}
2434
2435
#include "codegen/html5ent.inc"
2436
2437
26.8k
#define ENT_F_SEMICOLON 0x80u
2438
42.9k
#define ENT_F_SUBTABLE  0x40u
2439
1.85M
#define ENT_F_ALL       0xC0u
2440
2441
static const xmlChar *
2442
htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
2443
605k
                     int *nlen, int *rlen) {
2444
605k
    const xmlChar *match = NULL;
2445
605k
    unsigned left, right;
2446
605k
    int first = string[0];
2447
605k
    size_t matchLen = 0;
2448
605k
    size_t soff = 1;
2449
2450
605k
    if (slen < 2)
2451
1.19k
        return(NULL);
2452
604k
    if (!IS_ASCII_LETTER(first))
2453
152k
        return(NULL);
2454
2455
    /*
2456
     * Look up range by first character
2457
     */
2458
451k
    first &= 63;
2459
451k
    left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
2460
451k
    right = left + htmlEntAlpha[first*3+2];
2461
2462
    /*
2463
     * Binary search
2464
     */
2465
2.27M
    while (left < right) {
2466
1.85M
        const xmlChar *bytes;
2467
1.85M
        unsigned mid;
2468
1.85M
        size_t len;
2469
1.85M
        int cmp;
2470
2471
1.85M
        mid = left + (right - left) / 2;
2472
1.85M
        bytes = htmlEntStrings + htmlEntValues[mid];
2473
1.85M
        len = bytes[0] & ~ENT_F_ALL;
2474
2475
1.85M
        cmp = string[soff] - bytes[1];
2476
2477
1.85M
        if (cmp == 0) {
2478
84.1k
            if (slen < len) {
2479
1.43k
                cmp = strncmp((const char *) string + soff + 1,
2480
1.43k
                              (const char *) bytes + 2,
2481
1.43k
                              slen - 1);
2482
                /* Prefix can never match */
2483
1.43k
                if (cmp == 0)
2484
0
                    break;
2485
82.6k
            } else {
2486
82.6k
                cmp = strncmp((const char *) string + soff + 1,
2487
82.6k
                              (const char *) bytes + 2,
2488
82.6k
                              len - 1);
2489
82.6k
            }
2490
84.1k
        }
2491
2492
1.85M
        if (cmp < 0) {
2493
1.72M
            right = mid;
2494
1.72M
        } else if (cmp > 0) {
2495
86.1k
            left = mid + 1;
2496
86.1k
        } else {
2497
42.9k
            int term = soff + len < slen ? string[soff + len] : 0;
2498
42.9k
            int isAlnum, isTerm;
2499
2500
42.9k
            isAlnum = IS_ALNUM(term);
2501
42.9k
            isTerm = ((term == ';') ||
2502
26.8k
                      ((bytes[0] & ENT_F_SEMICOLON) &&
2503
2.58k
                       ((!isAttr) ||
2504
502
                        ((!isAlnum) && (term != '=')))));
2505
2506
42.9k
            if (isTerm) {
2507
18.4k
                match = bytes + len + 1;
2508
18.4k
                matchLen = soff + len;
2509
18.4k
                if (term == ';')
2510
16.1k
                    matchLen += 1;
2511
18.4k
            }
2512
2513
42.9k
            if (bytes[0] & ENT_F_SUBTABLE) {
2514
17.2k
                if (isTerm)
2515
4.65k
                    match += 2;
2516
2517
17.2k
                if ((isAlnum) && (soff + len < slen)) {
2518
11.2k
                    left = mid + bytes[len + 1];
2519
11.2k
                    right = left + bytes[len + 2];
2520
11.2k
                    soff += len;
2521
11.2k
                    continue;
2522
11.2k
                }
2523
17.2k
            }
2524
2525
31.7k
            break;
2526
42.9k
        }
2527
1.85M
    }
2528
2529
451k
    if (match == NULL)
2530
433k
        return(NULL);
2531
2532
18.1k
    *nlen = matchLen;
2533
18.1k
    *rlen = match[0];
2534
18.1k
    return(match + 1);
2535
451k
}
2536
2537
/**
2538
 * Parse data until terminator is reached.
2539
 *
2540
 * @param ctxt  an HTML parser context
2541
 * @param mask  mask of terminating characters
2542
 * @param comment  true if parsing a comment
2543
 * @param refs  true if references are allowed
2544
 * @param maxLength  maximum output length
2545
 * @returns the parsed string or NULL in case of errors.
2546
 */
2547
2548
static xmlChar *
2549
htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
2550
1.11M
              int comment, int refs, int maxLength) {
2551
1.11M
    xmlParserInputPtr input = ctxt->input;
2552
1.11M
    xmlChar *ret = NULL;
2553
1.11M
    xmlChar *buffer;
2554
1.11M
    xmlChar utf8Char[4];
2555
1.11M
    size_t buffer_size;
2556
1.11M
    size_t used;
2557
1.11M
    int eof = PARSER_PROGRESSIVE(ctxt);
2558
1.11M
    int line, col;
2559
1.11M
    int termSkip = -1;
2560
2561
1.11M
    used = 0;
2562
1.11M
    buffer_size = ctxt->spaceMax;
2563
1.11M
    buffer = (xmlChar *) ctxt->spaceTab;
2564
1.11M
    if (buffer == NULL) {
2565
9.77k
        buffer_size = 500;
2566
9.77k
        buffer = xmlMalloc(buffer_size + 1);
2567
9.77k
        if (buffer == NULL) {
2568
217
            htmlErrMemory(ctxt);
2569
217
            return(NULL);
2570
217
        }
2571
9.77k
    }
2572
2573
1.11M
    line = input->line;
2574
1.11M
    col = input->col;
2575
2576
42.8M
    while (!PARSER_STOPPED(ctxt)) {
2577
42.7M
        const xmlChar *chunk, *in, *repl;
2578
42.7M
        size_t avail, chunkSize, extraSize;
2579
42.7M
        int replSize;
2580
42.7M
        int skip = 0;
2581
42.7M
        int ncr = 0;
2582
42.7M
        int ncrSize = 0;
2583
42.7M
        int cp = 0;
2584
2585
42.7M
        chunk = input->cur;
2586
42.7M
        avail = input->end - chunk;
2587
42.7M
        in = chunk;
2588
2589
42.7M
        repl = BAD_CAST "";
2590
42.7M
        replSize = 0;
2591
2592
173M
        while (!PARSER_STOPPED(ctxt)) {
2593
173M
            size_t j;
2594
173M
            int cur, size;
2595
2596
173M
            if ((!eof) && (avail <= 64)) {
2597
31.7k
                size_t oldAvail = avail;
2598
31.7k
                size_t off = in - chunk;
2599
2600
31.7k
                input->cur = in;
2601
2602
31.7k
                xmlParserGrow(ctxt);
2603
2604
31.7k
                in = input->cur;
2605
31.7k
                chunk = in - off;
2606
31.7k
                input->cur = chunk;
2607
31.7k
                avail = input->end - in;
2608
2609
31.7k
                if (oldAvail == avail)
2610
10.4k
                    eof = 1;
2611
31.7k
            }
2612
2613
173M
            if (avail == 0) {
2614
3.32k
                termSkip = 0;
2615
3.32k
                break;
2616
3.32k
            }
2617
2618
173M
            cur = *in;
2619
173M
            size = 1;
2620
173M
            col += 1;
2621
2622
173M
            if (htmlMaskMatch(mask, cur)) {
2623
1.14M
                if (comment) {
2624
164k
                    if (avail < 2) {
2625
67
                        termSkip = 1;
2626
164k
                    } else if (in[1] == '-') {
2627
146k
                        if  (avail < 3) {
2628
26
                            termSkip = 2;
2629
146k
                        } else if (in[2] == '>') {
2630
9.42k
                            termSkip = 3;
2631
136k
                        } else if (in[2] == '!') {
2632
3.60k
                            if (avail < 4)
2633
5
                                termSkip = 3;
2634
3.60k
                            else if (in[3] == '>')
2635
1.52k
                                termSkip = 4;
2636
3.60k
                        }
2637
146k
                    }
2638
2639
164k
                    if (termSkip >= 0)
2640
11.0k
                        break;
2641
980k
                } else {
2642
980k
                    termSkip = 0;
2643
980k
                    break;
2644
980k
                }
2645
1.14M
            }
2646
2647
172M
            if (ncr) {
2648
551k
                int lc = cur | 0x20;
2649
551k
                int digit;
2650
2651
551k
                if ((cur >= '0') && (cur <= '9')) {
2652
29.1k
                    digit = cur - '0';
2653
522k
                } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
2654
510k
                    digit = (lc - 'a') + 10;
2655
510k
                } else {
2656
11.9k
                    if (cur == ';') {
2657
1.26k
                        in += 1;
2658
1.26k
                        size += 1;
2659
1.26k
                        ncrSize += 1;
2660
1.26k
                    }
2661
11.9k
                    goto next_chunk;
2662
11.9k
                }
2663
2664
539k
                cp = cp * ncr + digit;
2665
539k
                if (cp >= 0x110000)
2666
510k
                    cp = 0x110000;
2667
2668
539k
                ncrSize += 1;
2669
2670
539k
                goto next_char;
2671
551k
            }
2672
2673
171M
            switch (cur) {
2674
78.4k
            case '&':
2675
78.4k
                if (!refs)
2676
31.6k
                    break;
2677
2678
46.8k
                j = 1;
2679
2680
46.8k
                if ((j < avail) && (in[j] == '#')) {
2681
33.2k
                    j += 1;
2682
33.2k
                    if (j < avail) {
2683
33.2k
                        if ((in[j] | 0x20) == 'x') {
2684
23.2k
                            j += 1;
2685
23.2k
                            if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
2686
5.77k
                                ncr = 16;
2687
5.77k
                                size = 3;
2688
5.77k
                                ncrSize = 3;
2689
5.77k
                                cp = 0;
2690
5.77k
                            }
2691
23.2k
                        } else if (IS_ASCII_DIGIT(in[j])) {
2692
6.47k
                            ncr = 10;
2693
6.47k
                            size = 2;
2694
6.47k
                            ncrSize = 2;
2695
6.47k
                            cp = 0;
2696
6.47k
                        }
2697
33.2k
                    }
2698
33.2k
                } else {
2699
13.6k
                    repl = htmlFindEntityPrefix(in + j,
2700
13.6k
                                                avail - j,
2701
13.6k
                                                /* isAttr */ 1,
2702
13.6k
                                                &skip, &replSize);
2703
13.6k
                    if (repl != NULL) {
2704
1.01k
                        skip += 1;
2705
1.01k
                        goto next_chunk;
2706
1.01k
                    }
2707
2708
12.5k
                    skip = 0;
2709
12.5k
                }
2710
2711
45.8k
                break;
2712
2713
40.8M
            case '\0':
2714
40.8M
                skip = 1;
2715
40.8M
                repl = BAD_CAST "\xEF\xBF\xBD";
2716
40.8M
                replSize = 3;
2717
40.8M
                goto next_chunk;
2718
2719
12.4M
            case '\n':
2720
12.4M
                line += 1;
2721
12.4M
                col = 1;
2722
12.4M
                break;
2723
2724
56.9k
            case '\r':
2725
56.9k
                skip = 1;
2726
56.9k
                if (in[1] != 0x0A) {
2727
56.4k
                    repl = BAD_CAST "\x0A";
2728
56.4k
                    replSize = 1;
2729
56.4k
                }
2730
56.9k
                goto next_chunk;
2731
2732
118M
            default:
2733
118M
                if (cur < 0x80)
2734
6.29M
                    break;
2735
2736
112M
                if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
2737
3.47k
                    xmlChar * guess;
2738
2739
3.47k
                    if (in > chunk)
2740
1.23k
                        goto next_chunk;
2741
2742
2.23k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2743
2.23k
                    guess = NULL;
2744
#else
2745
                    guess = htmlFindEncoding(ctxt);
2746
#endif
2747
2.23k
                    if (guess == NULL) {
2748
2.23k
                        xmlSwitchEncoding(ctxt,
2749
2.23k
                                XML_CHAR_ENCODING_WINDOWS_1252);
2750
2.23k
                    } else {
2751
0
                        xmlSwitchEncodingName(ctxt, (const char *) guess);
2752
0
                        xmlFree(guess);
2753
0
                    }
2754
2.23k
                    input->flags |= XML_INPUT_HAS_ENCODING;
2755
2756
2.23k
                    eof = PARSER_PROGRESSIVE(ctxt);
2757
2.23k
                    goto restart;
2758
3.47k
                }
2759
2760
111M
                size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2761
2762
111M
                if (size <= 0) {
2763
801k
                    skip = 1;
2764
801k
                    repl = BAD_CAST "\xEF\xBF\xBD";
2765
801k
                    replSize = 3;
2766
801k
                    goto next_chunk;
2767
801k
                }
2768
2769
111M
                break;
2770
171M
            }
2771
2772
130M
next_char:
2773
130M
            in += size;
2774
130M
            avail -= size;
2775
130M
        }
2776
2777
42.7M
next_chunk:
2778
42.7M
        if (ncrSize > 0) {
2779
12.2k
            skip = ncrSize;
2780
12.2k
            in -= ncrSize;
2781
2782
12.2k
            repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
2783
12.2k
        }
2784
2785
42.7M
        chunkSize = in - chunk;
2786
42.7M
        extraSize = chunkSize + replSize;
2787
2788
42.7M
        if (extraSize > maxLength - used) {
2789
77
            htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
2790
77
                         "value too long\n", NULL, NULL);
2791
77
            goto error;
2792
77
        }
2793
2794
42.7M
        if (extraSize > buffer_size - used) {
2795
4.83k
            size_t newSize = (used + extraSize) * 2;
2796
4.83k
            xmlChar *tmp = xmlRealloc(buffer, newSize + 1);
2797
2798
4.83k
            if (tmp == NULL) {
2799
10
                htmlErrMemory(ctxt);
2800
10
                goto error;
2801
10
            }
2802
4.82k
            buffer = tmp;
2803
4.82k
            buffer_size = newSize;
2804
4.82k
        }
2805
2806
42.7M
        if (chunkSize > 0) {
2807
391k
            input->cur += chunkSize;
2808
391k
            memcpy(buffer + used, chunk, chunkSize);
2809
391k
            used += chunkSize;
2810
391k
        }
2811
2812
42.7M
        input->cur += skip;
2813
42.7M
        if (replSize > 0) {
2814
41.7M
            memcpy(buffer + used, repl, replSize);
2815
41.7M
            used += replSize;
2816
41.7M
        }
2817
2818
42.7M
        SHRINK;
2819
2820
42.7M
        if (termSkip >= 0)
2821
994k
            break;
2822
2823
41.7M
restart:
2824
41.7M
        ;
2825
41.7M
    }
2826
2827
1.11M
    if (termSkip > 0) {
2828
11.0k
        input->cur += termSkip;
2829
11.0k
        col += termSkip;
2830
11.0k
    }
2831
2832
1.11M
    input->line = line;
2833
1.11M
    input->col = col;
2834
2835
1.11M
    ret = xmlMalloc(used + 1);
2836
1.11M
    if (ret == NULL) {
2837
285
        htmlErrMemory(ctxt);
2838
1.10M
    } else {
2839
1.10M
        memcpy(ret, buffer, used);
2840
1.10M
        ret[used] = 0;
2841
1.10M
    }
2842
2843
1.11M
error:
2844
1.11M
    ctxt->spaceTab = (void *) buffer;
2845
1.11M
    ctxt->spaceMax = buffer_size;
2846
2847
1.11M
    return(ret);
2848
1.11M
}
2849
2850
/**
2851
 * @deprecated Internal function, don't use.
2852
 *
2853
 * @param ctxt  an HTML parser context
2854
 * @param str  location to store the entity name
2855
 * @returns NULL.
2856
 */
2857
const htmlEntityDesc *
2858
htmlParseEntityRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED,
2859
0
                   const xmlChar **str ATTRIBUTE_UNUSED) {
2860
0
    return(NULL);
2861
0
}
2862
2863
/**
2864
 * parse a value for an attribute
2865
 * Note: the parser won't do substitution of entities here, this
2866
 * will be handled later in #xmlStringGetNodeList, unless it was
2867
 * asked for ctxt->replaceEntities != 0
2868
 *
2869
 * @param ctxt  an HTML parser context
2870
 * @returns the AttValue parsed or NULL.
2871
 */
2872
2873
static xmlChar *
2874
78.5k
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2875
78.5k
    xmlChar *ret = NULL;
2876
78.5k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
2877
39.4k
                    XML_MAX_HUGE_LENGTH :
2878
78.5k
                    XML_MAX_TEXT_LENGTH;
2879
2880
78.5k
    if (CUR == '"') {
2881
9.58k
        SKIP(1);
2882
9.58k
  ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
2883
9.58k
        if (CUR == '"')
2884
9.47k
            SKIP(1);
2885
68.9k
    } else if (CUR == '\'') {
2886
963
        SKIP(1);
2887
963
  ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
2888
963
        if (CUR == '\'')
2889
927
            SKIP(1);
2890
67.9k
    } else {
2891
67.9k
  ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
2892
67.9k
    }
2893
78.5k
    return(ret);
2894
78.5k
}
2895
2896
static void
2897
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
2898
22.0M
                        int size, int mode) {
2899
22.0M
    if ((ctxt->sax == NULL) || (ctxt->disableSAX))
2900
35
        return;
2901
2902
22.0M
    if ((mode == 0) || (mode == DATA_RCDATA) ||
2903
15.6M
        (ctxt->sax->cdataBlock == NULL)) {
2904
6.37M
        if ((ctxt->name == NULL) ||
2905
5.48M
            (xmlStrEqual(ctxt->name, BAD_CAST "html")) ||
2906
5.47M
            (xmlStrEqual(ctxt->name, BAD_CAST "head"))) {
2907
919k
            int i;
2908
2909
            /*
2910
             * Add leading whitespace to html or head elements before
2911
             * calling htmlStartCharData.
2912
             */
2913
5.13M
            for (i = 0; i < size; i++)
2914
5.09M
                if (!IS_WS_HTML(buf[i]))
2915
882k
                    break;
2916
2917
919k
            if (i > 0) {
2918
41.4k
                if (!ctxt->keepBlanks) {
2919
34.5k
                    if (ctxt->sax->ignorableWhitespace != NULL)
2920
34.5k
                        ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i);
2921
34.5k
                } else {
2922
6.89k
                    if (ctxt->sax->characters != NULL)
2923
6.89k
                        ctxt->sax->characters(ctxt->userData, buf, i);
2924
6.89k
                }
2925
2926
41.4k
                buf += i;
2927
41.4k
                size -= i;
2928
41.4k
            }
2929
2930
919k
            if (size <= 0)
2931
36.8k
                return;
2932
2933
882k
            htmlStartCharData(ctxt);
2934
2935
882k
            if (PARSER_STOPPED(ctxt))
2936
77
                return;
2937
882k
        }
2938
2939
6.33M
        if ((mode == 0) &&
2940
3.24M
            (!ctxt->keepBlanks) &&
2941
1.41M
            (areBlanks(ctxt, buf, size) > 0)) {
2942
5.08k
            if (ctxt->sax->ignorableWhitespace != NULL)
2943
5.08k
                ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size);
2944
6.33M
        } else {
2945
6.33M
            if (ctxt->sax->characters != NULL)
2946
6.33M
                ctxt->sax->characters(ctxt->userData, buf, size);
2947
6.33M
        }
2948
15.6M
    } else {
2949
        /*
2950
         * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2951
         */
2952
15.6M
        ctxt->sax->cdataBlock(ctxt->userData, buf, size);
2953
15.6M
    }
2954
22.0M
}
2955
2956
/**
2957
 * Parse character data and references.
2958
 *
2959
 * @param ctxt  an HTML parser context
2960
 * @param partial  true if the input buffer is incomplete
2961
 * @returns 1 if all data was parsed, 0 otherwise.
2962
 */
2963
2964
static int
2965
264k
htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
2966
264k
    xmlParserInputPtr input = ctxt->input;
2967
264k
    xmlChar utf8Char[4];
2968
264k
    int complete = 0;
2969
264k
    int done = 0;
2970
264k
    int mode;
2971
264k
    int eof = PARSER_PROGRESSIVE(ctxt);
2972
264k
    int line, col;
2973
2974
264k
    mode = ctxt->endCheckState;
2975
2976
264k
    line = input->line;
2977
264k
    col = input->col;
2978
2979
42.6M
    while (!PARSER_STOPPED(ctxt)) {
2980
42.6M
        const xmlChar *chunk, *in, *repl;
2981
42.6M
        size_t avail;
2982
42.6M
        int replSize;
2983
42.6M
        int skip = 0;
2984
42.6M
        int ncr = 0;
2985
42.6M
        int ncrSize = 0;
2986
42.6M
        int cp = 0;
2987
2988
42.6M
        chunk = input->cur;
2989
42.6M
        avail = input->end - chunk;
2990
42.6M
        in = chunk;
2991
2992
42.6M
        repl = BAD_CAST "";
2993
42.6M
        replSize = 0;
2994
2995
195M
        while (!PARSER_STOPPED(ctxt)) {
2996
195M
            size_t j;
2997
195M
            int cur, size;
2998
2999
195M
            if (avail <= 64) {
3000
1.01M
                if (!eof) {
3001
37.2k
                    size_t oldAvail = avail;
3002
37.2k
                    size_t off = in - chunk;
3003
3004
37.2k
                    input->cur = in;
3005
3006
37.2k
                    xmlParserGrow(ctxt);
3007
3008
37.2k
                    in = input->cur;
3009
37.2k
                    chunk = in - off;
3010
37.2k
                    input->cur = chunk;
3011
37.2k
                    avail = input->end - in;
3012
3013
37.2k
                    if (oldAvail == avail)
3014
8.69k
                        eof = 1;
3015
37.2k
                }
3016
3017
1.01M
                if (avail == 0) {
3018
47.5k
                    if ((partial) && (ncr)) {
3019
660
                        in -= ncrSize;
3020
660
                        ncrSize = 0;
3021
660
                    }
3022
3023
47.5k
                    done = 1;
3024
47.5k
                    break;
3025
47.5k
                }
3026
1.01M
            }
3027
3028
            /* Accelerator */
3029
195M
            if (!ncr) {
3030
225M
                while (avail > 0) {
3031
225M
                    static const unsigned mask[8] = {
3032
225M
                        0x00002401, 0x10002040,
3033
225M
                        0x00000000, 0x00000000,
3034
225M
                        0xFFFFFFFF, 0xFFFFFFFF,
3035
225M
                        0xFFFFFFFF, 0xFFFFFFFF
3036
225M
                    };
3037
225M
                    cur = *in;
3038
225M
                    if ((1u << (cur & 0x1F)) & mask[cur >> 5])
3039
194M
                        break;
3040
30.7M
                    col += 1;
3041
30.7M
                    in += 1;
3042
30.7M
                    avail -= 1;
3043
30.7M
                }
3044
3045
194M
                if ((!eof) && (avail <= 64))
3046
3.09k
                    continue;
3047
194M
                if (avail == 0)
3048
6.65k
                    continue;
3049
194M
            }
3050
3051
195M
            cur = *in;
3052
195M
            size = 1;
3053
195M
            col += 1;
3054
3055
195M
            if (ncr) {
3056
640k
                int lc = cur | 0x20;
3057
640k
                int digit;
3058
3059
640k
                if ((cur >= '0') && (cur <= '9')) {
3060
32.1k
                    digit = cur - '0';
3061
608k
                } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
3062
547k
                    digit = (lc - 'a') + 10;
3063
547k
                } else {
3064
60.8k
                    if (cur == ';') {
3065
17.2k
                        in += 1;
3066
17.2k
                        size += 1;
3067
17.2k
                        ncrSize += 1;
3068
17.2k
                    }
3069
60.8k
                    goto next_chunk;
3070
60.8k
                }
3071
3072
579k
                cp = cp * ncr + digit;
3073
579k
                if (cp >= 0x110000)
3074
496k
                    cp = 0x110000;
3075
3076
579k
                ncrSize += 1;
3077
3078
579k
                goto next_char;
3079
640k
            }
3080
3081
194M
            switch (cur) {
3082
406k
            case '<':
3083
406k
                if (mode == 0) {
3084
196k
                    done = 1;
3085
196k
                    complete = 1;
3086
196k
                    goto next_chunk;
3087
196k
                }
3088
210k
                if (mode == DATA_PLAINTEXT)
3089
4.24k
                    break;
3090
3091
205k
                j = 1;
3092
205k
                if (j < avail) {
3093
204k
                    if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
3094
                        /* Check for comment start */
3095
3096
31.0k
                        j += 1;
3097
31.0k
                        if ((j < avail) && (in[j] == '-')) {
3098
24.0k
                            j += 1;
3099
24.0k
                            if ((j < avail) && (in[j] == '-'))
3100
22.8k
                                mode = DATA_SCRIPT_ESC1;
3101
24.0k
                        }
3102
173k
                    } else {
3103
173k
                        int i = 0;
3104
173k
                        int solidus = 0;
3105
3106
                        /* Check for tag */
3107
3108
173k
                        if (in[j] == '/') {
3109
31.8k
                            j += 1;
3110
31.8k
                            solidus = 1;
3111
31.8k
                        }
3112
3113
173k
                        if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
3114
130k
                            while ((j < avail) &&
3115
129k
                                   (ctxt->name[i] != 0) &&
3116
115k
                                   (ctxt->name[i] == (in[j] | 0x20))) {
3117
80.7k
                                i += 1;
3118
80.7k
                                j += 1;
3119
80.7k
                            }
3120
3121
49.8k
                            if ((ctxt->name[i] == 0) && (j < avail)) {
3122
13.8k
                                int c = in[j];
3123
3124
13.8k
                                if ((c == '>') || (c == '/') ||
3125
7.58k
                                    (IS_WS_HTML(c))) {
3126
7.58k
                                    if ((mode == DATA_SCRIPT_ESC1) &&
3127
1.13k
                                        (!solidus)) {
3128
878
                                        mode = DATA_SCRIPT_ESC2;
3129
6.70k
                                    } else if (mode == DATA_SCRIPT_ESC2) {
3130
755
                                        mode = DATA_SCRIPT_ESC1;
3131
5.95k
                                    } else {
3132
5.95k
                                        complete = 1;
3133
5.95k
                                        done = 1;
3134
5.95k
                                        goto next_chunk;
3135
5.95k
                                    }
3136
7.58k
                                }
3137
13.8k
                            }
3138
49.8k
                        }
3139
173k
                    }
3140
204k
                }
3141
3142
199k
                if ((partial) && (j >= avail)) {
3143
2.90k
                    done = 1;
3144
2.90k
                    goto next_chunk;
3145
2.90k
                }
3146
3147
197k
                break;
3148
3149
286k
            case '-':
3150
286k
                if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
3151
137k
                    break;
3152
3153
                /* Check for comment end */
3154
3155
149k
                j = 1;
3156
149k
                if ((j < avail) && (in[j] == '-')) {
3157
137k
                    j += 1;
3158
137k
                    if ((j < avail) && (in[j] == '>'))
3159
22.2k
                        mode = DATA_SCRIPT;
3160
137k
                }
3161
3162
149k
                if ((partial) && (j >= avail)) {
3163
3.55k
                    done = 1;
3164
3.55k
                    goto next_chunk;
3165
3.55k
                }
3166
3167
145k
                break;
3168
3169
739k
            case '&':
3170
739k
                if ((mode != 0) && (mode != DATA_RCDATA))
3171
52.7k
                    break;
3172
3173
686k
                j = 1;
3174
3175
686k
                if ((j < avail) && (in[j] == '#')) {
3176
88.9k
                    j += 1;
3177
88.9k
                    if (j < avail) {
3178
88.5k
                        if ((in[j] | 0x20) == 'x') {
3179
69.9k
                            j += 1;
3180
69.9k
                            if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
3181
50.3k
                                ncr = 16;
3182
50.3k
                                size = 3;
3183
50.3k
                                ncrSize = 3;
3184
50.3k
                                cp = 0;
3185
50.3k
                            }
3186
69.9k
                        } else if (IS_ASCII_DIGIT(in[j])) {
3187
11.5k
                            ncr = 10;
3188
11.5k
                            size = 2;
3189
11.5k
                            ncrSize = 2;
3190
11.5k
                            cp = 0;
3191
11.5k
                        }
3192
88.5k
                    }
3193
597k
                } else {
3194
597k
                    if (partial) {
3195
255k
                        int terminated = 0;
3196
255k
                        size_t i;
3197
3198
                        /*
3199
                         * &CounterClockwiseContourIntegral; has 33 bytes.
3200
                         */
3201
538k
                        for (i = 1; i < avail; i++) {
3202
532k
                            if ((i >= 32) ||
3203
532k
                                (!IS_ASCII_LETTER(in[i]) &&
3204
250k
                                 ((i < 2) || !IS_ASCII_DIGIT(in[i])))) {
3205
249k
                                terminated = 1;
3206
249k
                                break;
3207
249k
                            }
3208
532k
                        }
3209
3210
255k
                        if (!terminated) {
3211
5.68k
                            done = 1;
3212
5.68k
                            goto next_chunk;
3213
5.68k
                        }
3214
255k
                    }
3215
3216
591k
                    repl = htmlFindEntityPrefix(in + j,
3217
591k
                                                avail - j,
3218
591k
                                                /* isAttr */ 0,
3219
591k
                                                &skip, &replSize);
3220
591k
                    if (repl != NULL) {
3221
17.0k
                        skip += 1;
3222
17.0k
                        goto next_chunk;
3223
17.0k
                    }
3224
3225
574k
                    skip = 0;
3226
574k
                }
3227
3228
663k
                if ((partial) && (j >= avail)) {
3229
689
                    done = 1;
3230
689
                    goto next_chunk;
3231
689
                }
3232
3233
662k
                break;
3234
3235
40.0M
            case '\0':
3236
40.0M
                skip = 1;
3237
3238
40.0M
                if (mode == 0) {
3239
                    /*
3240
                     * The HTML5 spec says that the tokenizer should
3241
                     * pass on U+0000 unmodified in normal data mode.
3242
                     * These characters should then be ignored in body
3243
                     * and other text, but should be replaced with
3244
                     * U+FFFD in foreign content.
3245
                     *
3246
                     * At least for now, we always strip U+0000 when
3247
                     * tokenizing.
3248
                     */
3249
23.3M
                    repl = BAD_CAST "";
3250
23.3M
                    replSize = 0;
3251
23.3M
                } else {
3252
16.7M
                    repl = BAD_CAST "\xEF\xBF\xBD";
3253
16.7M
                    replSize = 3;
3254
16.7M
                }
3255
3256
40.0M
                goto next_chunk;
3257
3258
20.1M
            case '\n':
3259
20.1M
                line += 1;
3260
20.1M
                col = 1;
3261
20.1M
                break;
3262
3263
193k
            case '\r':
3264
193k
                if (partial && avail < 2) {
3265
261
                    done = 1;
3266
261
                    goto next_chunk;
3267
261
                }
3268
3269
193k
                skip = 1;
3270
193k
                if (in[1] != 0x0A) {
3271
192k
                    repl = BAD_CAST "\x0A";
3272
192k
                    replSize = 1;
3273
192k
                }
3274
193k
                goto next_chunk;
3275
3276
133M
            default:
3277
133M
                if (cur < 0x80)
3278
0
                    break;
3279
3280
133M
                if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
3281
6.70k
                    xmlChar * guess;
3282
3283
6.70k
                    if (in > chunk)
3284
1.51k
                        goto next_chunk;
3285
3286
5.19k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3287
5.19k
                    guess = NULL;
3288
#else
3289
                    guess = htmlFindEncoding(ctxt);
3290
#endif
3291
5.19k
                    if (guess == NULL) {
3292
5.19k
                        xmlSwitchEncoding(ctxt,
3293
5.19k
                                XML_CHAR_ENCODING_WINDOWS_1252);
3294
5.19k
                    } else {
3295
0
                        xmlSwitchEncodingName(ctxt, (const char *) guess);
3296
0
                        xmlFree(guess);
3297
0
                    }
3298
5.19k
                    input->flags |= XML_INPUT_HAS_ENCODING;
3299
3300
5.19k
                    eof = PARSER_PROGRESSIVE(ctxt);
3301
5.19k
                    goto restart;
3302
6.70k
                }
3303
3304
133M
                size = htmlValidateUtf8(ctxt, in, avail, partial);
3305
3306
133M
                if ((partial) && (size == 0)) {
3307
1.14k
                    done = 1;
3308
1.14k
                    goto next_chunk;
3309
1.14k
                }
3310
3311
133M
                if (size <= 0) {
3312
2.06M
                    skip = 1;
3313
2.06M
                    repl = BAD_CAST "\xEF\xBF\xBD";
3314
2.06M
                    replSize = 3;
3315
2.06M
                    goto next_chunk;
3316
2.06M
                }
3317
3318
131M
                break;
3319
194M
            }
3320
3321
152M
next_char:
3322
152M
            in += size;
3323
152M
            avail -= size;
3324
152M
        }
3325
3326
42.6M
next_chunk:
3327
42.6M
        if (ncrSize > 0) {
3328
61.2k
            skip = ncrSize;
3329
61.2k
            in -= ncrSize;
3330
3331
61.2k
            repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
3332
61.2k
        }
3333
3334
42.6M
        if (in > chunk) {
3335
2.98M
            input->cur += in - chunk;
3336
2.98M
            htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
3337
2.98M
        }
3338
3339
42.6M
        input->cur += skip;
3340
42.6M
        if (replSize > 0)
3341
19.0M
            htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
3342
3343
42.6M
        SHRINK;
3344
3345
42.6M
        if (done)
3346
264k
            break;
3347
3348
42.3M
restart:
3349
42.3M
        ;
3350
42.3M
    }
3351
3352
264k
    input->line = line;
3353
264k
    input->col = col;
3354
3355
264k
    if (complete)
3356
202k
        ctxt->endCheckState = 0;
3357
61.9k
    else
3358
61.9k
        ctxt->endCheckState = mode;
3359
3360
264k
    return(complete);
3361
264k
}
3362
3363
/**
3364
 * Parse an HTML comment
3365
 *
3366
 * @param ctxt  an HTML parser context
3367
 * @param bogus  true if this is a bogus comment
3368
 */
3369
static void
3370
995k
htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
3371
995k
    const xmlChar *comment = BAD_CAST "";
3372
995k
    xmlChar *buf = NULL;
3373
995k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3374
236k
                    XML_MAX_HUGE_LENGTH :
3375
995k
                    XML_MAX_TEXT_LENGTH;
3376
3377
995k
    if (bogus) {
3378
975k
        buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
3379
975k
        if (CUR == '>')
3380
973k
            SKIP(1);
3381
975k
        comment = buf;
3382
975k
    } else {
3383
20.1k
        if ((!PARSER_PROGRESSIVE(ctxt)) &&
3384
10.3k
            (ctxt->input->end - ctxt->input->cur < 2))
3385
72
            xmlParserGrow(ctxt);
3386
3387
20.1k
        if (CUR == '>') {
3388
7.27k
            SKIP(1);
3389
12.9k
        } else if ((CUR == '-') && (NXT(1) == '>')) {
3390
974
            SKIP(2);
3391
11.9k
        } else {
3392
11.9k
            buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
3393
11.9k
            comment = buf;
3394
11.9k
        }
3395
20.1k
    }
3396
3397
995k
    if (comment == NULL)
3398
215
        return;
3399
3400
995k
    if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3401
995k
        (!ctxt->disableSAX))
3402
879k
        ctxt->sax->comment(ctxt->userData, comment);
3403
3404
995k
    xmlFree(buf);
3405
995k
}
3406
3407
/**
3408
 * @deprecated Internal function, don't use.
3409
 *
3410
 * @param ctxt  an HTML parser context
3411
 * @returns 0
3412
 */
3413
int
3414
0
htmlParseCharRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED) {
3415
0
    return(0);
3416
0
}
3417
3418
3419
/**
3420
 * Parse a DOCTYPE SYTSTEM or PUBLIC literal.
3421
 *
3422
 * @param ctxt  an HTML parser context
3423
 * @returns the literal or NULL in case of error.
3424
 */
3425
3426
static xmlChar *
3427
32.5k
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
3428
32.5k
    xmlChar *ret;
3429
32.5k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3430
4.16k
                    XML_MAX_TEXT_LENGTH :
3431
32.5k
                    XML_MAX_NAME_LENGTH;
3432
3433
32.5k
    if (CUR == '"') {
3434
1.16k
        SKIP(1);
3435
1.16k
        ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
3436
1.16k
        if (CUR == '"')
3437
848
            SKIP(1);
3438
31.3k
    } else if (CUR == '\'') {
3439
2.98k
        SKIP(1);
3440
2.98k
        ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
3441
2.98k
        if (CUR == '\'')
3442
2.11k
            SKIP(1);
3443
28.3k
    } else {
3444
28.3k
        return(NULL);
3445
28.3k
    }
3446
3447
4.15k
    return(ret);
3448
32.5k
}
3449
3450
static void
3451
44.0k
htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
3452
44.0k
    const xmlChar *in;
3453
44.0k
    size_t avail;
3454
44.0k
    int eof = PARSER_PROGRESSIVE(ctxt);
3455
44.0k
    int line, col;
3456
3457
44.0k
    line = ctxt->input->line;
3458
44.0k
    col = ctxt->input->col;
3459
3460
44.0k
    in = ctxt->input->cur;
3461
44.0k
    avail = ctxt->input->end - in;
3462
3463
36.7M
    while (!PARSER_STOPPED(ctxt)) {
3464
36.7M
        int cur;
3465
3466
36.7M
        if ((!eof) && (avail <= 64)) {
3467
2.65k
            size_t oldAvail = avail;
3468
3469
2.65k
            ctxt->input->cur = in;
3470
3471
2.65k
            xmlParserGrow(ctxt);
3472
3473
2.65k
            in = ctxt->input->cur;
3474
2.65k
            avail = ctxt->input->end - in;
3475
3476
2.65k
            if (oldAvail == avail)
3477
947
                eof = 1;
3478
2.65k
        }
3479
3480
36.7M
        if (avail == 0)
3481
902
            break;
3482
3483
36.7M
        col += 1;
3484
3485
36.7M
        cur = *in;
3486
36.7M
        if (cur == '>') {
3487
42.7k
            in += 1;
3488
42.7k
            break;
3489
36.6M
        } else if (cur == 0x0A) {
3490
143k
            line += 1;
3491
143k
            col = 1;
3492
143k
        }
3493
3494
36.6M
        in += 1;
3495
36.6M
        avail -= 1;
3496
3497
36.6M
        SHRINK;
3498
36.6M
    }
3499
3500
44.0k
    ctxt->input->cur = in;
3501
44.0k
    ctxt->input->line = line;
3502
44.0k
    ctxt->input->col = col;
3503
44.0k
}
3504
3505
/**
3506
 * Parse a DOCTYPE declaration.
3507
 *
3508
 * @param ctxt  an HTML parser context
3509
 */
3510
3511
static void
3512
44.0k
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3513
44.0k
    xmlChar *name = NULL;
3514
44.0k
    xmlChar *publicId = NULL;
3515
44.0k
    xmlChar *URI = NULL;
3516
44.0k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3517
6.91k
                    XML_MAX_TEXT_LENGTH :
3518
44.0k
                    XML_MAX_NAME_LENGTH;
3519
3520
    /*
3521
     * We know that '<!DOCTYPE' has been detected.
3522
     */
3523
44.0k
    SKIP(9);
3524
3525
44.0k
    SKIP_BLANKS;
3526
3527
44.0k
    if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
3528
40.3k
        name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
3529
3530
40.3k
        if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
3531
14.0k
            xmlChar *cur;
3532
3533
2.57M
            for (cur = name; *cur; cur++) {
3534
2.56M
                if (IS_UPPER(*cur))
3535
5.00k
                    *cur += 0x20;
3536
2.56M
            }
3537
14.0k
        }
3538
3539
40.3k
        SKIP_BLANKS;
3540
40.3k
    }
3541
3542
    /*
3543
     * Check for SystemID and publicId
3544
     */
3545
44.0k
    if ((UPPER == 'P') && (UPP(1) == 'U') &&
3546
32.5k
  (UPP(2) == 'B') && (UPP(3) == 'L') &&
3547
31.7k
  (UPP(4) == 'I') && (UPP(5) == 'C')) {
3548
29.5k
        SKIP(6);
3549
29.5k
        SKIP_BLANKS;
3550
29.5k
  publicId = htmlParseDoctypeLiteral(ctxt);
3551
29.5k
  if (publicId == NULL)
3552
27.6k
            goto bogus;
3553
1.91k
        SKIP_BLANKS;
3554
1.91k
  URI = htmlParseDoctypeLiteral(ctxt);
3555
14.4k
    } else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3556
2.19k
               (UPP(2) == 'S') && (UPP(3) == 'T') &&
3557
1.66k
         (UPP(4) == 'E') && (UPP(5) == 'M')) {
3558
1.08k
        SKIP(6);
3559
1.08k
        SKIP_BLANKS;
3560
1.08k
  URI = htmlParseDoctypeLiteral(ctxt);
3561
1.08k
    }
3562
3563
44.0k
bogus:
3564
44.0k
    htmlSkipBogusDoctype(ctxt);
3565
3566
    /*
3567
     * Create or update the document accordingly to the DOCTYPE
3568
     */
3569
44.0k
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3570
44.0k
  (!ctxt->disableSAX))
3571
43.6k
  ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
3572
3573
44.0k
    xmlFree(name);
3574
44.0k
    xmlFree(URI);
3575
44.0k
    xmlFree(publicId);
3576
44.0k
}
3577
3578
/**
3579
 * parse an attribute
3580
 *
3581
 * [41] Attribute ::= Name Eq AttValue
3582
 *
3583
 * [25] Eq ::= S? '=' S?
3584
 *
3585
 * With namespace:
3586
 *
3587
 * [NS 11] Attribute ::= QName Eq AttValue
3588
 *
3589
 * Also the case QName == xmlns:??? is handled independently as a namespace
3590
 * definition.
3591
 *
3592
 * @param ctxt  an HTML parser context
3593
 * @param value  a xmlChar ** used to store the value of the attribute
3594
 * @returns the attribute name, and the value in *value.
3595
 */
3596
3597
static xmlHashedString
3598
5.28M
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3599
5.28M
    xmlHashedString hname;
3600
5.28M
    xmlChar *val = NULL;
3601
3602
5.28M
    *value = NULL;
3603
5.28M
    hname = htmlParseHTMLName(ctxt, 1);
3604
5.28M
    if (hname.name == NULL)
3605
20
        return(hname);
3606
3607
    /*
3608
     * read the value
3609
     */
3610
5.28M
    SKIP_BLANKS;
3611
5.28M
    if (CUR == '=') {
3612
78.5k
        SKIP(1);
3613
78.5k
  SKIP_BLANKS;
3614
78.5k
  val = htmlParseAttValue(ctxt);
3615
78.5k
    }
3616
3617
5.28M
    *value = val;
3618
5.28M
    return(hname);
3619
5.28M
}
3620
3621
static int
3622
htmlCharEncCheckAsciiCompatible(htmlParserCtxt *ctxt,
3623
5.93k
                                const xmlChar *encoding) {
3624
5.93k
    xmlCharEncodingHandler *handler;
3625
5.93k
    xmlChar in[9] = "<a A=\"/>";
3626
5.93k
    xmlChar out[9];
3627
5.93k
    int inlen, outlen;
3628
5.93k
    int res;
3629
3630
5.93k
    res = xmlCreateCharEncodingHandler(
3631
5.93k
            (const char *) encoding,
3632
5.93k
            XML_ENC_INPUT | XML_ENC_HTML,
3633
5.93k
            ctxt->convImpl, ctxt->convCtxt,
3634
5.93k
            &handler);
3635
5.93k
    if (res != XML_ERR_OK) {
3636
3.01k
        xmlFatalErr(ctxt, res, (const char *) encoding);
3637
3.01k
        return(-1);
3638
3.01k
    }
3639
3640
    /* UTF-8 */
3641
2.92k
    if (handler == NULL)
3642
207
        return(0);
3643
3644
2.71k
    inlen = 8;
3645
2.71k
    outlen = 8;
3646
2.71k
    res = xmlEncInputChunk(handler, out, &outlen, in, &inlen, /* flush */ 1);
3647
3648
2.71k
    xmlCharEncCloseFunc(handler);
3649
3650
2.71k
    if ((res != XML_ENC_ERR_SUCCESS) ||
3651
1.73k
        (inlen != 8) || (outlen != 8) ||
3652
1.73k
        (memcmp(in, out, 8) != 0)) {
3653
978
        htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3654
978
                     "Encoding %s isn't ASCII-compatible", encoding, NULL);
3655
978
        return(-1);
3656
978
    }
3657
3658
1.73k
    return(0);
3659
2.71k
}
3660
3661
/**
3662
 * Handle charset encoding in meta tag.
3663
 *
3664
 * @param ctxt  an HTML parser context
3665
 * @param atts  the attributes values
3666
 */
3667
static void
3668
8.70k
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3669
8.70k
    int i;
3670
8.70k
    const xmlChar *att, *value;
3671
8.70k
    int isContentType = 0;
3672
8.70k
    const xmlChar *content = NULL;
3673
8.70k
    xmlChar *encoding = NULL;
3674
3675
8.70k
    if ((ctxt == NULL) || (atts == NULL))
3676
0
  return;
3677
3678
8.70k
    i = 0;
3679
8.70k
    att = atts[i++];
3680
19.7k
    while (att != NULL) {
3681
15.2k
  value = atts[i++];
3682
15.2k
        if (value != NULL) {
3683
11.8k
            if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3684
3.69k
                (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3685
3.35k
                isContentType = 1;
3686
8.47k
            } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3687
4.21k
                encoding = xmlStrdup(value);
3688
4.21k
                if (encoding == NULL)
3689
30
                    htmlErrMemory(ctxt);
3690
4.21k
                break;
3691
4.26k
            } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3692
3.43k
                content = value;
3693
3.43k
            }
3694
11.8k
        }
3695
11.0k
  att = atts[i++];
3696
11.0k
    }
3697
3698
8.70k
    if ((encoding == NULL) && (isContentType) && (content != NULL)) {
3699
3.09k
        htmlMetaEncodingOffsets off;
3700
3701
3.09k
        if (htmlParseContentType(content, &off)) {
3702
1.77k
            encoding = xmlStrndup(content + off.start, off.end - off.start);
3703
1.77k
            if (encoding == NULL)
3704
22
                htmlErrMemory(ctxt);
3705
1.77k
        }
3706
3.09k
    }
3707
3708
8.70k
    if (encoding != NULL) {
3709
5.93k
        if (htmlCharEncCheckAsciiCompatible(ctxt, encoding) < 0) {
3710
3.99k
            xmlFree(encoding);
3711
3.99k
            return;
3712
3.99k
        }
3713
3714
1.94k
        xmlSetDeclaredEncoding(ctxt, encoding);
3715
1.94k
    }
3716
8.70k
}
3717
3718
/**
3719
 * Inserts a new attribute into the hash table.
3720
 *
3721
 * @param ctxt  parser context
3722
 * @param size  size of the hash table
3723
 * @param name  attribute name
3724
 * @param hashValue  hash value of name
3725
 * @param aindex  attribute index (this is a multiple of 5)
3726
 * @returns INT_MAX if no existing attribute was found, the attribute
3727
 * index if an attribute was found, -1 if a memory allocation failed.
3728
 */
3729
static int
3730
htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
3731
4.09M
                   unsigned hashValue, int aindex) {
3732
4.09M
    xmlAttrHashBucket *table = ctxt->attrHash;
3733
4.09M
    xmlAttrHashBucket *bucket;
3734
4.09M
    unsigned hindex;
3735
3736
4.09M
    hindex = hashValue & (size - 1);
3737
4.09M
    bucket = &table[hindex];
3738
3739
4.18M
    while (bucket->index >= 0) {
3740
3.69M
        const xmlChar **atts = &ctxt->atts[bucket->index];
3741
3742
3.69M
        if (name == atts[0])
3743
3.60M
            return(bucket->index);
3744
3745
96.0k
        hindex++;
3746
96.0k
        bucket++;
3747
96.0k
        if (hindex >= size) {
3748
2.79k
            hindex = 0;
3749
2.79k
            bucket = table;
3750
2.79k
        }
3751
96.0k
    }
3752
3753
488k
    bucket->index = aindex;
3754
3755
488k
    return(INT_MAX);
3756
4.09M
}
3757
3758
/**
3759
 * parse a start of tag either for rule element or
3760
 * EmptyElement. In both case we don't parse the tag closing chars.
3761
 *
3762
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3763
 *
3764
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3765
 *
3766
 * With namespace:
3767
 *
3768
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3769
 *
3770
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3771
 *
3772
 * @param ctxt  an HTML parser context
3773
 * @returns 0 in case of success, -1 in case of error and 1 if discarded
3774
 */
3775
3776
static void
3777
262k
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3778
262k
    const xmlChar *name;
3779
262k
    const xmlChar *attname;
3780
262k
    xmlChar *attvalue;
3781
262k
    const xmlChar **atts;
3782
262k
    int nbatts = 0;
3783
262k
    int maxatts;
3784
262k
    int i;
3785
262k
    int discardtag = 0;
3786
3787
262k
    ctxt->endCheckState = 0;
3788
3789
262k
    SKIP(1);
3790
3791
262k
    atts = ctxt->atts;
3792
262k
    maxatts = ctxt->maxatts;
3793
3794
262k
    GROW;
3795
262k
    name = htmlParseHTMLName(ctxt, 0).name;
3796
262k
    if (name == NULL)
3797
111
        return;
3798
3799
262k
    if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
3800
        /*
3801
         * Check for auto-closure of HTML elements.
3802
         */
3803
189k
        htmlAutoClose(ctxt, name);
3804
3805
        /*
3806
         * Check for implied HTML elements.
3807
         */
3808
189k
        htmlCheckImplied(ctxt, name);
3809
3810
        /*
3811
         * Avoid html at any level > 0, head at any level != 1
3812
         * or any attempt to recurse body
3813
         */
3814
189k
        if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3815
993
            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3816
993
                         "htmlParseStartTag: misplaced <html> tag\n",
3817
993
                         name, NULL);
3818
993
            discardtag = 1;
3819
993
            ctxt->depth++;
3820
993
        }
3821
189k
        if ((ctxt->nameNr != 1) &&
3822
184k
            (xmlStrEqual(name, BAD_CAST"head"))) {
3823
1.24k
            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3824
1.24k
                         "htmlParseStartTag: misplaced <head> tag\n",
3825
1.24k
                         name, NULL);
3826
1.24k
            discardtag = 1;
3827
1.24k
            ctxt->depth++;
3828
1.24k
        }
3829
189k
        if (xmlStrEqual(name, BAD_CAST"body")) {
3830
2.88k
            int indx;
3831
460k
            for (indx = 0;indx < ctxt->nameNr;indx++) {
3832
457k
                if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3833
926
                    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3834
926
                                 "htmlParseStartTag: misplaced <body> tag\n",
3835
926
                                 name, NULL);
3836
926
                    discardtag = 1;
3837
926
                    ctxt->depth++;
3838
926
                }
3839
457k
            }
3840
2.88k
        }
3841
189k
    }
3842
3843
    /*
3844
     * Now parse the attributes, it ends up with the ending
3845
     *
3846
     * (S Attribute)* S?
3847
     */
3848
262k
    SKIP_BLANKS;
3849
5.59M
    while ((ctxt->input->cur < ctxt->input->end) &&
3850
5.59M
           (CUR != '>') &&
3851
5.34M
     ((CUR != '/') || (NXT(1) != '>')) &&
3852
5.33M
           (PARSER_STOPPED(ctxt) == 0)) {
3853
5.33M
        xmlHashedString hattname;
3854
3855
        /*  unexpected-solidus-in-tag */
3856
5.33M
        if (CUR == '/') {
3857
77.1k
            SKIP(1);
3858
77.1k
            SKIP_BLANKS;
3859
77.1k
            continue;
3860
77.1k
        }
3861
5.26M
  GROW;
3862
5.26M
  hattname = htmlParseAttribute(ctxt, &attvalue);
3863
5.26M
        attname = hattname.name;
3864
3865
5.26M
        if (attname != NULL) {
3866
      /*
3867
       * Add the pair to atts
3868
       */
3869
5.26M
      if (nbatts + 4 > maxatts) {
3870
14.1k
          const xmlChar **tmp;
3871
14.1k
                unsigned *utmp;
3872
14.1k
                int newSize;
3873
3874
14.1k
                newSize = xmlGrowCapacity(maxatts,
3875
14.1k
                                          sizeof(tmp[0]) * 2 + sizeof(utmp[0]),
3876
14.1k
                                          11, HTML_MAX_ATTRS);
3877
14.1k
    if (newSize < 0) {
3878
0
        htmlErrMemory(ctxt);
3879
0
        goto failed;
3880
0
    }
3881
14.1k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3882
14.1k
                if (newSize < 2)
3883
7.57k
                    newSize = 2;
3884
14.1k
#endif
3885
14.1k
          tmp = xmlRealloc(atts, newSize * sizeof(tmp[0]) * 2);
3886
14.1k
    if (tmp == NULL) {
3887
34
        htmlErrMemory(ctxt);
3888
34
        goto failed;
3889
34
    }
3890
14.0k
                atts = tmp;
3891
14.0k
    ctxt->atts = tmp;
3892
3893
14.0k
          utmp = xmlRealloc(ctxt->attallocs, newSize * sizeof(utmp[0]));
3894
14.0k
    if (utmp == NULL) {
3895
56
        htmlErrMemory(ctxt);
3896
56
        goto failed;
3897
56
    }
3898
14.0k
                ctxt->attallocs = utmp;
3899
3900
14.0k
                maxatts = newSize * 2;
3901
14.0k
    ctxt->maxatts = maxatts;
3902
14.0k
      }
3903
3904
5.26M
            ctxt->attallocs[nbatts/2] = hattname.hashValue;
3905
5.26M
      atts[nbatts++] = attname;
3906
5.26M
      atts[nbatts++] = attvalue;
3907
3908
5.26M
            attvalue = NULL;
3909
5.26M
  }
3910
3911
5.26M
failed:
3912
5.26M
        if (attvalue != NULL)
3913
34
            xmlFree(attvalue);
3914
3915
5.26M
  SKIP_BLANKS;
3916
5.26M
    }
3917
3918
262k
    if (ctxt->input->cur >= ctxt->input->end) {
3919
3.80k
        discardtag = 1;
3920
3.80k
        goto done;
3921
3.80k
    }
3922
3923
    /*
3924
     * Verify that attribute names are unique.
3925
     */
3926
258k
    if (nbatts > 2) {
3927
20.8k
        unsigned attrHashSize;
3928
20.8k
        int j, k;
3929
3930
20.8k
        attrHashSize = 4;
3931
46.4k
        while (attrHashSize / 2 < (unsigned) nbatts / 2)
3932
25.5k
            attrHashSize *= 2;
3933
3934
20.8k
        if (attrHashSize > ctxt->attrHashMax) {
3935
3.65k
            xmlAttrHashBucket *tmp;
3936
3937
3.65k
            tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
3938
3.65k
            if (tmp == NULL) {
3939
63
                htmlErrMemory(ctxt);
3940
63
                goto done;
3941
63
            }
3942
3943
3.59k
            ctxt->attrHash = tmp;
3944
3.59k
            ctxt->attrHashMax = attrHashSize;
3945
3.59k
        }
3946
3947
20.7k
        memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
3948
3949
4.11M
        for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
3950
4.09M
            unsigned hashValue;
3951
4.09M
            int res;
3952
3953
4.09M
            attname = atts[i];
3954
4.09M
            hashValue = ctxt->attallocs[k] | 0x80000000;
3955
3956
4.09M
            res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
3957
4.09M
                                    hashValue, j);
3958
4.09M
            if (res < 0)
3959
0
                continue;
3960
3961
4.09M
            if (res == INT_MAX) {
3962
488k
                atts[j] = atts[i];
3963
488k
                atts[j+1] = atts[i+1];
3964
488k
                j += 2;
3965
3.60M
            } else {
3966
3.60M
                xmlFree((xmlChar *) atts[i+1]);
3967
3.60M
            }
3968
4.09M
        }
3969
3970
20.7k
        nbatts = j;
3971
20.7k
    }
3972
3973
258k
    if (nbatts > 0) {
3974
72.4k
        atts[nbatts] = NULL;
3975
72.4k
        atts[nbatts + 1] = NULL;
3976
3977
    /*
3978
     * Apple's new libiconv is so broken that you routinely run into
3979
     * issues when fuzz testing (by accident with an uninstrumented
3980
     * libiconv). Here's a harmless (?) example:
3981
     *
3982
     * printf '>'             | iconv -f shift_jis -t utf-8 | hexdump -C
3983
     * printf '\xfc\x00\x00'  | iconv -f shift_jis -t utf-8 | hexdump -C
3984
     * printf '>\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C
3985
     *
3986
     * The last command fails to detect the illegal sequence.
3987
     */
3988
72.4k
#if !defined(__APPLE__) || \
3989
72.4k
    !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
3990
        /*
3991
         * Handle specific association to the META tag
3992
         */
3993
72.4k
        if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
3994
18.6k
            (strcmp((char *) name, "meta") == 0)) {
3995
8.70k
            htmlCheckMeta(ctxt, atts);
3996
8.70k
        }
3997
72.4k
#endif
3998
72.4k
    }
3999
4000
    /*
4001
     * SAX: Start of Element !
4002
     */
4003
258k
    if (!discardtag) {
4004
255k
        if (ctxt->options & HTML_PARSE_HTML5) {
4005
71.9k
            if (ctxt->nameNr > 0)
4006
59.3k
                htmlnamePop(ctxt);
4007
71.9k
        }
4008
4009
255k
  htmlnamePush(ctxt, name);
4010
255k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4011
255k
      if (nbatts != 0)
4012
72.3k
    ctxt->sax->startElement(ctxt->userData, name, atts);
4013
183k
      else
4014
183k
    ctxt->sax->startElement(ctxt->userData, name, NULL);
4015
255k
  }
4016
255k
    }
4017
4018
262k
done:
4019
262k
    if (atts != NULL) {
4020
1.79M
        for (i = 1;i < nbatts;i += 2) {
4021
1.65M
      if (atts[i] != NULL)
4022
75.0k
    xmlFree((xmlChar *) atts[i]);
4023
1.65M
  }
4024
139k
    }
4025
262k
}
4026
4027
/**
4028
 * parse an end of tag
4029
 *
4030
 * [42] ETag ::= '</' Name S? '>'
4031
 *
4032
 * With namespace
4033
 *
4034
 * [NS 9] ETag ::= '</' QName S? '>'
4035
 *
4036
 * @param ctxt  an HTML parser context
4037
 * @returns 1 if the current level should be closed.
4038
 */
4039
4040
static void
4041
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4042
50.5k
{
4043
50.5k
    const xmlChar *name;
4044
50.5k
    const xmlChar *oldname;
4045
50.5k
    int i;
4046
4047
50.5k
    ctxt->endCheckState = 0;
4048
4049
50.5k
    SKIP(2);
4050
4051
50.5k
    if (ctxt->input->cur >= ctxt->input->end) {
4052
43
        htmlStartCharData(ctxt);
4053
43
        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4054
36
            (ctxt->sax->characters != NULL))
4055
36
            ctxt->sax->characters(ctxt->userData,
4056
36
                                  BAD_CAST "</", 2);
4057
43
        return;
4058
43
    }
4059
4060
50.4k
    if (CUR == '>') {
4061
2.41k
        SKIP(1);
4062
2.41k
        return;
4063
2.41k
    }
4064
4065
48.0k
    if (!IS_ASCII_LETTER(CUR)) {
4066
1.97k
        htmlParseComment(ctxt, /* bogus */ 1);
4067
1.97k
        return;
4068
1.97k
    }
4069
4070
46.0k
    name = htmlParseHTMLName(ctxt, 0).name;
4071
46.0k
    if (name == NULL)
4072
27
        return;
4073
4074
    /*
4075
     * Parse and ignore attributes.
4076
     */
4077
46.0k
    SKIP_BLANKS;
4078
74.4k
    while ((ctxt->input->cur < ctxt->input->end) &&
4079
74.1k
           (CUR != '>') &&
4080
29.1k
     ((CUR != '/') || (NXT(1) != '>')) &&
4081
28.4k
           (ctxt->instate != XML_PARSER_EOF)) {
4082
28.3k
        xmlChar *attvalue = NULL;
4083
4084
        /*  unexpected-solidus-in-tag */
4085
28.3k
        if (CUR == '/') {
4086
2.14k
            SKIP(1);
4087
2.14k
            SKIP_BLANKS;
4088
2.14k
            continue;
4089
2.14k
        }
4090
26.2k
  GROW;
4091
26.2k
  htmlParseAttribute(ctxt, &attvalue);
4092
26.2k
        if (attvalue != NULL)
4093
1.03k
            xmlFree(attvalue);
4094
4095
26.2k
  SKIP_BLANKS;
4096
26.2k
    }
4097
4098
46.0k
    if (CUR == '>') {
4099
45.0k
        SKIP(1);
4100
45.0k
    } else if ((CUR == '/') && (NXT(1) == '>')) {
4101
710
        SKIP(2);
4102
710
    } else {
4103
308
        return;
4104
308
    }
4105
4106
45.7k
    if (ctxt->options & HTML_PARSE_HTML5) {
4107
13.9k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4108
13.9k
            ctxt->sax->endElement(ctxt->userData, name);
4109
13.9k
        return;
4110
13.9k
    }
4111
4112
    /*
4113
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4114
     * out now.
4115
     */
4116
31.7k
    if ((ctxt->depth > 0) &&
4117
6.37k
        (xmlStrEqual(name, BAD_CAST "html") ||
4118
6.04k
         xmlStrEqual(name, BAD_CAST "body") ||
4119
5.56k
   xmlStrEqual(name, BAD_CAST "head"))) {
4120
1.36k
  ctxt->depth--;
4121
1.36k
  return;
4122
1.36k
    }
4123
4124
    /*
4125
     * If the name read is not one of the element in the parsing stack
4126
     * then return, it's just an error.
4127
     */
4128
1.09M
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4129
1.08M
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4130
15.1k
            break;
4131
1.08M
    }
4132
30.4k
    if (i < 0) {
4133
15.3k
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4134
15.3k
               "Unexpected end tag : %s\n", name, NULL);
4135
15.3k
        return;
4136
15.3k
    }
4137
4138
4139
    /*
4140
     * Check for auto-closure of HTML elements.
4141
     */
4142
4143
15.1k
    htmlAutoCloseOnClose(ctxt, name);
4144
4145
    /*
4146
     * Well formedness constraints, opening and closing must match.
4147
     * With the exception that the autoclose may have popped stuff out
4148
     * of the stack.
4149
     */
4150
15.1k
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4151
686
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4152
686
                     "Opening and ending tag mismatch: %s and %s\n",
4153
686
                     name, ctxt->name);
4154
686
    }
4155
4156
    /*
4157
     * SAX: End of Tag
4158
     */
4159
15.1k
    oldname = ctxt->name;
4160
15.1k
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4161
14.4k
  htmlParserFinishElementParsing(ctxt);
4162
14.4k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4163
14.4k
            ctxt->sax->endElement(ctxt->userData, name);
4164
14.4k
        htmlnamePop(ctxt);
4165
14.4k
    }
4166
15.1k
}
4167
4168
/**
4169
 * Parse a content: comment, sub-element, reference or text.
4170
 * New version for non recursive htmlParseElementInternal
4171
 *
4172
 * @param ctxt  an HTML parser context
4173
 */
4174
4175
static void
4176
9.84k
htmlParseContent(htmlParserCtxtPtr ctxt) {
4177
9.84k
    GROW;
4178
4179
1.39M
    while ((PARSER_STOPPED(ctxt) == 0) &&
4180
1.39M
           (ctxt->input->cur < ctxt->input->end)) {
4181
1.38M
        int mode;
4182
4183
1.38M
        mode = ctxt->endCheckState;
4184
4185
1.38M
        if ((mode == 0) && (CUR == '<')) {
4186
1.28M
            if (NXT(1) == '/') {
4187
25.2k
          htmlParseEndTag(ctxt);
4188
1.26M
            } else if (NXT(1) == '!') {
4189
                /*
4190
                 * Sometimes DOCTYPE arrives in the middle of the document
4191
                 */
4192
387k
                if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4193
23.2k
                    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4194
21.5k
                    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4195
20.8k
                    (UPP(8) == 'E')) {
4196
20.4k
                    htmlParseDocTypeDecl(ctxt);
4197
366k
                } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4198
8.72k
                    SKIP(4);
4199
8.72k
                    htmlParseComment(ctxt, /* bogus */ 0);
4200
357k
                } else {
4201
357k
                    SKIP(2);
4202
357k
                    htmlParseComment(ctxt, /* bogus */ 1);
4203
357k
                }
4204
873k
            } else if (NXT(1) == '?') {
4205
7.68k
                SKIP(1);
4206
7.68k
                htmlParseComment(ctxt, /* bogus */ 1);
4207
865k
            } else if (IS_ASCII_LETTER(NXT(1))) {
4208
131k
                htmlParseElementInternal(ctxt);
4209
734k
            } else {
4210
734k
                htmlStartCharData(ctxt);
4211
734k
                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4212
734k
                    (ctxt->sax->characters != NULL))
4213
734k
                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4214
734k
                SKIP(1);
4215
734k
            }
4216
1.28M
        } else {
4217
104k
            htmlParseCharData(ctxt, /* partial */ 0);
4218
104k
        }
4219
4220
1.38M
        SHRINK;
4221
1.38M
        GROW;
4222
1.38M
    }
4223
4224
9.84k
    if (ctxt->input->cur >= ctxt->input->end)
4225
8.78k
        htmlAutoCloseOnEnd(ctxt);
4226
9.84k
}
4227
4228
/**
4229
 * Parse an HTML element, new version, non recursive
4230
 *
4231
 * @param ctxt  an HTML parser context
4232
 */
4233
static int
4234
262k
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4235
262k
    const xmlChar *name;
4236
262k
    const htmlElemDesc * info;
4237
262k
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4238
4239
262k
    if ((ctxt == NULL) || (ctxt->input == NULL))
4240
0
  return(0);
4241
4242
    /* Capture start position */
4243
262k
    if (ctxt->record_info) {
4244
0
        node_info.begin_pos = ctxt->input->consumed +
4245
0
                          (CUR_PTR - ctxt->input->base);
4246
0
  node_info.begin_line = ctxt->input->line;
4247
0
    }
4248
4249
262k
    htmlParseStartTag(ctxt);
4250
262k
    name = ctxt->name;
4251
262k
    if (name == NULL)
4252
2.29k
        return(0);
4253
4254
260k
    if (ctxt->record_info)
4255
0
        htmlNodeInfoPush(ctxt, &node_info);
4256
4257
    /*
4258
     * Check for an Empty Element labeled the XML/SGML way
4259
     */
4260
260k
    if ((CUR == '/') && (NXT(1) == '>')) {
4261
3.88k
        SKIP(2);
4262
3.88k
        htmlParserFinishElementParsing(ctxt);
4263
3.88k
        if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4264
2.24k
            if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4265
2.24k
                ctxt->sax->endElement(ctxt->userData, name);
4266
2.24k
        }
4267
3.88k
  htmlnamePop(ctxt);
4268
3.88k
  return(0);
4269
3.88k
    }
4270
4271
256k
    if (CUR != '>')
4272
2.29k
        return(0);
4273
254k
    SKIP(1);
4274
4275
    /*
4276
     * Lookup the info for that element.
4277
     */
4278
254k
    info = htmlTagLookup(name);
4279
4280
    /*
4281
     * Check for an Empty Element from DTD definition
4282
     */
4283
254k
    if ((info != NULL) && (info->empty)) {
4284
18.2k
        htmlParserFinishElementParsing(ctxt);
4285
18.2k
        if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4286
10.2k
            if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4287
10.2k
                ctxt->sax->endElement(ctxt->userData, name);
4288
10.2k
        }
4289
18.2k
  htmlnamePop(ctxt);
4290
18.2k
  return(0);
4291
18.2k
    }
4292
4293
235k
    if (info != NULL)
4294
79.1k
        ctxt->endCheckState = info->dataMode;
4295
4296
235k
    return(1);
4297
254k
}
4298
4299
/**
4300
 * This is kept for compatibility with previous code versions
4301
 *
4302
 * @deprecated Internal function, don't use.
4303
 *
4304
 * @param ctxt  an HTML parser context
4305
 */
4306
void
4307
0
htmlParseElement(htmlParserCtxt *ctxt) {
4308
0
    const xmlChar *oldptr;
4309
0
    int depth;
4310
4311
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4312
0
  return;
4313
4314
0
    if (htmlParseElementInternal(ctxt) == 0)
4315
0
        return;
4316
4317
    /*
4318
     * Parse the content of the element:
4319
     */
4320
0
    depth = ctxt->nameNr;
4321
0
    while (CUR != 0) {
4322
0
  oldptr = ctxt->input->cur;
4323
0
  htmlParseContent(ctxt);
4324
0
  if (oldptr==ctxt->input->cur) break;
4325
0
  if (ctxt->nameNr < depth) break;
4326
0
    }
4327
4328
0
    if (CUR == 0) {
4329
0
  htmlAutoCloseOnEnd(ctxt);
4330
0
    }
4331
0
}
4332
4333
/**
4334
 * @param ctxt  parser context
4335
 * @param input  parser input
4336
 * @returns a node list.
4337
 */
4338
xmlNode *
4339
0
htmlCtxtParseContentInternal(htmlParserCtxt *ctxt, xmlParserInput *input) {
4340
0
    xmlNodePtr root;
4341
0
    xmlNodePtr list = NULL;
4342
0
    xmlChar *rootName = BAD_CAST "#root";
4343
4344
0
    root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL);
4345
0
    if (root == NULL) {
4346
0
        htmlErrMemory(ctxt);
4347
0
        return(NULL);
4348
0
    }
4349
4350
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4351
0
        xmlFreeNode(root);
4352
0
        return(NULL);
4353
0
    }
4354
4355
0
    htmlnamePush(ctxt, rootName);
4356
0
    nodePush(ctxt, root);
4357
4358
0
    htmlParseContent(ctxt);
4359
4360
    /*
4361
     * Only check for truncated multi-byte sequences
4362
     */
4363
0
    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
4364
4365
    /* TODO: Use xmlCtxtIsCatastrophicError */
4366
0
    if (ctxt->errNo != XML_ERR_NO_MEMORY) {
4367
0
        xmlNodePtr cur;
4368
4369
        /*
4370
         * Unlink newly created node list.
4371
         */
4372
0
        list = root->children;
4373
0
        root->children = NULL;
4374
0
        root->last = NULL;
4375
0
        for (cur = list; cur != NULL; cur = cur->next)
4376
0
            cur->parent = NULL;
4377
0
    }
4378
4379
0
    nodePop(ctxt);
4380
0
    htmlnamePop(ctxt);
4381
4382
0
    xmlCtxtPopInput(ctxt);
4383
4384
0
    xmlFreeNode(root);
4385
0
    return(list);
4386
0
}
4387
4388
/**
4389
 * Parse an HTML document and invoke the SAX handlers. This is useful
4390
 * if you're only interested in custom SAX callbacks. If you want a
4391
 * document tree, use #htmlCtxtParseDocument.
4392
 *
4393
 * @param ctxt  an HTML parser context
4394
 * @returns 0, -1 in case of error.
4395
 */
4396
int
4397
9.84k
htmlParseDocument(htmlParserCtxt *ctxt) {
4398
9.84k
    if ((ctxt == NULL) || (ctxt->input == NULL))
4399
0
  return(-1);
4400
4401
9.84k
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4402
9.84k
        ctxt->sax->setDocumentLocator(ctxt->userData,
4403
9.84k
                (xmlSAXLocator *) &xmlDefaultSAXLocator);
4404
9.84k
    }
4405
4406
9.84k
    xmlDetectEncoding(ctxt);
4407
4408
    /*
4409
     * TODO: Implement HTML5 prescan algorithm
4410
     */
4411
4412
    /*
4413
     * This is wrong but matches long-standing behavior. In most
4414
     * cases, a document starting with an XML declaration will
4415
     * specify UTF-8. The HTML5 prescan algorithm handles
4416
     * XML declarations in a better way.
4417
     */
4418
9.84k
    if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4419
9.41k
        (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4420
73
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4421
4422
    /*
4423
     * Wipe out everything which is before the first '<'
4424
     */
4425
9.84k
    SKIP_BLANKS;
4426
4427
9.84k
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4428
9.81k
  ctxt->sax->startDocument(ctxt->userData);
4429
4430
    /*
4431
     * Parse possible comments, PIs or doctype declarations
4432
     * before any content.
4433
     */
4434
9.84k
    ctxt->instate = XML_PARSER_MISC;
4435
191k
    while (CUR == '<') {
4436
186k
        if (NXT(1) == '!') {
4437
181k
            if ((NXT(2) == '-') && (NXT(3) == '-')) {
4438
1.58k
                SKIP(4);
4439
1.58k
                htmlParseComment(ctxt, /* bogus */ 0);
4440
179k
            } else if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4441
2.66k
                       (UPP(4) == 'C') && (UPP(5) == 'T') &&
4442
2.21k
                       (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4443
1.77k
                       (UPP(8) == 'E')) {
4444
1.56k
                htmlParseDocTypeDecl(ctxt);
4445
1.56k
                ctxt->instate = XML_PARSER_PROLOG;
4446
177k
            } else {
4447
177k
                SKIP(2);
4448
177k
                htmlParseComment(ctxt, /* bogus */ 1);
4449
177k
            }
4450
181k
        } else if (NXT(1) == '?') {
4451
995
            SKIP(1);
4452
995
            htmlParseComment(ctxt, /* bogus */ 1);
4453
4.80k
        } else {
4454
4.80k
            break;
4455
4.80k
        }
4456
182k
  SKIP_BLANKS;
4457
182k
        GROW;
4458
182k
    }
4459
4460
    /*
4461
     * Time to start parsing the tree itself
4462
     */
4463
9.84k
    ctxt->instate = XML_PARSER_CONTENT;
4464
9.84k
    htmlParseContent(ctxt);
4465
4466
    /*
4467
     * Only check for truncated multi-byte sequences
4468
     */
4469
9.84k
    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
4470
4471
    /*
4472
     * SAX: end of the document processing.
4473
     */
4474
9.84k
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4475
9.84k
        ctxt->sax->endDocument(ctxt->userData);
4476
4477
9.84k
    if (! ctxt->wellFormed) return(-1);
4478
7.98k
    return(0);
4479
9.84k
}
4480
4481
4482
/************************************************************************
4483
 *                  *
4484
 *      Parser contexts handling      *
4485
 *                  *
4486
 ************************************************************************/
4487
4488
/**
4489
 * Initialize a parser context
4490
 *
4491
 * @param ctxt  an HTML parser context
4492
 * @param sax  SAX handler
4493
 * @param userData  user data
4494
 * @returns 0 in case of success and -1 in case of error
4495
 */
4496
static int
4497
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4498
                   void *userData)
4499
21.0k
{
4500
21.0k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4501
21.0k
    size_t initialNodeTabSize = 1;
4502
#else
4503
    size_t initialNodeTabSize = 10;
4504
#endif
4505
4506
21.0k
    if (ctxt == NULL) return(-1);
4507
21.0k
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4508
4509
21.0k
    ctxt->dict = xmlDictCreate();
4510
21.0k
    if (ctxt->dict == NULL)
4511
2
  return(-1);
4512
4513
21.0k
    if (ctxt->sax == NULL)
4514
21.0k
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4515
21.0k
    if (ctxt->sax == NULL)
4516
2
  return(-1);
4517
21.0k
    if (sax == NULL) {
4518
21.0k
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4519
21.0k
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4520
21.0k
        ctxt->userData = ctxt;
4521
21.0k
    } else {
4522
0
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4523
0
        ctxt->userData = userData ? userData : ctxt;
4524
0
    }
4525
4526
    /* Allocate the Input stack */
4527
21.0k
    ctxt->inputTab = (htmlParserInputPtr *)
4528
21.0k
                      xmlMalloc(sizeof(htmlParserInputPtr));
4529
21.0k
    if (ctxt->inputTab == NULL)
4530
2
  return(-1);
4531
21.0k
    ctxt->inputNr = 0;
4532
21.0k
    ctxt->inputMax = 1;
4533
21.0k
    ctxt->input = NULL;
4534
21.0k
    ctxt->version = NULL;
4535
21.0k
    ctxt->encoding = NULL;
4536
21.0k
    ctxt->standalone = -1;
4537
21.0k
    ctxt->instate = XML_PARSER_START;
4538
4539
    /* Allocate the Node stack */
4540
21.0k
    ctxt->nodeTab = xmlMalloc(initialNodeTabSize * sizeof(htmlNodePtr));
4541
21.0k
    if (ctxt->nodeTab == NULL)
4542
2
  return(-1);
4543
21.0k
    ctxt->nodeNr = 0;
4544
21.0k
    ctxt->nodeMax = initialNodeTabSize;
4545
21.0k
    ctxt->node = NULL;
4546
4547
    /* Allocate the Name stack */
4548
21.0k
    ctxt->nameTab = xmlMalloc(initialNodeTabSize * sizeof(xmlChar *));
4549
21.0k
    if (ctxt->nameTab == NULL)
4550
2
  return(-1);
4551
21.0k
    ctxt->nameNr = 0;
4552
21.0k
    ctxt->nameMax = initialNodeTabSize;
4553
21.0k
    ctxt->name = NULL;
4554
4555
21.0k
    ctxt->nodeInfoTab = NULL;
4556
21.0k
    ctxt->nodeInfoNr  = 0;
4557
21.0k
    ctxt->nodeInfoMax = 0;
4558
4559
21.0k
    ctxt->myDoc = NULL;
4560
21.0k
    ctxt->wellFormed = 1;
4561
21.0k
    ctxt->replaceEntities = 0;
4562
21.0k
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4563
21.0k
    ctxt->html = INSERT_INITIAL;
4564
21.0k
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4565
21.0k
    ctxt->vctxt.userData = ctxt;
4566
21.0k
    ctxt->vctxt.error = xmlParserValidityError;
4567
21.0k
    ctxt->vctxt.warning = xmlParserValidityWarning;
4568
21.0k
    ctxt->record_info = 0;
4569
21.0k
    ctxt->validate = 0;
4570
21.0k
    ctxt->checkIndex = 0;
4571
21.0k
    ctxt->catalogs = NULL;
4572
21.0k
    xmlInitNodeInfoSeq(&ctxt->node_seq);
4573
21.0k
    return(0);
4574
21.0k
}
4575
4576
/**
4577
 * Free all the memory used by a parser context. However the parsed
4578
 * document in `ctxt->myDoc` is not freed.
4579
 *
4580
 * @param ctxt  an HTML parser context
4581
 */
4582
void
4583
htmlFreeParserCtxt(htmlParserCtxt *ctxt)
4584
19.7k
{
4585
19.7k
    xmlFreeParserCtxt(ctxt);
4586
19.7k
}
4587
4588
/**
4589
 * Allocate and initialize a new HTML parser context.
4590
 *
4591
 * This can be used to parse HTML documents into DOM trees with
4592
 * functions like #xmlCtxtReadFile or #xmlCtxtReadMemory.
4593
 *
4594
 * See #htmlCtxtUseOptions for parser options.
4595
 *
4596
 * See #xmlCtxtSetErrorHandler for advanced error handling.
4597
 *
4598
 * See #htmlNewSAXParserCtxt for custom SAX parsers.
4599
 *
4600
 * @returns the htmlParserCtxt or NULL in case of allocation error
4601
 */
4602
htmlParserCtxt *
4603
htmlNewParserCtxt(void)
4604
11.1k
{
4605
11.1k
    return(htmlNewSAXParserCtxt(NULL, NULL));
4606
11.1k
}
4607
4608
/**
4609
 * Allocate and initialize a new HTML SAX parser context. If `userData`
4610
 * is NULL, the parser context will be passed as user data.
4611
 *
4612
 * @since 2.11.0
4613
 *
4614
 * If you want support older versions, it's best to invoke
4615
 * #htmlNewParserCtxt and set `ctxt->sax` with struct assignment.
4616
 *
4617
 * Also see #htmlNewParserCtxt.
4618
 *
4619
 * @param sax  SAX handler
4620
 * @param userData  user data
4621
 * @returns the htmlParserCtxt or NULL in case of allocation error
4622
 */
4623
htmlParserCtxt *
4624
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
4625
21.0k
{
4626
21.0k
    xmlParserCtxtPtr ctxt;
4627
4628
21.0k
    xmlInitParser();
4629
4630
21.0k
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4631
21.0k
    if (ctxt == NULL)
4632
23
  return(NULL);
4633
21.0k
    memset(ctxt, 0, sizeof(xmlParserCtxt));
4634
21.0k
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
4635
10
        htmlFreeParserCtxt(ctxt);
4636
10
  return(NULL);
4637
10
    }
4638
21.0k
    return(ctxt);
4639
21.0k
}
4640
4641
static htmlParserCtxtPtr
4642
htmlCreateMemoryParserCtxtInternal(const char *url,
4643
                                   const char *buffer, size_t size,
4644
0
                                   const char *encoding) {
4645
0
    xmlParserCtxtPtr ctxt;
4646
0
    xmlParserInputPtr input;
4647
4648
0
    if (buffer == NULL)
4649
0
  return(NULL);
4650
4651
0
    ctxt = htmlNewParserCtxt();
4652
0
    if (ctxt == NULL)
4653
0
  return(NULL);
4654
4655
0
    input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0);
4656
0
    if (input == NULL) {
4657
0
  xmlFreeParserCtxt(ctxt);
4658
0
        return(NULL);
4659
0
    }
4660
4661
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4662
0
        xmlFreeInputStream(input);
4663
0
        xmlFreeParserCtxt(ctxt);
4664
0
        return(NULL);
4665
0
    }
4666
4667
0
    return(ctxt);
4668
0
}
4669
4670
/**
4671
 * Create a parser context for an HTML in-memory document. The input
4672
 * buffer must not contain any terminating null bytes.
4673
 *
4674
 * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadMemory.
4675
 *
4676
 * @param buffer  a pointer to a char array
4677
 * @param size  the size of the array
4678
 * @returns the new parser context or NULL
4679
 */
4680
htmlParserCtxt *
4681
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4682
0
    if (size <= 0)
4683
0
  return(NULL);
4684
4685
0
    return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
4686
0
}
4687
4688
/**
4689
 * Create a parser context for a null-terminated string.
4690
 *
4691
 * @param str  a pointer to an array of xmlChar
4692
 * @param url  URL of the document (optional)
4693
 * @param encoding  encoding (optional)
4694
 * @returns the new parser context or NULL if a memory allocation failed.
4695
 */
4696
static htmlParserCtxtPtr
4697
htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
4698
0
                        const char *encoding) {
4699
0
    xmlParserCtxtPtr ctxt;
4700
0
    xmlParserInputPtr input;
4701
4702
0
    if (str == NULL)
4703
0
  return(NULL);
4704
4705
0
    ctxt = htmlNewParserCtxt();
4706
0
    if (ctxt == NULL)
4707
0
  return(NULL);
4708
4709
0
    input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str,
4710
0
                                      encoding, 0);
4711
0
    if (input == NULL) {
4712
0
  xmlFreeParserCtxt(ctxt);
4713
0
  return(NULL);
4714
0
    }
4715
4716
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4717
0
        xmlFreeInputStream(input);
4718
0
        xmlFreeParserCtxt(ctxt);
4719
0
        return(NULL);
4720
0
    }
4721
4722
0
    return(ctxt);
4723
0
}
4724
4725
#ifdef LIBXML_PUSH_ENABLED
4726
/************************************************************************
4727
 *                  *
4728
 *  Progressive parsing interfaces        *
4729
 *                  *
4730
 ************************************************************************/
4731
4732
typedef enum {
4733
    LSTATE_TAG_NAME = 0,
4734
    LSTATE_BEFORE_ATTR_NAME,
4735
    LSTATE_ATTR_NAME,
4736
    LSTATE_AFTER_ATTR_NAME,
4737
    LSTATE_BEFORE_ATTR_VALUE,
4738
    LSTATE_ATTR_VALUE_DQUOTED,
4739
    LSTATE_ATTR_VALUE_SQUOTED,
4740
    LSTATE_ATTR_VALUE_UNQUOTED
4741
} xmlLookupStates;
4742
4743
/**
4744
 * Check whether there's enough data in the input buffer to finish parsing
4745
 * a tag. This has to take quotes into account.
4746
 *
4747
 * @param ctxt  an HTML parser context
4748
 */
4749
static int
4750
137k
htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
4751
137k
    const xmlChar *cur;
4752
137k
    const xmlChar *end = ctxt->input->end;
4753
137k
    int state = ctxt->endCheckState;
4754
137k
    size_t index;
4755
4756
137k
    if (ctxt->checkIndex == 0)
4757
85.0k
        cur = ctxt->input->cur + 2; /* Skip '<a' or '</' */
4758
52.6k
    else
4759
52.6k
        cur = ctxt->input->cur + ctxt->checkIndex;
4760
4761
49.2M
    while (cur < end) {
4762
49.2M
        int c = *cur++;
4763
4764
49.2M
        if (state != LSTATE_ATTR_VALUE_SQUOTED &&
4765
47.7M
            state != LSTATE_ATTR_VALUE_DQUOTED) {
4766
36.4M
            if (c == '/' &&
4767
28.4k
                state != LSTATE_BEFORE_ATTR_VALUE &&
4768
28.1k
                state != LSTATE_ATTR_VALUE_UNQUOTED) {
4769
27.3k
                state = LSTATE_BEFORE_ATTR_NAME;
4770
27.3k
                continue;
4771
36.4M
            } else if (c == '>') {
4772
82.5k
                ctxt->checkIndex = 0;
4773
82.5k
                ctxt->endCheckState = 0;
4774
82.5k
                return(0);
4775
82.5k
            }
4776
36.4M
        }
4777
4778
49.1M
        switch (state) {
4779
7.10M
            case LSTATE_TAG_NAME:
4780
7.10M
                if (IS_WS_HTML(c))
4781
18.3k
                    state = LSTATE_BEFORE_ATTR_NAME;
4782
7.10M
                break;
4783
4784
237k
            case LSTATE_BEFORE_ATTR_NAME:
4785
237k
                if (!IS_WS_HTML(c))
4786
44.0k
                    state = LSTATE_ATTR_NAME;
4787
237k
                break;
4788
4789
12.9M
            case LSTATE_ATTR_NAME:
4790
12.9M
                if (c == '=')
4791
20.1k
                    state = LSTATE_BEFORE_ATTR_VALUE;
4792
12.9M
                else if (IS_WS_HTML(c))
4793
1.91M
                    state = LSTATE_AFTER_ATTR_NAME;
4794
12.9M
                break;
4795
4796
3.98M
            case LSTATE_AFTER_ATTR_NAME:
4797
3.98M
                if (c == '=')
4798
271
                    state = LSTATE_BEFORE_ATTR_VALUE;
4799
3.98M
                else if (!IS_WS_HTML(c))
4800
1.89M
                    state = LSTATE_ATTR_NAME;
4801
3.98M
                break;
4802
4803
24.9k
            case LSTATE_BEFORE_ATTR_VALUE:
4804
24.9k
                if (c == '"')
4805
3.68k
                    state = LSTATE_ATTR_VALUE_DQUOTED;
4806
21.3k
                else if (c == '\'')
4807
408
                    state = LSTATE_ATTR_VALUE_SQUOTED;
4808
20.9k
                else if (!IS_WS_HTML(c))
4809
14.3k
                    state = LSTATE_ATTR_VALUE_UNQUOTED;
4810
24.9k
                break;
4811
4812
11.2M
            case LSTATE_ATTR_VALUE_DQUOTED:
4813
11.2M
                if (c == '"')
4814
3.58k
                    state = LSTATE_BEFORE_ATTR_NAME;
4815
11.2M
                break;
4816
4817
1.46M
            case LSTATE_ATTR_VALUE_SQUOTED:
4818
1.46M
                if (c == '\'')
4819
384
                    state = LSTATE_BEFORE_ATTR_NAME;
4820
1.46M
                break;
4821
4822
12.0M
            case LSTATE_ATTR_VALUE_UNQUOTED:
4823
12.0M
                if (IS_WS_HTML(c))
4824
4.21k
                    state = LSTATE_BEFORE_ATTR_NAME;
4825
12.0M
                break;
4826
49.1M
        }
4827
49.1M
    }
4828
4829
55.1k
    index = cur - ctxt->input->cur;
4830
55.1k
    if (index > LONG_MAX) {
4831
0
        ctxt->checkIndex = 0;
4832
0
        ctxt->endCheckState = 0;
4833
0
        return(0);
4834
0
    }
4835
55.1k
    ctxt->checkIndex = index;
4836
55.1k
    ctxt->endCheckState = state;
4837
55.1k
    return(-1);
4838
55.1k
}
4839
4840
/**
4841
 * Check whether the input buffer contains a string.
4842
 *
4843
 * @param ctxt  an XML parser context
4844
 * @param startDelta  delta to apply at the start
4845
 * @param str  string
4846
 * @param strLen  length of string
4847
 * @param extraLen  extra length
4848
 */
4849
static int
4850
htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
4851
595k
                      const char *str, size_t strLen, size_t extraLen) {
4852
595k
    const xmlChar *end = ctxt->input->end;
4853
595k
    const xmlChar *cur, *term;
4854
595k
    size_t index, rescan;
4855
595k
    int ret;
4856
4857
595k
    if (ctxt->checkIndex == 0) {
4858
248k
        cur = ctxt->input->cur + startDelta;
4859
347k
    } else {
4860
347k
        cur = ctxt->input->cur + ctxt->checkIndex;
4861
347k
    }
4862
4863
595k
    term = BAD_CAST strstr((const char *) cur, str);
4864
595k
    if ((term != NULL) &&
4865
257k
        ((size_t) (ctxt->input->end - term) >= extraLen + 1)) {
4866
257k
        ctxt->checkIndex = 0;
4867
4868
257k
        if (term - ctxt->input->cur > INT_MAX / 2)
4869
0
            ret = INT_MAX / 2;
4870
257k
        else
4871
257k
            ret = term - ctxt->input->cur;
4872
4873
257k
        return(ret);
4874
257k
    }
4875
4876
    /* Rescan (strLen + extraLen - 1) characters. */
4877
337k
    rescan = strLen + extraLen - 1;
4878
337k
    if ((size_t) (end - cur) <= rescan)
4879
4.81k
        end = cur;
4880
332k
    else
4881
332k
        end -= rescan;
4882
337k
    index = end - ctxt->input->cur;
4883
337k
    if (index > INT_MAX / 2) {
4884
0
        ctxt->checkIndex = 0;
4885
0
        ret = INT_MAX / 2;
4886
337k
    } else {
4887
337k
        ctxt->checkIndex = index;
4888
337k
        ret = -1;
4889
337k
    }
4890
4891
337k
    return(ret);
4892
595k
}
4893
4894
/**
4895
 * Try to find a comment end tag in the input stream
4896
 * The search includes "-->" as well as WHATWG-recommended
4897
 * incorrectly-closed tags.
4898
 *
4899
 * @param ctxt  an HTML parser context
4900
 * @returns the index to the current parsing point if the full
4901
 * sequence is available, -1 otherwise.
4902
 */
4903
static int
4904
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
4905
126k
{
4906
126k
    int mark = 0;
4907
126k
    int offset;
4908
4909
140k
    while (1) {
4910
140k
  mark = htmlParseLookupString(ctxt, 2, "--", 2, 0);
4911
140k
  if (mark < 0)
4912
122k
            break;
4913
        /*
4914
         * <!-->    is a complete comment, but
4915
         * <!--!>   is not
4916
         * <!---!>  is not
4917
         * <!----!> is
4918
         */
4919
17.8k
        if ((NXT(mark+2) == '>') ||
4920
15.3k
      ((mark >= 4) && (NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
4921
2.87k
            ctxt->checkIndex = 0;
4922
2.87k
      break;
4923
2.87k
  }
4924
14.9k
        offset = (NXT(mark+2) == '!') ? 3 : 2;
4925
14.9k
        if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
4926
1.18k
      ctxt->checkIndex = mark;
4927
1.18k
            return(-1);
4928
1.18k
        }
4929
13.7k
  ctxt->checkIndex = mark + 1;
4930
13.7k
    }
4931
125k
    return mark;
4932
126k
}
4933
4934
4935
/**
4936
 * Try to progress on parsing
4937
 *
4938
 * @param ctxt  an HTML parser context
4939
 * @param terminate  last chunk indicator
4940
 * @returns zero if no parsing was possible
4941
 */
4942
static void
4943
471k
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4944
2.09M
    while (PARSER_STOPPED(ctxt) == 0) {
4945
2.09M
        htmlParserInputPtr in;
4946
2.09M
        size_t avail;
4947
4948
2.09M
  in = ctxt->input;
4949
2.09M
  if (in == NULL) break;
4950
2.09M
  avail = in->end - in->cur;
4951
4952
2.09M
        switch (ctxt->instate) {
4953
0
            case XML_PARSER_EOF:
4954
          /*
4955
     * Document parsing is done !
4956
     */
4957
0
          return;
4958
4959
14.4k
            case XML_PARSER_START:
4960
                /*
4961
                 * Very first chars read from the document flow.
4962
                 */
4963
14.4k
                if ((!terminate) && (avail < 4))
4964
4.61k
                    return;
4965
4966
9.87k
                xmlDetectEncoding(ctxt);
4967
4968
                /*
4969
                 * TODO: Implement HTML5 prescan algorithm
4970
                 */
4971
4972
                /*
4973
                 * This is wrong but matches long-standing behavior. In most
4974
                 * cases, a document starting with an XML declaration will
4975
                 * specify UTF-8. The HTML5 prescan algorithm handles
4976
                 * XML declarations in a better way.
4977
                 */
4978
9.87k
                if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4979
9.43k
                    (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
4980
73
                    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4981
73
                }
4982
4983
                /* fall through */
4984
4985
9.87k
            case XML_PARSER_XML_DECL:
4986
9.87k
                if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4987
9.87k
                    ctxt->sax->setDocumentLocator(ctxt->userData,
4988
9.87k
                            (xmlSAXLocator *) &xmlDefaultSAXLocator);
4989
9.87k
                }
4990
9.87k
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4991
9.87k
              (!ctxt->disableSAX))
4992
9.86k
        ctxt->sax->startDocument(ctxt->userData);
4993
4994
                /* Allow callback to modify state for tests */
4995
9.87k
                if ((ctxt->instate == XML_PARSER_START) ||
4996
56
                    (ctxt->instate == XML_PARSER_XML_DECL))
4997
9.81k
                    ctxt->instate = XML_PARSER_MISC;
4998
9.87k
    break;
4999
5000
184k
            case XML_PARSER_START_TAG:
5001
184k
    if ((!terminate) &&
5002
127k
        (htmlParseLookupGt(ctxt) < 0))
5003
53.5k
        return;
5004
5005
131k
                htmlParseElementInternal(ctxt);
5006
5007
131k
    ctxt->instate = XML_PARSER_CONTENT;
5008
131k
                break;
5009
5010
95.0k
            case XML_PARSER_MISC: /* initial */
5011
97.1k
            case XML_PARSER_PROLOG: /* before html */
5012
1.86M
            case XML_PARSER_CONTENT: {
5013
1.86M
                int mode;
5014
5015
1.86M
                if ((ctxt->instate == XML_PARSER_MISC) ||
5016
1.76M
                    (ctxt->instate == XML_PARSER_PROLOG)) {
5017
97.1k
                    SKIP_BLANKS;
5018
97.1k
                    avail = in->end - in->cur;
5019
97.1k
                }
5020
5021
1.86M
    if (avail < 1)
5022
8.11k
        return;
5023
                /*
5024
                 * Note that endCheckState is also used by
5025
                 * xmlParseLookupGt.
5026
                 */
5027
1.85M
                mode = ctxt->endCheckState;
5028
5029
1.85M
                if (mode != 0) {
5030
53.4k
                    if (htmlParseCharData(ctxt, !terminate) == 0)
5031
50.4k
                        return;
5032
1.80M
    } else if (in->cur[0] == '<') {
5033
1.57M
                    int next;
5034
5035
1.57M
                    if (avail < 2) {
5036
2.79k
                        if (!terminate)
5037
2.42k
                            return;
5038
371
                        next = ' ';
5039
1.56M
                    } else {
5040
1.56M
                        next = in->cur[1];
5041
1.56M
                    }
5042
5043
1.56M
                    if (next == '!') {
5044
662k
                        if ((!terminate) && (avail < 4))
5045
692
                            return;
5046
661k
                        if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5047
133k
                            if ((!terminate) &&
5048
126k
                                (htmlParseLookupCommentEnd(ctxt) < 0))
5049
123k
                                return;
5050
9.89k
                            SKIP(4);
5051
9.89k
                            htmlParseComment(ctxt, /* bogus */ 0);
5052
                            /* don't change state */
5053
9.89k
                            break;
5054
133k
                        }
5055
5056
528k
                        if ((!terminate) && (avail < 9))
5057
2.00k
                            return;
5058
526k
                        if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5059
93.5k
                            (UPP(4) == 'C') && (UPP(5) == 'T') &&
5060
34.3k
                            (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5061
32.7k
                            (UPP(8) == 'E')) {
5062
31.4k
                            if ((!terminate) &&
5063
13.1k
                                (htmlParseLookupString(ctxt, 9, ">", 1,
5064
13.1k
                                                       0) < 0))
5065
9.52k
                                return;
5066
21.9k
                            htmlParseDocTypeDecl(ctxt);
5067
21.9k
                            if (ctxt->instate == XML_PARSER_MISC)
5068
527
                                ctxt->instate = XML_PARSER_PROLOG;
5069
494k
                        } else {
5070
494k
                            if ((!terminate) &&
5071
297k
                                (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5072
74.1k
                                return;
5073
420k
                            SKIP(2);
5074
420k
                            htmlParseComment(ctxt, /* bogus */ 1);
5075
420k
                        }
5076
905k
                    } else if (next == '?') {
5077
14.5k
                        if ((!terminate) &&
5078
7.41k
                            (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5079
5.90k
                            return;
5080
8.66k
                        SKIP(1);
5081
8.66k
                        htmlParseComment(ctxt, /* bogus */ 1);
5082
                        /* don't change state */
5083
890k
                    } else if (next == '/') {
5084
25.2k
                        ctxt->instate = XML_PARSER_END_TAG;
5085
25.2k
                        ctxt->checkIndex = 0;
5086
865k
                    } else if (IS_ASCII_LETTER(next)) {
5087
131k
                        ctxt->instate = XML_PARSER_START_TAG;
5088
131k
                        ctxt->checkIndex = 0;
5089
734k
                    } else {
5090
734k
                        ctxt->instate = XML_PARSER_CONTENT;
5091
734k
                        htmlStartCharData(ctxt);
5092
734k
                        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5093
734k
                            (ctxt->sax->characters != NULL))
5094
734k
                            ctxt->sax->characters(ctxt->userData,
5095
734k
                                                  BAD_CAST "<", 1);
5096
734k
                        SKIP(1);
5097
734k
                    }
5098
1.56M
                } else {
5099
232k
                    ctxt->instate = XML_PARSER_CONTENT;
5100
                    /*
5101
                     * We follow the logic of the XML push parser
5102
                     */
5103
232k
        if (avail < HTML_PARSER_BIG_BUFFER_SIZE) {
5104
147k
                        if ((!terminate) &&
5105
136k
                            (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
5106
125k
                            return;
5107
147k
                    }
5108
106k
                    ctxt->checkIndex = 0;
5109
106k
                    if (htmlParseCharData(ctxt, !terminate) == 0)
5110
8.47k
                        return;
5111
106k
    }
5112
5113
1.44M
    break;
5114
1.85M
      }
5115
5116
1.44M
            case XML_PARSER_END_TAG:
5117
26.8k
    if ((!terminate) &&
5118
10.5k
        (htmlParseLookupGt(ctxt) < 0))
5119
1.58k
        return;
5120
25.2k
    htmlParseEndTag(ctxt);
5121
25.2k
    ctxt->instate = XML_PARSER_CONTENT;
5122
25.2k
    ctxt->checkIndex = 0;
5123
25.2k
          break;
5124
5125
0
      default:
5126
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5127
0
           "HPP: internal error\n", NULL, NULL);
5128
0
    ctxt->instate = XML_PARSER_EOF;
5129
0
    break;
5130
2.09M
  }
5131
2.09M
    }
5132
471k
}
5133
5134
/**
5135
 * Parse a chunk of memory in push parser mode.
5136
 *
5137
 * Assumes that the parser context was initialized with
5138
 * #htmlCreatePushParserCtxt.
5139
 *
5140
 * The last chunk, which will often be empty, must be marked with
5141
 * the `terminate` flag. With the default SAX callbacks, the resulting
5142
 * document will be available in `ctxt->myDoc`. This pointer will not
5143
 * be freed by the library.
5144
 *
5145
 * If the document isn't well-formed, `ctxt->myDoc` is set to NULL.
5146
 *
5147
 * Since 2.14.0, #xmlCtxtGetDocument can be used to retrieve the
5148
 * result document.
5149
 *
5150
 * @param ctxt  an HTML parser context
5151
 * @param chunk  chunk of memory
5152
 * @param size  size of chunk in bytes
5153
 * @param terminate  last chunk indicator
5154
 * @returns an xmlParserErrors code (0 on success).
5155
 */
5156
int
5157
htmlParseChunk(htmlParserCtxt *ctxt, const char *chunk, int size,
5158
473k
              int terminate) {
5159
473k
    if ((ctxt == NULL) ||
5160
473k
        (ctxt->input == NULL) || (ctxt->input->buf == NULL) ||
5161
473k
        (size < 0) ||
5162
473k
        ((size > 0) && (chunk == NULL)))
5163
0
  return(XML_ERR_ARGUMENT);
5164
473k
    if (PARSER_STOPPED(ctxt) != 0)
5165
588
        return(ctxt->errNo);
5166
5167
472k
    if (size > 0)  {
5168
472k
  size_t pos = ctxt->input->cur - ctxt->input->base;
5169
472k
  int res;
5170
5171
472k
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5172
472k
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5173
472k
  if (res < 0) {
5174
653
            htmlParseErr(ctxt, ctxt->input->buf->error,
5175
653
                         "xmlParserInputBufferPush failed", NULL, NULL);
5176
653
      return (ctxt->errNo);
5177
653
  }
5178
472k
    }
5179
5180
471k
    htmlParseTryOrFinish(ctxt, terminate);
5181
5182
471k
    if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
5183
9.21k
        htmlAutoCloseOnEnd(ctxt);
5184
5185
        /*
5186
         * Only check for truncated multi-byte sequences
5187
         */
5188
9.21k
        xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
5189
5190
9.21k
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5191
9.21k
            ctxt->sax->endDocument(ctxt->userData);
5192
5193
9.21k
  ctxt->instate = XML_PARSER_EOF;
5194
9.21k
    }
5195
5196
471k
    return((xmlParserErrors) ctxt->errNo);
5197
472k
}
5198
5199
/************************************************************************
5200
 *                  *
5201
 *      User entry points       *
5202
 *                  *
5203
 ************************************************************************/
5204
5205
/**
5206
 * Create a parser context for using the HTML parser in push mode.
5207
 *
5208
 * @param sax  a SAX handler (optional)
5209
 * @param user_data  The user data returned on SAX callbacks (optional)
5210
 * @param chunk  a pointer to an array of chars (optional)
5211
 * @param size  number of chars in the array
5212
 * @param filename  only used for error reporting (optional)
5213
 * @param enc  encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5214
 * @returns the new parser context or NULL if a memory allocation
5215
 * failed.
5216
 */
5217
htmlParserCtxt *
5218
htmlCreatePushParserCtxt(htmlSAXHandler *sax, void *user_data,
5219
                         const char *chunk, int size, const char *filename,
5220
9.89k
       xmlCharEncoding enc) {
5221
9.89k
    htmlParserCtxtPtr ctxt;
5222
9.89k
    htmlParserInputPtr input;
5223
9.89k
    const char *encoding;
5224
5225
9.89k
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
5226
9.89k
    if (ctxt == NULL)
5227
14
  return(NULL);
5228
5229
9.87k
    encoding = xmlGetCharEncodingName(enc);
5230
9.87k
    input = xmlNewPushInput(filename, chunk, size);
5231
9.87k
    if (input == NULL) {
5232
6
  htmlFreeParserCtxt(ctxt);
5233
6
  return(NULL);
5234
6
    }
5235
5236
9.87k
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5237
0
        xmlFreeInputStream(input);
5238
0
        xmlFreeParserCtxt(ctxt);
5239
0
        return(NULL);
5240
0
    }
5241
5242
9.87k
    if (encoding != NULL)
5243
0
        xmlSwitchEncodingName(ctxt, encoding);
5244
5245
9.87k
    return(ctxt);
5246
9.87k
}
5247
#endif /* LIBXML_PUSH_ENABLED */
5248
5249
/**
5250
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5251
 * to handle parse events. If sax is NULL, fallback to the default DOM
5252
 * behavior and return a tree.
5253
 *
5254
 * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadDoc.
5255
 *
5256
 * @param cur  a pointer to an array of xmlChar
5257
 * @param encoding  a free form C string describing the HTML document encoding, or NULL
5258
 * @param sax  the SAX handler block
5259
 * @param userData  if using SAX, this pointer will be provided on callbacks.
5260
 * @returns the resulting document tree unless SAX is NULL or the document is
5261
 *     not well formed.
5262
 */
5263
5264
xmlDoc *
5265
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5266
0
                htmlSAXHandler *sax, void *userData) {
5267
0
    htmlDocPtr ret;
5268
0
    htmlParserCtxtPtr ctxt;
5269
5270
0
    if (cur == NULL)
5271
0
        return(NULL);
5272
5273
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5274
0
    if (ctxt == NULL)
5275
0
        return(NULL);
5276
5277
0
    if (sax != NULL) {
5278
0
        *ctxt->sax = *sax;
5279
0
        ctxt->userData = userData;
5280
0
    }
5281
5282
0
    htmlParseDocument(ctxt);
5283
0
    ret = ctxt->myDoc;
5284
0
    htmlFreeParserCtxt(ctxt);
5285
5286
0
    return(ret);
5287
0
}
5288
5289
/**
5290
 * Parse an HTML in-memory document and build a tree.
5291
 *
5292
 * @deprecated Use #htmlReadDoc.
5293
 *
5294
 * This function uses deprecated global parser options.
5295
 *
5296
 * @param cur  a pointer to an array of xmlChar
5297
 * @param encoding  the encoding (optional)
5298
 * @returns the resulting document tree
5299
 */
5300
5301
xmlDoc *
5302
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
5303
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5304
0
}
5305
5306
5307
/**
5308
 * Create a parser context to read from a file.
5309
 *
5310
 * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadFile.
5311
 *
5312
 * A non-NULL encoding overrides encoding declarations in the document.
5313
 *
5314
 * Automatic support for ZLIB/Compress compressed document is provided
5315
 * by default if found at compile-time.
5316
 *
5317
 * @param filename  the filename
5318
 * @param encoding  optional encoding
5319
 * @returns the new parser context or NULL if a memory allocation failed.
5320
 */
5321
htmlParserCtxt *
5322
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5323
0
{
5324
0
    htmlParserCtxtPtr ctxt;
5325
0
    htmlParserInputPtr input;
5326
5327
0
    if (filename == NULL)
5328
0
        return(NULL);
5329
5330
0
    ctxt = htmlNewParserCtxt();
5331
0
    if (ctxt == NULL) {
5332
0
  return(NULL);
5333
0
    }
5334
5335
0
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5336
0
    if (input == NULL) {
5337
0
  xmlFreeParserCtxt(ctxt);
5338
0
  return(NULL);
5339
0
    }
5340
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5341
0
        xmlFreeInputStream(input);
5342
0
        xmlFreeParserCtxt(ctxt);
5343
0
        return(NULL);
5344
0
    }
5345
5346
0
    return(ctxt);
5347
0
}
5348
5349
/**
5350
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5351
 * compressed document is provided by default if found at compile-time.
5352
 * It use the given SAX function block to handle the parsing callback.
5353
 * If sax is NULL, fallback to the default DOM tree building routines.
5354
 *
5355
 * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadFile.
5356
 *
5357
 * @param filename  the filename
5358
 * @param encoding  encoding (optional)
5359
 * @param sax  the SAX handler block
5360
 * @param userData  if using SAX, this pointer will be provided on callbacks.
5361
 * @returns the resulting document tree unless SAX is NULL or the document is
5362
 *     not well formed.
5363
 */
5364
5365
xmlDoc *
5366
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandler *sax,
5367
0
                 void *userData) {
5368
0
    htmlDocPtr ret;
5369
0
    htmlParserCtxtPtr ctxt;
5370
0
    htmlSAXHandlerPtr oldsax = NULL;
5371
5372
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5373
0
    if (ctxt == NULL) return(NULL);
5374
0
    if (sax != NULL) {
5375
0
  oldsax = ctxt->sax;
5376
0
        ctxt->sax = sax;
5377
0
        ctxt->userData = userData;
5378
0
    }
5379
5380
0
    htmlParseDocument(ctxt);
5381
5382
0
    ret = ctxt->myDoc;
5383
0
    if (sax != NULL) {
5384
0
        ctxt->sax = oldsax;
5385
0
        ctxt->userData = NULL;
5386
0
    }
5387
0
    htmlFreeParserCtxt(ctxt);
5388
5389
0
    return(ret);
5390
0
}
5391
5392
/**
5393
 * Parse an HTML file and build a tree.
5394
 *
5395
 * @param filename  the filename
5396
 * @param encoding  encoding (optional)
5397
 * @returns the resulting document tree
5398
 */
5399
5400
xmlDoc *
5401
0
htmlParseFile(const char *filename, const char *encoding) {
5402
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5403
0
}
5404
5405
/**
5406
 * Set and return the previous value for handling HTML omitted tags.
5407
 *
5408
 * @deprecated Use HTML_PARSE_NOIMPLIED
5409
 *
5410
 * @param val  int 0 or 1
5411
 * @returns the last value for 0 for no handling, 1 for auto insertion.
5412
 */
5413
5414
int
5415
0
htmlHandleOmittedElem(int val) {
5416
0
    int old = htmlOmittedDefaultValue;
5417
5418
0
    htmlOmittedDefaultValue = val;
5419
0
    return(old);
5420
0
}
5421
5422
/**
5423
 * @deprecated Don't use.
5424
 *
5425
 * @param parent  HTML parent element
5426
 * @param elt  HTML element
5427
 * @returns 1
5428
 */
5429
int
5430
htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5431
0
                       const xmlChar* elt ATTRIBUTE_UNUSED) {
5432
0
    return(1);
5433
0
}
5434
5435
/**
5436
 * @deprecated Don't use.
5437
 *
5438
 * @param parent  HTML parent element
5439
 * @param elt  HTML element
5440
 * @returns HTML_VALID
5441
 */
5442
htmlStatus
5443
htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5444
0
                      const htmlElemDesc* elt ATTRIBUTE_UNUSED) {
5445
0
    return(HTML_VALID);
5446
0
}
5447
5448
/**
5449
 * @deprecated Don't use.
5450
 *
5451
 * @param elt  HTML element
5452
 * @param attr  HTML attribute
5453
 * @param legacy  whether to allow deprecated attributes
5454
 * @returns HTML_VALID
5455
 */
5456
htmlStatus
5457
htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED,
5458
                const xmlChar* attr ATTRIBUTE_UNUSED,
5459
0
                int legacy ATTRIBUTE_UNUSED) {
5460
0
    return(HTML_VALID);
5461
0
}
5462
5463
/**
5464
 * @deprecated Don't use.
5465
 *
5466
 * @param node  an xmlNode in a tree
5467
 * @param legacy  whether to allow deprecated elements (YES is faster here
5468
 *  for Element nodes)
5469
 * @returns HTML_VALID
5470
 */
5471
htmlStatus
5472
htmlNodeStatus(xmlNode *node ATTRIBUTE_UNUSED,
5473
0
               int legacy ATTRIBUTE_UNUSED) {
5474
0
    return(HTML_VALID);
5475
0
}
5476
5477
/************************************************************************
5478
 *                  *
5479
 *  New set (2.6.0) of simpler and more flexible APIs   *
5480
 *                  *
5481
 ************************************************************************/
5482
5483
/**
5484
 * Reset a parser context
5485
 *
5486
 * Same as #xmlCtxtReset.
5487
 *
5488
 * @param ctxt  an HTML parser context
5489
 */
5490
void
5491
htmlCtxtReset(htmlParserCtxt *ctxt)
5492
11.3k
{
5493
11.3k
    xmlCtxtReset(ctxt);
5494
11.3k
}
5495
5496
static int
5497
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
5498
22.5k
{
5499
22.5k
    int allMask;
5500
5501
22.5k
    if (ctxt == NULL)
5502
22
        return(-1);
5503
5504
22.4k
    allMask = HTML_PARSE_RECOVER |
5505
22.4k
              HTML_PARSE_HTML5 |
5506
22.4k
              HTML_PARSE_NODEFDTD |
5507
22.4k
              HTML_PARSE_NOERROR |
5508
22.4k
              HTML_PARSE_NOWARNING |
5509
22.4k
              HTML_PARSE_PEDANTIC |
5510
22.4k
              HTML_PARSE_NOBLANKS |
5511
22.4k
              HTML_PARSE_NONET |
5512
22.4k
              HTML_PARSE_NOIMPLIED |
5513
22.4k
              HTML_PARSE_COMPACT |
5514
22.4k
              HTML_PARSE_HUGE |
5515
22.4k
              HTML_PARSE_IGNORE_ENC |
5516
22.4k
              HTML_PARSE_BIG_LINES;
5517
5518
22.4k
    ctxt->options = (ctxt->options & keepMask) | (options & allMask);
5519
5520
    /*
5521
     * For some options, struct members are historically the source
5522
     * of truth. See xmlCtxtSetOptionsInternal.
5523
     */
5524
22.4k
    ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
5525
5526
    /*
5527
     * Recover from character encoding errors
5528
     */
5529
22.4k
    ctxt->recovery = 1;
5530
5531
    /*
5532
     * Changing SAX callbacks is a bad idea. This should be fixed.
5533
     */
5534
22.4k
    if (options & HTML_PARSE_NOBLANKS) {
5535
8.02k
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5536
8.02k
    }
5537
22.4k
    if (options & HTML_PARSE_HUGE) {
5538
8.78k
        if (ctxt->dict != NULL)
5539
8.78k
            xmlDictSetLimit(ctxt->dict, 0);
5540
8.78k
    }
5541
5542
    /*
5543
     * It would be useful to allow this feature.
5544
     */
5545
22.4k
    ctxt->dictNames = 0;
5546
5547
    /*
5548
     * Allow XML_PARSE_NOENT which many users set on the HTML parser.
5549
     */
5550
22.4k
    return(options & ~allMask & ~XML_PARSE_NOENT);
5551
22.5k
}
5552
5553
/**
5554
 * Applies the options to the parser context. Unset options are
5555
 * cleared.
5556
 *
5557
 * @since 2.14.0
5558
 *
5559
 * With older versions, you can use #htmlCtxtUseOptions.
5560
 *
5561
 * @param ctxt  an HTML parser context
5562
 * @param options  a bitmask of htmlParserOption values
5563
 * @returns 0 in case of success, the set of unknown or unimplemented options
5564
 *         in case of error.
5565
 */
5566
int
5567
htmlCtxtSetOptions(htmlParserCtxt *ctxt, int options)
5568
0
{
5569
0
    return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
5570
0
}
5571
5572
/**
5573
 * Applies the options to the parser context. The following options
5574
 * are never cleared and can only be enabled:
5575
 *
5576
 * @deprecated Use #htmlCtxtSetOptions.
5577
 *
5578
 * - HTML_PARSE_NODEFDTD
5579
 * - HTML_PARSE_NOERROR
5580
 * - HTML_PARSE_NOWARNING
5581
 * - HTML_PARSE_NOIMPLIED
5582
 * - HTML_PARSE_COMPACT
5583
 * - HTML_PARSE_HUGE
5584
 * - HTML_PARSE_IGNORE_ENC
5585
 * - HTML_PARSE_BIG_LINES
5586
 *
5587
 * @param ctxt  an HTML parser context
5588
 * @param options  a combination of htmlParserOption values
5589
 * @returns 0 in case of success, the set of unknown or unimplemented options
5590
 *         in case of error.
5591
 */
5592
int
5593
htmlCtxtUseOptions(htmlParserCtxt *ctxt, int options)
5594
22.5k
{
5595
22.5k
    int keepMask;
5596
5597
    /*
5598
     * For historic reasons, some options can only be enabled.
5599
     */
5600
22.5k
    keepMask = HTML_PARSE_NODEFDTD |
5601
22.5k
               HTML_PARSE_NOERROR |
5602
22.5k
               HTML_PARSE_NOWARNING |
5603
22.5k
               HTML_PARSE_NOIMPLIED |
5604
22.5k
               HTML_PARSE_COMPACT |
5605
22.5k
               HTML_PARSE_HUGE |
5606
22.5k
               HTML_PARSE_IGNORE_ENC |
5607
22.5k
               HTML_PARSE_BIG_LINES;
5608
5609
22.5k
    return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
5610
22.5k
}
5611
5612
/**
5613
 * Parse an HTML document and return the resulting document tree.
5614
 *
5615
 * @since 2.13.0
5616
 *
5617
 * @param ctxt  an HTML parser context
5618
 * @param input  parser input
5619
 * @returns the resulting document tree or NULL
5620
 */
5621
xmlDoc *
5622
htmlCtxtParseDocument(htmlParserCtxt *ctxt, xmlParserInput *input)
5623
9.84k
{
5624
9.84k
    htmlDocPtr ret;
5625
5626
9.84k
    if ((ctxt == NULL) || (input == NULL)) {
5627
0
        xmlFatalErr(ctxt, XML_ERR_ARGUMENT, NULL);
5628
0
        xmlFreeInputStream(input);
5629
0
        return(NULL);
5630
0
    }
5631
5632
    /* assert(ctxt->inputNr == 0); */
5633
9.84k
    while (ctxt->inputNr > 0)
5634
0
        xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5635
5636
9.84k
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5637
0
        xmlFreeInputStream(input);
5638
0
        return(NULL);
5639
0
    }
5640
5641
9.84k
    ctxt->html = INSERT_INITIAL;
5642
9.84k
    htmlParseDocument(ctxt);
5643
5644
9.84k
    ret = xmlCtxtGetDocument(ctxt);
5645
5646
    /* assert(ctxt->inputNr == 1); */
5647
19.6k
    while (ctxt->inputNr > 0)
5648
9.84k
        xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5649
5650
9.84k
    return(ret);
5651
9.84k
}
5652
5653
/**
5654
 * Convenience function to parse an HTML document from a zero-terminated
5655
 * string.
5656
 *
5657
 * See #htmlCtxtReadDoc for details.
5658
 *
5659
 * @param str  a pointer to a zero terminated string
5660
 * @param url  only used for error reporting (optoinal)
5661
 * @param encoding  the document encoding (optional)
5662
 * @param options  a combination of htmlParserOption values
5663
 * @returns the resulting document tree.
5664
 */
5665
xmlDoc *
5666
htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
5667
            int options)
5668
0
{
5669
0
    htmlParserCtxtPtr ctxt;
5670
0
    xmlParserInputPtr input;
5671
0
    htmlDocPtr doc = NULL;
5672
5673
0
    ctxt = htmlNewParserCtxt();
5674
0
    if (ctxt == NULL)
5675
0
        return(NULL);
5676
5677
0
    htmlCtxtUseOptions(ctxt, options);
5678
5679
0
    input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding,
5680
0
                                      XML_INPUT_BUF_STATIC);
5681
5682
0
    if (input != NULL)
5683
0
        doc = htmlCtxtParseDocument(ctxt, input);
5684
5685
0
    htmlFreeParserCtxt(ctxt);
5686
0
    return(doc);
5687
0
}
5688
5689
/**
5690
 * Convenience function to parse an HTML file from the filesystem,
5691
 * the network or a global user-defined resource loader.
5692
 *
5693
 * See #htmlCtxtReadFile for details.
5694
 *
5695
 * @param filename  a file or URL
5696
 * @param encoding  the document encoding (optional)
5697
 * @param options  a combination of htmlParserOption values
5698
 * @returns the resulting document tree.
5699
 */
5700
xmlDoc *
5701
htmlReadFile(const char *filename, const char *encoding, int options)
5702
0
{
5703
0
    htmlParserCtxtPtr ctxt;
5704
0
    xmlParserInputPtr input;
5705
0
    htmlDocPtr doc = NULL;
5706
5707
0
    ctxt = htmlNewParserCtxt();
5708
0
    if (ctxt == NULL)
5709
0
        return(NULL);
5710
5711
0
    htmlCtxtUseOptions(ctxt, options);
5712
5713
0
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5714
5715
0
    if (input != NULL)
5716
0
        doc = htmlCtxtParseDocument(ctxt, input);
5717
5718
0
    htmlFreeParserCtxt(ctxt);
5719
0
    return(doc);
5720
0
}
5721
5722
/**
5723
 * Convenience function to parse an HTML document from memory.
5724
 * The input buffer must not contain any terminating null bytes.
5725
 *
5726
 * See #htmlCtxtReadMemory for details.
5727
 *
5728
 * @param buffer  a pointer to a char array
5729
 * @param size  the size of the array
5730
 * @param url  only used for error reporting (optional)
5731
 * @param encoding  the document encoding, or NULL
5732
 * @param options  a combination of htmlParserOption values
5733
 * @returns the resulting document tree
5734
 */
5735
xmlDoc *
5736
htmlReadMemory(const char *buffer, int size, const char *url,
5737
               const char *encoding, int options)
5738
0
{
5739
0
    htmlParserCtxtPtr ctxt;
5740
0
    xmlParserInputPtr input;
5741
0
    htmlDocPtr doc = NULL;
5742
5743
0
    if (size < 0)
5744
0
  return(NULL);
5745
5746
0
    ctxt = htmlNewParserCtxt();
5747
0
    if (ctxt == NULL)
5748
0
        return(NULL);
5749
5750
0
    htmlCtxtUseOptions(ctxt, options);
5751
5752
0
    input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding,
5753
0
                                      XML_INPUT_BUF_STATIC);
5754
5755
0
    if (input != NULL)
5756
0
        doc = htmlCtxtParseDocument(ctxt, input);
5757
5758
0
    htmlFreeParserCtxt(ctxt);
5759
0
    return(doc);
5760
0
}
5761
5762
/**
5763
 * Convenience function to parse an HTML document from a
5764
 * file descriptor.
5765
 *
5766
 * NOTE that the file descriptor will not be closed when the
5767
 * context is freed or reset.
5768
 *
5769
 * See #htmlCtxtReadFd for details.
5770
 *
5771
 * @param fd  an open file descriptor
5772
 * @param url  only used for error reporting (optional)
5773
 * @param encoding  the document encoding, or NULL
5774
 * @param options  a combination of htmlParserOption values
5775
 * @returns the resulting document tree
5776
 */
5777
xmlDoc *
5778
htmlReadFd(int fd, const char *url, const char *encoding, int options)
5779
0
{
5780
0
    htmlParserCtxtPtr ctxt;
5781
0
    xmlParserInputPtr input;
5782
0
    htmlDocPtr doc = NULL;
5783
5784
0
    ctxt = htmlNewParserCtxt();
5785
0
    if (ctxt == NULL)
5786
0
        return(NULL);
5787
5788
0
    htmlCtxtUseOptions(ctxt, options);
5789
5790
0
    input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0);
5791
5792
0
    if (input != NULL)
5793
0
        doc = htmlCtxtParseDocument(ctxt, input);
5794
5795
0
    htmlFreeParserCtxt(ctxt);
5796
0
    return(doc);
5797
0
}
5798
5799
/**
5800
 * Convenience function to parse an HTML document from I/O functions
5801
 * and context.
5802
 *
5803
 * See #htmlCtxtReadIO for details.
5804
 *
5805
 * @param ioread  an I/O read function
5806
 * @param ioclose  an I/O close function (optional)
5807
 * @param ioctx  an I/O handler
5808
 * @param url  only used for error reporting (optional)
5809
 * @param encoding  the document encoding (optional)
5810
 * @param options  a combination of htmlParserOption values
5811
 * @returns the resulting document tree
5812
 */
5813
xmlDoc *
5814
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5815
          void *ioctx, const char *url, const char *encoding, int options)
5816
0
{
5817
0
    htmlParserCtxtPtr ctxt;
5818
0
    xmlParserInputPtr input;
5819
0
    htmlDocPtr doc = NULL;
5820
5821
0
    ctxt = htmlNewParserCtxt();
5822
0
    if (ctxt == NULL)
5823
0
        return (NULL);
5824
5825
0
    htmlCtxtUseOptions(ctxt, options);
5826
5827
0
    input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx,
5828
0
                                  encoding, 0);
5829
5830
0
    if (input != NULL)
5831
0
        doc = htmlCtxtParseDocument(ctxt, input);
5832
5833
0
    htmlFreeParserCtxt(ctxt);
5834
0
    return(doc);
5835
0
}
5836
5837
/**
5838
 * Parse an HTML in-memory document and build a tree.
5839
 *
5840
 * See #htmlCtxtUseOptions for details.
5841
 *
5842
 * @param ctxt  an HTML parser context
5843
 * @param str  a pointer to a zero terminated string
5844
 * @param URL  only used for error reporting (optional)
5845
 * @param encoding  the document encoding (optional)
5846
 * @param options  a combination of htmlParserOption values
5847
 * @returns the resulting document tree
5848
 */
5849
xmlDoc *
5850
htmlCtxtReadDoc(xmlParserCtxt *ctxt, const xmlChar *str,
5851
                const char *URL, const char *encoding, int options)
5852
0
{
5853
0
    xmlParserInputPtr input;
5854
5855
0
    if (ctxt == NULL)
5856
0
        return (NULL);
5857
5858
0
    htmlCtxtReset(ctxt);
5859
0
    htmlCtxtUseOptions(ctxt, options);
5860
5861
0
    input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str,
5862
0
                                      encoding, 0);
5863
0
    if (input == NULL)
5864
0
        return(NULL);
5865
5866
0
    return(htmlCtxtParseDocument(ctxt, input));
5867
0
}
5868
5869
/**
5870
 * Parse an HTML file from the filesystem, the network or a
5871
 * user-defined resource loader.
5872
 *
5873
 * See #htmlCtxtUseOptions for details.
5874
 *
5875
 * @param ctxt  an HTML parser context
5876
 * @param filename  a file or URL
5877
 * @param encoding  the document encoding (optional)
5878
 * @param options  a combination of htmlParserOption values
5879
 * @returns the resulting document tree
5880
 */
5881
xmlDoc *
5882
htmlCtxtReadFile(xmlParserCtxt *ctxt, const char *filename,
5883
                const char *encoding, int options)
5884
1.45k
{
5885
1.45k
    xmlParserInputPtr input;
5886
5887
1.45k
    if (ctxt == NULL)
5888
0
        return (NULL);
5889
5890
1.45k
    htmlCtxtReset(ctxt);
5891
1.45k
    htmlCtxtUseOptions(ctxt, options);
5892
5893
1.45k
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5894
1.45k
    if (input == NULL)
5895
1.45k
        return(NULL);
5896
5897
0
    return(htmlCtxtParseDocument(ctxt, input));
5898
1.45k
}
5899
5900
/**
5901
 * Parse an HTML in-memory document and build a tree. The input buffer must
5902
 * not contain any terminating null bytes.
5903
 *
5904
 * See #htmlCtxtUseOptions for details.
5905
 *
5906
 * @param ctxt  an HTML parser context
5907
 * @param buffer  a pointer to a char array
5908
 * @param size  the size of the array
5909
 * @param URL  only used for error reporting (optional)
5910
 * @param encoding  the document encoding (optinal)
5911
 * @param options  a combination of htmlParserOption values
5912
 * @returns the resulting document tree
5913
 */
5914
xmlDoc *
5915
htmlCtxtReadMemory(xmlParserCtxt *ctxt, const char *buffer, int size,
5916
                  const char *URL, const char *encoding, int options)
5917
9.87k
{
5918
9.87k
    xmlParserInputPtr input;
5919
5920
9.87k
    if ((ctxt == NULL) || (size < 0))
5921
0
        return (NULL);
5922
5923
9.87k
    htmlCtxtReset(ctxt);
5924
9.87k
    htmlCtxtUseOptions(ctxt, options);
5925
5926
9.87k
    input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding,
5927
9.87k
                                      XML_INPUT_BUF_STATIC);
5928
9.87k
    if (input == NULL)
5929
29
        return(NULL);
5930
5931
9.84k
    return(htmlCtxtParseDocument(ctxt, input));
5932
9.87k
}
5933
5934
/**
5935
 * Parse an HTML from a file descriptor and build a tree.
5936
 *
5937
 * See #htmlCtxtUseOptions for details.
5938
 *
5939
 * NOTE that the file descriptor will not be closed when the
5940
 * context is freed or reset.
5941
 *
5942
 * @param ctxt  an HTML parser context
5943
 * @param fd  an open file descriptor
5944
 * @param URL  only used for error reporting (optional)
5945
 * @param encoding  the document encoding (optinal)
5946
 * @param options  a combination of htmlParserOption values
5947
 * @returns the resulting document tree
5948
 */
5949
xmlDoc *
5950
htmlCtxtReadFd(xmlParserCtxt *ctxt, int fd,
5951
              const char *URL, const char *encoding, int options)
5952
0
{
5953
0
    xmlParserInputPtr input;
5954
5955
0
    if (ctxt == NULL)
5956
0
        return(NULL);
5957
5958
0
    htmlCtxtReset(ctxt);
5959
0
    htmlCtxtUseOptions(ctxt, options);
5960
5961
0
    input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0);
5962
0
    if (input == NULL)
5963
0
        return(NULL);
5964
5965
0
    return(htmlCtxtParseDocument(ctxt, input));
5966
0
}
5967
5968
/**
5969
 * Parse an HTML document from I/O functions and source and build a tree.
5970
 *
5971
 * See #htmlCtxtUseOptions for details.
5972
 *
5973
 * @param ctxt  an HTML parser context
5974
 * @param ioread  an I/O read function
5975
 * @param ioclose  an I/O close function
5976
 * @param ioctx  an I/O handler
5977
 * @param URL  the base URL to use for the document
5978
 * @param encoding  the document encoding, or NULL
5979
 * @param options  a combination of htmlParserOption values
5980
 * @returns the resulting document tree
5981
 */
5982
xmlDoc *
5983
htmlCtxtReadIO(xmlParserCtxt *ctxt, xmlInputReadCallback ioread,
5984
              xmlInputCloseCallback ioclose, void *ioctx,
5985
        const char *URL,
5986
              const char *encoding, int options)
5987
0
{
5988
0
    xmlParserInputPtr input;
5989
5990
0
    if (ctxt == NULL)
5991
0
        return (NULL);
5992
5993
0
    htmlCtxtReset(ctxt);
5994
0
    htmlCtxtUseOptions(ctxt, options);
5995
5996
0
    input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx,
5997
0
                                  encoding, 0);
5998
0
    if (input == NULL)
5999
0
        return(NULL);
6000
6001
0
    return(htmlCtxtParseDocument(ctxt, input));
6002
0
}
6003
6004
#endif /* LIBXML_HTML_ENABLED */