Coverage Report

Created: 2025-07-01 06:27

/src/libxml2/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML parser
3
 *
4
 * References:
5
 *   HTML Living Standard
6
 *     https://html.spec.whatwg.org/multipage/parsing.html
7
 *
8
 * Tokenization now conforms to HTML5. Tree construction still follows
9
 * a custom, non-standard implementation. See:
10
 *
11
 *     https://gitlab.gnome.org/GNOME/libxml2/-/issues/211
12
 *
13
 * See Copyright for the status of this software.
14
 *
15
 * Author: Daniel Veillard
16
 */
17
18
#define IN_LIBXML
19
#include "libxml.h"
20
#ifdef LIBXML_HTML_ENABLED
21
22
#include <string.h>
23
#include <ctype.h>
24
#include <stdlib.h>
25
26
#include <libxml/HTMLparser.h>
27
#include <libxml/xmlmemory.h>
28
#include <libxml/tree.h>
29
#include <libxml/parser.h>
30
#include <libxml/parserInternals.h>
31
#include <libxml/xmlerror.h>
32
#include <libxml/HTMLtree.h>
33
#include <libxml/entities.h>
34
#include <libxml/encoding.h>
35
#include <libxml/xmlIO.h>
36
#include <libxml/uri.h>
37
38
#include "private/buf.h"
39
#include "private/dict.h"
40
#include "private/enc.h"
41
#include "private/error.h"
42
#include "private/html.h"
43
#include "private/io.h"
44
#include "private/memory.h"
45
#include "private/parser.h"
46
#include "private/tree.h"
47
48
#define HTML_MAX_NAMELEN 1000
49
14.2k
#define HTML_MAX_ATTRS 100000000 /* 100 million */
50
301k
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
51
35.5M
#define HTML_PARSER_BUFFER_SIZE 100
52
53
#define IS_HEX_DIGIT(c) \
54
70.6k
    ((IS_ASCII_DIGIT(c)) || \
55
70.6k
     ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
56
57
#define IS_UPPER(c) \
58
15.2M
    (((c) >= 'A') && ((c) <= 'Z'))
59
60
#define IS_ALNUM(c) \
61
82.2k
    (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c))
62
63
typedef enum {
64
    INSERT_INITIAL = 1,
65
    INSERT_IN_HEAD = 3,
66
    INSERT_IN_BODY = 10
67
} htmlInsertMode;
68
69
typedef const unsigned htmlAsciiMask[2];
70
71
static htmlAsciiMask MASK_DQ = {
72
    0,
73
    1u << ('"' - 32),
74
};
75
static htmlAsciiMask MASK_SQ = {
76
    0,
77
    1u << ('\'' - 32),
78
};
79
static htmlAsciiMask MASK_GT = {
80
    0,
81
    1u << ('>' - 32),
82
};
83
static htmlAsciiMask MASK_DASH = {
84
    0,
85
    1u << ('-' - 32),
86
};
87
static htmlAsciiMask MASK_WS_GT = {
88
    1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
89
    1u << (' ' - 32) | 1u << ('>' - 32),
90
};
91
static htmlAsciiMask MASK_DQ_GT = {
92
    0,
93
    1u << ('"' - 32) | 1u << ('>' - 32),
94
};
95
static htmlAsciiMask MASK_SQ_GT = {
96
    0,
97
    1u << ('\'' - 32) | 1u << ('>' - 32),
98
};
99
100
static int htmlOmittedDefaultValue = 1;
101
102
static int
103
htmlParseElementInternal(htmlParserCtxtPtr ctxt);
104
105
/************************************************************************
106
 *                  *
107
 *    Some factorized error routines        *
108
 *                  *
109
 ************************************************************************/
110
111
/**
112
 * Handle an out-of-memory error
113
 *
114
 * @param ctxt  an HTML parser context
115
 */
116
static void
117
htmlErrMemory(xmlParserCtxtPtr ctxt)
118
834
{
119
834
    xmlCtxtErrMemory(ctxt);
120
834
}
121
122
/**
123
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
124
 *
125
 * @param ctxt  an HTML parser context
126
 * @param error  the error number
127
 * @param msg  the error message
128
 * @param str1  string infor
129
 * @param str2  string infor
130
 */
131
static void LIBXML_ATTR_FORMAT(3,0)
132
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
133
             const char *msg, const xmlChar *str1, const xmlChar *str2)
134
83.7k
{
135
83.7k
    xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
136
83.7k
               str1, str2, NULL, 0, msg, str1, str2);
137
83.7k
}
138
139
/************************************************************************
140
 *                  *
141
 *  Parser stacks related functions and macros    *
142
 *                  *
143
 ************************************************************************/
144
145
/**
146
 * Pushes a new element name on top of the name stack
147
 *
148
 * @param ctxt  an HTML parser context
149
 * @param value  the element name
150
 * @returns -1 in case of error, the index in the stack otherwise
151
 */
152
static int
153
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
154
449k
{
155
449k
    if ((ctxt->html < INSERT_IN_HEAD) && (xmlStrEqual(value, BAD_CAST "head")))
156
890
        ctxt->html = INSERT_IN_HEAD;
157
449k
    if ((ctxt->html < INSERT_IN_BODY) && (xmlStrEqual(value, BAD_CAST "body")))
158
6.87k
        ctxt->html = INSERT_IN_BODY;
159
449k
    if (ctxt->nameNr >= ctxt->nameMax) {
160
20.8k
        const xmlChar **tmp;
161
20.8k
        int newSize;
162
163
20.8k
        newSize = xmlGrowCapacity(ctxt->nameMax, sizeof(tmp[0]),
164
20.8k
                                  10, XML_MAX_ITEMS);
165
20.8k
        if (newSize < 0) {
166
0
            htmlErrMemory(ctxt);
167
0
            return (-1);
168
0
        }
169
20.8k
        tmp = xmlRealloc(ctxt->nameTab, newSize * sizeof(tmp[0]));
170
20.8k
        if (tmp == NULL) {
171
57
            htmlErrMemory(ctxt);
172
57
            return(-1);
173
57
        }
174
20.7k
        ctxt->nameTab = tmp;
175
20.7k
        ctxt->nameMax = newSize;
176
20.7k
    }
177
449k
    ctxt->nameTab[ctxt->nameNr] = value;
178
449k
    ctxt->name = value;
179
449k
    return (ctxt->nameNr++);
180
449k
}
181
/**
182
 * Pops the top element name from the name stack
183
 *
184
 * @param ctxt  an HTML parser context
185
 * @returns the name just removed
186
 */
187
static const xmlChar *
188
htmlnamePop(htmlParserCtxtPtr ctxt)
189
433k
{
190
433k
    const xmlChar *ret;
191
192
433k
    if (ctxt->nameNr <= 0)
193
0
        return (NULL);
194
433k
    ctxt->nameNr--;
195
433k
    if (ctxt->nameNr < 0)
196
0
        return (NULL);
197
433k
    if (ctxt->nameNr > 0)
198
352k
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
199
81.1k
    else
200
81.1k
        ctxt->name = NULL;
201
433k
    ret = ctxt->nameTab[ctxt->nameNr];
202
433k
    ctxt->nameTab[ctxt->nameNr] = NULL;
203
433k
    return (ret);
204
433k
}
205
206
/**
207
 * Pushes a new element name on top of the node info stack
208
 *
209
 * @param ctxt  an HTML parser context
210
 * @param value  the node info
211
 * @returns 0 in case of error, the index in the stack otherwise
212
 */
213
static int
214
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
215
0
{
216
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
217
0
        xmlParserNodeInfo *tmp;
218
0
        int newSize;
219
220
0
        newSize = xmlGrowCapacity(ctxt->nodeInfoMax, sizeof(tmp[0]),
221
0
                                  5, XML_MAX_ITEMS);
222
0
        if (newSize < 0) {
223
0
            htmlErrMemory(ctxt);
224
0
            return (0);
225
0
        }
226
0
        tmp = xmlRealloc(ctxt->nodeInfoTab, newSize * sizeof(tmp[0]));
227
0
        if (tmp == NULL) {
228
0
            htmlErrMemory(ctxt);
229
0
            return (0);
230
0
        }
231
0
        ctxt->nodeInfoTab = tmp;
232
0
        ctxt->nodeInfoMax = newSize;
233
0
    }
234
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
235
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
236
0
    return (ctxt->nodeInfoNr++);
237
0
}
238
239
/**
240
 * Pops the top element name from the node info stack
241
 *
242
 * @param ctxt  an HTML parser context
243
 * @returns 0 in case of error, the pointer to NodeInfo otherwise
244
 */
245
static htmlParserNodeInfo *
246
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
247
0
{
248
0
    if (ctxt->nodeInfoNr <= 0)
249
0
        return (NULL);
250
0
    ctxt->nodeInfoNr--;
251
0
    if (ctxt->nodeInfoNr < 0)
252
0
        return (NULL);
253
0
    if (ctxt->nodeInfoNr > 0)
254
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
255
0
    else
256
0
        ctxt->nodeInfo = NULL;
257
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
258
0
}
259
260
/*
261
 * Macros for accessing the content. Those should be used only by the parser,
262
 * and not exported.
263
 *
264
 * Dirty macros, i.e. one need to make assumption on the context to use them
265
 *
266
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
267
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
268
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
269
 *           in UNICODE mode. This should be used internally by the parser
270
 *           only to compare to ASCII values otherwise it would break when
271
 *           running with UTF-8 encoding.
272
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
273
 *           to compare on ASCII based substring.
274
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
275
 *           it should be used only to compare on ASCII based substring.
276
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
277
 *           strings without newlines within the parser.
278
 *
279
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
280
 *
281
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
282
 */
283
284
85.7k
#define UPPER (toupper(*ctxt->input->cur))
285
286
3.17M
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
287
288
3.32M
#define NXT(val) ctxt->input->cur[(val)]
289
290
993k
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
291
292
0
#define CUR_PTR ctxt->input->cur
293
#define BASE_PTR ctxt->input->base
294
295
#define SHRINK \
296
128M
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
297
128M
        (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
298
128M
  (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
299
128M
  xmlParserShrink(ctxt);
300
301
#define GROW \
302
6.09M
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
303
6.09M
        (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
304
122k
  xmlParserGrow(ctxt);
305
306
9.64M
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
307
308
/* Imported from XML */
309
310
21.4M
#define CUR (*ctxt->input->cur)
311
312
/**
313
 * Prescan to find encoding.
314
 *
315
 * Try to find an encoding in the current data available in the input
316
 * buffer.
317
 *
318
 * TODO: Implement HTML5 prescan algorithm.
319
 *
320
 * @param ctxt  the HTML parser context
321
 * @returns  an encoding string or NULL if not found
322
 */
323
static xmlChar *
324
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
325
0
    const xmlChar *start, *cur, *end;
326
0
    xmlChar *ret;
327
0
328
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
329
0
        (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
330
0
        return(NULL);
331
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
332
0
        return(NULL);
333
0
334
0
    start = ctxt->input->cur;
335
0
    end = ctxt->input->end;
336
0
    /* we also expect the input buffer to be zero terminated */
337
0
    if (*end != 0)
338
0
        return(NULL);
339
0
340
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
341
0
    if (cur == NULL)
342
0
        return(NULL);
343
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
344
0
    if (cur == NULL)
345
0
        return(NULL);
346
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
347
0
    if (cur == NULL)
348
0
        return(NULL);
349
0
    cur += 8;
350
0
    start = cur;
351
0
    while ((IS_ALNUM(*cur)) ||
352
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
353
0
           cur++;
354
0
    if (cur == start)
355
0
        return(NULL);
356
0
    ret = xmlStrndup(start, cur - start);
357
0
    if (ret == NULL)
358
0
        htmlErrMemory(ctxt);
359
0
    return(ret);
360
0
}
361
362
static int
363
120M
htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
364
120M
    if (c >= 64)
365
88.4M
        return(0);
366
32.1M
    return((mask[c/32] >> (c & 31)) & 1);
367
120M
}
368
369
static int
370
htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len,
371
217M
                 int partial) {
372
217M
    unsigned c = str[0];
373
217M
    int size;
374
375
217M
    if (c < 0xC2) {
376
5.08M
        goto invalid;
377
212M
    } else if (c < 0xE0) {
378
52.0M
        if (len < 2)
379
10.4k
            goto incomplete;
380
52.0M
        if ((str[1] & 0xC0) != 0x80)
381
2.19M
            goto invalid;
382
49.8M
        size = 2;
383
160M
    } else if (c < 0xF0) {
384
159M
        unsigned v;
385
386
159M
        if (len < 3)
387
275
            goto incomplete;
388
389
159M
        v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
390
159M
        v |= c << 16;
391
392
159M
        if (((v & 0x00C0C0) != 0x008080) ||
393
159M
            ((v & 0x0F2000) == 0x000000) ||
394
159M
            ((v & 0x0F2000) == 0x0D2000))
395
45.4k
            goto invalid;
396
397
159M
        size = 3;
398
159M
    } else {
399
324k
        unsigned v;
400
401
324k
        if (len < 4)
402
1.61k
            goto incomplete;
403
404
323k
        v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
405
406
323k
        if (((v & 0x00C0C0C0) != 0x00808080) ||
407
323k
            (v < 0xF0900000) || (v >= 0xF4900000))
408
174k
            goto invalid;
409
410
148k
        size = 4;
411
148k
    }
412
413
209M
    return(size);
414
415
12.3k
incomplete:
416
12.3k
    if (partial)
417
11.1k
        return(0);
418
419
7.50M
invalid:
420
    /* Only report the first error */
421
7.50M
    if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
422
2.23k
        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
423
2.23k
                     "Invalid bytes in character encoding", NULL, NULL);
424
2.23k
        ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
425
2.23k
    }
426
427
7.50M
    return(-1);
428
12.3k
}
429
430
/**
431
 * skip all blanks character found at that point in the input streams.
432
 *
433
 * @param ctxt  the HTML parser context
434
 * @returns the number of space chars skipped
435
 */
436
437
static int
438
9.64M
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
439
9.64M
    const xmlChar *cur = ctxt->input->cur;
440
9.64M
    size_t avail = ctxt->input->end - cur;
441
9.64M
    int res = 0;
442
9.64M
    int line = ctxt->input->line;
443
9.64M
    int col = ctxt->input->col;
444
445
29.1M
    while (!PARSER_STOPPED(ctxt)) {
446
29.1M
        if (avail == 0) {
447
9.50k
            ctxt->input->cur = cur;
448
9.50k
            GROW;
449
9.50k
            cur = ctxt->input->cur;
450
9.50k
            avail = ctxt->input->end - cur;
451
452
9.50k
            if (avail == 0)
453
7.88k
                break;
454
9.50k
        }
455
456
29.1M
        if (*cur == '\n') {
457
3.29M
            line++;
458
3.29M
            col = 1;
459
25.8M
        } else if (IS_WS_HTML(*cur)) {
460
16.2M
            col++;
461
16.2M
        } else {
462
9.63M
            break;
463
9.63M
        }
464
465
19.5M
        cur += 1;
466
19.5M
        avail -= 1;
467
468
19.5M
  if (res < INT_MAX)
469
19.5M
      res++;
470
19.5M
    }
471
472
9.64M
    ctxt->input->cur = cur;
473
9.64M
    ctxt->input->line = line;
474
9.64M
    ctxt->input->col = col;
475
476
9.64M
    if (res > 8)
477
52.2k
        GROW;
478
479
9.64M
    return(res);
480
9.64M
}
481
482
483
484
/************************************************************************
485
 *                  *
486
 *  The list of HTML elements and their properties    *
487
 *                  *
488
 ************************************************************************/
489
490
/*
491
 *  Start Tag: 1 means the start tag can be omitted
492
 *  End Tag:   1 means the end tag can be omitted
493
 *             2 means it's forbidden (empty elements)
494
 *             3 means the tag is stylistic and should be closed easily
495
 *  Depr:      this element is deprecated
496
 *  DTD:       1 means that this element is valid only in the Loose DTD
497
 *             2 means that this element is valid only in the Frameset DTD
498
 *
499
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
500
 */
501
502
static const htmlElemDesc
503
html40ElementTable[] = {
504
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
505
  NULL, NULL, NULL, NULL, NULL,
506
  0
507
},
508
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
509
  NULL, NULL, NULL, NULL, NULL,
510
  0
511
},
512
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
513
  NULL, NULL, NULL, NULL, NULL,
514
  0
515
},
516
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
517
  NULL, NULL, NULL, NULL, NULL,
518
  0
519
},
520
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
521
  NULL, NULL, NULL, NULL, NULL,
522
  0
523
},
524
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
525
  NULL, NULL, NULL, NULL, NULL,
526
  0
527
},
528
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
529
  NULL, NULL, NULL, NULL, NULL,
530
  0
531
},
532
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
533
  NULL, NULL, NULL, NULL, NULL,
534
  0
535
},
536
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
537
  NULL, NULL, NULL, NULL, NULL,
538
  0
539
},
540
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
541
  NULL, NULL, NULL, NULL, NULL,
542
  0
543
},
544
{ "bgsound",  0, 0, 2, 1, 0, 0, 0, "",
545
  NULL, NULL, NULL, NULL, NULL,
546
  0
547
},
548
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
549
  NULL, NULL, NULL, NULL, NULL,
550
  0
551
},
552
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
553
  NULL, NULL, NULL, NULL, NULL,
554
  0
555
},
556
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
557
  NULL, NULL, NULL, NULL, NULL,
558
  0
559
},
560
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
561
  NULL, NULL, NULL, NULL, NULL,
562
  0
563
},
564
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
565
  NULL, NULL, NULL, NULL, NULL,
566
  0
567
},
568
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
569
  NULL, NULL, NULL, NULL, NULL,
570
  0
571
},
572
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
573
  NULL, NULL, NULL, NULL, NULL,
574
  0
575
},
576
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
577
  NULL, NULL, NULL, NULL, NULL,
578
  0
579
},
580
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
581
  NULL, NULL, NULL, NULL, NULL,
582
  0
583
},
584
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
585
  NULL, NULL, NULL, NULL, NULL,
586
  0
587
},
588
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
589
  NULL, NULL, NULL, NULL, NULL,
590
  0
591
},
592
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
593
  NULL, NULL, NULL, NULL, NULL,
594
  0
595
},
596
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
597
  NULL, NULL, NULL, NULL, NULL,
598
  0
599
},
600
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
601
  NULL, NULL, NULL, NULL, NULL,
602
  0
603
},
604
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
605
  NULL, NULL, NULL, NULL, NULL,
606
  0
607
},
608
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
609
  NULL, NULL, NULL, NULL, NULL,
610
  0
611
},
612
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
613
  NULL, NULL, NULL, NULL, NULL,
614
  0
615
},
616
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
617
  NULL, NULL, NULL, NULL, NULL,
618
  0
619
},
620
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
621
  NULL, NULL, NULL, NULL, NULL,
622
  0
623
},
624
{ "embed",  0, 1, 2, 1, 1, 1, 1, "generic embedded object ",
625
  NULL, NULL, NULL, NULL, NULL,
626
  0
627
},
628
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
629
  NULL, NULL, NULL, NULL, NULL,
630
  0
631
},
632
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
633
  NULL, NULL, NULL, NULL, NULL,
634
  0
635
},
636
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
637
  NULL, NULL, NULL, NULL, NULL,
638
  0
639
},
640
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
641
  NULL, NULL, NULL, NULL, NULL,
642
  0
643
},
644
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
645
  NULL, NULL, NULL, NULL, NULL,
646
  0
647
},
648
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
649
  NULL, NULL, NULL, NULL, NULL,
650
  0
651
},
652
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
653
  NULL, NULL, NULL, NULL, NULL,
654
  0
655
},
656
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
657
  NULL, NULL, NULL, NULL, NULL,
658
  0
659
},
660
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
661
  NULL, NULL, NULL, NULL, NULL,
662
  0
663
},
664
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
665
  NULL, NULL, NULL, NULL, NULL,
666
  0
667
},
668
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
669
  NULL, NULL, NULL, NULL, NULL,
670
  0
671
},
672
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
673
  NULL, NULL, NULL, NULL, NULL,
674
  0
675
},
676
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
677
  NULL, NULL, NULL, NULL, NULL,
678
  0
679
},
680
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
681
  NULL, NULL, NULL, NULL, NULL,
682
  0
683
},
684
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
685
  NULL, NULL, NULL, NULL, NULL,
686
  0
687
},
688
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
689
  NULL, NULL, NULL, NULL, NULL,
690
  DATA_RAWTEXT
691
},
692
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
693
  NULL, NULL, NULL, NULL, NULL,
694
  0
695
},
696
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
697
  NULL, NULL, NULL, NULL, NULL,
698
  0
699
},
700
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
701
  NULL, NULL, NULL, NULL, NULL,
702
  0
703
},
704
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
705
  NULL, NULL, NULL, NULL, NULL,
706
  0
707
},
708
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
709
  NULL, NULL, NULL, NULL, NULL,
710
  0
711
},
712
{ "keygen", 0, 0, 2, 1, 0, 0, 0, "",
713
  NULL, NULL, NULL, NULL, NULL,
714
  0
715
},
716
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
717
  NULL, NULL, NULL, NULL, NULL,
718
  0
719
},
720
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
721
  NULL, NULL, NULL, NULL, NULL,
722
  0
723
},
724
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
725
  NULL, NULL, NULL, NULL, NULL,
726
  0
727
},
728
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
729
  NULL, NULL, NULL, NULL, NULL,
730
  0
731
},
732
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
733
  NULL, NULL, NULL, NULL, NULL,
734
  0
735
},
736
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
737
  NULL, NULL, NULL, NULL, NULL,
738
  0
739
},
740
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
741
  NULL, NULL, NULL, NULL, NULL,
742
  0
743
},
744
{ "noembed",  0, 0, 0, 0, 0, 0, 0, "",
745
  NULL, NULL, NULL, NULL, NULL,
746
  DATA_RAWTEXT
747
},
748
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
749
  NULL, NULL, NULL, NULL, NULL,
750
  DATA_RAWTEXT
751
},
752
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
753
  NULL, NULL, NULL, NULL, NULL,
754
  0
755
},
756
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
757
  NULL, NULL, NULL, NULL, NULL,
758
  0
759
},
760
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
761
  NULL, NULL, NULL, NULL, NULL,
762
  0
763
},
764
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
765
  NULL, NULL, NULL, NULL, NULL,
766
  0
767
},
768
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
769
  NULL, NULL, NULL, NULL, NULL,
770
  0
771
},
772
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
773
  NULL, NULL, NULL, NULL, NULL,
774
  0
775
},
776
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
777
  NULL, NULL, NULL, NULL, NULL,
778
  0
779
},
780
{ "plaintext",  0, 0, 0, 0, 0, 0, 0, "",
781
  NULL, NULL, NULL, NULL, NULL,
782
  DATA_PLAINTEXT
783
},
784
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
785
  NULL, NULL, NULL, NULL, NULL,
786
  0
787
},
788
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
789
  NULL, NULL, NULL, NULL, NULL,
790
  0
791
},
792
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
793
  NULL, NULL, NULL, NULL, NULL,
794
  0
795
},
796
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
797
  NULL, NULL, NULL, NULL, NULL,
798
  0
799
},
800
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
801
  NULL, NULL, NULL, NULL, NULL,
802
  DATA_SCRIPT
803
},
804
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
805
  NULL, NULL, NULL, NULL, NULL,
806
  0
807
},
808
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
809
  NULL, NULL, NULL, NULL, NULL,
810
  0
811
},
812
{ "source", 0, 0, 2, 1, 0, 0, 0, "",
813
  NULL, NULL, NULL, NULL, NULL,
814
  0
815
},
816
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
817
  NULL, NULL, NULL, NULL, NULL,
818
  0
819
},
820
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
821
  NULL, NULL, NULL, NULL, NULL,
822
  0
823
},
824
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
825
  NULL, NULL, NULL, NULL, NULL,
826
  0
827
},
828
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
829
  NULL, NULL, NULL, NULL, NULL,
830
  DATA_RAWTEXT
831
},
832
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
833
  NULL, NULL, NULL, NULL, NULL,
834
  0
835
},
836
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
837
  NULL, NULL, NULL, NULL, NULL,
838
  0
839
},
840
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
841
  NULL, NULL, NULL, NULL, NULL,
842
  0
843
},
844
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
845
  NULL, NULL, NULL, NULL, NULL,
846
  0
847
},
848
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
849
  NULL, NULL, NULL, NULL, NULL,
850
  0
851
},
852
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
853
  NULL, NULL, NULL, NULL, NULL,
854
  DATA_RCDATA
855
},
856
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
857
  NULL, NULL, NULL, NULL, NULL,
858
  0
859
},
860
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
861
  NULL, NULL, NULL, NULL, NULL,
862
  0
863
},
864
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
865
  NULL, NULL, NULL, NULL, NULL,
866
  0
867
},
868
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
869
  NULL, NULL, NULL, NULL, NULL,
870
  DATA_RCDATA
871
},
872
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
873
  NULL, NULL, NULL, NULL, NULL,
874
  0
875
},
876
{ "track",  0, 0, 2, 1, 0, 0, 0, "",
877
  NULL, NULL, NULL, NULL, NULL,
878
  0
879
},
880
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
881
  NULL, NULL, NULL, NULL, NULL,
882
  0
883
},
884
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
885
  NULL, NULL, NULL, NULL, NULL,
886
  0
887
},
888
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
889
  NULL, NULL, NULL, NULL, NULL,
890
  0
891
},
892
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893
  NULL, NULL, NULL, NULL, NULL,
894
  0
895
},
896
{ "wbr",  0, 0, 2, 1, 0, 0, 0, "",
897
  NULL, NULL, NULL, NULL, NULL,
898
  0
899
},
900
{ "xmp",  0, 0, 0, 0, 0, 0, 1, "",
901
  NULL, NULL, NULL, NULL, NULL,
902
  DATA_RAWTEXT
903
}
904
};
905
906
typedef struct {
907
    const char *oldTag;
908
    const char *newTag;
909
} htmlStartCloseEntry;
910
911
/*
912
 * start tags that imply the end of current element
913
 */
914
static const htmlStartCloseEntry htmlStartClose[] = {
915
    { "a", "a" },
916
    { "a", "fieldset" },
917
    { "a", "table" },
918
    { "a", "td" },
919
    { "a", "th" },
920
    { "address", "dd" },
921
    { "address", "dl" },
922
    { "address", "dt" },
923
    { "address", "form" },
924
    { "address", "li" },
925
    { "address", "ul" },
926
    { "b", "center" },
927
    { "b", "p" },
928
    { "b", "td" },
929
    { "b", "th" },
930
    { "big", "p" },
931
    { "caption", "col" },
932
    { "caption", "colgroup" },
933
    { "caption", "tbody" },
934
    { "caption", "tfoot" },
935
    { "caption", "thead" },
936
    { "caption", "tr" },
937
    { "col", "col" },
938
    { "col", "colgroup" },
939
    { "col", "tbody" },
940
    { "col", "tfoot" },
941
    { "col", "thead" },
942
    { "col", "tr" },
943
    { "colgroup", "colgroup" },
944
    { "colgroup", "tbody" },
945
    { "colgroup", "tfoot" },
946
    { "colgroup", "thead" },
947
    { "colgroup", "tr" },
948
    { "dd", "dt" },
949
    { "dir", "dd" },
950
    { "dir", "dl" },
951
    { "dir", "dt" },
952
    { "dir", "form" },
953
    { "dir", "ul" },
954
    { "dl", "form" },
955
    { "dl", "li" },
956
    { "dt", "dd" },
957
    { "dt", "dl" },
958
    { "font", "center" },
959
    { "font", "td" },
960
    { "font", "th" },
961
    { "form", "form" },
962
    { "h1", "fieldset" },
963
    { "h1", "form" },
964
    { "h1", "li" },
965
    { "h1", "p" },
966
    { "h1", "table" },
967
    { "h2", "fieldset" },
968
    { "h2", "form" },
969
    { "h2", "li" },
970
    { "h2", "p" },
971
    { "h2", "table" },
972
    { "h3", "fieldset" },
973
    { "h3", "form" },
974
    { "h3", "li" },
975
    { "h3", "p" },
976
    { "h3", "table" },
977
    { "h4", "fieldset" },
978
    { "h4", "form" },
979
    { "h4", "li" },
980
    { "h4", "p" },
981
    { "h4", "table" },
982
    { "h5", "fieldset" },
983
    { "h5", "form" },
984
    { "h5", "li" },
985
    { "h5", "p" },
986
    { "h5", "table" },
987
    { "h6", "fieldset" },
988
    { "h6", "form" },
989
    { "h6", "li" },
990
    { "h6", "p" },
991
    { "h6", "table" },
992
    { "head", "a" },
993
    { "head", "abbr" },
994
    { "head", "acronym" },
995
    { "head", "address" },
996
    { "head", "b" },
997
    { "head", "bdo" },
998
    { "head", "big" },
999
    { "head", "blockquote" },
1000
    { "head", "body" },
1001
    { "head", "br" },
1002
    { "head", "center" },
1003
    { "head", "cite" },
1004
    { "head", "code" },
1005
    { "head", "dd" },
1006
    { "head", "dfn" },
1007
    { "head", "dir" },
1008
    { "head", "div" },
1009
    { "head", "dl" },
1010
    { "head", "dt" },
1011
    { "head", "em" },
1012
    { "head", "fieldset" },
1013
    { "head", "font" },
1014
    { "head", "form" },
1015
    { "head", "frameset" },
1016
    { "head", "h1" },
1017
    { "head", "h2" },
1018
    { "head", "h3" },
1019
    { "head", "h4" },
1020
    { "head", "h5" },
1021
    { "head", "h6" },
1022
    { "head", "hr" },
1023
    { "head", "i" },
1024
    { "head", "iframe" },
1025
    { "head", "img" },
1026
    { "head", "kbd" },
1027
    { "head", "li" },
1028
    { "head", "listing" },
1029
    { "head", "map" },
1030
    { "head", "menu" },
1031
    { "head", "ol" },
1032
    { "head", "p" },
1033
    { "head", "pre" },
1034
    { "head", "q" },
1035
    { "head", "s" },
1036
    { "head", "samp" },
1037
    { "head", "small" },
1038
    { "head", "span" },
1039
    { "head", "strike" },
1040
    { "head", "strong" },
1041
    { "head", "sub" },
1042
    { "head", "sup" },
1043
    { "head", "table" },
1044
    { "head", "tt" },
1045
    { "head", "u" },
1046
    { "head", "ul" },
1047
    { "head", "var" },
1048
    { "head", "xmp" },
1049
    { "hr", "form" },
1050
    { "i", "center" },
1051
    { "i", "p" },
1052
    { "i", "td" },
1053
    { "i", "th" },
1054
    { "legend", "fieldset" },
1055
    { "li", "li" },
1056
    { "link", "body" },
1057
    { "link", "frameset" },
1058
    { "listing", "dd" },
1059
    { "listing", "dl" },
1060
    { "listing", "dt" },
1061
    { "listing", "fieldset" },
1062
    { "listing", "form" },
1063
    { "listing", "li" },
1064
    { "listing", "table" },
1065
    { "listing", "ul" },
1066
    { "menu", "dd" },
1067
    { "menu", "dl" },
1068
    { "menu", "dt" },
1069
    { "menu", "form" },
1070
    { "menu", "ul" },
1071
    { "ol", "form" },
1072
    { "option", "optgroup" },
1073
    { "option", "option" },
1074
    { "p", "address" },
1075
    { "p", "blockquote" },
1076
    { "p", "body" },
1077
    { "p", "caption" },
1078
    { "p", "center" },
1079
    { "p", "col" },
1080
    { "p", "colgroup" },
1081
    { "p", "dd" },
1082
    { "p", "dir" },
1083
    { "p", "div" },
1084
    { "p", "dl" },
1085
    { "p", "dt" },
1086
    { "p", "fieldset" },
1087
    { "p", "form" },
1088
    { "p", "frameset" },
1089
    { "p", "h1" },
1090
    { "p", "h2" },
1091
    { "p", "h3" },
1092
    { "p", "h4" },
1093
    { "p", "h5" },
1094
    { "p", "h6" },
1095
    { "p", "head" },
1096
    { "p", "hr" },
1097
    { "p", "li" },
1098
    { "p", "listing" },
1099
    { "p", "menu" },
1100
    { "p", "ol" },
1101
    { "p", "p" },
1102
    { "p", "pre" },
1103
    { "p", "table" },
1104
    { "p", "tbody" },
1105
    { "p", "td" },
1106
    { "p", "tfoot" },
1107
    { "p", "th" },
1108
    { "p", "title" },
1109
    { "p", "tr" },
1110
    { "p", "ul" },
1111
    { "p", "xmp" },
1112
    { "pre", "dd" },
1113
    { "pre", "dl" },
1114
    { "pre", "dt" },
1115
    { "pre", "fieldset" },
1116
    { "pre", "form" },
1117
    { "pre", "li" },
1118
    { "pre", "table" },
1119
    { "pre", "ul" },
1120
    { "s", "p" },
1121
    { "script", "noscript" },
1122
    { "small", "p" },
1123
    { "span", "td" },
1124
    { "span", "th" },
1125
    { "strike", "p" },
1126
    { "style", "body" },
1127
    { "style", "frameset" },
1128
    { "tbody", "tbody" },
1129
    { "tbody", "tfoot" },
1130
    { "td", "tbody" },
1131
    { "td", "td" },
1132
    { "td", "tfoot" },
1133
    { "td", "th" },
1134
    { "td", "tr" },
1135
    { "tfoot", "tbody" },
1136
    { "th", "tbody" },
1137
    { "th", "td" },
1138
    { "th", "tfoot" },
1139
    { "th", "th" },
1140
    { "th", "tr" },
1141
    { "thead", "tbody" },
1142
    { "thead", "tfoot" },
1143
    { "title", "body" },
1144
    { "title", "frameset" },
1145
    { "tr", "tbody" },
1146
    { "tr", "tfoot" },
1147
    { "tr", "tr" },
1148
    { "tt", "p" },
1149
    { "u", "p" },
1150
    { "u", "td" },
1151
    { "u", "th" },
1152
    { "ul", "address" },
1153
    { "ul", "form" },
1154
    { "ul", "menu" },
1155
    { "ul", "pre" },
1156
    { "xmp", "dd" },
1157
    { "xmp", "dl" },
1158
    { "xmp", "dt" },
1159
    { "xmp", "fieldset" },
1160
    { "xmp", "form" },
1161
    { "xmp", "li" },
1162
    { "xmp", "table" },
1163
    { "xmp", "ul" }
1164
};
1165
1166
/*
1167
 * The list of HTML attributes which are of content %Script;
1168
 * NOTE: when adding ones, check #htmlIsScriptAttribute since
1169
 *       it assumes the name starts with 'on'
1170
 */
1171
static const char *const htmlScriptAttributes[] = {
1172
    "onclick",
1173
    "ondblclick",
1174
    "onmousedown",
1175
    "onmouseup",
1176
    "onmouseover",
1177
    "onmousemove",
1178
    "onmouseout",
1179
    "onkeypress",
1180
    "onkeydown",
1181
    "onkeyup",
1182
    "onload",
1183
    "onunload",
1184
    "onfocus",
1185
    "onblur",
1186
    "onsubmit",
1187
    "onreset",
1188
    "onchange",
1189
    "onselect"
1190
};
1191
1192
/*
1193
 * This table is used by the htmlparser to know what to do with
1194
 * broken html pages. By assigning different priorities to different
1195
 * elements the parser can decide how to handle extra endtags.
1196
 * Endtags are only allowed to close elements with lower or equal
1197
 * priority.
1198
 */
1199
1200
typedef struct {
1201
    const char *name;
1202
    int priority;
1203
} elementPriority;
1204
1205
static const elementPriority htmlEndPriority[] = {
1206
    {"div",   150},
1207
    {"td",    160},
1208
    {"th",    160},
1209
    {"tr",    170},
1210
    {"thead", 180},
1211
    {"tbody", 180},
1212
    {"tfoot", 180},
1213
    {"table", 190},
1214
    {"head",  200},
1215
    {"body",  200},
1216
    {"html",  220},
1217
    {NULL,    100} /* Default priority */
1218
};
1219
1220
/************************************************************************
1221
 *                  *
1222
 *  functions to handle HTML specific data      *
1223
 *                  *
1224
 ************************************************************************/
1225
1226
static void
1227
374k
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
1228
    /*
1229
     * Capture end position and add node
1230
     */
1231
374k
    if ( ctxt->node != NULL && ctxt->record_info ) {
1232
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
1233
0
                                (CUR_PTR - ctxt->input->base);
1234
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
1235
0
       ctxt->nodeInfo->node = ctxt->node;
1236
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
1237
0
       htmlNodeInfoPop(ctxt);
1238
0
    }
1239
374k
}
1240
1241
/**
1242
 * @deprecated This is a no-op.
1243
 */
1244
void
1245
0
htmlInitAutoClose(void) {
1246
0
}
1247
1248
static int
1249
7.85M
htmlCompareTags(const void *key, const void *member) {
1250
7.85M
    const xmlChar *tag = (const xmlChar *) key;
1251
7.85M
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1252
1253
7.85M
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1254
7.85M
}
1255
1256
/**
1257
 * Lookup the HTML tag in the ElementTable
1258
 *
1259
 * @deprecated Only supports HTML 4.
1260
 *
1261
 * @param tag  The tag name in lowercase
1262
 * @returns the related htmlElemDesc or NULL if not found.
1263
 */
1264
const htmlElemDesc *
1265
1.18M
htmlTagLookup(const xmlChar *tag) {
1266
1.18M
    if (tag == NULL)
1267
416
        return(NULL);
1268
1269
1.18M
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1270
1.18M
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1271
1.18M
                sizeof(htmlElemDesc), htmlCompareTags));
1272
1.18M
}
1273
1274
/**
1275
 * @param name  The name of the element to look up the priority for.
1276
 * @returns value: The "endtag" priority.
1277
 **/
1278
static int
1279
258k
htmlGetEndPriority (const xmlChar *name) {
1280
258k
    int i = 0;
1281
1282
3.06M
    while ((htmlEndPriority[i].name != NULL) &&
1283
3.06M
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1284
2.81M
  i++;
1285
1286
258k
    return(htmlEndPriority[i].priority);
1287
258k
}
1288
1289
1290
static int
1291
3.41M
htmlCompareStartClose(const void *vkey, const void *member) {
1292
3.41M
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1293
3.41M
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1294
3.41M
    int ret;
1295
1296
3.41M
    ret = strcmp(key->oldTag, entry->oldTag);
1297
3.41M
    if (ret == 0)
1298
255k
        ret = strcmp(key->newTag, entry->newTag);
1299
1300
3.41M
    return(ret);
1301
3.41M
}
1302
1303
/**
1304
 * Checks whether the new tag is one of the registered valid tags for
1305
 * closing old.
1306
 *
1307
 * @param newtag  The new tag name
1308
 * @param oldtag  The old tag name
1309
 * @returns 0 if no, 1 if yes.
1310
 */
1311
static int
1312
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1313
435k
{
1314
435k
    htmlStartCloseEntry key;
1315
435k
    void *res;
1316
1317
435k
    key.oldTag = (const char *) oldtag;
1318
435k
    key.newTag = (const char *) newtag;
1319
435k
    res = bsearch(&key, htmlStartClose,
1320
435k
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1321
435k
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1322
435k
    return(res != NULL);
1323
435k
}
1324
1325
/**
1326
 * The HTML DTD allows an ending tag to implicitly close other tags.
1327
 *
1328
 * @param ctxt  an HTML parser context
1329
 * @param newtag  The new tag name
1330
 */
1331
static void
1332
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1333
24.7k
{
1334
24.7k
    const htmlElemDesc *info;
1335
24.7k
    int i, priority;
1336
1337
24.7k
    if (ctxt->options & HTML_PARSE_HTML5)
1338
0
        return;
1339
1340
24.7k
    priority = htmlGetEndPriority(newtag);
1341
1342
257k
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1343
1344
257k
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1345
23.4k
            break;
1346
        /*
1347
         * A misplaced endtag can only close elements with lower
1348
         * or equal priority, so if we find an element with higher
1349
         * priority before we find an element with
1350
         * matching name, we just ignore this endtag
1351
         */
1352
233k
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1353
1.35k
            return;
1354
233k
    }
1355
23.4k
    if (i < 0)
1356
0
        return;
1357
1358
191k
    while (!xmlStrEqual(newtag, ctxt->name)) {
1359
167k
        info = htmlTagLookup(ctxt->name);
1360
167k
        if ((info != NULL) && (info->endTag == 3)) {
1361
7.81k
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1362
7.81k
                   "Opening and ending tag mismatch: %s and %s\n",
1363
7.81k
       newtag, ctxt->name);
1364
7.81k
        }
1365
167k
  htmlParserFinishElementParsing(ctxt);
1366
167k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1367
167k
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1368
167k
  htmlnamePop(ctxt);
1369
167k
    }
1370
23.4k
}
1371
1372
/**
1373
 * Close all remaining tags at the end of the stream
1374
 *
1375
 * @param ctxt  an HTML parser context
1376
 */
1377
static void
1378
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1379
17.7k
{
1380
17.7k
    int i;
1381
1382
17.7k
    if (ctxt->options & HTML_PARSE_HTML5)
1383
6.79k
        return;
1384
1385
10.9k
    if (ctxt->nameNr == 0)
1386
3.04k
        return;
1387
132k
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1388
124k
  htmlParserFinishElementParsing(ctxt);
1389
124k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1390
124k
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1391
124k
  htmlnamePop(ctxt);
1392
124k
    }
1393
7.86k
}
1394
1395
/**
1396
 * The HTML DTD allows a tag to implicitly close other tags.
1397
 * The list is kept in htmlStartClose array. This function is
1398
 * called when a new tag has been detected and generates the
1399
 * appropriates closes if possible/needed.
1400
 * If newtag is NULL this mean we are at the end of the resource
1401
 * and we should check
1402
 *
1403
 * @param ctxt  an HTML parser context
1404
 * @param newtag  The new tag name or NULL
1405
 */
1406
static void
1407
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1408
414k
{
1409
414k
    if (ctxt->options & HTML_PARSE_HTML5)
1410
0
        return;
1411
1412
414k
    if (newtag == NULL)
1413
0
        return;
1414
1415
445k
    while ((ctxt->name != NULL) &&
1416
445k
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1417
31.3k
  htmlParserFinishElementParsing(ctxt);
1418
31.3k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419
31.3k
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420
31.3k
  htmlnamePop(ctxt);
1421
31.3k
    }
1422
414k
}
1423
1424
/**
1425
 * The HTML DTD allows a tag to implicitly close other tags.
1426
 * The list is kept in htmlStartClose array. This function checks
1427
 * if the element or one of it's children would autoclose the
1428
 * given tag.
1429
 *
1430
 * @deprecated Internal function, don't use.
1431
 *
1432
 * @param doc  the HTML document
1433
 * @param name  The tag name
1434
 * @param elem  the HTML element
1435
 * @returns 1 if autoclose, 0 otherwise
1436
 */
1437
int
1438
0
htmlAutoCloseTag(xmlDoc *doc, const xmlChar *name, xmlNode *elem) {
1439
0
    htmlNodePtr child;
1440
1441
0
    if (elem == NULL) return(1);
1442
0
    if (xmlStrEqual(name, elem->name)) return(0);
1443
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1444
0
    child = elem->children;
1445
0
    while (child != NULL) {
1446
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1447
0
  child = child->next;
1448
0
    }
1449
0
    return(0);
1450
0
}
1451
1452
/**
1453
 * The HTML DTD allows a tag to implicitly close other tags.
1454
 * The list is kept in htmlStartClose array. This function checks
1455
 * if a tag is autoclosed by one of it's child
1456
 *
1457
 * @deprecated Internal function, don't use.
1458
 *
1459
 * @param doc  the HTML document
1460
 * @param elem  the HTML element
1461
 * @returns 1 if autoclosed, 0 otherwise
1462
 */
1463
int
1464
0
htmlIsAutoClosed(xmlDoc *doc, xmlNode *elem) {
1465
0
    htmlNodePtr child;
1466
1467
0
    if (elem == NULL) return(1);
1468
0
    child = elem->children;
1469
0
    while (child != NULL) {
1470
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471
0
  child = child->next;
1472
0
    }
1473
0
    return(0);
1474
0
}
1475
1476
/**
1477
 * The HTML DTD allows a tag to exists only implicitly
1478
 * called when a new tag has been detected and generates the
1479
 * appropriates implicit tags if missing
1480
 *
1481
 * @param ctxt  an HTML parser context
1482
 * @param newtag  The new tag name
1483
 */
1484
static void
1485
1.49M
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1486
1.49M
    int i;
1487
1488
1.49M
    if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1489
89.8k
        return;
1490
1.40M
    if (!htmlOmittedDefaultValue)
1491
0
  return;
1492
1.40M
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1493
13.0k
  return;
1494
1.39M
    if (ctxt->nameNr <= 0) {
1495
8.17k
  htmlnamePush(ctxt, BAD_CAST"html");
1496
8.17k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497
8.17k
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1498
8.17k
    }
1499
1.39M
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1500
6.40k
        return;
1501
1.38M
    if ((ctxt->nameNr <= 1) &&
1502
1.38M
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1503
24.7k
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1504
24.7k
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1505
24.7k
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1506
24.7k
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1507
24.7k
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1508
2.20k
        if (ctxt->html >= INSERT_IN_HEAD) {
1509
            /* we already saw or generated an <head> before */
1510
1.49k
            return;
1511
1.49k
        }
1512
        /*
1513
         * dropped OBJECT ... i you put it first BODY will be
1514
         * assumed !
1515
         */
1516
702
        htmlnamePush(ctxt, BAD_CAST"head");
1517
702
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1518
702
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1519
1.38M
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1520
1.38M
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1521
1.38M
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1522
1.38M
        if (ctxt->html >= INSERT_IN_BODY) {
1523
            /* we already saw or generated a <body> before */
1524
1.37M
            return;
1525
1.37M
        }
1526
15.7k
  for (i = 0;i < ctxt->nameNr;i++) {
1527
9.11k
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1528
0
    return;
1529
0
      }
1530
9.11k
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1531
1.22k
    return;
1532
1.22k
      }
1533
9.11k
  }
1534
1535
6.65k
  htmlnamePush(ctxt, BAD_CAST"body");
1536
6.65k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1537
6.65k
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1538
6.65k
    }
1539
1.38M
}
1540
1541
/**
1542
 * Prepare for non-whitespace character data.
1543
 *
1544
 * @param ctxt  an HTML parser context
1545
 */
1546
1547
static void
1548
2.22M
htmlStartCharData(htmlParserCtxtPtr ctxt) {
1549
2.22M
    if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1550
1.14M
        return;
1551
1.08M
    if (!htmlOmittedDefaultValue)
1552
0
  return;
1553
1554
1.08M
    if (xmlStrEqual(ctxt->name, BAD_CAST "head"))
1555
792
        htmlAutoClose(ctxt, BAD_CAST "p");
1556
1.08M
    htmlCheckImplied(ctxt, BAD_CAST "p");
1557
1.08M
}
1558
1559
/**
1560
 * Check if an attribute is of content type Script
1561
 *
1562
 * @deprecated Only supports HTML 4.
1563
 *
1564
 * @param name  an attribute name
1565
 * @returns 1 is the attribute is a script 0 otherwise
1566
 */
1567
int
1568
0
htmlIsScriptAttribute(const xmlChar *name) {
1569
0
    unsigned int i;
1570
1571
0
    if (name == NULL)
1572
0
      return(0);
1573
    /*
1574
     * all script attributes start with 'on'
1575
     */
1576
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1577
0
      return(0);
1578
0
    for (i = 0;
1579
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1580
0
   i++) {
1581
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1582
0
      return(1);
1583
0
    }
1584
0
    return(0);
1585
0
}
1586
1587
/************************************************************************
1588
 *                  *
1589
 *  The list of HTML predefined entities      *
1590
 *                  *
1591
 ************************************************************************/
1592
1593
1594
static const htmlEntityDesc  html40EntitiesTable[] = {
1595
/*
1596
 * the 4 absolute ones, plus apostrophe.
1597
 */
1598
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1599
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1600
{ 39, "apos", "single quote" },
1601
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1602
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1603
1604
/*
1605
 * A bunch still in the 128-255 range
1606
 * Replacing them depend really on the charset used.
1607
 */
1608
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1609
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1610
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1611
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1612
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1613
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1614
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1615
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1616
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1617
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1618
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1619
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1620
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1621
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1622
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1623
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1624
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1625
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1626
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1627
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1628
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1629
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1630
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1631
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1632
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1633
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1634
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1635
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1636
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1637
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1638
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1639
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1640
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1641
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1642
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1643
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1644
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1645
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1646
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1647
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1648
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1649
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1650
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1651
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1652
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1653
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1654
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1655
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1656
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1657
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1658
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1659
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1660
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1661
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1662
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1663
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1664
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1665
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1666
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1667
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1668
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1669
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1670
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1671
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1672
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1673
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1674
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1675
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1676
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1677
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1678
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1679
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1680
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1681
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1682
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1683
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1684
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1685
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1686
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1687
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1688
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1689
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1690
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1691
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1692
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1693
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1694
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1695
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1696
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1697
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1698
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1699
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1700
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1701
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1702
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1703
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1704
1705
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1706
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1707
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1708
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1709
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1710
1711
/*
1712
 * Anything below should really be kept as entities references
1713
 */
1714
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1715
1716
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1717
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1718
1719
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1720
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1721
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1722
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1723
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1724
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1725
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1726
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1727
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1728
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1729
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1730
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1731
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1732
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1733
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1734
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1735
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1736
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1737
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1738
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1739
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1740
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1741
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1742
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1743
1744
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1745
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1746
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1747
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1748
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1749
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1750
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1751
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1752
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1753
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1754
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1755
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1756
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1757
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1758
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1759
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1760
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1761
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1762
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1763
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1764
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1765
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1766
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1767
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1768
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1769
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1770
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1771
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1772
1773
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1774
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1775
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1776
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1777
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1778
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1779
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1780
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1781
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1782
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1783
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1784
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1785
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1786
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1787
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1788
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1789
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1790
1791
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1792
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1793
1794
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1795
1796
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1797
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1798
1799
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1800
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1801
1802
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1803
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1804
1805
{ 8364, "euro", "euro sign, U+20AC NEW" },
1806
1807
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1808
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1809
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1810
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1811
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1812
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1813
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1814
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1815
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1816
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1817
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1818
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1819
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1820
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1821
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1822
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1823
1824
{ 8704, "forall","for all, U+2200 ISOtech" },
1825
{ 8706, "part", "partial differential, U+2202 ISOtech" },
1826
{ 8707, "exist","there exists, U+2203 ISOtech" },
1827
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1828
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1829
{ 8712, "isin", "element of, U+2208 ISOtech" },
1830
{ 8713, "notin","not an element of, U+2209 ISOtech" },
1831
{ 8715, "ni", "contains as member, U+220B ISOtech" },
1832
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1833
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1834
{ 8722, "minus","minus sign, U+2212 ISOtech" },
1835
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1836
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1837
{ 8733, "prop", "proportional to, U+221D ISOtech" },
1838
{ 8734, "infin","infinity, U+221E ISOtech" },
1839
{ 8736, "ang",  "angle, U+2220 ISOamso" },
1840
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1841
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1842
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1843
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
1844
{ 8747, "int",  "integral, U+222B ISOtech" },
1845
{ 8756, "there4","therefore, U+2234 ISOtech" },
1846
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1847
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1848
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1849
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1850
{ 8801, "equiv","identical to, U+2261 ISOtech" },
1851
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1852
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1853
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
1854
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
1855
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1856
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1857
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1858
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1859
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1860
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1861
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1862
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1863
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1864
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1865
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1866
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1867
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1868
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
1869
1870
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1871
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1872
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1873
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1874
1875
};
1876
1877
/************************************************************************
1878
 *                  *
1879
 *    Commodity functions to handle entities      *
1880
 *                  *
1881
 ************************************************************************/
1882
1883
/**
1884
 * Lookup the given entity in EntitiesTable
1885
 *
1886
 * @deprecated Only supports HTML 4.
1887
 *
1888
 * TODO: the linear scan is really ugly, an hash table is really needed.
1889
 *
1890
 * @param name  the entity name
1891
 * @returns the associated htmlEntityDesc if found, NULL otherwise.
1892
 */
1893
const htmlEntityDesc *
1894
0
htmlEntityLookup(const xmlChar *name) {
1895
0
    unsigned int i;
1896
1897
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
1898
0
                    sizeof(html40EntitiesTable[0]));i++) {
1899
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1900
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1901
0
  }
1902
0
    }
1903
0
    return(NULL);
1904
0
}
1905
1906
static int
1907
13.1M
htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
1908
13.1M
    const unsigned *key = vkey;
1909
13.1M
    const htmlEntityDesc *desc = vdesc;
1910
1911
13.1M
    return((int) *key - (int) desc->value);
1912
13.1M
}
1913
1914
/**
1915
 * Lookup the given entity in EntitiesTable
1916
 *
1917
 * @deprecated Only supports HTML 4.
1918
 *
1919
 * TODO: the linear scan is really ugly, an hash table is really needed.
1920
 *
1921
 * @param value  the entity's unicode value
1922
 * @returns the associated htmlEntityDesc if found, NULL otherwise.
1923
 */
1924
const htmlEntityDesc *
1925
1.74M
htmlEntityValueLookup(unsigned int value) {
1926
1.74M
    const htmlEntityDesc *desc;
1927
1.74M
    size_t nmemb;
1928
1929
1.74M
    nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
1930
1.74M
    desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
1931
1.74M
                   htmlCompareEntityDesc);
1932
1933
1.74M
    return(desc);
1934
1.74M
}
1935
1936
/**
1937
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938
 * plus HTML entities block of chars out.
1939
 *
1940
 * @deprecated Internal function, don't use.
1941
 *
1942
 * @param out  a pointer to an array of bytes to store the result
1943
 * @param outlen  the length of `out`
1944
 * @param in  a pointer to an array of UTF-8 chars
1945
 * @param inlen  the length of `in`
1946
 * @returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1947
 * The value of `inlen` after return is the number of octets consumed
1948
 *     as the return value is positive, else unpredictable.
1949
 * The value of `outlen` after return is the number of octets consumed.
1950
 */
1951
int
1952
htmlUTF8ToHtml(unsigned char* out, int *outlen,
1953
226k
               const unsigned char* in, int *inlen) {
1954
226k
    const unsigned char* instart = in;
1955
226k
    const unsigned char* inend;
1956
226k
    unsigned char* outstart = out;
1957
226k
    unsigned char* outend;
1958
226k
    int ret = XML_ENC_ERR_SPACE;
1959
1960
226k
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
1961
0
        return(XML_ENC_ERR_INTERNAL);
1962
1963
226k
    if (in == NULL) {
1964
        /*
1965
   * initialization nothing to do
1966
   */
1967
10.2k
  *outlen = 0;
1968
10.2k
  *inlen = 0;
1969
10.2k
  return(XML_ENC_ERR_SUCCESS);
1970
10.2k
    }
1971
1972
215k
    inend = in + *inlen;
1973
215k
    outend = out + *outlen;
1974
83.5M
    while (in < inend) {
1975
83.3M
        const htmlEntityDesc *ent;
1976
83.3M
        const char *cp;
1977
83.3M
        char nbuf[16];
1978
83.3M
        unsigned c, d;
1979
83.3M
        int seqlen, len, i;
1980
1981
83.3M
  d = *in;
1982
1983
83.3M
  if (d < 0x80) {
1984
81.6M
            if (out >= outend)
1985
0
                goto done;
1986
81.6M
            *out++ = d;
1987
81.6M
            in += 1;
1988
81.6M
            continue;
1989
81.6M
        }
1990
1991
1.74M
        if (d < 0xE0)      { c = d & 0x1F; seqlen = 2; }
1992
761k
        else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; }
1993
519k
        else               { c = d & 0x07; seqlen = 4; }
1994
1995
1.74M
  if (inend - in < seqlen)
1996
853
      break;
1997
1998
4.77M
  for (i = 1; i < seqlen; i++) {
1999
3.02M
      d = in[i];
2000
3.02M
      c <<= 6;
2001
3.02M
      c |= d & 0x3F;
2002
3.02M
  }
2003
2004
        /*
2005
         * Try to lookup a predefined HTML entity for it
2006
         */
2007
1.74M
        ent = htmlEntityValueLookup(c);
2008
2009
1.74M
        if (ent == NULL) {
2010
1.69M
          snprintf(nbuf, sizeof(nbuf), "#%u", c);
2011
1.69M
          cp = nbuf;
2012
1.69M
        } else {
2013
45.9k
          cp = ent->name;
2014
45.9k
        }
2015
2016
1.74M
        len = strlen(cp);
2017
1.74M
        if (outend - out < len + 2)
2018
0
            goto done;
2019
2020
1.74M
        *out++ = '&';
2021
1.74M
        memcpy(out, cp, len);
2022
1.74M
        out += len;
2023
1.74M
        *out++ = ';';
2024
2025
1.74M
        in += seqlen;
2026
1.74M
    }
2027
2028
215k
    ret = out - outstart;
2029
2030
215k
done:
2031
215k
    *outlen = out - outstart;
2032
215k
    *inlen = in - instart;
2033
215k
    return(ret);
2034
215k
}
2035
2036
/**
2037
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2038
 * plus HTML entities block of chars out.
2039
 *
2040
 * @deprecated Only supports HTML 4.
2041
 *
2042
 * @param out  a pointer to an array of bytes to store the result
2043
 * @param outlen  the length of `out`
2044
 * @param in  a pointer to an array of UTF-8 chars
2045
 * @param inlen  the length of `in`
2046
 * @param quoteChar  the quote character to escape (' or ") or zero.
2047
 * @returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2048
 * The value of `inlen` after return is the number of octets consumed
2049
 *     as the return value is positive, else unpredictable.
2050
 * The value of `outlen` after return is the number of octets consumed.
2051
 */
2052
int
2053
htmlEncodeEntities(unsigned char* out, int *outlen,
2054
0
       const unsigned char* in, int *inlen, int quoteChar) {
2055
0
    const unsigned char* processed = in;
2056
0
    const unsigned char* outend;
2057
0
    const unsigned char* outstart = out;
2058
0
    const unsigned char* instart = in;
2059
0
    const unsigned char* inend;
2060
0
    unsigned int c, d;
2061
0
    int trailing;
2062
2063
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2064
0
        return(-1);
2065
0
    outend = out + (*outlen);
2066
0
    inend = in + (*inlen);
2067
0
    while (in < inend) {
2068
0
  d = *in++;
2069
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2070
0
  else if (d < 0xC0) {
2071
      /* trailing byte in leading position */
2072
0
      *outlen = out - outstart;
2073
0
      *inlen = processed - instart;
2074
0
      return(-2);
2075
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2076
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2077
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2078
0
  else {
2079
      /* no chance for this in Ascii */
2080
0
      *outlen = out - outstart;
2081
0
      *inlen = processed - instart;
2082
0
      return(-2);
2083
0
  }
2084
2085
0
  if (inend - in < trailing)
2086
0
      break;
2087
2088
0
  while (trailing--) {
2089
0
      if (((d= *in++) & 0xC0) != 0x80) {
2090
0
    *outlen = out - outstart;
2091
0
    *inlen = processed - instart;
2092
0
    return(-2);
2093
0
      }
2094
0
      c <<= 6;
2095
0
      c |= d & 0x3F;
2096
0
  }
2097
2098
  /* assertion: c is a single UTF-4 value */
2099
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2100
0
      (c != '&') && (c != '<') && (c != '>')) {
2101
0
      if (out >= outend)
2102
0
    break;
2103
0
      *out++ = c;
2104
0
  } else {
2105
0
      const htmlEntityDesc * ent;
2106
0
      const char *cp;
2107
0
      char nbuf[16];
2108
0
      int len;
2109
2110
      /*
2111
       * Try to lookup a predefined HTML entity for it
2112
       */
2113
0
      ent = htmlEntityValueLookup(c);
2114
0
      if (ent == NULL) {
2115
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2116
0
    cp = nbuf;
2117
0
      }
2118
0
      else
2119
0
    cp = ent->name;
2120
0
      len = strlen(cp);
2121
0
      if (outend - out < len + 2)
2122
0
    break;
2123
0
      *out++ = '&';
2124
0
      memcpy(out, cp, len);
2125
0
      out += len;
2126
0
      *out++ = ';';
2127
0
  }
2128
0
  processed = in;
2129
0
    }
2130
0
    *outlen = out - outstart;
2131
0
    *inlen = processed - instart;
2132
0
    return(0);
2133
0
}
2134
2135
/************************************************************************
2136
 *                  *
2137
 *    Commodity functions, cleanup needed ?     *
2138
 *                  *
2139
 ************************************************************************/
2140
/*
2141
 * all tags allowing pc data from the html 4.01 loose dtd
2142
 * NOTE: it might be more appropriate to integrate this information
2143
 * into the html40ElementTable array but I don't want to risk any
2144
 * binary incompatibility
2145
 */
2146
static const char *allowPCData[] = {
2147
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2148
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2149
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2150
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2151
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2152
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2153
};
2154
2155
/**
2156
 * Is this a sequence of blank chars that one can ignore ?
2157
 *
2158
 * @param ctxt  an HTML parser context
2159
 * @param str  a xmlChar *
2160
 * @param len  the size of `str`
2161
 * @returns 1 if ignorable 0 if whitespace, -1 otherwise.
2162
 */
2163
2164
2.91M
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2165
2.91M
    unsigned int i;
2166
2.91M
    int j;
2167
2.91M
    xmlNodePtr lastChild;
2168
2.91M
    xmlDtdPtr dtd;
2169
2170
4.97M
    for (j = 0;j < len;j++)
2171
4.76M
        if (!(IS_WS_HTML(str[j]))) return(-1);
2172
2173
213k
    if (CUR == 0) return(1);
2174
199k
    if (CUR != '<') return(0);
2175
11.2k
    if (ctxt->name == NULL)
2176
0
  return(1);
2177
11.2k
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2178
0
  return(1);
2179
11.2k
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2180
0
  return(1);
2181
2182
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2183
11.2k
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2184
2.08k
        dtd = xmlGetIntSubset(ctxt->myDoc);
2185
2.08k
        if (dtd != NULL && dtd->ExternalID != NULL) {
2186
487
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2187
487
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2188
263
                return(1);
2189
487
        }
2190
2.08k
    }
2191
2192
11.0k
    if (ctxt->node == NULL) return(0);
2193
10.7k
    lastChild = xmlGetLastChild(ctxt->node);
2194
28.7k
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2195
18.0k
  lastChild = lastChild->prev;
2196
10.7k
    if (lastChild == NULL) {
2197
2.32k
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2198
2.32k
            (ctxt->node->content != NULL)) return(0);
2199
  /* keep ws in constructs like ...<b> </b>...
2200
     for all tags "b" allowing PCDATA */
2201
98.5k
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2202
97.1k
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2203
973
    return(0);
2204
973
      }
2205
97.1k
  }
2206
8.37k
    } else if (xmlNodeIsText(lastChild)) {
2207
5.33k
        return(0);
2208
5.33k
    } else {
2209
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2210
     for all tags "p" allowing PCDATA */
2211
135k
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2212
133k
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2213
1.28k
    return(0);
2214
1.28k
      }
2215
133k
  }
2216
3.03k
    }
2217
3.10k
    return(1);
2218
10.7k
}
2219
2220
/**
2221
 * Creates a new HTML document without a DTD node if `URI` and `publicId`
2222
 * are NULL
2223
 *
2224
 * @param URI  system ID (URI) of the DTD (optional)
2225
 * @param publicId  public ID of the DTD (optional)
2226
 * @returns a new document, do not initialize the DTD if not provided
2227
 */
2228
xmlDoc *
2229
62.8k
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *publicId) {
2230
62.8k
    xmlDocPtr cur;
2231
2232
    /*
2233
     * Allocate a new document and fill the fields.
2234
     */
2235
62.8k
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2236
62.8k
    if (cur == NULL)
2237
207
  return(NULL);
2238
62.6k
    memset(cur, 0, sizeof(xmlDoc));
2239
2240
62.6k
    cur->type = XML_HTML_DOCUMENT_NODE;
2241
62.6k
    cur->version = NULL;
2242
62.6k
    cur->intSubset = NULL;
2243
62.6k
    cur->doc = cur;
2244
62.6k
    cur->name = NULL;
2245
62.6k
    cur->children = NULL;
2246
62.6k
    cur->extSubset = NULL;
2247
62.6k
    cur->oldNs = NULL;
2248
62.6k
    cur->encoding = NULL;
2249
62.6k
    cur->standalone = 1;
2250
62.6k
    cur->compression = 0;
2251
62.6k
    cur->ids = NULL;
2252
62.6k
    cur->refs = NULL;
2253
62.6k
    cur->_private = NULL;
2254
62.6k
    cur->charset = XML_CHAR_ENCODING_UTF8;
2255
62.6k
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2256
62.6k
    if ((publicId != NULL) ||
2257
62.6k
  (URI != NULL)) {
2258
37.2k
        xmlDtdPtr intSubset;
2259
2260
37.2k
  intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", publicId, URI);
2261
37.2k
        if (intSubset == NULL) {
2262
10
            xmlFree(cur);
2263
10
            return(NULL);
2264
10
        }
2265
37.2k
    }
2266
62.6k
    if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2267
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2268
62.6k
    return(cur);
2269
62.6k
}
2270
2271
/**
2272
 * Creates a new HTML document
2273
 *
2274
 * @param URI  system ID (URI) of the DTD (optional)
2275
 * @param publicId  public ID of the DTD (optional)
2276
 * @returns a new document
2277
 */
2278
xmlDoc *
2279
22.3k
htmlNewDoc(const xmlChar *URI, const xmlChar *publicId) {
2280
22.3k
    if ((URI == NULL) && (publicId == NULL))
2281
7.90k
  return(htmlNewDocNoDtD(
2282
7.90k
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2283
7.90k
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2284
2285
14.4k
    return(htmlNewDocNoDtD(URI, publicId));
2286
22.3k
}
2287
2288
2289
/************************************************************************
2290
 *                  *
2291
 *      The parser itself       *
2292
 *  Relates to http://www.w3.org/TR/html40        *
2293
 *                  *
2294
 ************************************************************************/
2295
2296
/************************************************************************
2297
 *                  *
2298
 *      The parser itself       *
2299
 *                  *
2300
 ************************************************************************/
2301
2302
/**
2303
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2304
 * since HTML names are not case-sensitive.
2305
 *
2306
 * @param ctxt  an HTML parser context
2307
 * @param attr  whether this is an attribute name
2308
 * @returns the Tag Name parsed or NULL
2309
 */
2310
2311
static xmlHashedString
2312
4.86M
htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
2313
4.86M
    xmlHashedString ret;
2314
4.86M
    xmlChar buf[HTML_PARSER_BUFFER_SIZE];
2315
4.86M
    const xmlChar *in;
2316
4.86M
    size_t avail;
2317
4.86M
    int eof = PARSER_PROGRESSIVE(ctxt);
2318
4.86M
    int nbchar = 0;
2319
4.86M
    int stop = attr ? '=' : ' ';
2320
2321
4.86M
    in = ctxt->input->cur;
2322
4.86M
    avail = ctxt->input->end - in;
2323
2324
40.3M
    while (1) {
2325
40.3M
        int c, size;
2326
2327
40.3M
        if ((!eof) && (avail < 32)) {
2328
19.3k
            size_t oldAvail = avail;
2329
2330
19.3k
            ctxt->input->cur = in;
2331
2332
19.3k
            SHRINK;
2333
19.3k
            xmlParserGrow(ctxt);
2334
2335
19.3k
            in = ctxt->input->cur;
2336
19.3k
            avail = ctxt->input->end - in;
2337
2338
19.3k
            if (oldAvail == avail)
2339
17.1k
                eof = 1;
2340
19.3k
        }
2341
2342
40.3M
        if (avail == 0)
2343
2.63k
            break;
2344
2345
40.3M
        c = *in;
2346
40.3M
        size = 1;
2347
2348
40.3M
        if ((nbchar != 0) &&
2349
40.3M
            ((c == '/') || (c == '>') || (c == stop) ||
2350
35.5M
             (IS_WS_HTML(c))))
2351
4.85M
            break;
2352
2353
35.5M
        if (c == 0) {
2354
1.49M
            if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2355
801k
                buf[nbchar++] = 0xEF;
2356
801k
                buf[nbchar++] = 0xBF;
2357
801k
                buf[nbchar++] = 0xBD;
2358
801k
            }
2359
34.0M
        } else if (c < 0x80) {
2360
16.4M
            if (nbchar < HTML_PARSER_BUFFER_SIZE) {
2361
15.1M
                if (IS_UPPER(c))
2362
2.12M
                    c += 0x20;
2363
15.1M
                buf[nbchar++] = c;
2364
15.1M
            }
2365
17.5M
        } else {
2366
17.5M
            size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2367
2368
17.5M
            if (size > 0) {
2369
17.2M
                if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
2370
1.57M
                    memcpy(buf + nbchar, in, size);
2371
1.57M
                    nbchar += size;
2372
1.57M
                }
2373
17.2M
            } else {
2374
352k
                size = 1;
2375
2376
352k
                if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2377
108k
                    buf[nbchar++] = 0xEF;
2378
108k
                    buf[nbchar++] = 0xBF;
2379
108k
                    buf[nbchar++] = 0xBD;
2380
108k
                }
2381
352k
            }
2382
17.5M
        }
2383
2384
35.5M
        in += size;
2385
35.5M
        avail -= size;
2386
35.5M
    }
2387
2388
4.86M
    ctxt->input->cur = in;
2389
2390
4.86M
    SHRINK;
2391
2392
4.86M
    ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
2393
4.86M
    if (ret.name == NULL)
2394
129
        htmlErrMemory(ctxt);
2395
2396
4.86M
    return(ret);
2397
4.86M
}
2398
2399
static const short htmlC1Remap[32] = {
2400
    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
2401
    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
2402
    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
2403
    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
2404
};
2405
2406
static const xmlChar *
2407
63.6k
htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
2408
63.6k
    int i = 0;
2409
63.6k
    int bits, hi;
2410
2411
63.6k
    if ((c >= 0x80) && (c < 0xA0)) {
2412
499
        c = htmlC1Remap[c - 0x80];
2413
63.1k
    } else if ((c <= 0) ||
2414
63.1k
               ((c >= 0xD800) && (c < 0xE000)) ||
2415
63.1k
               (c > 0x10FFFF)) {
2416
9.36k
        c = 0xFFFD;
2417
9.36k
    }
2418
2419
63.6k
    if      (c <    0x80) { bits =  0; hi = 0x00; }
2420
18.9k
    else if (c <   0x800) { bits =  6; hi = 0xC0; }
2421
12.7k
    else if (c < 0x10000) { bits = 12; hi = 0xE0; }
2422
2.00k
    else                  { bits = 18; hi = 0xF0; }
2423
2424
63.6k
    out[i++] = (c >> bits) | hi;
2425
2426
97.3k
    while (bits > 0) {
2427
33.6k
        bits -= 6;
2428
33.6k
        out[i++] = ((c >> bits) & 0x3F) | 0x80;
2429
33.6k
    }
2430
2431
63.6k
    *osize = i;
2432
63.6k
    return(out);
2433
63.6k
}
2434
2435
#include "codegen/html5ent.inc"
2436
2437
48.3k
#define ENT_F_SEMICOLON 0x80u
2438
82.2k
#define ENT_F_SUBTABLE  0x40u
2439
2.04M
#define ENT_F_ALL       0xC0u
2440
2441
static const xmlChar *
2442
htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
2443
662k
                     int *nlen, int *rlen) {
2444
662k
    const xmlChar *match = NULL;
2445
662k
    unsigned left, right;
2446
662k
    int first = string[0];
2447
662k
    size_t matchLen = 0;
2448
662k
    size_t soff = 1;
2449
2450
662k
    if (slen < 2)
2451
948
        return(NULL);
2452
661k
    if (!IS_ASCII_LETTER(first))
2453
170k
        return(NULL);
2454
2455
    /*
2456
     * Look up range by first character
2457
     */
2458
490k
    first &= 63;
2459
490k
    left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
2460
490k
    right = left + htmlEntAlpha[first*3+2];
2461
2462
    /*
2463
     * Binary search
2464
     */
2465
2.47M
    while (left < right) {
2466
2.04M
        const xmlChar *bytes;
2467
2.04M
        unsigned mid;
2468
2.04M
        size_t len;
2469
2.04M
        int cmp;
2470
2471
2.04M
        mid = left + (right - left) / 2;
2472
2.04M
        bytes = htmlEntStrings + htmlEntValues[mid];
2473
2.04M
        len = bytes[0] & ~ENT_F_ALL;
2474
2475
2.04M
        cmp = string[soff] - bytes[1];
2476
2477
2.04M
        if (cmp == 0) {
2478
161k
            if (slen < len) {
2479
1.23k
                cmp = strncmp((const char *) string + soff + 1,
2480
1.23k
                              (const char *) bytes + 2,
2481
1.23k
                              slen - 1);
2482
                /* Prefix can never match */
2483
1.23k
                if (cmp == 0)
2484
0
                    break;
2485
160k
            } else {
2486
160k
                cmp = strncmp((const char *) string + soff + 1,
2487
160k
                              (const char *) bytes + 2,
2488
160k
                              len - 1);
2489
160k
            }
2490
161k
        }
2491
2492
2.04M
        if (cmp < 0) {
2493
1.81M
            right = mid;
2494
1.81M
        } else if (cmp > 0) {
2495
152k
            left = mid + 1;
2496
152k
        } else {
2497
82.2k
            int term = soff + len < slen ? string[soff + len] : 0;
2498
82.2k
            int isAlnum, isTerm;
2499
2500
82.2k
            isAlnum = IS_ALNUM(term);
2501
82.2k
            isTerm = ((term == ';') ||
2502
82.2k
                      ((bytes[0] & ENT_F_SEMICOLON) &&
2503
48.3k
                       ((!isAttr) ||
2504
4.29k
                        ((!isAlnum) && (term != '=')))));
2505
2506
82.2k
            if (isTerm) {
2507
37.8k
                match = bytes + len + 1;
2508
37.8k
                matchLen = soff + len;
2509
37.8k
                if (term == ';')
2510
33.9k
                    matchLen += 1;
2511
37.8k
            }
2512
2513
82.2k
            if (bytes[0] & ENT_F_SUBTABLE) {
2514
30.6k
                if (isTerm)
2515
8.70k
                    match += 2;
2516
2517
30.6k
                if ((isAlnum) && (soff + len < slen)) {
2518
18.7k
                    left = mid + bytes[len + 1];
2519
18.7k
                    right = left + bytes[len + 2];
2520
18.7k
                    soff += len;
2521
18.7k
                    continue;
2522
18.7k
                }
2523
30.6k
            }
2524
2525
63.4k
            break;
2526
82.2k
        }
2527
2.04M
    }
2528
2529
490k
    if (match == NULL)
2530
452k
        return(NULL);
2531
2532
37.7k
    *nlen = matchLen;
2533
37.7k
    *rlen = match[0];
2534
37.7k
    return(match + 1);
2535
490k
}
2536
2537
/**
2538
 * Parse data until terminator is reached.
2539
 *
2540
 * @param ctxt  an HTML parser context
2541
 * @param mask  mask of terminating characters
2542
 * @param comment  true if parsing a comment
2543
 * @param refs  true if references are allowed
2544
 * @param maxLength  maximum output length
2545
 * @returns the parsed string or NULL in case of errors.
2546
 */
2547
2548
static xmlChar *
2549
htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
2550
327k
              int comment, int refs, int maxLength) {
2551
327k
    xmlParserInputPtr input = ctxt->input;
2552
327k
    xmlChar *ret = NULL;
2553
327k
    xmlChar *buffer;
2554
327k
    xmlChar utf8Char[4];
2555
327k
    size_t buffer_size;
2556
327k
    size_t used;
2557
327k
    int eof = PARSER_PROGRESSIVE(ctxt);
2558
327k
    int line, col;
2559
327k
    int termSkip = -1;
2560
2561
327k
    used = 0;
2562
327k
    buffer_size = ctxt->spaceMax;
2563
327k
    buffer = (xmlChar *) ctxt->spaceTab;
2564
327k
    if (buffer == NULL) {
2565
9.78k
        buffer_size = 500;
2566
9.78k
        buffer = xmlMalloc(buffer_size + 1);
2567
9.78k
        if (buffer == NULL) {
2568
159
            htmlErrMemory(ctxt);
2569
159
            return(NULL);
2570
159
        }
2571
9.78k
    }
2572
2573
327k
    line = input->line;
2574
327k
    col = input->col;
2575
2576
22.9M
    while (!PARSER_STOPPED(ctxt)) {
2577
22.9M
        const xmlChar *chunk, *in, *repl;
2578
22.9M
        size_t avail, chunkSize, extraSize;
2579
22.9M
        int replSize;
2580
22.9M
        int skip = 0;
2581
22.9M
        int ncr = 0;
2582
22.9M
        int ncrSize = 0;
2583
22.9M
        int cp = 0;
2584
2585
22.9M
        chunk = input->cur;
2586
22.9M
        avail = input->end - chunk;
2587
22.9M
        in = chunk;
2588
2589
22.9M
        repl = BAD_CAST "";
2590
22.9M
        replSize = 0;
2591
2592
120M
        while (!PARSER_STOPPED(ctxt)) {
2593
120M
            size_t j;
2594
120M
            int cur, size;
2595
2596
120M
            if ((!eof) && (avail <= 64)) {
2597
24.7k
                size_t oldAvail = avail;
2598
24.7k
                size_t off = in - chunk;
2599
2600
24.7k
                input->cur = in;
2601
2602
24.7k
                xmlParserGrow(ctxt);
2603
2604
24.7k
                in = input->cur;
2605
24.7k
                chunk = in - off;
2606
24.7k
                input->cur = chunk;
2607
24.7k
                avail = input->end - in;
2608
2609
24.7k
                if (oldAvail == avail)
2610
10.1k
                    eof = 1;
2611
24.7k
            }
2612
2613
120M
            if (avail == 0) {
2614
3.26k
                termSkip = 0;
2615
3.26k
                break;
2616
3.26k
            }
2617
2618
120M
            cur = *in;
2619
120M
            size = 1;
2620
120M
            col += 1;
2621
2622
120M
            if (htmlMaskMatch(mask, cur)) {
2623
467k
                if (comment) {
2624
158k
                    if (avail < 2) {
2625
87
                        termSkip = 1;
2626
158k
                    } else if (in[1] == '-') {
2627
126k
                        if  (avail < 3) {
2628
19
                            termSkip = 2;
2629
126k
                        } else if (in[2] == '>') {
2630
12.7k
                            termSkip = 3;
2631
113k
                        } else if (in[2] == '!') {
2632
5.91k
                            if (avail < 4)
2633
5
                                termSkip = 3;
2634
5.90k
                            else if (in[3] == '>')
2635
2.06k
                                termSkip = 4;
2636
5.91k
                        }
2637
126k
                    }
2638
2639
158k
                    if (termSkip >= 0)
2640
14.9k
                        break;
2641
308k
                } else {
2642
308k
                    termSkip = 0;
2643
308k
                    break;
2644
308k
                }
2645
467k
            }
2646
2647
120M
            if (ncr) {
2648
156k
                int lc = cur | 0x20;
2649
156k
                int digit;
2650
2651
156k
                if ((cur >= '0') && (cur <= '9')) {
2652
24.2k
                    digit = cur - '0';
2653
132k
                } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
2654
122k
                    digit = (lc - 'a') + 10;
2655
122k
                } else {
2656
10.3k
                    if (cur == ';') {
2657
5.03k
                        in += 1;
2658
5.03k
                        size += 1;
2659
5.03k
                        ncrSize += 1;
2660
5.03k
                    }
2661
10.3k
                    goto next_chunk;
2662
10.3k
                }
2663
2664
146k
                cp = cp * ncr + digit;
2665
146k
                if (cp >= 0x110000)
2666
117k
                    cp = 0x110000;
2667
2668
146k
                ncrSize += 1;
2669
2670
146k
                goto next_char;
2671
156k
            }
2672
2673
120M
            switch (cur) {
2674
77.4k
            case '&':
2675
77.4k
                if (!refs)
2676
32.6k
                    break;
2677
2678
44.7k
                j = 1;
2679
2680
44.7k
                if ((j < avail) && (in[j] == '#')) {
2681
25.7k
                    j += 1;
2682
25.7k
                    if (j < avail) {
2683
25.7k
                        if ((in[j] | 0x20) == 'x') {
2684
15.4k
                            j += 1;
2685
15.4k
                            if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
2686
2.99k
                                ncr = 16;
2687
2.99k
                                size = 3;
2688
2.99k
                                ncrSize = 3;
2689
2.99k
                                cp = 0;
2690
2.99k
                            }
2691
15.4k
                        } else if (IS_ASCII_DIGIT(in[j])) {
2692
7.70k
                            ncr = 10;
2693
7.70k
                            size = 2;
2694
7.70k
                            ncrSize = 2;
2695
7.70k
                            cp = 0;
2696
7.70k
                        }
2697
25.7k
                    }
2698
25.7k
                } else {
2699
19.0k
                    repl = htmlFindEntityPrefix(in + j,
2700
19.0k
                                                avail - j,
2701
19.0k
                                                /* isAttr */ 1,
2702
19.0k
                                                &skip, &replSize);
2703
19.0k
                    if (repl != NULL) {
2704
3.65k
                        skip += 1;
2705
3.65k
                        goto next_chunk;
2706
3.65k
                    }
2707
2708
15.4k
                    skip = 0;
2709
15.4k
                }
2710
2711
41.1k
                break;
2712
2713
20.3M
            case '\0':
2714
20.3M
                skip = 1;
2715
20.3M
                repl = BAD_CAST "\xEF\xBF\xBD";
2716
20.3M
                replSize = 3;
2717
20.3M
                goto next_chunk;
2718
2719
5.61M
            case '\n':
2720
5.61M
                line += 1;
2721
5.61M
                col = 1;
2722
5.61M
                break;
2723
2724
55.9k
            case '\r':
2725
55.9k
                skip = 1;
2726
55.9k
                if (in[1] != 0x0A) {
2727
55.1k
                    repl = BAD_CAST "\x0A";
2728
55.1k
                    replSize = 1;
2729
55.1k
                }
2730
55.9k
                goto next_chunk;
2731
2732
94.0M
            default:
2733
94.0M
                if (cur < 0x80)
2734
10.6M
                    break;
2735
2736
83.3M
                if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
2737
3.55k
                    xmlChar * guess;
2738
2739
3.55k
                    if (in > chunk)
2740
1.26k
                        goto next_chunk;
2741
2742
2.29k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2743
2.29k
                    guess = NULL;
2744
#else
2745
                    guess = htmlFindEncoding(ctxt);
2746
#endif
2747
2.29k
                    if (guess == NULL) {
2748
2.29k
                        xmlSwitchEncoding(ctxt,
2749
2.29k
                                XML_CHAR_ENCODING_WINDOWS_1252);
2750
2.29k
                    } else {
2751
0
                        xmlSwitchEncodingName(ctxt, (const char *) guess);
2752
0
                        xmlFree(guess);
2753
0
                    }
2754
2.29k
                    input->flags |= XML_INPUT_HAS_ENCODING;
2755
2756
2.29k
                    eof = PARSER_PROGRESSIVE(ctxt);
2757
2.29k
                    goto restart;
2758
3.55k
                }
2759
2760
83.3M
                size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2761
2762
83.3M
                if (size <= 0) {
2763
2.16M
                    skip = 1;
2764
2.16M
                    repl = BAD_CAST "\xEF\xBF\xBD";
2765
2.16M
                    replSize = 3;
2766
2.16M
                    goto next_chunk;
2767
2.16M
                }
2768
2769
81.1M
                break;
2770
120M
            }
2771
2772
97.6M
next_char:
2773
97.6M
            in += size;
2774
97.6M
            avail -= size;
2775
97.6M
        }
2776
2777
22.9M
next_chunk:
2778
22.9M
        if (ncrSize > 0) {
2779
10.6k
            skip = ncrSize;
2780
10.6k
            in -= ncrSize;
2781
2782
10.6k
            repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
2783
10.6k
        }
2784
2785
22.9M
        chunkSize = in - chunk;
2786
22.9M
        extraSize = chunkSize + replSize;
2787
2788
22.9M
        if (extraSize > maxLength - used) {
2789
290
            htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
2790
290
                         "value too long\n", NULL, NULL);
2791
290
            goto error;
2792
290
        }
2793
2794
22.9M
        if (extraSize > buffer_size - used) {
2795
4.32k
            size_t newSize = (used + extraSize) * 2;
2796
4.32k
            xmlChar *tmp = xmlRealloc(buffer, newSize + 1);
2797
2798
4.32k
            if (tmp == NULL) {
2799
11
                htmlErrMemory(ctxt);
2800
11
                goto error;
2801
11
            }
2802
4.31k
            buffer = tmp;
2803
4.31k
            buffer_size = newSize;
2804
4.31k
        }
2805
2806
22.9M
        if (chunkSize > 0) {
2807
655k
            input->cur += chunkSize;
2808
655k
            memcpy(buffer + used, chunk, chunkSize);
2809
655k
            used += chunkSize;
2810
655k
        }
2811
2812
22.9M
        input->cur += skip;
2813
22.9M
        if (replSize > 0) {
2814
22.6M
            memcpy(buffer + used, repl, replSize);
2815
22.6M
            used += replSize;
2816
22.6M
        }
2817
2818
22.9M
        SHRINK;
2819
2820
22.9M
        if (termSkip >= 0)
2821
326k
            break;
2822
2823
22.6M
restart:
2824
22.6M
        ;
2825
22.6M
    }
2826
2827
327k
    if (termSkip > 0) {
2828
14.9k
        input->cur += termSkip;
2829
14.9k
        col += termSkip;
2830
14.9k
    }
2831
2832
327k
    input->line = line;
2833
327k
    input->col = col;
2834
2835
327k
    ret = xmlMalloc(used + 1);
2836
327k
    if (ret == NULL) {
2837
268
        htmlErrMemory(ctxt);
2838
327k
    } else {
2839
327k
        memcpy(ret, buffer, used);
2840
327k
        ret[used] = 0;
2841
327k
    }
2842
2843
327k
error:
2844
327k
    ctxt->spaceTab = (void *) buffer;
2845
327k
    ctxt->spaceMax = buffer_size;
2846
2847
327k
    return(ret);
2848
327k
}
2849
2850
/**
2851
 * @deprecated Internal function, don't use.
2852
 *
2853
 * @param ctxt  an HTML parser context
2854
 * @param str  location to store the entity name
2855
 * @returns NULL.
2856
 */
2857
const htmlEntityDesc *
2858
htmlParseEntityRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED,
2859
0
                   const xmlChar **str ATTRIBUTE_UNUSED) {
2860
0
    return(NULL);
2861
0
}
2862
2863
/**
2864
 * parse a value for an attribute
2865
 * Note: the parser won't do substitution of entities here, this
2866
 * will be handled later in #xmlStringGetNodeList, unless it was
2867
 * asked for ctxt->replaceEntities != 0
2868
 *
2869
 * @param ctxt  an HTML parser context
2870
 * @returns the AttValue parsed or NULL.
2871
 */
2872
2873
static xmlChar *
2874
191k
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2875
191k
    xmlChar *ret = NULL;
2876
191k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
2877
140k
                    XML_MAX_HUGE_LENGTH :
2878
191k
                    XML_MAX_TEXT_LENGTH;
2879
2880
191k
    if (CUR == '"') {
2881
19.2k
        SKIP(1);
2882
19.2k
  ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
2883
19.2k
        if (CUR == '"')
2884
19.1k
            SKIP(1);
2885
171k
    } else if (CUR == '\'') {
2886
1.58k
        SKIP(1);
2887
1.58k
  ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
2888
1.58k
        if (CUR == '\'')
2889
1.52k
            SKIP(1);
2890
170k
    } else {
2891
170k
  ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
2892
170k
    }
2893
191k
    return(ret);
2894
191k
}
2895
2896
static void
2897
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
2898
15.5M
                        int size, int mode) {
2899
15.5M
    if ((ctxt->sax == NULL) || (ctxt->disableSAX))
2900
6.23k
        return;
2901
2902
15.5M
    if ((mode == 0) || (mode == DATA_RCDATA) ||
2903
15.5M
        (ctxt->sax->cdataBlock == NULL)) {
2904
9.41M
        if ((ctxt->name == NULL) ||
2905
9.41M
            (xmlStrEqual(ctxt->name, BAD_CAST "html")) ||
2906
9.41M
            (xmlStrEqual(ctxt->name, BAD_CAST "head"))) {
2907
823k
            int i;
2908
2909
            /*
2910
             * Add leading whitespace to html or head elements before
2911
             * calling htmlStartCharData.
2912
             */
2913
1.51M
            for (i = 0; i < size; i++)
2914
1.46M
                if (!IS_WS_HTML(buf[i]))
2915
766k
                    break;
2916
2917
823k
            if (i > 0) {
2918
65.7k
                if (!ctxt->keepBlanks) {
2919
57.2k
                    if (ctxt->sax->ignorableWhitespace != NULL)
2920
57.2k
                        ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i);
2921
57.2k
                } else {
2922
8.52k
                    if (ctxt->sax->characters != NULL)
2923
8.52k
                        ctxt->sax->characters(ctxt->userData, buf, i);
2924
8.52k
                }
2925
2926
65.7k
                buf += i;
2927
65.7k
                size -= i;
2928
65.7k
            }
2929
2930
823k
            if (size <= 0)
2931
56.7k
                return;
2932
2933
766k
            htmlStartCharData(ctxt);
2934
2935
766k
            if (PARSER_STOPPED(ctxt))
2936
83
                return;
2937
766k
        }
2938
2939
9.35M
        if ((mode == 0) &&
2940
9.35M
            (!ctxt->keepBlanks) &&
2941
9.35M
            (areBlanks(ctxt, buf, size) > 0)) {
2942
18.2k
            if (ctxt->sax->ignorableWhitespace != NULL)
2943
18.2k
                ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size);
2944
9.33M
        } else {
2945
9.33M
            if (ctxt->sax->characters != NULL)
2946
9.33M
                ctxt->sax->characters(ctxt->userData, buf, size);
2947
9.33M
        }
2948
9.35M
    } else {
2949
        /*
2950
         * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2951
         */
2952
6.09M
        ctxt->sax->cdataBlock(ctxt->userData, buf, size);
2953
6.09M
    }
2954
15.5M
}
2955
2956
/**
2957
 * Parse character data and references.
2958
 *
2959
 * @param ctxt  an HTML parser context
2960
 * @param partial  true if the input buffer is incomplete
2961
 * @returns 1 if all data was parsed, 0 otherwise.
2962
 */
2963
2964
static int
2965
387k
htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
2966
387k
    xmlParserInputPtr input = ctxt->input;
2967
387k
    xmlChar utf8Char[4];
2968
387k
    int complete = 0;
2969
387k
    int done = 0;
2970
387k
    int mode;
2971
387k
    int eof = PARSER_PROGRESSIVE(ctxt);
2972
387k
    int line, col;
2973
2974
387k
    mode = ctxt->endCheckState;
2975
2976
387k
    line = input->line;
2977
387k
    col = input->col;
2978
2979
46.6M
    while (!PARSER_STOPPED(ctxt)) {
2980
46.6M
        const xmlChar *chunk, *in, *repl;
2981
46.6M
        size_t avail;
2982
46.6M
        int replSize;
2983
46.6M
        int skip = 0;
2984
46.6M
        int ncr = 0;
2985
46.6M
        int ncrSize = 0;
2986
46.6M
        int cp = 0;
2987
2988
46.6M
        chunk = input->cur;
2989
46.6M
        avail = input->end - chunk;
2990
46.6M
        in = chunk;
2991
2992
46.6M
        repl = BAD_CAST "";
2993
46.6M
        replSize = 0;
2994
2995
176M
        while (!PARSER_STOPPED(ctxt)) {
2996
176M
            size_t j;
2997
176M
            int cur, size;
2998
2999
176M
            if (avail <= 64) {
3000
1.08M
                if (!eof) {
3001
37.0k
                    size_t oldAvail = avail;
3002
37.0k
                    size_t off = in - chunk;
3003
3004
37.0k
                    input->cur = in;
3005
3006
37.0k
                    xmlParserGrow(ctxt);
3007
3008
37.0k
                    in = input->cur;
3009
37.0k
                    chunk = in - off;
3010
37.0k
                    input->cur = chunk;
3011
37.0k
                    avail = input->end - in;
3012
3013
37.0k
                    if (oldAvail == avail)
3014
9.16k
                        eof = 1;
3015
37.0k
                }
3016
3017
1.08M
                if (avail == 0) {
3018
62.9k
                    if ((partial) && (ncr)) {
3019
485
                        in -= ncrSize;
3020
485
                        ncrSize = 0;
3021
485
                    }
3022
3023
62.9k
                    done = 1;
3024
62.9k
                    break;
3025
62.9k
                }
3026
1.08M
            }
3027
3028
            /* Accelerator */
3029
176M
            if (!ncr) {
3030
224M
                while (avail > 0) {
3031
224M
                    static const unsigned mask[8] = {
3032
224M
                        0x00002401, 0x10002040,
3033
224M
                        0x00000000, 0x00000000,
3034
224M
                        0xFFFFFFFF, 0xFFFFFFFF,
3035
224M
                        0xFFFFFFFF, 0xFFFFFFFF
3036
224M
                    };
3037
224M
                    cur = *in;
3038
224M
                    if ((1u << (cur & 0x1F)) & mask[cur >> 5])
3039
175M
                        break;
3040
49.4M
                    col += 1;
3041
49.4M
                    in += 1;
3042
49.4M
                    avail -= 1;
3043
49.4M
                }
3044
3045
175M
                if ((!eof) && (avail <= 64))
3046
4.86k
                    continue;
3047
175M
                if (avail == 0)
3048
9.87k
                    continue;
3049
175M
            }
3050
3051
176M
            cur = *in;
3052
176M
            size = 1;
3053
176M
            col += 1;
3054
3055
176M
            if (ncr) {
3056
1.18M
                int lc = cur | 0x20;
3057
1.18M
                int digit;
3058
3059
1.18M
                if ((cur >= '0') && (cur <= '9')) {
3060
416k
                    digit = cur - '0';
3061
768k
                } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
3062
715k
                    digit = (lc - 'a') + 10;
3063
715k
                } else {
3064
52.6k
                    if (cur == ';') {
3065
21.7k
                        in += 1;
3066
21.7k
                        size += 1;
3067
21.7k
                        ncrSize += 1;
3068
21.7k
                    }
3069
52.6k
                    goto next_chunk;
3070
52.6k
                }
3071
3072
1.13M
                cp = cp * ncr + digit;
3073
1.13M
                if (cp >= 0x110000)
3074
1.04M
                    cp = 0x110000;
3075
3076
1.13M
                ncrSize += 1;
3077
3078
1.13M
                goto next_char;
3079
1.18M
            }
3080
3081
175M
            switch (cur) {
3082
622k
            case '<':
3083
622k
                if (mode == 0) {
3084
288k
                    done = 1;
3085
288k
                    complete = 1;
3086
288k
                    goto next_chunk;
3087
288k
                }
3088
333k
                if (mode == DATA_PLAINTEXT)
3089
3.74k
                    break;
3090
3091
329k
                j = 1;
3092
329k
                if (j < avail) {
3093
328k
                    if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
3094
                        /* Check for comment start */
3095
3096
48.8k
                        j += 1;
3097
48.8k
                        if ((j < avail) && (in[j] == '-')) {
3098
44.9k
                            j += 1;
3099
44.9k
                            if ((j < avail) && (in[j] == '-'))
3100
43.2k
                                mode = DATA_SCRIPT_ESC1;
3101
44.9k
                        }
3102
279k
                    } else {
3103
279k
                        int i = 0;
3104
279k
                        int solidus = 0;
3105
3106
                        /* Check for tag */
3107
3108
279k
                        if (in[j] == '/') {
3109
43.1k
                            j += 1;
3110
43.1k
                            solidus = 1;
3111
43.1k
                        }
3112
3113
279k
                        if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
3114
168k
                            while ((j < avail) &&
3115
168k
                                   (ctxt->name[i] != 0) &&
3116
168k
                                   (ctxt->name[i] == (in[j] | 0x20))) {
3117
89.1k
                                i += 1;
3118
89.1k
                                j += 1;
3119
89.1k
                            }
3120
3121
79.7k
                            if ((ctxt->name[i] == 0) && (j < avail)) {
3122
14.7k
                                int c = in[j];
3123
3124
14.7k
                                if ((c == '>') || (c == '/') ||
3125
14.7k
                                    (IS_WS_HTML(c))) {
3126
9.08k
                                    if ((mode == DATA_SCRIPT_ESC1) &&
3127
9.08k
                                        (!solidus)) {
3128
1.46k
                                        mode = DATA_SCRIPT_ESC2;
3129
7.61k
                                    } else if (mode == DATA_SCRIPT_ESC2) {
3130
1.25k
                                        mode = DATA_SCRIPT_ESC1;
3131
6.35k
                                    } else {
3132
6.35k
                                        complete = 1;
3133
6.35k
                                        done = 1;
3134
6.35k
                                        goto next_chunk;
3135
6.35k
                                    }
3136
9.08k
                                }
3137
14.7k
                            }
3138
79.7k
                        }
3139
279k
                    }
3140
328k
                }
3141
3142
323k
                if ((partial) && (j >= avail)) {
3143
3.21k
                    done = 1;
3144
3.21k
                    goto next_chunk;
3145
3.21k
                }
3146
3147
320k
                break;
3148
3149
395k
            case '-':
3150
395k
                if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
3151
244k
                    break;
3152
3153
                /* Check for comment end */
3154
3155
151k
                j = 1;
3156
151k
                if ((j < avail) && (in[j] == '-')) {
3157
137k
                    j += 1;
3158
137k
                    if ((j < avail) && (in[j] == '>'))
3159
42.4k
                        mode = DATA_SCRIPT;
3160
137k
                }
3161
3162
151k
                if ((partial) && (j >= avail)) {
3163
953
                    done = 1;
3164
953
                    goto next_chunk;
3165
953
                }
3166
3167
150k
                break;
3168
3169
807k
            case '&':
3170
807k
                if ((mode != 0) && (mode != DATA_RCDATA))
3171
68.3k
                    break;
3172
3173
739k
                j = 1;
3174
3175
739k
                if ((j < avail) && (in[j] == '#')) {
3176
83.3k
                    j += 1;
3177
83.3k
                    if (j < avail) {
3178
82.9k
                        if ((in[j] | 0x20) == 'x') {
3179
55.5k
                            j += 1;
3180
55.5k
                            if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
3181
33.6k
                                ncr = 16;
3182
33.6k
                                size = 3;
3183
33.6k
                                ncrSize = 3;
3184
33.6k
                                cp = 0;
3185
33.6k
                            }
3186
55.5k
                        } else if (IS_ASCII_DIGIT(in[j])) {
3187
19.8k
                            ncr = 10;
3188
19.8k
                            size = 2;
3189
19.8k
                            ncrSize = 2;
3190
19.8k
                            cp = 0;
3191
19.8k
                        }
3192
82.9k
                    }
3193
655k
                } else {
3194
655k
                    if (partial) {
3195
289k
                        int terminated = 0;
3196
289k
                        size_t i;
3197
3198
                        /*
3199
                         * &CounterClockwiseContourIntegral; has 33 bytes.
3200
                         */
3201
673k
                        for (i = 1; i < avail; i++) {
3202
660k
                            if ((i >= 32) ||
3203
660k
                                (!IS_ASCII_LETTER(in[i]) &&
3204
660k
                                 ((i < 2) || !IS_ASCII_DIGIT(in[i])))) {
3205
277k
                                terminated = 1;
3206
277k
                                break;
3207
277k
                            }
3208
660k
                        }
3209
3210
289k
                        if (!terminated) {
3211
12.6k
                            done = 1;
3212
12.6k
                            goto next_chunk;
3213
12.6k
                        }
3214
289k
                    }
3215
3216
643k
                    repl = htmlFindEntityPrefix(in + j,
3217
643k
                                                avail - j,
3218
643k
                                                /* isAttr */ 0,
3219
643k
                                                &skip, &replSize);
3220
643k
                    if (repl != NULL) {
3221
34.0k
                        skip += 1;
3222
34.0k
                        goto next_chunk;
3223
34.0k
                    }
3224
3225
609k
                    skip = 0;
3226
609k
                }
3227
3228
692k
                if ((partial) && (j >= avail)) {
3229
799
                    done = 1;
3230
799
                    goto next_chunk;
3231
799
                }
3232
3233
691k
                break;
3234
3235
40.4M
            case '\0':
3236
40.4M
                skip = 1;
3237
3238
40.4M
                if (mode == 0) {
3239
                    /*
3240
                     * The HTML5 spec says that the tokenizer should
3241
                     * pass on U+0000 unmodified in normal data mode.
3242
                     * These characters should then be ignored in body
3243
                     * and other text, but should be replaced with
3244
                     * U+FFFD in foreign content.
3245
                     *
3246
                     * At least for now, we always strip U+0000 when
3247
                     * tokenizing.
3248
                     */
3249
34.9M
                    repl = BAD_CAST "";
3250
34.9M
                    replSize = 0;
3251
34.9M
                } else {
3252
5.56M
                    repl = BAD_CAST "\xEF\xBF\xBD";
3253
5.56M
                    replSize = 3;
3254
5.56M
                }
3255
3256
40.4M
                goto next_chunk;
3257
3258
15.8M
            case '\n':
3259
15.8M
                line += 1;
3260
15.8M
                col = 1;
3261
15.8M
                break;
3262
3263
687k
            case '\r':
3264
687k
                if (partial && avail < 2) {
3265
282
                    done = 1;
3266
282
                    goto next_chunk;
3267
282
                }
3268
3269
687k
                skip = 1;
3270
687k
                if (in[1] != 0x0A) {
3271
680k
                    repl = BAD_CAST "\x0A";
3272
680k
                    replSize = 1;
3273
680k
                }
3274
687k
                goto next_chunk;
3275
3276
116M
            default:
3277
116M
                if (cur < 0x80)
3278
0
                    break;
3279
3280
116M
                if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
3281
6.61k
                    xmlChar * guess;
3282
3283
6.61k
                    if (in > chunk)
3284
1.77k
                        goto next_chunk;
3285
3286
4.84k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3287
4.84k
                    guess = NULL;
3288
#else
3289
                    guess = htmlFindEncoding(ctxt);
3290
#endif
3291
4.84k
                    if (guess == NULL) {
3292
4.84k
                        xmlSwitchEncoding(ctxt,
3293
4.84k
                                XML_CHAR_ENCODING_WINDOWS_1252);
3294
4.84k
                    } else {
3295
0
                        xmlSwitchEncodingName(ctxt, (const char *) guess);
3296
0
                        xmlFree(guess);
3297
0
                    }
3298
4.84k
                    input->flags |= XML_INPUT_HAS_ENCODING;
3299
3300
4.84k
                    eof = PARSER_PROGRESSIVE(ctxt);
3301
4.84k
                    goto restart;
3302
6.61k
                }
3303
3304
116M
                size = htmlValidateUtf8(ctxt, in, avail, partial);
3305
3306
116M
                if ((partial) && (size == 0)) {
3307
11.1k
                    done = 1;
3308
11.1k
                    goto next_chunk;
3309
11.1k
                }
3310
3311
116M
                if (size <= 0) {
3312
4.98M
                    skip = 1;
3313
4.98M
                    repl = BAD_CAST "\xEF\xBF\xBD";
3314
4.98M
                    replSize = 3;
3315
4.98M
                    goto next_chunk;
3316
4.98M
                }
3317
3318
111M
                break;
3319
175M
            }
3320
3321
129M
next_char:
3322
129M
            in += size;
3323
129M
            avail -= size;
3324
129M
        }
3325
3326
46.6M
next_chunk:
3327
46.6M
        if (ncrSize > 0) {
3328
52.9k
            skip = ncrSize;
3329
52.9k
            in -= ncrSize;
3330
3331
52.9k
            repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
3332
52.9k
        }
3333
3334
46.6M
        if (in > chunk) {
3335
4.20M
            input->cur += in - chunk;
3336
4.20M
            htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
3337
4.20M
        }
3338
3339
46.6M
        input->cur += skip;
3340
46.6M
        if (replSize > 0)
3341
11.3M
            htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
3342
3343
46.6M
        SHRINK;
3344
3345
46.6M
        if (done)
3346
386k
            break;
3347
3348
46.2M
restart:
3349
46.2M
        ;
3350
46.2M
    }
3351
3352
387k
    input->line = line;
3353
387k
    input->col = col;
3354
3355
387k
    if (complete)
3356
294k
        ctxt->endCheckState = 0;
3357
92.2k
    else
3358
92.2k
        ctxt->endCheckState = mode;
3359
3360
387k
    return(complete);
3361
387k
}
3362
3363
/**
3364
 * Parse an HTML comment
3365
 *
3366
 * @param ctxt  an HTML parser context
3367
 * @param bogus  true if this is a bogus comment
3368
 */
3369
static void
3370
84.5k
htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
3371
84.5k
    const xmlChar *comment = BAD_CAST "";
3372
84.5k
    xmlChar *buf = NULL;
3373
84.5k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3374
27.7k
                    XML_MAX_HUGE_LENGTH :
3375
84.5k
                    XML_MAX_TEXT_LENGTH;
3376
3377
84.5k
    if (bogus) {
3378
60.7k
        buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
3379
60.7k
        if (CUR == '>')
3380
59.2k
            SKIP(1);
3381
60.7k
        comment = buf;
3382
60.7k
    } else {
3383
23.7k
        if ((!PARSER_PROGRESSIVE(ctxt)) &&
3384
23.7k
            (ctxt->input->end - ctxt->input->cur < 2))
3385
80
            xmlParserGrow(ctxt);
3386
3387
23.7k
        if (CUR == '>') {
3388
6.91k
            SKIP(1);
3389
16.8k
        } else if ((CUR == '-') && (NXT(1) == '>')) {
3390
1.09k
            SKIP(2);
3391
15.7k
        } else {
3392
15.7k
            buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
3393
15.7k
            comment = buf;
3394
15.7k
        }
3395
23.7k
    }
3396
3397
84.5k
    if (comment == NULL)
3398
185
        return;
3399
3400
84.3k
    if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3401
84.3k
        (!ctxt->disableSAX))
3402
80.8k
        ctxt->sax->comment(ctxt->userData, comment);
3403
3404
84.3k
    xmlFree(buf);
3405
84.3k
}
3406
3407
/**
3408
 * @deprecated Internal function, don't use.
3409
 *
3410
 * @param ctxt  an HTML parser context
3411
 * @returns 0
3412
 */
3413
int
3414
0
htmlParseCharRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED) {
3415
0
    return(0);
3416
0
}
3417
3418
3419
/**
3420
 * Parse a DOCTYPE SYTSTEM or PUBLIC literal.
3421
 *
3422
 * @param ctxt  an HTML parser context
3423
 * @returns the literal or NULL in case of error.
3424
 */
3425
3426
static xmlChar *
3427
43.8k
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
3428
43.8k
    xmlChar *ret;
3429
43.8k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3430
8.46k
                    XML_MAX_TEXT_LENGTH :
3431
43.8k
                    XML_MAX_NAME_LENGTH;
3432
3433
43.8k
    if (CUR == '"') {
3434
1.64k
        SKIP(1);
3435
1.64k
        ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
3436
1.64k
        if (CUR == '"')
3437
1.26k
            SKIP(1);
3438
42.1k
    } else if (CUR == '\'') {
3439
4.30k
        SKIP(1);
3440
4.30k
        ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
3441
4.30k
        if (CUR == '\'')
3442
3.45k
            SKIP(1);
3443
37.8k
    } else {
3444
37.8k
        return(NULL);
3445
37.8k
    }
3446
3447
5.95k
    return(ret);
3448
43.8k
}
3449
3450
static void
3451
62.6k
htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
3452
62.6k
    const xmlChar *in;
3453
62.6k
    size_t avail;
3454
62.6k
    int eof = PARSER_PROGRESSIVE(ctxt);
3455
62.6k
    int line, col;
3456
3457
62.6k
    line = ctxt->input->line;
3458
62.6k
    col = ctxt->input->col;
3459
3460
62.6k
    in = ctxt->input->cur;
3461
62.6k
    avail = ctxt->input->end - in;
3462
3463
53.3M
    while (!PARSER_STOPPED(ctxt)) {
3464
53.3M
        int cur;
3465
3466
53.3M
        if ((!eof) && (avail <= 64)) {
3467
2.62k
            size_t oldAvail = avail;
3468
3469
2.62k
            ctxt->input->cur = in;
3470
3471
2.62k
            xmlParserGrow(ctxt);
3472
3473
2.62k
            in = ctxt->input->cur;
3474
2.62k
            avail = ctxt->input->end - in;
3475
3476
2.62k
            if (oldAvail == avail)
3477
990
                eof = 1;
3478
2.62k
        }
3479
3480
53.3M
        if (avail == 0)
3481
898
            break;
3482
3483
53.3M
        col += 1;
3484
3485
53.3M
        cur = *in;
3486
53.3M
        if (cur == '>') {
3487
61.6k
            in += 1;
3488
61.6k
            break;
3489
53.3M
        } else if (cur == 0x0A) {
3490
134k
            line += 1;
3491
134k
            col = 1;
3492
134k
        }
3493
3494
53.3M
        in += 1;
3495
53.3M
        avail -= 1;
3496
3497
53.3M
        SHRINK;
3498
53.3M
    }
3499
3500
62.6k
    ctxt->input->cur = in;
3501
62.6k
    ctxt->input->line = line;
3502
62.6k
    ctxt->input->col = col;
3503
62.6k
}
3504
3505
/**
3506
 * Parse a DOCTYPE declaration.
3507
 *
3508
 * @param ctxt  an HTML parser context
3509
 */
3510
3511
static void
3512
62.6k
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3513
62.6k
    xmlChar *name = NULL;
3514
62.6k
    xmlChar *publicId = NULL;
3515
62.6k
    xmlChar *URI = NULL;
3516
62.6k
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3517
13.6k
                    XML_MAX_TEXT_LENGTH :
3518
62.6k
                    XML_MAX_NAME_LENGTH;
3519
3520
    /*
3521
     * We know that '<!DOCTYPE' has been detected.
3522
     */
3523
62.6k
    SKIP(9);
3524
3525
62.6k
    SKIP_BLANKS;
3526
3527
62.6k
    if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
3528
54.1k
        name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
3529
3530
54.1k
        if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
3531
12.2k
            xmlChar *cur;
3532
3533
148k
            for (cur = name; *cur; cur++) {
3534
136k
                if (IS_UPPER(*cur))
3535
5.38k
                    *cur += 0x20;
3536
136k
            }
3537
12.2k
        }
3538
3539
54.1k
        SKIP_BLANKS;
3540
54.1k
    }
3541
3542
    /*
3543
     * Check for SystemID and publicId
3544
     */
3545
62.6k
    if ((UPPER == 'P') && (UPP(1) == 'U') &&
3546
62.6k
  (UPP(2) == 'B') && (UPP(3) == 'L') &&
3547
62.6k
  (UPP(4) == 'I') && (UPP(5) == 'C')) {
3548
39.4k
        SKIP(6);
3549
39.4k
        SKIP_BLANKS;
3550
39.4k
  publicId = htmlParseDoctypeLiteral(ctxt);
3551
39.4k
  if (publicId == NULL)
3552
36.8k
            goto bogus;
3553
2.62k
        SKIP_BLANKS;
3554
2.62k
  URI = htmlParseDoctypeLiteral(ctxt);
3555
23.1k
    } else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3556
23.1k
               (UPP(2) == 'S') && (UPP(3) == 'T') &&
3557
23.1k
         (UPP(4) == 'E') && (UPP(5) == 'M')) {
3558
1.69k
        SKIP(6);
3559
1.69k
        SKIP_BLANKS;
3560
1.69k
  URI = htmlParseDoctypeLiteral(ctxt);
3561
1.69k
    }
3562
3563
62.6k
bogus:
3564
62.6k
    htmlSkipBogusDoctype(ctxt);
3565
3566
    /*
3567
     * Create or update the document accordingly to the DOCTYPE
3568
     */
3569
62.6k
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3570
62.6k
  (!ctxt->disableSAX))
3571
62.3k
  ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
3572
3573
62.6k
    xmlFree(name);
3574
62.6k
    xmlFree(URI);
3575
62.6k
    xmlFree(publicId);
3576
62.6k
}
3577
3578
/**
3579
 * parse an attribute
3580
 *
3581
 * [41] Attribute ::= Name Eq AttValue
3582
 *
3583
 * [25] Eq ::= S? '=' S?
3584
 *
3585
 * With namespace:
3586
 *
3587
 * [NS 11] Attribute ::= QName Eq AttValue
3588
 *
3589
 * Also the case QName == xmlns:??? is handled independently as a namespace
3590
 * definition.
3591
 *
3592
 * @param ctxt  an HTML parser context
3593
 * @param value  a xmlChar ** used to store the value of the attribute
3594
 * @returns the attribute name, and the value in *value.
3595
 */
3596
3597
static xmlHashedString
3598
4.30M
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3599
4.30M
    xmlHashedString hname;
3600
4.30M
    xmlChar *val = NULL;
3601
3602
4.30M
    *value = NULL;
3603
4.30M
    hname = htmlParseHTMLName(ctxt, 1);
3604
4.30M
    if (hname.name == NULL)
3605
21
        return(hname);
3606
3607
    /*
3608
     * read the value
3609
     */
3610
4.30M
    SKIP_BLANKS;
3611
4.30M
    if (CUR == '=') {
3612
191k
        SKIP(1);
3613
191k
  SKIP_BLANKS;
3614
191k
  val = htmlParseAttValue(ctxt);
3615
191k
    }
3616
3617
4.30M
    *value = val;
3618
4.30M
    return(hname);
3619
4.30M
}
3620
3621
static int
3622
htmlCharEncCheckAsciiCompatible(htmlParserCtxt *ctxt,
3623
6.44k
                                const xmlChar *encoding) {
3624
6.44k
    xmlCharEncodingHandler *handler;
3625
6.44k
    xmlChar in[9] = "<a A=\"/>";
3626
6.44k
    xmlChar out[9];
3627
6.44k
    int inlen, outlen;
3628
6.44k
    int res;
3629
3630
6.44k
    res = xmlCreateCharEncodingHandler(
3631
6.44k
            (const char *) encoding,
3632
6.44k
            XML_ENC_INPUT | XML_ENC_HTML,
3633
6.44k
            ctxt->convImpl, ctxt->convCtxt,
3634
6.44k
            &handler);
3635
6.44k
    if (res != XML_ERR_OK) {
3636
3.29k
        xmlFatalErr(ctxt, res, (const char *) encoding);
3637
3.29k
        return(-1);
3638
3.29k
    }
3639
3640
    /* UTF-8 */
3641
3.15k
    if (handler == NULL)
3642
260
        return(0);
3643
3644
2.89k
    inlen = 8;
3645
2.89k
    outlen = 8;
3646
2.89k
    res = xmlEncInputChunk(handler, out, &outlen, in, &inlen, /* flush */ 1);
3647
3648
2.89k
    xmlCharEncCloseFunc(handler);
3649
3650
2.89k
    if ((res != XML_ENC_ERR_SUCCESS) ||
3651
2.89k
        (inlen != 8) || (outlen != 8) ||
3652
2.89k
        (memcmp(in, out, 8) != 0)) {
3653
970
        htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3654
970
                     "Encoding %s isn't ASCII-compatible", encoding, NULL);
3655
970
        return(-1);
3656
970
    }
3657
3658
1.92k
    return(0);
3659
2.89k
}
3660
3661
/**
3662
 * Handle charset encoding in meta tag.
3663
 *
3664
 * @param ctxt  an HTML parser context
3665
 * @param atts  the attributes values
3666
 */
3667
static void
3668
9.86k
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3669
9.86k
    int i;
3670
9.86k
    const xmlChar *att, *value;
3671
9.86k
    int isContentType = 0;
3672
9.86k
    const xmlChar *content = NULL;
3673
9.86k
    xmlChar *encoding = NULL;
3674
3675
9.86k
    if ((ctxt == NULL) || (atts == NULL))
3676
0
  return;
3677
3678
9.86k
    i = 0;
3679
9.86k
    att = atts[i++];
3680
25.7k
    while (att != NULL) {
3681
20.6k
  value = atts[i++];
3682
20.6k
        if (value != NULL) {
3683
12.8k
            if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3684
12.8k
                (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3685
3.54k
                isContentType = 1;
3686
9.33k
            } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3687
4.75k
                encoding = xmlStrdup(value);
3688
4.75k
                if (encoding == NULL)
3689
28
                    htmlErrMemory(ctxt);
3690
4.75k
                break;
3691
4.75k
            } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3692
3.55k
                content = value;
3693
3.55k
            }
3694
12.8k
        }
3695
15.8k
  att = atts[i++];
3696
15.8k
    }
3697
3698
9.86k
    if ((encoding == NULL) && (isContentType) && (content != NULL)) {
3699
3.21k
        htmlMetaEncodingOffsets off;
3700
3701
3.21k
        if (htmlParseContentType(content, &off)) {
3702
1.74k
            encoding = xmlStrndup(content + off.start, off.end - off.start);
3703
1.74k
            if (encoding == NULL)
3704
23
                htmlErrMemory(ctxt);
3705
1.74k
        }
3706
3.21k
    }
3707
3708
9.86k
    if (encoding != NULL) {
3709
6.44k
        if (htmlCharEncCheckAsciiCompatible(ctxt, encoding) < 0) {
3710
4.26k
            xmlFree(encoding);
3711
4.26k
            return;
3712
4.26k
        }
3713
3714
2.18k
        xmlSetDeclaredEncoding(ctxt, encoding);
3715
2.18k
    }
3716
9.86k
}
3717
3718
/**
3719
 * Inserts a new attribute into the hash table.
3720
 *
3721
 * @param ctxt  parser context
3722
 * @param size  size of the hash table
3723
 * @param name  attribute name
3724
 * @param hashValue  hash value of name
3725
 * @param aindex  attribute index (this is a multiple of 5)
3726
 * @returns INT_MAX if no existing attribute was found, the attribute
3727
 * index if an attribute was found, -1 if a memory allocation failed.
3728
 */
3729
static int
3730
htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
3731
3.34M
                   unsigned hashValue, int aindex) {
3732
3.34M
    xmlAttrHashBucket *table = ctxt->attrHash;
3733
3.34M
    xmlAttrHashBucket *bucket;
3734
3.34M
    unsigned hindex;
3735
3736
3.34M
    hindex = hashValue & (size - 1);
3737
3.34M
    bucket = &table[hindex];
3738
3739
3.42M
    while (bucket->index >= 0) {
3740
2.92M
        const xmlChar **atts = &ctxt->atts[bucket->index];
3741
3742
2.92M
        if (name == atts[0])
3743
2.84M
            return(bucket->index);
3744
3745
77.0k
        hindex++;
3746
77.0k
        bucket++;
3747
77.0k
        if (hindex >= size) {
3748
4.53k
            hindex = 0;
3749
4.53k
            bucket = table;
3750
4.53k
        }
3751
77.0k
    }
3752
3753
499k
    bucket->index = aindex;
3754
3755
499k
    return(INT_MAX);
3756
3.34M
}
3757
3758
/**
3759
 * parse a start of tag either for rule element or
3760
 * EmptyElement. In both case we don't parse the tag closing chars.
3761
 *
3762
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3763
 *
3764
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3765
 *
3766
 * With namespace:
3767
 *
3768
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3769
 *
3770
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3771
 *
3772
 * @param ctxt  an HTML parser context
3773
 * @returns 0 in case of success, -1 in case of error and 1 if discarded
3774
 */
3775
3776
static void
3777
485k
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3778
485k
    const xmlChar *name;
3779
485k
    const xmlChar *attname;
3780
485k
    xmlChar *attvalue;
3781
485k
    const xmlChar **atts;
3782
485k
    int nbatts = 0;
3783
485k
    int maxatts;
3784
485k
    int i;
3785
485k
    int discardtag = 0;
3786
3787
485k
    ctxt->endCheckState = 0;
3788
3789
485k
    SKIP(1);
3790
3791
485k
    atts = ctxt->atts;
3792
485k
    maxatts = ctxt->maxatts;
3793
3794
485k
    GROW;
3795
485k
    name = htmlParseHTMLName(ctxt, 0).name;
3796
485k
    if (name == NULL)
3797
90
        return;
3798
3799
485k
    if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
3800
        /*
3801
         * Check for auto-closure of HTML elements.
3802
         */
3803
413k
        htmlAutoClose(ctxt, name);
3804
3805
        /*
3806
         * Check for implied HTML elements.
3807
         */
3808
413k
        htmlCheckImplied(ctxt, name);
3809
3810
        /*
3811
         * Avoid html at any level > 0, head at any level != 1
3812
         * or any attempt to recurse body
3813
         */
3814
413k
        if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3815
41.1k
            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3816
41.1k
                         "htmlParseStartTag: misplaced <html> tag\n",
3817
41.1k
                         name, NULL);
3818
41.1k
            discardtag = 1;
3819
41.1k
            ctxt->depth++;
3820
41.1k
        }
3821
413k
        if ((ctxt->nameNr != 1) &&
3822
413k
            (xmlStrEqual(name, BAD_CAST"head"))) {
3823
4.91k
            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3824
4.91k
                         "htmlParseStartTag: misplaced <head> tag\n",
3825
4.91k
                         name, NULL);
3826
4.91k
            discardtag = 1;
3827
4.91k
            ctxt->depth++;
3828
4.91k
        }
3829
413k
        if (xmlStrEqual(name, BAD_CAST"body")) {
3830
4.11k
            int indx;
3831
952k
            for (indx = 0;indx < ctxt->nameNr;indx++) {
3832
948k
                if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3833
2.46k
                    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3834
2.46k
                                 "htmlParseStartTag: misplaced <body> tag\n",
3835
2.46k
                                 name, NULL);
3836
2.46k
                    discardtag = 1;
3837
2.46k
                    ctxt->depth++;
3838
2.46k
                }
3839
948k
            }
3840
4.11k
        }
3841
413k
    }
3842
3843
    /*
3844
     * Now parse the attributes, it ends up with the ending
3845
     *
3846
     * (S Attribute)* S?
3847
     */
3848
485k
    SKIP_BLANKS;
3849
4.83M
    while ((ctxt->input->cur < ctxt->input->end) &&
3850
4.83M
           (CUR != '>') &&
3851
4.83M
     ((CUR != '/') || (NXT(1) != '>')) &&
3852
4.83M
           (PARSER_STOPPED(ctxt) == 0)) {
3853
4.34M
        xmlHashedString hattname;
3854
3855
        /*  unexpected-solidus-in-tag */
3856
4.34M
        if (CUR == '/') {
3857
84.0k
            SKIP(1);
3858
84.0k
            SKIP_BLANKS;
3859
84.0k
            continue;
3860
84.0k
        }
3861
4.26M
  GROW;
3862
4.26M
  hattname = htmlParseAttribute(ctxt, &attvalue);
3863
4.26M
        attname = hattname.name;
3864
3865
4.26M
        if (attname != NULL) {
3866
      /*
3867
       * Add the pair to atts
3868
       */
3869
4.26M
      if (nbatts + 4 > maxatts) {
3870
14.2k
          const xmlChar **tmp;
3871
14.2k
                unsigned *utmp;
3872
14.2k
                int newSize;
3873
3874
14.2k
                newSize = xmlGrowCapacity(maxatts,
3875
14.2k
                                          sizeof(tmp[0]) * 2 + sizeof(utmp[0]),
3876
14.2k
                                          11, HTML_MAX_ATTRS);
3877
14.2k
    if (newSize < 0) {
3878
0
        htmlErrMemory(ctxt);
3879
0
        goto failed;
3880
0
    }
3881
14.2k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3882
14.2k
                if (newSize < 2)
3883
7.71k
                    newSize = 2;
3884
14.2k
#endif
3885
14.2k
          tmp = xmlRealloc(atts, newSize * sizeof(tmp[0]) * 2);
3886
14.2k
    if (tmp == NULL) {
3887
44
        htmlErrMemory(ctxt);
3888
44
        goto failed;
3889
44
    }
3890
14.1k
                atts = tmp;
3891
14.1k
    ctxt->atts = tmp;
3892
3893
14.1k
          utmp = xmlRealloc(ctxt->attallocs, newSize * sizeof(utmp[0]));
3894
14.1k
    if (utmp == NULL) {
3895
61
        htmlErrMemory(ctxt);
3896
61
        goto failed;
3897
61
    }
3898
14.1k
                ctxt->attallocs = utmp;
3899
3900
14.1k
                maxatts = newSize * 2;
3901
14.1k
    ctxt->maxatts = maxatts;
3902
14.1k
      }
3903
3904
4.26M
            ctxt->attallocs[nbatts/2] = hattname.hashValue;
3905
4.26M
      atts[nbatts++] = attname;
3906
4.26M
      atts[nbatts++] = attvalue;
3907
3908
4.26M
            attvalue = NULL;
3909
4.26M
  }
3910
3911
4.26M
failed:
3912
4.26M
        if (attvalue != NULL)
3913
32
            xmlFree(attvalue);
3914
3915
4.26M
  SKIP_BLANKS;
3916
4.26M
    }
3917
3918
485k
    if (ctxt->input->cur >= ctxt->input->end) {
3919
3.54k
        discardtag = 1;
3920
3.54k
        goto done;
3921
3.54k
    }
3922
3923
    /*
3924
     * Verify that attribute names are unique.
3925
     */
3926
482k
    if (nbatts > 2) {
3927
37.2k
        unsigned attrHashSize;
3928
37.2k
        int j, k;
3929
3930
37.2k
        attrHashSize = 4;
3931
69.1k
        while (attrHashSize / 2 < (unsigned) nbatts / 2)
3932
31.9k
            attrHashSize *= 2;
3933
3934
37.2k
        if (attrHashSize > ctxt->attrHashMax) {
3935
4.29k
            xmlAttrHashBucket *tmp;
3936
3937
4.29k
            tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
3938
4.29k
            if (tmp == NULL) {
3939
54
                htmlErrMemory(ctxt);
3940
54
                goto done;
3941
54
            }
3942
3943
4.24k
            ctxt->attrHash = tmp;
3944
4.24k
            ctxt->attrHashMax = attrHashSize;
3945
4.24k
        }
3946
3947
37.1k
        memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
3948
3949
3.38M
        for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
3950
3.34M
            unsigned hashValue;
3951
3.34M
            int res;
3952
3953
3.34M
            attname = atts[i];
3954
3.34M
            hashValue = ctxt->attallocs[k] | 0x80000000;
3955
3956
3.34M
            res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
3957
3.34M
                                    hashValue, j);
3958
3.34M
            if (res < 0)
3959
0
                continue;
3960
3961
3.34M
            if (res == INT_MAX) {
3962
499k
                atts[j] = atts[i];
3963
499k
                atts[j+1] = atts[i+1];
3964
499k
                j += 2;
3965
2.84M
            } else {
3966
2.84M
                xmlFree((xmlChar *) atts[i+1]);
3967
2.84M
            }
3968
3.34M
        }
3969
3970
37.1k
        nbatts = j;
3971
37.1k
    }
3972
3973
482k
    if (nbatts > 0) {
3974
188k
        atts[nbatts] = NULL;
3975
188k
        atts[nbatts + 1] = NULL;
3976
3977
    /*
3978
     * Apple's new libiconv is so broken that you routinely run into
3979
     * issues when fuzz testing (by accident with an uninstrumented
3980
     * libiconv). Here's a harmless (?) example:
3981
     *
3982
     * printf '>'             | iconv -f shift_jis -t utf-8 | hexdump -C
3983
     * printf '\xfc\x00\x00'  | iconv -f shift_jis -t utf-8 | hexdump -C
3984
     * printf '>\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C
3985
     *
3986
     * The last command fails to detect the illegal sequence.
3987
     */
3988
188k
#if !defined(__APPLE__) || \
3989
188k
    !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
3990
        /*
3991
         * Handle specific association to the META tag
3992
         */
3993
188k
        if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
3994
188k
            (strcmp((char *) name, "meta") == 0)) {
3995
9.86k
            htmlCheckMeta(ctxt, atts);
3996
9.86k
        }
3997
188k
#endif
3998
188k
    }
3999
4000
    /*
4001
     * SAX: Start of Element !
4002
     */
4003
482k
    if (!discardtag) {
4004
433k
        if (ctxt->options & HTML_PARSE_HTML5) {
4005
70.9k
            if (ctxt->nameNr > 0)
4006
58.7k
                htmlnamePop(ctxt);
4007
70.9k
        }
4008
4009
433k
  htmlnamePush(ctxt, name);
4010
433k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4011
433k
      if (nbatts != 0)
4012
188k
    ctxt->sax->startElement(ctxt->userData, name, atts);
4013
245k
      else
4014
245k
    ctxt->sax->startElement(ctxt->userData, name, NULL);
4015
433k
  }
4016
433k
    }
4017
4018
485k
done:
4019
485k
    if (atts != NULL) {
4020
1.79M
        for (i = 1;i < nbatts;i += 2) {
4021
1.41M
      if (atts[i] != NULL)
4022
181k
    xmlFree((xmlChar *) atts[i]);
4023
1.41M
  }
4024
374k
    }
4025
485k
}
4026
4027
/**
4028
 * parse an end of tag
4029
 *
4030
 * [42] ETag ::= '</' Name S? '>'
4031
 *
4032
 * With namespace
4033
 *
4034
 * [NS 9] ETag ::= '</' QName S? '>'
4035
 *
4036
 * @param ctxt  an HTML parser context
4037
 * @returns 1 if the current level should be closed.
4038
 */
4039
4040
static void
4041
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4042
80.3k
{
4043
80.3k
    const xmlChar *name;
4044
80.3k
    const xmlChar *oldname;
4045
80.3k
    int i;
4046
4047
80.3k
    ctxt->endCheckState = 0;
4048
4049
80.3k
    SKIP(2);
4050
4051
80.3k
    if (ctxt->input->cur >= ctxt->input->end) {
4052
24
        htmlStartCharData(ctxt);
4053
24
        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4054
24
            (ctxt->sax->characters != NULL))
4055
17
            ctxt->sax->characters(ctxt->userData,
4056
17
                                  BAD_CAST "</", 2);
4057
24
        return;
4058
24
    }
4059
4060
80.3k
    if (CUR == '>') {
4061
3.93k
        SKIP(1);
4062
3.93k
        return;
4063
3.93k
    }
4064
4065
76.3k
    if (!IS_ASCII_LETTER(CUR)) {
4066
3.80k
        htmlParseComment(ctxt, /* bogus */ 1);
4067
3.80k
        return;
4068
3.80k
    }
4069
4070
72.5k
    name = htmlParseHTMLName(ctxt, 0).name;
4071
72.5k
    if (name == NULL)
4072
18
        return;
4073
4074
    /*
4075
     * Parse and ignore attributes.
4076
     */
4077
72.5k
    SKIP_BLANKS;
4078
117k
    while ((ctxt->input->cur < ctxt->input->end) &&
4079
117k
           (CUR != '>') &&
4080
117k
     ((CUR != '/') || (NXT(1) != '>')) &&
4081
117k
           (ctxt->instate != XML_PARSER_EOF)) {
4082
44.8k
        xmlChar *attvalue = NULL;
4083
4084
        /*  unexpected-solidus-in-tag */
4085
44.8k
        if (CUR == '/') {
4086
6.41k
            SKIP(1);
4087
6.41k
            SKIP_BLANKS;
4088
6.41k
            continue;
4089
6.41k
        }
4090
38.4k
  GROW;
4091
38.4k
  htmlParseAttribute(ctxt, &attvalue);
4092
38.4k
        if (attvalue != NULL)
4093
1.67k
            xmlFree(attvalue);
4094
4095
38.4k
  SKIP_BLANKS;
4096
38.4k
    }
4097
4098
72.5k
    if (CUR == '>') {
4099
71.4k
        SKIP(1);
4100
71.4k
    } else if ((CUR == '/') && (NXT(1) == '>')) {
4101
808
        SKIP(2);
4102
808
    } else {
4103
303
        return;
4104
303
    }
4105
4106
72.2k
    if (ctxt->options & HTML_PARSE_HTML5) {
4107
15.9k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4108
15.9k
            ctxt->sax->endElement(ctxt->userData, name);
4109
15.9k
        return;
4110
15.9k
    }
4111
4112
    /*
4113
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4114
     * out now.
4115
     */
4116
56.2k
    if ((ctxt->depth > 0) &&
4117
56.2k
        (xmlStrEqual(name, BAD_CAST "html") ||
4118
21.8k
         xmlStrEqual(name, BAD_CAST "body") ||
4119
21.8k
   xmlStrEqual(name, BAD_CAST "head"))) {
4120
8.98k
  ctxt->depth--;
4121
8.98k
  return;
4122
8.98k
    }
4123
4124
    /*
4125
     * If the name read is not one of the element in the parsing stack
4126
     * then return, it's just an error.
4127
     */
4128
3.24M
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4129
3.21M
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4130
24.7k
            break;
4131
3.21M
    }
4132
47.2k
    if (i < 0) {
4133
22.4k
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4134
22.4k
               "Unexpected end tag : %s\n", name, NULL);
4135
22.4k
        return;
4136
22.4k
    }
4137
4138
4139
    /*
4140
     * Check for auto-closure of HTML elements.
4141
     */
4142
4143
24.7k
    htmlAutoCloseOnClose(ctxt, name);
4144
4145
    /*
4146
     * Well formedness constraints, opening and closing must match.
4147
     * With the exception that the autoclose may have popped stuff out
4148
     * of the stack.
4149
     */
4150
24.7k
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4151
1.35k
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4152
1.35k
                     "Opening and ending tag mismatch: %s and %s\n",
4153
1.35k
                     name, ctxt->name);
4154
1.35k
    }
4155
4156
    /*
4157
     * SAX: End of Tag
4158
     */
4159
24.7k
    oldname = ctxt->name;
4160
24.7k
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4161
23.4k
  htmlParserFinishElementParsing(ctxt);
4162
23.4k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4163
23.4k
            ctxt->sax->endElement(ctxt->userData, name);
4164
23.4k
        htmlnamePop(ctxt);
4165
23.4k
    }
4166
24.7k
}
4167
4168
/**
4169
 * Parse a content: comment, sub-element, reference or text.
4170
 * New version for non recursive htmlParseElementInternal
4171
 *
4172
 * @param ctxt  an HTML parser context
4173
 */
4174
4175
static void
4176
9.70k
htmlParseContent(htmlParserCtxtPtr ctxt) {
4177
9.70k
    GROW;
4178
4179
1.24M
    while ((PARSER_STOPPED(ctxt) == 0) &&
4180
1.24M
           (ctxt->input->cur < ctxt->input->end)) {
4181
1.23M
        int mode;
4182
4183
1.23M
        mode = ctxt->endCheckState;
4184
4185
1.23M
        if ((mode == 0) && (CUR == '<')) {
4186
1.08M
            if (NXT(1) == '/') {
4187
40.4k
          htmlParseEndTag(ctxt);
4188
1.04M
            } else if (NXT(1) == '!') {
4189
                /*
4190
                 * Sometimes DOCTYPE arrives in the middle of the document
4191
                 */
4192
51.7k
                if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4193
51.7k
                    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4194
51.7k
                    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4195
51.7k
                    (UPP(8) == 'E')) {
4196
30.0k
                    htmlParseDocTypeDecl(ctxt);
4197
30.0k
                } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4198
10.0k
                    SKIP(4);
4199
10.0k
                    htmlParseComment(ctxt, /* bogus */ 0);
4200
11.5k
                } else {
4201
11.5k
                    SKIP(2);
4202
11.5k
                    htmlParseComment(ctxt, /* bogus */ 1);
4203
11.5k
                }
4204
988k
            } else if (NXT(1) == '?') {
4205
13.3k
                SKIP(1);
4206
13.3k
                htmlParseComment(ctxt, /* bogus */ 1);
4207
975k
            } else if (IS_ASCII_LETTER(NXT(1))) {
4208
243k
                htmlParseElementInternal(ctxt);
4209
731k
            } else {
4210
731k
                htmlStartCharData(ctxt);
4211
731k
                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4212
731k
                    (ctxt->sax->characters != NULL))
4213
730k
                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4214
731k
                SKIP(1);
4215
731k
            }
4216
1.08M
        } else {
4217
151k
            htmlParseCharData(ctxt, /* partial */ 0);
4218
151k
        }
4219
4220
1.23M
        SHRINK;
4221
1.23M
        GROW;
4222
1.23M
    }
4223
4224
9.70k
    if (ctxt->input->cur >= ctxt->input->end)
4225
8.67k
        htmlAutoCloseOnEnd(ctxt);
4226
9.70k
}
4227
4228
/**
4229
 * Parse an HTML element, new version, non recursive
4230
 *
4231
 * @param ctxt  an HTML parser context
4232
 */
4233
static int
4234
485k
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4235
485k
    const xmlChar *name;
4236
485k
    const htmlElemDesc * info;
4237
485k
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4238
4239
485k
    if ((ctxt == NULL) || (ctxt->input == NULL))
4240
0
  return(0);
4241
4242
    /* Capture start position */
4243
485k
    if (ctxt->record_info) {
4244
0
        node_info.begin_pos = ctxt->input->consumed +
4245
0
                          (CUR_PTR - ctxt->input->base);
4246
0
  node_info.begin_line = ctxt->input->line;
4247
0
    }
4248
4249
485k
    htmlParseStartTag(ctxt);
4250
485k
    name = ctxt->name;
4251
485k
    if (name == NULL)
4252
2.04k
        return(0);
4253
4254
483k
    if (ctxt->record_info)
4255
0
        htmlNodeInfoPush(ctxt, &node_info);
4256
4257
    /*
4258
     * Check for an Empty Element labeled the XML/SGML way
4259
     */
4260
483k
    if ((CUR == '/') && (NXT(1) == '>')) {
4261
3.75k
        SKIP(2);
4262
3.75k
        htmlParserFinishElementParsing(ctxt);
4263
3.75k
        if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4264
2.81k
            if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4265
2.81k
                ctxt->sax->endElement(ctxt->userData, name);
4266
2.81k
        }
4267
3.75k
  htmlnamePop(ctxt);
4268
3.75k
  return(0);
4269
3.75k
    }
4270
4271
479k
    if (CUR != '>')
4272
2.17k
        return(0);
4273
477k
    SKIP(1);
4274
4275
    /*
4276
     * Lookup the info for that element.
4277
     */
4278
477k
    info = htmlTagLookup(name);
4279
4280
    /*
4281
     * Check for an Empty Element from DTD definition
4282
     */
4283
477k
    if ((info != NULL) && (info->empty)) {
4284
23.6k
        htmlParserFinishElementParsing(ctxt);
4285
23.6k
        if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4286
15.2k
            if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4287
15.2k
                ctxt->sax->endElement(ctxt->userData, name);
4288
15.2k
        }
4289
23.6k
  htmlnamePop(ctxt);
4290
23.6k
  return(0);
4291
23.6k
    }
4292
4293
454k
    if (info != NULL)
4294
133k
        ctxt->endCheckState = info->dataMode;
4295
4296
454k
    return(1);
4297
477k
}
4298
4299
/**
4300
 * This is kept for compatibility with previous code versions
4301
 *
4302
 * @deprecated Internal function, don't use.
4303
 *
4304
 * @param ctxt  an HTML parser context
4305
 */
4306
void
4307
0
htmlParseElement(htmlParserCtxt *ctxt) {
4308
0
    const xmlChar *oldptr;
4309
0
    int depth;
4310
4311
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4312
0
  return;
4313
4314
0
    if (htmlParseElementInternal(ctxt) == 0)
4315
0
        return;
4316
4317
    /*
4318
     * Parse the content of the element:
4319
     */
4320
0
    depth = ctxt->nameNr;
4321
0
    while (CUR != 0) {
4322
0
  oldptr = ctxt->input->cur;
4323
0
  htmlParseContent(ctxt);
4324
0
  if (oldptr==ctxt->input->cur) break;
4325
0
  if (ctxt->nameNr < depth) break;
4326
0
    }
4327
4328
0
    if (CUR == 0) {
4329
0
  htmlAutoCloseOnEnd(ctxt);
4330
0
    }
4331
0
}
4332
4333
/**
4334
 * @param ctxt  parser context
4335
 * @param input  parser input
4336
 * @returns a node list.
4337
 */
4338
xmlNode *
4339
0
htmlCtxtParseContentInternal(htmlParserCtxt *ctxt, xmlParserInput *input) {
4340
0
    xmlNodePtr root;
4341
0
    xmlNodePtr list = NULL;
4342
0
    xmlChar *rootName = BAD_CAST "#root";
4343
4344
0
    root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL);
4345
0
    if (root == NULL) {
4346
0
        htmlErrMemory(ctxt);
4347
0
        return(NULL);
4348
0
    }
4349
4350
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4351
0
        xmlFreeNode(root);
4352
0
        return(NULL);
4353
0
    }
4354
4355
0
    htmlnamePush(ctxt, rootName);
4356
0
    nodePush(ctxt, root);
4357
4358
0
    htmlParseContent(ctxt);
4359
4360
    /*
4361
     * Only check for truncated multi-byte sequences
4362
     */
4363
0
    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
4364
4365
    /* TODO: Use xmlCtxtIsCatastrophicError */
4366
0
    if (ctxt->errNo != XML_ERR_NO_MEMORY) {
4367
0
        xmlNodePtr cur;
4368
4369
        /*
4370
         * Unlink newly created node list.
4371
         */
4372
0
        list = root->children;
4373
0
        root->children = NULL;
4374
0
        root->last = NULL;
4375
0
        for (cur = list; cur != NULL; cur = cur->next)
4376
0
            cur->parent = NULL;
4377
0
    }
4378
4379
0
    nodePop(ctxt);
4380
0
    htmlnamePop(ctxt);
4381
4382
0
    xmlCtxtPopInput(ctxt);
4383
4384
0
    xmlFreeNode(root);
4385
0
    return(list);
4386
0
}
4387
4388
/**
4389
 * Parse an HTML document and invoke the SAX handlers. This is useful
4390
 * if you're only interested in custom SAX callbacks. If you want a
4391
 * document tree, use #htmlCtxtParseDocument.
4392
 *
4393
 * @param ctxt  an HTML parser context
4394
 * @returns 0, -1 in case of error.
4395
 */
4396
int
4397
9.70k
htmlParseDocument(htmlParserCtxt *ctxt) {
4398
9.70k
    if ((ctxt == NULL) || (ctxt->input == NULL))
4399
0
  return(-1);
4400
4401
9.70k
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4402
9.70k
        ctxt->sax->setDocumentLocator(ctxt->userData,
4403
9.70k
                (xmlSAXLocator *) &xmlDefaultSAXLocator);
4404
9.70k
    }
4405
4406
9.70k
    xmlDetectEncoding(ctxt);
4407
4408
    /*
4409
     * TODO: Implement HTML5 prescan algorithm
4410
     */
4411
4412
    /*
4413
     * This is wrong but matches long-standing behavior. In most
4414
     * cases, a document starting with an XML declaration will
4415
     * specify UTF-8. The HTML5 prescan algorithm handles
4416
     * XML declarations in a better way.
4417
     */
4418
9.70k
    if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4419
9.70k
        (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4420
100
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4421
4422
    /*
4423
     * Wipe out everything which is before the first '<'
4424
     */
4425
9.70k
    SKIP_BLANKS;
4426
4427
9.70k
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4428
9.58k
  ctxt->sax->startDocument(ctxt->userData);
4429
4430
    /*
4431
     * Parse possible comments, PIs or doctype declarations
4432
     * before any content.
4433
     */
4434
9.70k
    ctxt->instate = XML_PARSER_MISC;
4435
17.2k
    while (CUR == '<') {
4436
11.9k
        if (NXT(1) == '!') {
4437
6.16k
            if ((NXT(2) == '-') && (NXT(3) == '-')) {
4438
1.92k
                SKIP(4);
4439
1.92k
                htmlParseComment(ctxt, /* bogus */ 0);
4440
4.23k
            } else if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4441
4.23k
                       (UPP(4) == 'C') && (UPP(5) == 'T') &&
4442
4.23k
                       (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4443
4.23k
                       (UPP(8) == 'E')) {
4444
1.23k
                htmlParseDocTypeDecl(ctxt);
4445
1.23k
                ctxt->instate = XML_PARSER_PROLOG;
4446
3.00k
            } else {
4447
3.00k
                SKIP(2);
4448
3.00k
                htmlParseComment(ctxt, /* bogus */ 1);
4449
3.00k
            }
4450
6.16k
        } else if (NXT(1) == '?') {
4451
1.36k
            SKIP(1);
4452
1.36k
            htmlParseComment(ctxt, /* bogus */ 1);
4453
4.42k
        } else {
4454
4.42k
            break;
4455
4.42k
        }
4456
7.53k
  SKIP_BLANKS;
4457
7.53k
        GROW;
4458
7.53k
    }
4459
4460
    /*
4461
     * Time to start parsing the tree itself
4462
     */
4463
9.70k
    ctxt->instate = XML_PARSER_CONTENT;
4464
9.70k
    htmlParseContent(ctxt);
4465
4466
    /*
4467
     * Only check for truncated multi-byte sequences
4468
     */
4469
9.70k
    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
4470
4471
    /*
4472
     * SAX: end of the document processing.
4473
     */
4474
9.70k
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4475
9.70k
        ctxt->sax->endDocument(ctxt->userData);
4476
4477
9.70k
    if (! ctxt->wellFormed) return(-1);
4478
7.80k
    return(0);
4479
9.70k
}
4480
4481
4482
/************************************************************************
4483
 *                  *
4484
 *      Parser contexts handling      *
4485
 *                  *
4486
 ************************************************************************/
4487
4488
/**
4489
 * Initialize a parser context
4490
 *
4491
 * @param ctxt  an HTML parser context
4492
 * @param sax  SAX handler
4493
 * @param userData  user data
4494
 * @returns 0 in case of success and -1 in case of error
4495
 */
4496
static int
4497
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4498
                   void *userData)
4499
20.6k
{
4500
20.6k
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4501
20.6k
    size_t initialNodeTabSize = 1;
4502
#else
4503
    size_t initialNodeTabSize = 10;
4504
#endif
4505
4506
20.6k
    if (ctxt == NULL) return(-1);
4507
20.6k
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4508
4509
20.6k
    ctxt->dict = xmlDictCreate();
4510
20.6k
    if (ctxt->dict == NULL)
4511
2
  return(-1);
4512
4513
20.6k
    if (ctxt->sax == NULL)
4514
20.6k
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4515
20.6k
    if (ctxt->sax == NULL)
4516
2
  return(-1);
4517
20.6k
    if (sax == NULL) {
4518
20.6k
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4519
20.6k
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4520
20.6k
        ctxt->userData = ctxt;
4521
20.6k
    } else {
4522
0
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4523
0
        ctxt->userData = userData ? userData : ctxt;
4524
0
    }
4525
4526
    /* Allocate the Input stack */
4527
20.6k
    ctxt->inputTab = (htmlParserInputPtr *)
4528
20.6k
                      xmlMalloc(sizeof(htmlParserInputPtr));
4529
20.6k
    if (ctxt->inputTab == NULL)
4530
4
  return(-1);
4531
20.6k
    ctxt->inputNr = 0;
4532
20.6k
    ctxt->inputMax = 1;
4533
20.6k
    ctxt->input = NULL;
4534
20.6k
    ctxt->version = NULL;
4535
20.6k
    ctxt->encoding = NULL;
4536
20.6k
    ctxt->standalone = -1;
4537
20.6k
    ctxt->instate = XML_PARSER_START;
4538
4539
    /* Allocate the Node stack */
4540
20.6k
    ctxt->nodeTab = xmlMalloc(initialNodeTabSize * sizeof(htmlNodePtr));
4541
20.6k
    if (ctxt->nodeTab == NULL)
4542
2
  return(-1);
4543
20.6k
    ctxt->nodeNr = 0;
4544
20.6k
    ctxt->nodeMax = initialNodeTabSize;
4545
20.6k
    ctxt->node = NULL;
4546
4547
    /* Allocate the Name stack */
4548
20.6k
    ctxt->nameTab = xmlMalloc(initialNodeTabSize * sizeof(xmlChar *));
4549
20.6k
    if (ctxt->nameTab == NULL)
4550
2
  return(-1);
4551
20.6k
    ctxt->nameNr = 0;
4552
20.6k
    ctxt->nameMax = initialNodeTabSize;
4553
20.6k
    ctxt->name = NULL;
4554
4555
20.6k
    ctxt->nodeInfoTab = NULL;
4556
20.6k
    ctxt->nodeInfoNr  = 0;
4557
20.6k
    ctxt->nodeInfoMax = 0;
4558
4559
20.6k
    ctxt->myDoc = NULL;
4560
20.6k
    ctxt->wellFormed = 1;
4561
20.6k
    ctxt->replaceEntities = 0;
4562
20.6k
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4563
20.6k
    ctxt->html = INSERT_INITIAL;
4564
20.6k
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4565
20.6k
    ctxt->vctxt.userData = ctxt;
4566
20.6k
    ctxt->vctxt.error = xmlParserValidityError;
4567
20.6k
    ctxt->vctxt.warning = xmlParserValidityWarning;
4568
20.6k
    ctxt->record_info = 0;
4569
20.6k
    ctxt->validate = 0;
4570
20.6k
    ctxt->checkIndex = 0;
4571
20.6k
    ctxt->catalogs = NULL;
4572
20.6k
    xmlInitNodeInfoSeq(&ctxt->node_seq);
4573
20.6k
    return(0);
4574
20.6k
}
4575
4576
/**
4577
 * Free all the memory used by a parser context. However the parsed
4578
 * document in `ctxt->myDoc` is not freed.
4579
 *
4580
 * @param ctxt  an HTML parser context
4581
 */
4582
void
4583
htmlFreeParserCtxt(htmlParserCtxt *ctxt)
4584
19.4k
{
4585
19.4k
    xmlFreeParserCtxt(ctxt);
4586
19.4k
}
4587
4588
/**
4589
 * Allocate and initialize a new HTML parser context.
4590
 *
4591
 * This can be used to parse HTML documents into DOM trees with
4592
 * functions like #xmlCtxtReadFile or #xmlCtxtReadMemory.
4593
 *
4594
 * See #htmlCtxtUseOptions for parser options.
4595
 *
4596
 * See #xmlCtxtSetErrorHandler for advanced error handling.
4597
 *
4598
 * See #htmlNewSAXParserCtxt for custom SAX parsers.
4599
 *
4600
 * @returns the htmlParserCtxt or NULL in case of allocation error
4601
 */
4602
htmlParserCtxt *
4603
htmlNewParserCtxt(void)
4604
10.9k
{
4605
10.9k
    return(htmlNewSAXParserCtxt(NULL, NULL));
4606
10.9k
}
4607
4608
/**
4609
 * Allocate and initialize a new HTML SAX parser context. If `userData`
4610
 * is NULL, the parser context will be passed as user data.
4611
 *
4612
 * @since 2.11.0
4613
 *
4614
 * If you want support older versions, it's best to invoke
4615
 * #htmlNewParserCtxt and set `ctxt->sax` with struct assignment.
4616
 *
4617
 * Also see #htmlNewParserCtxt.
4618
 *
4619
 * @param sax  SAX handler
4620
 * @param userData  user data
4621
 * @returns the htmlParserCtxt or NULL in case of allocation error
4622
 */
4623
htmlParserCtxt *
4624
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
4625
20.7k
{
4626
20.7k
    xmlParserCtxtPtr ctxt;
4627
4628
20.7k
    xmlInitParser();
4629
4630
20.7k
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4631
20.7k
    if (ctxt == NULL)
4632
29
  return(NULL);
4633
20.6k
    memset(ctxt, 0, sizeof(xmlParserCtxt));
4634
20.6k
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
4635
12
        htmlFreeParserCtxt(ctxt);
4636
12
  return(NULL);
4637
12
    }
4638
20.6k
    return(ctxt);
4639
20.6k
}
4640
4641
static htmlParserCtxtPtr
4642
htmlCreateMemoryParserCtxtInternal(const char *url,
4643
                                   const char *buffer, size_t size,
4644
0
                                   const char *encoding) {
4645
0
    xmlParserCtxtPtr ctxt;
4646
0
    xmlParserInputPtr input;
4647
4648
0
    if (buffer == NULL)
4649
0
  return(NULL);
4650
4651
0
    ctxt = htmlNewParserCtxt();
4652
0
    if (ctxt == NULL)
4653
0
  return(NULL);
4654
4655
0
    input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0);
4656
0
    if (input == NULL) {
4657
0
  xmlFreeParserCtxt(ctxt);
4658
0
        return(NULL);
4659
0
    }
4660
4661
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4662
0
        xmlFreeInputStream(input);
4663
0
        xmlFreeParserCtxt(ctxt);
4664
0
        return(NULL);
4665
0
    }
4666
4667
0
    return(ctxt);
4668
0
}
4669
4670
/**
4671
 * Create a parser context for an HTML in-memory document. The input
4672
 * buffer must not contain any terminating null bytes.
4673
 *
4674
 * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadMemory.
4675
 *
4676
 * @param buffer  a pointer to a char array
4677
 * @param size  the size of the array
4678
 * @returns the new parser context or NULL
4679
 */
4680
htmlParserCtxt *
4681
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4682
0
    if (size <= 0)
4683
0
  return(NULL);
4684
4685
0
    return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
4686
0
}
4687
4688
/**
4689
 * Create a parser context for a null-terminated string.
4690
 *
4691
 * @param str  a pointer to an array of xmlChar
4692
 * @param url  URL of the document (optional)
4693
 * @param encoding  encoding (optional)
4694
 * @returns the new parser context or NULL if a memory allocation failed.
4695
 */
4696
static htmlParserCtxtPtr
4697
htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
4698
0
                        const char *encoding) {
4699
0
    xmlParserCtxtPtr ctxt;
4700
0
    xmlParserInputPtr input;
4701
4702
0
    if (str == NULL)
4703
0
  return(NULL);
4704
4705
0
    ctxt = htmlNewParserCtxt();
4706
0
    if (ctxt == NULL)
4707
0
  return(NULL);
4708
4709
0
    input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str,
4710
0
                                      encoding, 0);
4711
0
    if (input == NULL) {
4712
0
  xmlFreeParserCtxt(ctxt);
4713
0
  return(NULL);
4714
0
    }
4715
4716
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4717
0
        xmlFreeInputStream(input);
4718
0
        xmlFreeParserCtxt(ctxt);
4719
0
        return(NULL);
4720
0
    }
4721
4722
0
    return(ctxt);
4723
0
}
4724
4725
#ifdef LIBXML_PUSH_ENABLED
4726
/************************************************************************
4727
 *                  *
4728
 *  Progressive parsing interfaces        *
4729
 *                  *
4730
 ************************************************************************/
4731
4732
typedef enum {
4733
    LSTATE_TAG_NAME = 0,
4734
    LSTATE_BEFORE_ATTR_NAME,
4735
    LSTATE_ATTR_NAME,
4736
    LSTATE_AFTER_ATTR_NAME,
4737
    LSTATE_BEFORE_ATTR_VALUE,
4738
    LSTATE_ATTR_VALUE_DQUOTED,
4739
    LSTATE_ATTR_VALUE_SQUOTED,
4740
    LSTATE_ATTR_VALUE_UNQUOTED
4741
} xmlLookupStates;
4742
4743
/**
4744
 * Check whether there's enough data in the input buffer to finish parsing
4745
 * a tag. This has to take quotes into account.
4746
 *
4747
 * @param ctxt  an HTML parser context
4748
 */
4749
static int
4750
180k
htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
4751
180k
    const xmlChar *cur;
4752
180k
    const xmlChar *end = ctxt->input->end;
4753
180k
    int state = ctxt->endCheckState;
4754
180k
    size_t index;
4755
4756
180k
    if (ctxt->checkIndex == 0)
4757
146k
        cur = ctxt->input->cur + 2; /* Skip '<a' or '</' */
4758
33.6k
    else
4759
33.6k
        cur = ctxt->input->cur + ctxt->checkIndex;
4760
4761
32.5M
    while (cur < end) {
4762
32.5M
        int c = *cur++;
4763
4764
32.5M
        if (state != LSTATE_ATTR_VALUE_SQUOTED &&
4765
32.5M
            state != LSTATE_ATTR_VALUE_DQUOTED) {
4766
27.6M
            if (c == '/' &&
4767
27.6M
                state != LSTATE_BEFORE_ATTR_VALUE &&
4768
27.6M
                state != LSTATE_ATTR_VALUE_UNQUOTED) {
4769
74.7k
                state = LSTATE_BEFORE_ATTR_NAME;
4770
74.7k
                continue;
4771
27.5M
            } else if (c == '>') {
4772
144k
                ctxt->checkIndex = 0;
4773
144k
                ctxt->endCheckState = 0;
4774
144k
                return(0);
4775
144k
            }
4776
27.6M
        }
4777
4778
32.2M
        switch (state) {
4779
5.54M
            case LSTATE_TAG_NAME:
4780
5.54M
                if (IS_WS_HTML(c))
4781
48.3k
                    state = LSTATE_BEFORE_ATTR_NAME;
4782
5.54M
                break;
4783
4784
350k
            case LSTATE_BEFORE_ATTR_NAME:
4785
350k
                if (!IS_WS_HTML(c))
4786
120k
                    state = LSTATE_ATTR_NAME;
4787
350k
                break;
4788
4789
13.5M
            case LSTATE_ATTR_NAME:
4790
13.5M
                if (c == '=')
4791
49.3k
                    state = LSTATE_BEFORE_ATTR_VALUE;
4792
13.5M
                else if (IS_WS_HTML(c))
4793
1.13M
                    state = LSTATE_AFTER_ATTR_NAME;
4794
13.5M
                break;
4795
4796
2.38M
            case LSTATE_AFTER_ATTR_NAME:
4797
2.38M
                if (c == '=')
4798
325
                    state = LSTATE_BEFORE_ATTR_VALUE;
4799
2.38M
                else if (!IS_WS_HTML(c))
4800
1.10M
                    state = LSTATE_ATTR_NAME;
4801
2.38M
                break;
4802
4803
36.9k
            case LSTATE_BEFORE_ATTR_VALUE:
4804
36.9k
                if (c == '"')
4805
5.73k
                    state = LSTATE_ATTR_VALUE_DQUOTED;
4806
31.2k
                else if (c == '\'')
4807
633
                    state = LSTATE_ATTR_VALUE_SQUOTED;
4808
30.6k
                else if (!IS_WS_HTML(c))
4809
23.0k
                    state = LSTATE_ATTR_VALUE_UNQUOTED;
4810
36.9k
                break;
4811
4812
3.56M
            case LSTATE_ATTR_VALUE_DQUOTED:
4813
3.56M
                if (c == '"')
4814
5.61k
                    state = LSTATE_BEFORE_ATTR_NAME;
4815
3.56M
                break;
4816
4817
1.28M
            case LSTATE_ATTR_VALUE_SQUOTED:
4818
1.28M
                if (c == '\'')
4819
607
                    state = LSTATE_BEFORE_ATTR_NAME;
4820
1.28M
                break;
4821
4822
5.55M
            case LSTATE_ATTR_VALUE_UNQUOTED:
4823
5.55M
                if (IS_WS_HTML(c))
4824
6.62k
                    state = LSTATE_BEFORE_ATTR_NAME;
4825
5.55M
                break;
4826
32.2M
        }
4827
32.2M
    }
4828
4829
35.9k
    index = cur - ctxt->input->cur;
4830
35.9k
    if (index > LONG_MAX) {
4831
0
        ctxt->checkIndex = 0;
4832
0
        ctxt->endCheckState = 0;
4833
0
        return(0);
4834
0
    }
4835
35.9k
    ctxt->checkIndex = index;
4836
35.9k
    ctxt->endCheckState = state;
4837
35.9k
    return(-1);
4838
35.9k
}
4839
4840
/**
4841
 * Check whether the input buffer contains a string.
4842
 *
4843
 * @param ctxt  an XML parser context
4844
 * @param startDelta  delta to apply at the start
4845
 * @param str  string
4846
 * @param strLen  length of string
4847
 * @param extraLen  extra length
4848
 */
4849
static int
4850
htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
4851
298k
                      const char *str, size_t strLen, size_t extraLen) {
4852
298k
    const xmlChar *end = ctxt->input->end;
4853
298k
    const xmlChar *cur, *term;
4854
298k
    size_t index, rescan;
4855
298k
    int ret;
4856
4857
298k
    if (ctxt->checkIndex == 0) {
4858
43.1k
        cur = ctxt->input->cur + startDelta;
4859
255k
    } else {
4860
255k
        cur = ctxt->input->cur + ctxt->checkIndex;
4861
255k
    }
4862
4863
298k
    term = BAD_CAST strstr((const char *) cur, str);
4864
298k
    if ((term != NULL) &&
4865
298k
        ((size_t) (ctxt->input->end - term) >= extraLen + 1)) {
4866
53.7k
        ctxt->checkIndex = 0;
4867
4868
53.7k
        if (term - ctxt->input->cur > INT_MAX / 2)
4869
0
            ret = INT_MAX / 2;
4870
53.7k
        else
4871
53.7k
            ret = term - ctxt->input->cur;
4872
4873
53.7k
        return(ret);
4874
53.7k
    }
4875
4876
    /* Rescan (strLen + extraLen - 1) characters. */
4877
245k
    rescan = strLen + extraLen - 1;
4878
245k
    if ((size_t) (end - cur) <= rescan)
4879
2.37k
        end = cur;
4880
242k
    else
4881
242k
        end -= rescan;
4882
245k
    index = end - ctxt->input->cur;
4883
245k
    if (index > INT_MAX / 2) {
4884
0
        ctxt->checkIndex = 0;
4885
0
        ret = INT_MAX / 2;
4886
245k
    } else {
4887
245k
        ctxt->checkIndex = index;
4888
245k
        ret = -1;
4889
245k
    }
4890
4891
245k
    return(ret);
4892
298k
}
4893
4894
/**
4895
 * Try to find a comment end tag in the input stream
4896
 * The search includes "-->" as well as WHATWG-recommended
4897
 * incorrectly-closed tags.
4898
 *
4899
 * @param ctxt  an HTML parser context
4900
 * @returns the index to the current parsing point if the full
4901
 * sequence is available, -1 otherwise.
4902
 */
4903
static int
4904
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
4905
32.8k
{
4906
32.8k
    int mark = 0;
4907
32.8k
    int offset;
4908
4909
47.0k
    while (1) {
4910
47.0k
  mark = htmlParseLookupString(ctxt, 2, "--", 2, 0);
4911
47.0k
  if (mark < 0)
4912
29.1k
            break;
4913
        /*
4914
         * <!-->    is a complete comment, but
4915
         * <!--!>   is not
4916
         * <!---!>  is not
4917
         * <!----!> is
4918
         */
4919
17.8k
        if ((NXT(mark+2) == '>') ||
4920
17.8k
      ((mark >= 4) && (NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
4921
2.39k
            ctxt->checkIndex = 0;
4922
2.39k
      break;
4923
2.39k
  }
4924
15.4k
        offset = (NXT(mark+2) == '!') ? 3 : 2;
4925
15.4k
        if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
4926
1.33k
      ctxt->checkIndex = mark;
4927
1.33k
            return(-1);
4928
1.33k
        }
4929
14.1k
  ctxt->checkIndex = mark + 1;
4930
14.1k
    }
4931
31.5k
    return mark;
4932
32.8k
}
4933
4934
4935
/**
4936
 * Try to progress on parsing
4937
 *
4938
 * @param ctxt  an HTML parser context
4939
 * @param terminate  last chunk indicator
4940
 * @returns zero if no parsing was possible
4941
 */
4942
static void
4943
392k
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4944
1.91M
    while (PARSER_STOPPED(ctxt) == 0) {
4945
1.91M
        htmlParserInputPtr in;
4946
1.91M
        size_t avail;
4947
4948
1.91M
  in = ctxt->input;
4949
1.91M
  if (in == NULL) break;
4950
1.91M
  avail = in->end - in->cur;
4951
4952
1.91M
        switch (ctxt->instate) {
4953
0
            case XML_PARSER_EOF:
4954
          /*
4955
     * Document parsing is done !
4956
     */
4957
0
          return;
4958
4959
14.1k
            case XML_PARSER_START:
4960
                /*
4961
                 * Very first chars read from the document flow.
4962
                 */
4963
14.1k
                if ((!terminate) && (avail < 4))
4964
4.40k
                    return;
4965
4966
9.72k
                xmlDetectEncoding(ctxt);
4967
4968
                /*
4969
                 * TODO: Implement HTML5 prescan algorithm
4970
                 */
4971
4972
                /*
4973
                 * This is wrong but matches long-standing behavior. In most
4974
                 * cases, a document starting with an XML declaration will
4975
                 * specify UTF-8. The HTML5 prescan algorithm handles
4976
                 * XML declarations in a better way.
4977
                 */
4978
9.72k
                if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4979
9.72k
                    (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
4980
100
                    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4981
100
                }
4982
4983
                /* fall through */
4984
4985
9.72k
            case XML_PARSER_XML_DECL:
4986
9.72k
                if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4987
9.72k
                    ctxt->sax->setDocumentLocator(ctxt->userData,
4988
9.72k
                            (xmlSAXLocator *) &xmlDefaultSAXLocator);
4989
9.72k
                }
4990
9.72k
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
4991
9.72k
              (!ctxt->disableSAX))
4992
9.64k
        ctxt->sax->startDocument(ctxt->userData);
4993
4994
                /* Allow callback to modify state for tests */
4995
9.72k
                if ((ctxt->instate == XML_PARSER_START) ||
4996
9.72k
                    (ctxt->instate == XML_PARSER_XML_DECL))
4997
9.67k
                    ctxt->instate = XML_PARSER_MISC;
4998
9.72k
    break;
4999
5000
275k
            case XML_PARSER_START_TAG:
5001
275k
    if ((!terminate) &&
5002
275k
        (htmlParseLookupGt(ctxt) < 0))
5003
32.7k
        return;
5004
5005
242k
                htmlParseElementInternal(ctxt);
5006
5007
242k
    ctxt->instate = XML_PARSER_CONTENT;
5008
242k
                break;
5009
5010
23.0k
            case XML_PARSER_MISC: /* initial */
5011
25.8k
            case XML_PARSER_PROLOG: /* before html */
5012
1.58M
            case XML_PARSER_CONTENT: {
5013
1.58M
                int mode;
5014
5015
1.58M
                if ((ctxt->instate == XML_PARSER_MISC) ||
5016
1.58M
                    (ctxt->instate == XML_PARSER_PROLOG)) {
5017
25.8k
                    SKIP_BLANKS;
5018
25.8k
                    avail = in->end - in->cur;
5019
25.8k
                }
5020
5021
1.58M
    if (avail < 1)
5022
8.64k
        return;
5023
                /*
5024
                 * Note that endCheckState is also used by
5025
                 * xmlParseLookupGt.
5026
                 */
5027
1.57M
                mode = ctxt->endCheckState;
5028
5029
1.57M
                if (mode != 0) {
5030
85.4k
                    if (htmlParseCharData(ctxt, !terminate) == 0)
5031
82.2k
                        return;
5032
1.48M
    } else if (in->cur[0] == '<') {
5033
1.18M
                    int next;
5034
5035
1.18M
                    if (avail < 2) {
5036
3.58k
                        if (!terminate)
5037
3.21k
                            return;
5038
369
                        next = ' ';
5039
1.18M
                    } else {
5040
1.18M
                        next = in->cur[1];
5041
1.18M
                    }
5042
5043
1.18M
                    if (next == '!') {
5044
147k
                        if ((!terminate) && (avail < 4))
5045
1.03k
                            return;
5046
146k
                        if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5047
42.1k
                            if ((!terminate) &&
5048
42.1k
                                (htmlParseLookupCommentEnd(ctxt) < 0))
5049
30.4k
                                return;
5050
11.7k
                            SKIP(4);
5051
11.7k
                            htmlParseComment(ctxt, /* bogus */ 0);
5052
                            /* don't change state */
5053
11.7k
                            break;
5054
42.1k
                        }
5055
5056
104k
                        if ((!terminate) && (avail < 9))
5057
2.78k
                            return;
5058
101k
                        if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5059
101k
                            (UPP(4) == 'C') && (UPP(5) == 'T') &&
5060
101k
                            (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5061
101k
                            (UPP(8) == 'E')) {
5062
48.3k
                            if ((!terminate) &&
5063
48.3k
                                (htmlParseLookupString(ctxt, 9, ">", 1,
5064
33.1k
                                                       0) < 0))
5065
17.0k
                                return;
5066
31.3k
                            htmlParseDocTypeDecl(ctxt);
5067
31.3k
                            if (ctxt->instate == XML_PARSER_MISC)
5068
480
                                ctxt->instate = XML_PARSER_PROLOG;
5069
53.2k
                        } else {
5070
53.2k
                            if ((!terminate) &&
5071
53.2k
                                (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5072
38.8k
                                return;
5073
14.4k
                            SKIP(2);
5074
14.4k
                            htmlParseComment(ctxt, /* bogus */ 1);
5075
14.4k
                        }
5076
1.03M
                    } else if (next == '?') {
5077
22.6k
                        if ((!terminate) &&
5078
22.6k
                            (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5079
9.54k
                            return;
5080
13.1k
                        SKIP(1);
5081
13.1k
                        htmlParseComment(ctxt, /* bogus */ 1);
5082
                        /* don't change state */
5083
1.01M
                    } else if (next == '/') {
5084
39.8k
                        ctxt->instate = XML_PARSER_END_TAG;
5085
39.8k
                        ctxt->checkIndex = 0;
5086
973k
                    } else if (IS_ASCII_LETTER(next)) {
5087
242k
                        ctxt->instate = XML_PARSER_START_TAG;
5088
242k
                        ctxt->checkIndex = 0;
5089
731k
                    } else {
5090
731k
                        ctxt->instate = XML_PARSER_CONTENT;
5091
731k
                        htmlStartCharData(ctxt);
5092
731k
                        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5093
731k
                            (ctxt->sax->characters != NULL))
5094
730k
                            ctxt->sax->characters(ctxt->userData,
5095
730k
                                                  BAD_CAST "<", 1);
5096
731k
                        SKIP(1);
5097
731k
                    }
5098
1.18M
                } else {
5099
301k
                    ctxt->instate = XML_PARSER_CONTENT;
5100
                    /*
5101
                     * We follow the logic of the XML push parser
5102
                     */
5103
301k
        if (avail < HTML_PARSER_BIG_BUFFER_SIZE) {
5104
174k
                        if ((!terminate) &&
5105
174k
                            (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
5106
150k
                            return;
5107
174k
                    }
5108
150k
                    ctxt->checkIndex = 0;
5109
150k
                    if (htmlParseCharData(ctxt, !terminate) == 0)
5110
6.78k
                        return;
5111
150k
    }
5112
5113
1.21M
    break;
5114
1.57M
      }
5115
5116
1.21M
            case XML_PARSER_END_TAG:
5117
43.0k
    if ((!terminate) &&
5118
43.0k
        (htmlParseLookupGt(ctxt) < 0))
5119
3.22k
        return;
5120
39.8k
    htmlParseEndTag(ctxt);
5121
39.8k
    ctxt->instate = XML_PARSER_CONTENT;
5122
39.8k
    ctxt->checkIndex = 0;
5123
39.8k
          break;
5124
5125
0
      default:
5126
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5127
0
           "HPP: internal error\n", NULL, NULL);
5128
0
    ctxt->instate = XML_PARSER_EOF;
5129
0
    break;
5130
1.91M
  }
5131
1.91M
    }
5132
392k
}
5133
5134
/**
5135
 * Parse a chunk of memory in push parser mode.
5136
 *
5137
 * Assumes that the parser context was initialized with
5138
 * #htmlCreatePushParserCtxt.
5139
 *
5140
 * The last chunk, which will often be empty, must be marked with
5141
 * the `terminate` flag. With the default SAX callbacks, the resulting
5142
 * document will be available in `ctxt->myDoc`. This pointer will not
5143
 * be freed by the library.
5144
 *
5145
 * If the document isn't well-formed, `ctxt->myDoc` is set to NULL.
5146
 *
5147
 * Since 2.14.0, #xmlCtxtGetDocument can be used to retrieve the
5148
 * result document.
5149
 *
5150
 * @param ctxt  an HTML parser context
5151
 * @param chunk  chunk of memory
5152
 * @param size  size of chunk in bytes
5153
 * @param terminate  last chunk indicator
5154
 * @returns an xmlParserErrors code (0 on success).
5155
 */
5156
int
5157
htmlParseChunk(htmlParserCtxt *ctxt, const char *chunk, int size,
5158
393k
              int terminate) {
5159
393k
    if ((ctxt == NULL) ||
5160
393k
        (ctxt->input == NULL) || (ctxt->input->buf == NULL) ||
5161
393k
        (size < 0) ||
5162
393k
        ((size > 0) && (chunk == NULL)))
5163
0
  return(XML_ERR_ARGUMENT);
5164
393k
    if (PARSER_STOPPED(ctxt) != 0)
5165
408
        return(ctxt->errNo);
5166
5167
392k
    if (size > 0)  {
5168
392k
  size_t pos = ctxt->input->cur - ctxt->input->base;
5169
392k
  int res;
5170
5171
392k
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5172
392k
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5173
392k
  if (res < 0) {
5174
118
            htmlParseErr(ctxt, ctxt->input->buf->error,
5175
118
                         "xmlParserInputBufferPush failed", NULL, NULL);
5176
118
            xmlHaltParser(ctxt);
5177
118
      return (ctxt->errNo);
5178
118
  }
5179
392k
    }
5180
5181
392k
    htmlParseTryOrFinish(ctxt, terminate);
5182
5183
392k
    if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
5184
9.03k
        htmlAutoCloseOnEnd(ctxt);
5185
5186
        /*
5187
         * Only check for truncated multi-byte sequences
5188
         */
5189
9.03k
        xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
5190
5191
9.03k
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5192
9.03k
            ctxt->sax->endDocument(ctxt->userData);
5193
5194
9.03k
  ctxt->instate = XML_PARSER_EOF;
5195
9.03k
    }
5196
5197
392k
    return((xmlParserErrors) ctxt->errNo);
5198
392k
}
5199
5200
/************************************************************************
5201
 *                  *
5202
 *      User entry points       *
5203
 *                  *
5204
 ************************************************************************/
5205
5206
/**
5207
 * Create a parser context for using the HTML parser in push mode.
5208
 *
5209
 * @param sax  a SAX handler (optional)
5210
 * @param user_data  The user data returned on SAX callbacks (optional)
5211
 * @param chunk  a pointer to an array of chars (optional)
5212
 * @param size  number of chars in the array
5213
 * @param filename  only used for error reporting (optional)
5214
 * @param enc  encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5215
 * @returns the new parser context or NULL if a memory allocation
5216
 * failed.
5217
 */
5218
htmlParserCtxt *
5219
htmlCreatePushParserCtxt(htmlSAXHandler *sax, void *user_data,
5220
                         const char *chunk, int size, const char *filename,
5221
9.75k
       xmlCharEncoding enc) {
5222
9.75k
    htmlParserCtxtPtr ctxt;
5223
9.75k
    htmlParserInputPtr input;
5224
9.75k
    const char *encoding;
5225
5226
9.75k
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
5227
9.75k
    if (ctxt == NULL)
5228
20
  return(NULL);
5229
5230
9.73k
    encoding = xmlGetCharEncodingName(enc);
5231
9.73k
    input = xmlNewPushInput(filename, chunk, size);
5232
9.73k
    if (input == NULL) {
5233
6
  htmlFreeParserCtxt(ctxt);
5234
6
  return(NULL);
5235
6
    }
5236
5237
9.72k
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5238
0
        xmlFreeInputStream(input);
5239
0
        xmlFreeParserCtxt(ctxt);
5240
0
        return(NULL);
5241
0
    }
5242
5243
9.72k
    if (encoding != NULL)
5244
0
        xmlSwitchEncodingName(ctxt, encoding);
5245
5246
9.72k
    return(ctxt);
5247
9.72k
}
5248
#endif /* LIBXML_PUSH_ENABLED */
5249
5250
/**
5251
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5252
 * to handle parse events. If sax is NULL, fallback to the default DOM
5253
 * behavior and return a tree.
5254
 *
5255
 * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadDoc.
5256
 *
5257
 * @param cur  a pointer to an array of xmlChar
5258
 * @param encoding  a free form C string describing the HTML document encoding, or NULL
5259
 * @param sax  the SAX handler block
5260
 * @param userData  if using SAX, this pointer will be provided on callbacks.
5261
 * @returns the resulting document tree unless SAX is NULL or the document is
5262
 *     not well formed.
5263
 */
5264
5265
xmlDoc *
5266
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5267
0
                htmlSAXHandler *sax, void *userData) {
5268
0
    htmlDocPtr ret;
5269
0
    htmlParserCtxtPtr ctxt;
5270
5271
0
    if (cur == NULL)
5272
0
        return(NULL);
5273
5274
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5275
0
    if (ctxt == NULL)
5276
0
        return(NULL);
5277
5278
0
    if (sax != NULL) {
5279
0
        *ctxt->sax = *sax;
5280
0
        ctxt->userData = userData;
5281
0
    }
5282
5283
0
    htmlParseDocument(ctxt);
5284
0
    ret = ctxt->myDoc;
5285
0
    htmlFreeParserCtxt(ctxt);
5286
5287
0
    return(ret);
5288
0
}
5289
5290
/**
5291
 * Parse an HTML in-memory document and build a tree.
5292
 *
5293
 * @deprecated Use #htmlReadDoc.
5294
 *
5295
 * This function uses deprecated global parser options.
5296
 *
5297
 * @param cur  a pointer to an array of xmlChar
5298
 * @param encoding  the encoding (optional)
5299
 * @returns the resulting document tree
5300
 */
5301
5302
xmlDoc *
5303
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
5304
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5305
0
}
5306
5307
5308
/**
5309
 * Create a parser context to read from a file.
5310
 *
5311
 * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadFile.
5312
 *
5313
 * A non-NULL encoding overrides encoding declarations in the document.
5314
 *
5315
 * Automatic support for ZLIB/Compress compressed document is provided
5316
 * by default if found at compile-time.
5317
 *
5318
 * @param filename  the filename
5319
 * @param encoding  optional encoding
5320
 * @returns the new parser context or NULL if a memory allocation failed.
5321
 */
5322
htmlParserCtxt *
5323
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5324
0
{
5325
0
    htmlParserCtxtPtr ctxt;
5326
0
    htmlParserInputPtr input;
5327
5328
0
    if (filename == NULL)
5329
0
        return(NULL);
5330
5331
0
    ctxt = htmlNewParserCtxt();
5332
0
    if (ctxt == NULL) {
5333
0
  return(NULL);
5334
0
    }
5335
5336
0
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5337
0
    if (input == NULL) {
5338
0
  xmlFreeParserCtxt(ctxt);
5339
0
  return(NULL);
5340
0
    }
5341
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5342
0
        xmlFreeInputStream(input);
5343
0
        xmlFreeParserCtxt(ctxt);
5344
0
        return(NULL);
5345
0
    }
5346
5347
0
    return(ctxt);
5348
0
}
5349
5350
/**
5351
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5352
 * compressed document is provided by default if found at compile-time.
5353
 * It use the given SAX function block to handle the parsing callback.
5354
 * If sax is NULL, fallback to the default DOM tree building routines.
5355
 *
5356
 * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadFile.
5357
 *
5358
 * @param filename  the filename
5359
 * @param encoding  encoding (optional)
5360
 * @param sax  the SAX handler block
5361
 * @param userData  if using SAX, this pointer will be provided on callbacks.
5362
 * @returns the resulting document tree unless SAX is NULL or the document is
5363
 *     not well formed.
5364
 */
5365
5366
xmlDoc *
5367
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandler *sax,
5368
0
                 void *userData) {
5369
0
    htmlDocPtr ret;
5370
0
    htmlParserCtxtPtr ctxt;
5371
0
    htmlSAXHandlerPtr oldsax = NULL;
5372
5373
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5374
0
    if (ctxt == NULL) return(NULL);
5375
0
    if (sax != NULL) {
5376
0
  oldsax = ctxt->sax;
5377
0
        ctxt->sax = sax;
5378
0
        ctxt->userData = userData;
5379
0
    }
5380
5381
0
    htmlParseDocument(ctxt);
5382
5383
0
    ret = ctxt->myDoc;
5384
0
    if (sax != NULL) {
5385
0
        ctxt->sax = oldsax;
5386
0
        ctxt->userData = NULL;
5387
0
    }
5388
0
    htmlFreeParserCtxt(ctxt);
5389
5390
0
    return(ret);
5391
0
}
5392
5393
/**
5394
 * Parse an HTML file and build a tree.
5395
 *
5396
 * @param filename  the filename
5397
 * @param encoding  encoding (optional)
5398
 * @returns the resulting document tree
5399
 */
5400
5401
xmlDoc *
5402
0
htmlParseFile(const char *filename, const char *encoding) {
5403
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5404
0
}
5405
5406
/**
5407
 * Set and return the previous value for handling HTML omitted tags.
5408
 *
5409
 * @deprecated Use HTML_PARSE_NOIMPLIED
5410
 *
5411
 * @param val  int 0 or 1
5412
 * @returns the last value for 0 for no handling, 1 for auto insertion.
5413
 */
5414
5415
int
5416
0
htmlHandleOmittedElem(int val) {
5417
0
    int old = htmlOmittedDefaultValue;
5418
5419
0
    htmlOmittedDefaultValue = val;
5420
0
    return(old);
5421
0
}
5422
5423
/**
5424
 * @deprecated Don't use.
5425
 *
5426
 * @param parent  HTML parent element
5427
 * @param elt  HTML element
5428
 * @returns 1
5429
 */
5430
int
5431
htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5432
0
                       const xmlChar* elt ATTRIBUTE_UNUSED) {
5433
0
    return(1);
5434
0
}
5435
5436
/**
5437
 * @deprecated Don't use.
5438
 *
5439
 * @param parent  HTML parent element
5440
 * @param elt  HTML element
5441
 * @returns HTML_VALID
5442
 */
5443
htmlStatus
5444
htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5445
0
                      const htmlElemDesc* elt ATTRIBUTE_UNUSED) {
5446
0
    return(HTML_VALID);
5447
0
}
5448
5449
/**
5450
 * @deprecated Don't use.
5451
 *
5452
 * @param elt  HTML element
5453
 * @param attr  HTML attribute
5454
 * @param legacy  whether to allow deprecated attributes
5455
 * @returns HTML_VALID
5456
 */
5457
htmlStatus
5458
htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED,
5459
                const xmlChar* attr ATTRIBUTE_UNUSED,
5460
0
                int legacy ATTRIBUTE_UNUSED) {
5461
0
    return(HTML_VALID);
5462
0
}
5463
5464
/**
5465
 * @deprecated Don't use.
5466
 *
5467
 * @param node  an xmlNode in a tree
5468
 * @param legacy  whether to allow deprecated elements (YES is faster here
5469
 *  for Element nodes)
5470
 * @returns HTML_VALID
5471
 */
5472
htmlStatus
5473
htmlNodeStatus(xmlNode *node ATTRIBUTE_UNUSED,
5474
0
               int legacy ATTRIBUTE_UNUSED) {
5475
0
    return(HTML_VALID);
5476
0
}
5477
5478
/************************************************************************
5479
 *                  *
5480
 *  New set (2.6.0) of simpler and more flexible APIs   *
5481
 *                  *
5482
 ************************************************************************/
5483
5484
/**
5485
 * Reset a parser context
5486
 *
5487
 * Same as #xmlCtxtReset.
5488
 *
5489
 * @param ctxt  an HTML parser context
5490
 */
5491
void
5492
htmlCtxtReset(htmlParserCtxt *ctxt)
5493
11.0k
{
5494
11.0k
    xmlCtxtReset(ctxt);
5495
11.0k
}
5496
5497
static int
5498
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
5499
22.0k
{
5500
22.0k
    int allMask;
5501
5502
22.0k
    if (ctxt == NULL)
5503
28
        return(-1);
5504
5505
22.0k
    allMask = HTML_PARSE_RECOVER |
5506
22.0k
              HTML_PARSE_HTML5 |
5507
22.0k
              HTML_PARSE_NODEFDTD |
5508
22.0k
              HTML_PARSE_NOERROR |
5509
22.0k
              HTML_PARSE_NOWARNING |
5510
22.0k
              HTML_PARSE_PEDANTIC |
5511
22.0k
              HTML_PARSE_NOBLANKS |
5512
22.0k
              HTML_PARSE_NONET |
5513
22.0k
              HTML_PARSE_NOIMPLIED |
5514
22.0k
              HTML_PARSE_COMPACT |
5515
22.0k
              HTML_PARSE_HUGE |
5516
22.0k
              HTML_PARSE_IGNORE_ENC |
5517
22.0k
              HTML_PARSE_BIG_LINES;
5518
5519
22.0k
    ctxt->options = (ctxt->options & keepMask) | (options & allMask);
5520
5521
    /*
5522
     * For some options, struct members are historically the source
5523
     * of truth. See xmlCtxtSetOptionsInternal.
5524
     */
5525
22.0k
    ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
5526
5527
    /*
5528
     * Changing SAX callbacks is a bad idea. This should be fixed.
5529
     */
5530
22.0k
    if (options & HTML_PARSE_NOBLANKS) {
5531
7.71k
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5532
7.71k
    }
5533
22.0k
    if (options & HTML_PARSE_HUGE) {
5534
8.23k
        if (ctxt->dict != NULL)
5535
8.23k
            xmlDictSetLimit(ctxt->dict, 0);
5536
8.23k
    }
5537
5538
    /*
5539
     * It would be useful to allow this feature.
5540
     */
5541
22.0k
    ctxt->dictNames = 0;
5542
5543
    /*
5544
     * Allow XML_PARSE_NOENT which many users set on the HTML parser.
5545
     */
5546
22.0k
    return(options & ~allMask & ~XML_PARSE_NOENT);
5547
22.0k
}
5548
5549
/**
5550
 * Applies the options to the parser context. Unset options are
5551
 * cleared.
5552
 *
5553
 * @since 2.14.0
5554
 *
5555
 * With older versions, you can use #htmlCtxtUseOptions.
5556
 *
5557
 * @param ctxt  an HTML parser context
5558
 * @param options  a bitmask of htmlParserOption values
5559
 * @returns 0 in case of success, the set of unknown or unimplemented options
5560
 *         in case of error.
5561
 */
5562
int
5563
htmlCtxtSetOptions(htmlParserCtxt *ctxt, int options)
5564
0
{
5565
0
    return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
5566
0
}
5567
5568
/**
5569
 * Applies the options to the parser context. The following options
5570
 * are never cleared and can only be enabled:
5571
 *
5572
 * @deprecated Use #htmlCtxtSetOptions.
5573
 *
5574
 * - HTML_PARSE_NODEFDTD
5575
 * - HTML_PARSE_NOERROR
5576
 * - HTML_PARSE_NOWARNING
5577
 * - HTML_PARSE_NOIMPLIED
5578
 * - HTML_PARSE_COMPACT
5579
 * - HTML_PARSE_HUGE
5580
 * - HTML_PARSE_IGNORE_ENC
5581
 * - HTML_PARSE_BIG_LINES
5582
 *
5583
 * @param ctxt  an HTML parser context
5584
 * @param options  a combination of htmlParserOption values
5585
 * @returns 0 in case of success, the set of unknown or unimplemented options
5586
 *         in case of error.
5587
 */
5588
int
5589
htmlCtxtUseOptions(htmlParserCtxt *ctxt, int options)
5590
22.0k
{
5591
22.0k
    int keepMask;
5592
5593
    /*
5594
     * For historic reasons, some options can only be enabled.
5595
     */
5596
22.0k
    keepMask = HTML_PARSE_NODEFDTD |
5597
22.0k
               HTML_PARSE_NOERROR |
5598
22.0k
               HTML_PARSE_NOWARNING |
5599
22.0k
               HTML_PARSE_NOIMPLIED |
5600
22.0k
               HTML_PARSE_COMPACT |
5601
22.0k
               HTML_PARSE_HUGE |
5602
22.0k
               HTML_PARSE_IGNORE_ENC |
5603
22.0k
               HTML_PARSE_BIG_LINES;
5604
5605
22.0k
    return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
5606
22.0k
}
5607
5608
/**
5609
 * Parse an HTML document and return the resulting document tree.
5610
 *
5611
 * @since 2.13.0
5612
 *
5613
 * @param ctxt  an HTML parser context
5614
 * @param input  parser input
5615
 * @returns the resulting document tree or NULL
5616
 */
5617
xmlDoc *
5618
htmlCtxtParseDocument(htmlParserCtxt *ctxt, xmlParserInput *input)
5619
9.70k
{
5620
9.70k
    htmlDocPtr ret;
5621
5622
9.70k
    if ((ctxt == NULL) || (input == NULL)) {
5623
0
        xmlFatalErr(ctxt, XML_ERR_ARGUMENT, NULL);
5624
0
        xmlFreeInputStream(input);
5625
0
        return(NULL);
5626
0
    }
5627
5628
    /* assert(ctxt->inputNr == 0); */
5629
9.70k
    while (ctxt->inputNr > 0)
5630
0
        xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5631
5632
9.70k
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5633
0
        xmlFreeInputStream(input);
5634
0
        return(NULL);
5635
0
    }
5636
5637
9.70k
    ctxt->html = INSERT_INITIAL;
5638
9.70k
    htmlParseDocument(ctxt);
5639
5640
9.70k
    ret = xmlCtxtGetDocument(ctxt);
5641
5642
    /* assert(ctxt->inputNr == 1); */
5643
19.4k
    while (ctxt->inputNr > 0)
5644
9.70k
        xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5645
5646
9.70k
    return(ret);
5647
9.70k
}
5648
5649
/**
5650
 * Convenience function to parse an HTML document from a zero-terminated
5651
 * string.
5652
 *
5653
 * See #htmlCtxtReadDoc for details.
5654
 *
5655
 * @param str  a pointer to a zero terminated string
5656
 * @param url  only used for error reporting (optoinal)
5657
 * @param encoding  the document encoding (optional)
5658
 * @param options  a combination of htmlParserOption values
5659
 * @returns the resulting document tree.
5660
 */
5661
xmlDoc *
5662
htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
5663
            int options)
5664
0
{
5665
0
    htmlParserCtxtPtr ctxt;
5666
0
    xmlParserInputPtr input;
5667
0
    htmlDocPtr doc = NULL;
5668
5669
0
    ctxt = htmlNewParserCtxt();
5670
0
    if (ctxt == NULL)
5671
0
        return(NULL);
5672
5673
0
    htmlCtxtUseOptions(ctxt, options);
5674
5675
0
    input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding,
5676
0
                                      XML_INPUT_BUF_STATIC);
5677
5678
0
    if (input != NULL)
5679
0
        doc = htmlCtxtParseDocument(ctxt, input);
5680
5681
0
    htmlFreeParserCtxt(ctxt);
5682
0
    return(doc);
5683
0
}
5684
5685
/**
5686
 * Convenience function to parse an HTML file from the filesystem,
5687
 * the network or a global user-defined resource loader.
5688
 *
5689
 * See #htmlCtxtReadFile for details.
5690
 *
5691
 * @param filename  a file or URL
5692
 * @param encoding  the document encoding (optional)
5693
 * @param options  a combination of htmlParserOption values
5694
 * @returns the resulting document tree.
5695
 */
5696
xmlDoc *
5697
htmlReadFile(const char *filename, const char *encoding, int options)
5698
0
{
5699
0
    htmlParserCtxtPtr ctxt;
5700
0
    xmlParserInputPtr input;
5701
0
    htmlDocPtr doc = NULL;
5702
5703
0
    ctxt = htmlNewParserCtxt();
5704
0
    if (ctxt == NULL)
5705
0
        return(NULL);
5706
5707
0
    htmlCtxtUseOptions(ctxt, options);
5708
5709
0
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5710
5711
0
    if (input != NULL)
5712
0
        doc = htmlCtxtParseDocument(ctxt, input);
5713
5714
0
    htmlFreeParserCtxt(ctxt);
5715
0
    return(doc);
5716
0
}
5717
5718
/**
5719
 * Convenience function to parse an HTML document from memory.
5720
 * The input buffer must not contain any terminating null bytes.
5721
 *
5722
 * See #htmlCtxtReadMemory for details.
5723
 *
5724
 * @param buffer  a pointer to a char array
5725
 * @param size  the size of the array
5726
 * @param url  only used for error reporting (optional)
5727
 * @param encoding  the document encoding, or NULL
5728
 * @param options  a combination of htmlParserOption values
5729
 * @returns the resulting document tree
5730
 */
5731
xmlDoc *
5732
htmlReadMemory(const char *buffer, int size, const char *url,
5733
               const char *encoding, int options)
5734
0
{
5735
0
    htmlParserCtxtPtr ctxt;
5736
0
    xmlParserInputPtr input;
5737
0
    htmlDocPtr doc = NULL;
5738
5739
0
    if (size < 0)
5740
0
  return(NULL);
5741
5742
0
    ctxt = htmlNewParserCtxt();
5743
0
    if (ctxt == NULL)
5744
0
        return(NULL);
5745
5746
0
    htmlCtxtUseOptions(ctxt, options);
5747
5748
0
    input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding,
5749
0
                                      XML_INPUT_BUF_STATIC);
5750
5751
0
    if (input != NULL)
5752
0
        doc = htmlCtxtParseDocument(ctxt, input);
5753
5754
0
    htmlFreeParserCtxt(ctxt);
5755
0
    return(doc);
5756
0
}
5757
5758
/**
5759
 * Convenience function to parse an HTML document from a
5760
 * file descriptor.
5761
 *
5762
 * NOTE that the file descriptor will not be closed when the
5763
 * context is freed or reset.
5764
 *
5765
 * See #htmlCtxtReadFd for details.
5766
 *
5767
 * @param fd  an open file descriptor
5768
 * @param url  only used for error reporting (optional)
5769
 * @param encoding  the document encoding, or NULL
5770
 * @param options  a combination of htmlParserOption values
5771
 * @returns the resulting document tree
5772
 */
5773
xmlDoc *
5774
htmlReadFd(int fd, const char *url, const char *encoding, int options)
5775
0
{
5776
0
    htmlParserCtxtPtr ctxt;
5777
0
    xmlParserInputPtr input;
5778
0
    htmlDocPtr doc = NULL;
5779
5780
0
    ctxt = htmlNewParserCtxt();
5781
0
    if (ctxt == NULL)
5782
0
        return(NULL);
5783
5784
0
    htmlCtxtUseOptions(ctxt, options);
5785
5786
0
    input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0);
5787
5788
0
    if (input != NULL)
5789
0
        doc = htmlCtxtParseDocument(ctxt, input);
5790
5791
0
    htmlFreeParserCtxt(ctxt);
5792
0
    return(doc);
5793
0
}
5794
5795
/**
5796
 * Convenience function to parse an HTML document from I/O functions
5797
 * and context.
5798
 *
5799
 * See #htmlCtxtReadIO for details.
5800
 *
5801
 * @param ioread  an I/O read function
5802
 * @param ioclose  an I/O close function (optional)
5803
 * @param ioctx  an I/O handler
5804
 * @param url  only used for error reporting (optional)
5805
 * @param encoding  the document encoding (optional)
5806
 * @param options  a combination of htmlParserOption values
5807
 * @returns the resulting document tree
5808
 */
5809
xmlDoc *
5810
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5811
          void *ioctx, const char *url, const char *encoding, int options)
5812
0
{
5813
0
    htmlParserCtxtPtr ctxt;
5814
0
    xmlParserInputPtr input;
5815
0
    htmlDocPtr doc = NULL;
5816
5817
0
    ctxt = htmlNewParserCtxt();
5818
0
    if (ctxt == NULL)
5819
0
        return (NULL);
5820
5821
0
    htmlCtxtUseOptions(ctxt, options);
5822
5823
0
    input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx,
5824
0
                                  encoding, 0);
5825
5826
0
    if (input != NULL)
5827
0
        doc = htmlCtxtParseDocument(ctxt, input);
5828
5829
0
    htmlFreeParserCtxt(ctxt);
5830
0
    return(doc);
5831
0
}
5832
5833
/**
5834
 * Parse an HTML in-memory document and build a tree.
5835
 *
5836
 * See #htmlCtxtUseOptions for details.
5837
 *
5838
 * @param ctxt  an HTML parser context
5839
 * @param str  a pointer to a zero terminated string
5840
 * @param URL  only used for error reporting (optional)
5841
 * @param encoding  the document encoding (optional)
5842
 * @param options  a combination of htmlParserOption values
5843
 * @returns the resulting document tree
5844
 */
5845
xmlDoc *
5846
htmlCtxtReadDoc(xmlParserCtxt *ctxt, const xmlChar *str,
5847
                const char *URL, const char *encoding, int options)
5848
0
{
5849
0
    xmlParserInputPtr input;
5850
5851
0
    if (ctxt == NULL)
5852
0
        return (NULL);
5853
5854
0
    htmlCtxtReset(ctxt);
5855
0
    htmlCtxtUseOptions(ctxt, options);
5856
5857
0
    input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str,
5858
0
                                      encoding, 0);
5859
0
    if (input == NULL)
5860
0
        return(NULL);
5861
5862
0
    return(htmlCtxtParseDocument(ctxt, input));
5863
0
}
5864
5865
/**
5866
 * Parse an HTML file from the filesystem, the network or a
5867
 * user-defined resource loader.
5868
 *
5869
 * See #htmlCtxtUseOptions for details.
5870
 *
5871
 * @param ctxt  an HTML parser context
5872
 * @param filename  a file or URL
5873
 * @param encoding  the document encoding (optional)
5874
 * @param options  a combination of htmlParserOption values
5875
 * @returns the resulting document tree
5876
 */
5877
xmlDoc *
5878
htmlCtxtReadFile(xmlParserCtxt *ctxt, const char *filename,
5879
                const char *encoding, int options)
5880
1.34k
{
5881
1.34k
    xmlParserInputPtr input;
5882
5883
1.34k
    if (ctxt == NULL)
5884
0
        return (NULL);
5885
5886
1.34k
    htmlCtxtReset(ctxt);
5887
1.34k
    htmlCtxtUseOptions(ctxt, options);
5888
5889
1.34k
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5890
1.34k
    if (input == NULL)
5891
1.34k
        return(NULL);
5892
5893
0
    return(htmlCtxtParseDocument(ctxt, input));
5894
1.34k
}
5895
5896
/**
5897
 * Parse an HTML in-memory document and build a tree. The input buffer must
5898
 * not contain any terminating null bytes.
5899
 *
5900
 * See #htmlCtxtUseOptions for details.
5901
 *
5902
 * @param ctxt  an HTML parser context
5903
 * @param buffer  a pointer to a char array
5904
 * @param size  the size of the array
5905
 * @param URL  only used for error reporting (optional)
5906
 * @param encoding  the document encoding (optinal)
5907
 * @param options  a combination of htmlParserOption values
5908
 * @returns the resulting document tree
5909
 */
5910
xmlDoc *
5911
htmlCtxtReadMemory(xmlParserCtxt *ctxt, const char *buffer, int size,
5912
                  const char *URL, const char *encoding, int options)
5913
9.73k
{
5914
9.73k
    xmlParserInputPtr input;
5915
5916
9.73k
    if ((ctxt == NULL) || (size < 0))
5917
0
        return (NULL);
5918
5919
9.73k
    htmlCtxtReset(ctxt);
5920
9.73k
    htmlCtxtUseOptions(ctxt, options);
5921
5922
9.73k
    input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding,
5923
9.73k
                                      XML_INPUT_BUF_STATIC);
5924
9.73k
    if (input == NULL)
5925
31
        return(NULL);
5926
5927
9.70k
    return(htmlCtxtParseDocument(ctxt, input));
5928
9.73k
}
5929
5930
/**
5931
 * Parse an HTML from a file descriptor and build a tree.
5932
 *
5933
 * See #htmlCtxtUseOptions for details.
5934
 *
5935
 * NOTE that the file descriptor will not be closed when the
5936
 * context is freed or reset.
5937
 *
5938
 * @param ctxt  an HTML parser context
5939
 * @param fd  an open file descriptor
5940
 * @param URL  only used for error reporting (optional)
5941
 * @param encoding  the document encoding (optinal)
5942
 * @param options  a combination of htmlParserOption values
5943
 * @returns the resulting document tree
5944
 */
5945
xmlDoc *
5946
htmlCtxtReadFd(xmlParserCtxt *ctxt, int fd,
5947
              const char *URL, const char *encoding, int options)
5948
0
{
5949
0
    xmlParserInputPtr input;
5950
5951
0
    if (ctxt == NULL)
5952
0
        return(NULL);
5953
5954
0
    htmlCtxtReset(ctxt);
5955
0
    htmlCtxtUseOptions(ctxt, options);
5956
5957
0
    input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0);
5958
0
    if (input == NULL)
5959
0
        return(NULL);
5960
5961
0
    return(htmlCtxtParseDocument(ctxt, input));
5962
0
}
5963
5964
/**
5965
 * Parse an HTML document from I/O functions and source and build a tree.
5966
 *
5967
 * See #htmlCtxtUseOptions for details.
5968
 *
5969
 * @param ctxt  an HTML parser context
5970
 * @param ioread  an I/O read function
5971
 * @param ioclose  an I/O close function
5972
 * @param ioctx  an I/O handler
5973
 * @param URL  the base URL to use for the document
5974
 * @param encoding  the document encoding, or NULL
5975
 * @param options  a combination of htmlParserOption values
5976
 * @returns the resulting document tree
5977
 */
5978
xmlDoc *
5979
htmlCtxtReadIO(xmlParserCtxt *ctxt, xmlInputReadCallback ioread,
5980
              xmlInputCloseCallback ioclose, void *ioctx,
5981
        const char *URL,
5982
              const char *encoding, int options)
5983
0
{
5984
0
    xmlParserInputPtr input;
5985
5986
0
    if (ctxt == NULL)
5987
0
        return (NULL);
5988
5989
0
    htmlCtxtReset(ctxt);
5990
0
    htmlCtxtUseOptions(ctxt, options);
5991
5992
0
    input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx,
5993
0
                                  encoding, 0);
5994
0
    if (input == NULL)
5995
0
        return(NULL);
5996
5997
0
    return(htmlCtxtParseDocument(ctxt, input));
5998
0
}
5999
6000
#endif /* LIBXML_HTML_ENABLED */