Coverage Report

Created: 2025-06-22 06:55

/src/libxml2/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML parser
3
 *
4
 * References:
5
 *   HTML Living Standard
6
 *     https://html.spec.whatwg.org/multipage/parsing.html
7
 *
8
 * Tokenization now conforms to HTML5. Tree construction still follows
9
 * a custom, non-standard implementation. See:
10
 *
11
 *     https://gitlab.gnome.org/GNOME/libxml2/-/issues/211
12
 *
13
 * See Copyright for the status of this software.
14
 *
15
 * Author: Daniel Veillard
16
 */
17
18
#define IN_LIBXML
19
#include "libxml.h"
20
#ifdef LIBXML_HTML_ENABLED
21
22
#include <string.h>
23
#include <ctype.h>
24
#include <stdlib.h>
25
26
#include <libxml/HTMLparser.h>
27
#include <libxml/xmlmemory.h>
28
#include <libxml/tree.h>
29
#include <libxml/parser.h>
30
#include <libxml/parserInternals.h>
31
#include <libxml/xmlerror.h>
32
#include <libxml/HTMLtree.h>
33
#include <libxml/entities.h>
34
#include <libxml/encoding.h>
35
#include <libxml/xmlIO.h>
36
#include <libxml/uri.h>
37
38
#include "private/buf.h"
39
#include "private/dict.h"
40
#include "private/enc.h"
41
#include "private/error.h"
42
#include "private/html.h"
43
#include "private/io.h"
44
#include "private/memory.h"
45
#include "private/parser.h"
46
#include "private/tree.h"
47
48
#define HTML_MAX_NAMELEN 1000
49
0
#define HTML_MAX_ATTRS 100000000 /* 100 million */
50
0
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
51
0
#define HTML_PARSER_BUFFER_SIZE 100
52
53
#define IS_HEX_DIGIT(c) \
54
0
    ((IS_ASCII_DIGIT(c)) || \
55
0
     ((((c) | 0x20) >= 'a') && (((c) | 0x20) <= 'f')))
56
57
#define IS_UPPER(c) \
58
0
    (((c) >= 'A') && ((c) <= 'Z'))
59
60
#define IS_ALNUM(c) \
61
0
    (IS_ASCII_LETTER(c) || IS_ASCII_DIGIT(c))
62
63
typedef enum {
64
    INSERT_INITIAL = 1,
65
    INSERT_IN_HEAD = 3,
66
    INSERT_IN_BODY = 10
67
} htmlInsertMode;
68
69
typedef const unsigned htmlAsciiMask[2];
70
71
static htmlAsciiMask MASK_DQ = {
72
    0,
73
    1u << ('"' - 32),
74
};
75
static htmlAsciiMask MASK_SQ = {
76
    0,
77
    1u << ('\'' - 32),
78
};
79
static htmlAsciiMask MASK_GT = {
80
    0,
81
    1u << ('>' - 32),
82
};
83
static htmlAsciiMask MASK_DASH = {
84
    0,
85
    1u << ('-' - 32),
86
};
87
static htmlAsciiMask MASK_WS_GT = {
88
    1u << 0x09 | 1u << 0x0A | 1u << 0x0C | 1u << 0x0D,
89
    1u << (' ' - 32) | 1u << ('>' - 32),
90
};
91
static htmlAsciiMask MASK_DQ_GT = {
92
    0,
93
    1u << ('"' - 32) | 1u << ('>' - 32),
94
};
95
static htmlAsciiMask MASK_SQ_GT = {
96
    0,
97
    1u << ('\'' - 32) | 1u << ('>' - 32),
98
};
99
100
static int htmlOmittedDefaultValue = 1;
101
102
static int
103
htmlParseElementInternal(htmlParserCtxtPtr ctxt);
104
105
/************************************************************************
106
 *                  *
107
 *    Some factorized error routines        *
108
 *                  *
109
 ************************************************************************/
110
111
/**
112
 * Handle an out-of-memory error
113
 *
114
 * @param ctxt  an HTML parser context
115
 */
116
static void
117
htmlErrMemory(xmlParserCtxtPtr ctxt)
118
0
{
119
0
    xmlCtxtErrMemory(ctxt);
120
0
}
121
122
/**
123
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
124
 *
125
 * @param ctxt  an HTML parser context
126
 * @param error  the error number
127
 * @param msg  the error message
128
 * @param str1  string infor
129
 * @param str2  string infor
130
 */
131
static void LIBXML_ATTR_FORMAT(3,0)
132
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
133
             const char *msg, const xmlChar *str1, const xmlChar *str2)
134
0
{
135
0
    xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
136
0
               str1, str2, NULL, 0, msg, str1, str2);
137
0
}
138
139
/************************************************************************
140
 *                  *
141
 *  Parser stacks related functions and macros    *
142
 *                  *
143
 ************************************************************************/
144
145
/**
146
 * Pushes a new element name on top of the name stack
147
 *
148
 * @param ctxt  an HTML parser context
149
 * @param value  the element name
150
 * @returns -1 in case of error, the index in the stack otherwise
151
 */
152
static int
153
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
154
0
{
155
0
    if ((ctxt->html < INSERT_IN_HEAD) && (xmlStrEqual(value, BAD_CAST "head")))
156
0
        ctxt->html = INSERT_IN_HEAD;
157
0
    if ((ctxt->html < INSERT_IN_BODY) && (xmlStrEqual(value, BAD_CAST "body")))
158
0
        ctxt->html = INSERT_IN_BODY;
159
0
    if (ctxt->nameNr >= ctxt->nameMax) {
160
0
        const xmlChar **tmp;
161
0
        int newSize;
162
163
0
        newSize = xmlGrowCapacity(ctxt->nameMax, sizeof(tmp[0]),
164
0
                                  10, XML_MAX_ITEMS);
165
0
        if (newSize < 0) {
166
0
            htmlErrMemory(ctxt);
167
0
            return (-1);
168
0
        }
169
0
        tmp = xmlRealloc(ctxt->nameTab, newSize * sizeof(tmp[0]));
170
0
        if (tmp == NULL) {
171
0
            htmlErrMemory(ctxt);
172
0
            return(-1);
173
0
        }
174
0
        ctxt->nameTab = tmp;
175
0
        ctxt->nameMax = newSize;
176
0
    }
177
0
    ctxt->nameTab[ctxt->nameNr] = value;
178
0
    ctxt->name = value;
179
0
    return (ctxt->nameNr++);
180
0
}
181
/**
182
 * Pops the top element name from the name stack
183
 *
184
 * @param ctxt  an HTML parser context
185
 * @returns the name just removed
186
 */
187
static const xmlChar *
188
htmlnamePop(htmlParserCtxtPtr ctxt)
189
0
{
190
0
    const xmlChar *ret;
191
192
0
    if (ctxt->nameNr <= 0)
193
0
        return (NULL);
194
0
    ctxt->nameNr--;
195
0
    if (ctxt->nameNr < 0)
196
0
        return (NULL);
197
0
    if (ctxt->nameNr > 0)
198
0
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
199
0
    else
200
0
        ctxt->name = NULL;
201
0
    ret = ctxt->nameTab[ctxt->nameNr];
202
0
    ctxt->nameTab[ctxt->nameNr] = NULL;
203
0
    return (ret);
204
0
}
205
206
/**
207
 * Pushes a new element name on top of the node info stack
208
 *
209
 * @param ctxt  an HTML parser context
210
 * @param value  the node info
211
 * @returns 0 in case of error, the index in the stack otherwise
212
 */
213
static int
214
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
215
0
{
216
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
217
0
        xmlParserNodeInfo *tmp;
218
0
        int newSize;
219
220
0
        newSize = xmlGrowCapacity(ctxt->nodeInfoMax, sizeof(tmp[0]),
221
0
                                  5, XML_MAX_ITEMS);
222
0
        if (newSize < 0) {
223
0
            htmlErrMemory(ctxt);
224
0
            return (0);
225
0
        }
226
0
        tmp = xmlRealloc(ctxt->nodeInfoTab, newSize * sizeof(tmp[0]));
227
0
        if (tmp == NULL) {
228
0
            htmlErrMemory(ctxt);
229
0
            return (0);
230
0
        }
231
0
        ctxt->nodeInfoTab = tmp;
232
0
        ctxt->nodeInfoMax = newSize;
233
0
    }
234
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
235
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
236
0
    return (ctxt->nodeInfoNr++);
237
0
}
238
239
/**
240
 * Pops the top element name from the node info stack
241
 *
242
 * @param ctxt  an HTML parser context
243
 * @returns 0 in case of error, the pointer to NodeInfo otherwise
244
 */
245
static htmlParserNodeInfo *
246
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
247
0
{
248
0
    if (ctxt->nodeInfoNr <= 0)
249
0
        return (NULL);
250
0
    ctxt->nodeInfoNr--;
251
0
    if (ctxt->nodeInfoNr < 0)
252
0
        return (NULL);
253
0
    if (ctxt->nodeInfoNr > 0)
254
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
255
0
    else
256
0
        ctxt->nodeInfo = NULL;
257
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
258
0
}
259
260
/*
261
 * Macros for accessing the content. Those should be used only by the parser,
262
 * and not exported.
263
 *
264
 * Dirty macros, i.e. one need to make assumption on the context to use them
265
 *
266
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
267
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
268
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
269
 *           in UNICODE mode. This should be used internally by the parser
270
 *           only to compare to ASCII values otherwise it would break when
271
 *           running with UTF-8 encoding.
272
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
273
 *           to compare on ASCII based substring.
274
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
275
 *           it should be used only to compare on ASCII based substring.
276
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
277
 *           strings without newlines within the parser.
278
 *
279
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
280
 *
281
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
282
 */
283
284
0
#define UPPER (toupper(*ctxt->input->cur))
285
286
0
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
287
288
0
#define NXT(val) ctxt->input->cur[(val)]
289
290
0
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
291
292
0
#define CUR_PTR ctxt->input->cur
293
#define BASE_PTR ctxt->input->base
294
295
#define SHRINK \
296
0
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
297
0
        (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
298
0
  (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
299
0
  xmlParserShrink(ctxt);
300
301
#define GROW \
302
0
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
303
0
        (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
304
0
  xmlParserGrow(ctxt);
305
306
0
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
307
308
/* Imported from XML */
309
310
0
#define CUR (*ctxt->input->cur)
311
312
/**
313
 * Prescan to find encoding.
314
 *
315
 * Try to find an encoding in the current data available in the input
316
 * buffer.
317
 *
318
 * TODO: Implement HTML5 prescan algorithm.
319
 *
320
 * @param ctxt  the HTML parser context
321
 * @returns  an encoding string or NULL if not found
322
 */
323
static xmlChar *
324
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
325
0
    const xmlChar *start, *cur, *end;
326
0
    xmlChar *ret;
327
0
328
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
329
0
        (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
330
0
        return(NULL);
331
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
332
0
        return(NULL);
333
0
334
0
    start = ctxt->input->cur;
335
0
    end = ctxt->input->end;
336
0
    /* we also expect the input buffer to be zero terminated */
337
0
    if (*end != 0)
338
0
        return(NULL);
339
0
340
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
341
0
    if (cur == NULL)
342
0
        return(NULL);
343
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
344
0
    if (cur == NULL)
345
0
        return(NULL);
346
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
347
0
    if (cur == NULL)
348
0
        return(NULL);
349
0
    cur += 8;
350
0
    start = cur;
351
0
    while ((IS_ALNUM(*cur)) ||
352
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
353
0
           cur++;
354
0
    if (cur == start)
355
0
        return(NULL);
356
0
    ret = xmlStrndup(start, cur - start);
357
0
    if (ret == NULL)
358
0
        htmlErrMemory(ctxt);
359
0
    return(ret);
360
0
}
361
362
static int
363
0
htmlMaskMatch(htmlAsciiMask mask, unsigned c) {
364
0
    if (c >= 64)
365
0
        return(0);
366
0
    return((mask[c/32] >> (c & 31)) & 1);
367
0
}
368
369
static int
370
htmlValidateUtf8(xmlParserCtxtPtr ctxt, const xmlChar *str, size_t len,
371
0
                 int partial) {
372
0
    unsigned c = str[0];
373
0
    int size;
374
375
0
    if (c < 0xC2) {
376
0
        goto invalid;
377
0
    } else if (c < 0xE0) {
378
0
        if (len < 2)
379
0
            goto incomplete;
380
0
        if ((str[1] & 0xC0) != 0x80)
381
0
            goto invalid;
382
0
        size = 2;
383
0
    } else if (c < 0xF0) {
384
0
        unsigned v;
385
386
0
        if (len < 3)
387
0
            goto incomplete;
388
389
0
        v = str[1] << 8 | str[2]; /* hint to generate 16-bit load */
390
0
        v |= c << 16;
391
392
0
        if (((v & 0x00C0C0) != 0x008080) ||
393
0
            ((v & 0x0F2000) == 0x000000) ||
394
0
            ((v & 0x0F2000) == 0x0D2000))
395
0
            goto invalid;
396
397
0
        size = 3;
398
0
    } else {
399
0
        unsigned v;
400
401
0
        if (len < 4)
402
0
            goto incomplete;
403
404
0
        v = c << 24 | str[1] << 16 | str[2] << 8 | str[3];
405
406
0
        if (((v & 0x00C0C0C0) != 0x00808080) ||
407
0
            (v < 0xF0900000) || (v >= 0xF4900000))
408
0
            goto invalid;
409
410
0
        size = 4;
411
0
    }
412
413
0
    return(size);
414
415
0
incomplete:
416
0
    if (partial)
417
0
        return(0);
418
419
0
invalid:
420
    /* Only report the first error */
421
0
    if ((ctxt->input->flags & XML_INPUT_ENCODING_ERROR) == 0) {
422
0
        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
423
0
                     "Invalid bytes in character encoding", NULL, NULL);
424
0
        ctxt->input->flags |= XML_INPUT_ENCODING_ERROR;
425
0
    }
426
427
0
    return(-1);
428
0
}
429
430
/**
431
 * skip all blanks character found at that point in the input streams.
432
 *
433
 * @param ctxt  the HTML parser context
434
 * @returns the number of space chars skipped
435
 */
436
437
static int
438
0
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
439
0
    const xmlChar *cur = ctxt->input->cur;
440
0
    size_t avail = ctxt->input->end - cur;
441
0
    int res = 0;
442
0
    int line = ctxt->input->line;
443
0
    int col = ctxt->input->col;
444
445
0
    while (!PARSER_STOPPED(ctxt)) {
446
0
        if (avail == 0) {
447
0
            ctxt->input->cur = cur;
448
0
            GROW;
449
0
            cur = ctxt->input->cur;
450
0
            avail = ctxt->input->end - cur;
451
452
0
            if (avail == 0)
453
0
                break;
454
0
        }
455
456
0
        if (*cur == '\n') {
457
0
            line++;
458
0
            col = 1;
459
0
        } else if (IS_WS_HTML(*cur)) {
460
0
            col++;
461
0
        } else {
462
0
            break;
463
0
        }
464
465
0
        cur += 1;
466
0
        avail -= 1;
467
468
0
  if (res < INT_MAX)
469
0
      res++;
470
0
    }
471
472
0
    ctxt->input->cur = cur;
473
0
    ctxt->input->line = line;
474
0
    ctxt->input->col = col;
475
476
0
    if (res > 8)
477
0
        GROW;
478
479
0
    return(res);
480
0
}
481
482
483
484
/************************************************************************
485
 *                  *
486
 *  The list of HTML elements and their properties    *
487
 *                  *
488
 ************************************************************************/
489
490
/*
491
 *  Start Tag: 1 means the start tag can be omitted
492
 *  End Tag:   1 means the end tag can be omitted
493
 *             2 means it's forbidden (empty elements)
494
 *             3 means the tag is stylistic and should be closed easily
495
 *  Depr:      this element is deprecated
496
 *  DTD:       1 means that this element is valid only in the Loose DTD
497
 *             2 means that this element is valid only in the Frameset DTD
498
 *
499
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
500
 */
501
502
static const htmlElemDesc
503
html40ElementTable[] = {
504
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
505
  NULL, NULL, NULL, NULL, NULL,
506
  0
507
},
508
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
509
  NULL, NULL, NULL, NULL, NULL,
510
  0
511
},
512
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
513
  NULL, NULL, NULL, NULL, NULL,
514
  0
515
},
516
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
517
  NULL, NULL, NULL, NULL, NULL,
518
  0
519
},
520
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
521
  NULL, NULL, NULL, NULL, NULL,
522
  0
523
},
524
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
525
  NULL, NULL, NULL, NULL, NULL,
526
  0
527
},
528
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
529
  NULL, NULL, NULL, NULL, NULL,
530
  0
531
},
532
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
533
  NULL, NULL, NULL, NULL, NULL,
534
  0
535
},
536
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
537
  NULL, NULL, NULL, NULL, NULL,
538
  0
539
},
540
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
541
  NULL, NULL, NULL, NULL, NULL,
542
  0
543
},
544
{ "bgsound",  0, 0, 2, 1, 0, 0, 0, "",
545
  NULL, NULL, NULL, NULL, NULL,
546
  0
547
},
548
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
549
  NULL, NULL, NULL, NULL, NULL,
550
  0
551
},
552
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
553
  NULL, NULL, NULL, NULL, NULL,
554
  0
555
},
556
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
557
  NULL, NULL, NULL, NULL, NULL,
558
  0
559
},
560
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
561
  NULL, NULL, NULL, NULL, NULL,
562
  0
563
},
564
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
565
  NULL, NULL, NULL, NULL, NULL,
566
  0
567
},
568
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
569
  NULL, NULL, NULL, NULL, NULL,
570
  0
571
},
572
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
573
  NULL, NULL, NULL, NULL, NULL,
574
  0
575
},
576
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
577
  NULL, NULL, NULL, NULL, NULL,
578
  0
579
},
580
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
581
  NULL, NULL, NULL, NULL, NULL,
582
  0
583
},
584
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
585
  NULL, NULL, NULL, NULL, NULL,
586
  0
587
},
588
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
589
  NULL, NULL, NULL, NULL, NULL,
590
  0
591
},
592
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
593
  NULL, NULL, NULL, NULL, NULL,
594
  0
595
},
596
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
597
  NULL, NULL, NULL, NULL, NULL,
598
  0
599
},
600
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
601
  NULL, NULL, NULL, NULL, NULL,
602
  0
603
},
604
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
605
  NULL, NULL, NULL, NULL, NULL,
606
  0
607
},
608
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
609
  NULL, NULL, NULL, NULL, NULL,
610
  0
611
},
612
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
613
  NULL, NULL, NULL, NULL, NULL,
614
  0
615
},
616
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
617
  NULL, NULL, NULL, NULL, NULL,
618
  0
619
},
620
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
621
  NULL, NULL, NULL, NULL, NULL,
622
  0
623
},
624
{ "embed",  0, 1, 2, 1, 1, 1, 1, "generic embedded object ",
625
  NULL, NULL, NULL, NULL, NULL,
626
  0
627
},
628
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
629
  NULL, NULL, NULL, NULL, NULL,
630
  0
631
},
632
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
633
  NULL, NULL, NULL, NULL, NULL,
634
  0
635
},
636
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
637
  NULL, NULL, NULL, NULL, NULL,
638
  0
639
},
640
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
641
  NULL, NULL, NULL, NULL, NULL,
642
  0
643
},
644
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
645
  NULL, NULL, NULL, NULL, NULL,
646
  0
647
},
648
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
649
  NULL, NULL, NULL, NULL, NULL,
650
  0
651
},
652
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
653
  NULL, NULL, NULL, NULL, NULL,
654
  0
655
},
656
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
657
  NULL, NULL, NULL, NULL, NULL,
658
  0
659
},
660
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
661
  NULL, NULL, NULL, NULL, NULL,
662
  0
663
},
664
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
665
  NULL, NULL, NULL, NULL, NULL,
666
  0
667
},
668
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
669
  NULL, NULL, NULL, NULL, NULL,
670
  0
671
},
672
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
673
  NULL, NULL, NULL, NULL, NULL,
674
  0
675
},
676
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
677
  NULL, NULL, NULL, NULL, NULL,
678
  0
679
},
680
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
681
  NULL, NULL, NULL, NULL, NULL,
682
  0
683
},
684
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
685
  NULL, NULL, NULL, NULL, NULL,
686
  0
687
},
688
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
689
  NULL, NULL, NULL, NULL, NULL,
690
  DATA_RAWTEXT
691
},
692
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
693
  NULL, NULL, NULL, NULL, NULL,
694
  0
695
},
696
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
697
  NULL, NULL, NULL, NULL, NULL,
698
  0
699
},
700
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
701
  NULL, NULL, NULL, NULL, NULL,
702
  0
703
},
704
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
705
  NULL, NULL, NULL, NULL, NULL,
706
  0
707
},
708
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
709
  NULL, NULL, NULL, NULL, NULL,
710
  0
711
},
712
{ "keygen", 0, 0, 2, 1, 0, 0, 0, "",
713
  NULL, NULL, NULL, NULL, NULL,
714
  0
715
},
716
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
717
  NULL, NULL, NULL, NULL, NULL,
718
  0
719
},
720
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
721
  NULL, NULL, NULL, NULL, NULL,
722
  0
723
},
724
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
725
  NULL, NULL, NULL, NULL, NULL,
726
  0
727
},
728
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
729
  NULL, NULL, NULL, NULL, NULL,
730
  0
731
},
732
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
733
  NULL, NULL, NULL, NULL, NULL,
734
  0
735
},
736
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
737
  NULL, NULL, NULL, NULL, NULL,
738
  0
739
},
740
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
741
  NULL, NULL, NULL, NULL, NULL,
742
  0
743
},
744
{ "noembed",  0, 0, 0, 0, 0, 0, 0, "",
745
  NULL, NULL, NULL, NULL, NULL,
746
  DATA_RAWTEXT
747
},
748
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
749
  NULL, NULL, NULL, NULL, NULL,
750
  DATA_RAWTEXT
751
},
752
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
753
  NULL, NULL, NULL, NULL, NULL,
754
  0
755
},
756
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
757
  NULL, NULL, NULL, NULL, NULL,
758
  0
759
},
760
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
761
  NULL, NULL, NULL, NULL, NULL,
762
  0
763
},
764
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
765
  NULL, NULL, NULL, NULL, NULL,
766
  0
767
},
768
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
769
  NULL, NULL, NULL, NULL, NULL,
770
  0
771
},
772
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
773
  NULL, NULL, NULL, NULL, NULL,
774
  0
775
},
776
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
777
  NULL, NULL, NULL, NULL, NULL,
778
  0
779
},
780
{ "plaintext",  0, 0, 0, 0, 0, 0, 0, "",
781
  NULL, NULL, NULL, NULL, NULL,
782
  DATA_PLAINTEXT
783
},
784
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
785
  NULL, NULL, NULL, NULL, NULL,
786
  0
787
},
788
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
789
  NULL, NULL, NULL, NULL, NULL,
790
  0
791
},
792
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
793
  NULL, NULL, NULL, NULL, NULL,
794
  0
795
},
796
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
797
  NULL, NULL, NULL, NULL, NULL,
798
  0
799
},
800
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
801
  NULL, NULL, NULL, NULL, NULL,
802
  DATA_SCRIPT
803
},
804
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
805
  NULL, NULL, NULL, NULL, NULL,
806
  0
807
},
808
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
809
  NULL, NULL, NULL, NULL, NULL,
810
  0
811
},
812
{ "source", 0, 0, 2, 1, 0, 0, 0, "",
813
  NULL, NULL, NULL, NULL, NULL,
814
  0
815
},
816
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
817
  NULL, NULL, NULL, NULL, NULL,
818
  0
819
},
820
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
821
  NULL, NULL, NULL, NULL, NULL,
822
  0
823
},
824
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
825
  NULL, NULL, NULL, NULL, NULL,
826
  0
827
},
828
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
829
  NULL, NULL, NULL, NULL, NULL,
830
  DATA_RAWTEXT
831
},
832
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
833
  NULL, NULL, NULL, NULL, NULL,
834
  0
835
},
836
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
837
  NULL, NULL, NULL, NULL, NULL,
838
  0
839
},
840
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
841
  NULL, NULL, NULL, NULL, NULL,
842
  0
843
},
844
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
845
  NULL, NULL, NULL, NULL, NULL,
846
  0
847
},
848
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
849
  NULL, NULL, NULL, NULL, NULL,
850
  0
851
},
852
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
853
  NULL, NULL, NULL, NULL, NULL,
854
  DATA_RCDATA
855
},
856
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
857
  NULL, NULL, NULL, NULL, NULL,
858
  0
859
},
860
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
861
  NULL, NULL, NULL, NULL, NULL,
862
  0
863
},
864
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
865
  NULL, NULL, NULL, NULL, NULL,
866
  0
867
},
868
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
869
  NULL, NULL, NULL, NULL, NULL,
870
  DATA_RCDATA
871
},
872
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
873
  NULL, NULL, NULL, NULL, NULL,
874
  0
875
},
876
{ "track",  0, 0, 2, 1, 0, 0, 0, "",
877
  NULL, NULL, NULL, NULL, NULL,
878
  0
879
},
880
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
881
  NULL, NULL, NULL, NULL, NULL,
882
  0
883
},
884
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
885
  NULL, NULL, NULL, NULL, NULL,
886
  0
887
},
888
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
889
  NULL, NULL, NULL, NULL, NULL,
890
  0
891
},
892
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
893
  NULL, NULL, NULL, NULL, NULL,
894
  0
895
},
896
{ "wbr",  0, 0, 2, 1, 0, 0, 0, "",
897
  NULL, NULL, NULL, NULL, NULL,
898
  0
899
},
900
{ "xmp",  0, 0, 0, 0, 0, 0, 1, "",
901
  NULL, NULL, NULL, NULL, NULL,
902
  DATA_RAWTEXT
903
}
904
};
905
906
typedef struct {
907
    const char *oldTag;
908
    const char *newTag;
909
} htmlStartCloseEntry;
910
911
/*
912
 * start tags that imply the end of current element
913
 */
914
static const htmlStartCloseEntry htmlStartClose[] = {
915
    { "a", "a" },
916
    { "a", "fieldset" },
917
    { "a", "table" },
918
    { "a", "td" },
919
    { "a", "th" },
920
    { "address", "dd" },
921
    { "address", "dl" },
922
    { "address", "dt" },
923
    { "address", "form" },
924
    { "address", "li" },
925
    { "address", "ul" },
926
    { "b", "center" },
927
    { "b", "p" },
928
    { "b", "td" },
929
    { "b", "th" },
930
    { "big", "p" },
931
    { "caption", "col" },
932
    { "caption", "colgroup" },
933
    { "caption", "tbody" },
934
    { "caption", "tfoot" },
935
    { "caption", "thead" },
936
    { "caption", "tr" },
937
    { "col", "col" },
938
    { "col", "colgroup" },
939
    { "col", "tbody" },
940
    { "col", "tfoot" },
941
    { "col", "thead" },
942
    { "col", "tr" },
943
    { "colgroup", "colgroup" },
944
    { "colgroup", "tbody" },
945
    { "colgroup", "tfoot" },
946
    { "colgroup", "thead" },
947
    { "colgroup", "tr" },
948
    { "dd", "dt" },
949
    { "dir", "dd" },
950
    { "dir", "dl" },
951
    { "dir", "dt" },
952
    { "dir", "form" },
953
    { "dir", "ul" },
954
    { "dl", "form" },
955
    { "dl", "li" },
956
    { "dt", "dd" },
957
    { "dt", "dl" },
958
    { "font", "center" },
959
    { "font", "td" },
960
    { "font", "th" },
961
    { "form", "form" },
962
    { "h1", "fieldset" },
963
    { "h1", "form" },
964
    { "h1", "li" },
965
    { "h1", "p" },
966
    { "h1", "table" },
967
    { "h2", "fieldset" },
968
    { "h2", "form" },
969
    { "h2", "li" },
970
    { "h2", "p" },
971
    { "h2", "table" },
972
    { "h3", "fieldset" },
973
    { "h3", "form" },
974
    { "h3", "li" },
975
    { "h3", "p" },
976
    { "h3", "table" },
977
    { "h4", "fieldset" },
978
    { "h4", "form" },
979
    { "h4", "li" },
980
    { "h4", "p" },
981
    { "h4", "table" },
982
    { "h5", "fieldset" },
983
    { "h5", "form" },
984
    { "h5", "li" },
985
    { "h5", "p" },
986
    { "h5", "table" },
987
    { "h6", "fieldset" },
988
    { "h6", "form" },
989
    { "h6", "li" },
990
    { "h6", "p" },
991
    { "h6", "table" },
992
    { "head", "a" },
993
    { "head", "abbr" },
994
    { "head", "acronym" },
995
    { "head", "address" },
996
    { "head", "b" },
997
    { "head", "bdo" },
998
    { "head", "big" },
999
    { "head", "blockquote" },
1000
    { "head", "body" },
1001
    { "head", "br" },
1002
    { "head", "center" },
1003
    { "head", "cite" },
1004
    { "head", "code" },
1005
    { "head", "dd" },
1006
    { "head", "dfn" },
1007
    { "head", "dir" },
1008
    { "head", "div" },
1009
    { "head", "dl" },
1010
    { "head", "dt" },
1011
    { "head", "em" },
1012
    { "head", "fieldset" },
1013
    { "head", "font" },
1014
    { "head", "form" },
1015
    { "head", "frameset" },
1016
    { "head", "h1" },
1017
    { "head", "h2" },
1018
    { "head", "h3" },
1019
    { "head", "h4" },
1020
    { "head", "h5" },
1021
    { "head", "h6" },
1022
    { "head", "hr" },
1023
    { "head", "i" },
1024
    { "head", "iframe" },
1025
    { "head", "img" },
1026
    { "head", "kbd" },
1027
    { "head", "li" },
1028
    { "head", "listing" },
1029
    { "head", "map" },
1030
    { "head", "menu" },
1031
    { "head", "ol" },
1032
    { "head", "p" },
1033
    { "head", "pre" },
1034
    { "head", "q" },
1035
    { "head", "s" },
1036
    { "head", "samp" },
1037
    { "head", "small" },
1038
    { "head", "span" },
1039
    { "head", "strike" },
1040
    { "head", "strong" },
1041
    { "head", "sub" },
1042
    { "head", "sup" },
1043
    { "head", "table" },
1044
    { "head", "tt" },
1045
    { "head", "u" },
1046
    { "head", "ul" },
1047
    { "head", "var" },
1048
    { "head", "xmp" },
1049
    { "hr", "form" },
1050
    { "i", "center" },
1051
    { "i", "p" },
1052
    { "i", "td" },
1053
    { "i", "th" },
1054
    { "legend", "fieldset" },
1055
    { "li", "li" },
1056
    { "link", "body" },
1057
    { "link", "frameset" },
1058
    { "listing", "dd" },
1059
    { "listing", "dl" },
1060
    { "listing", "dt" },
1061
    { "listing", "fieldset" },
1062
    { "listing", "form" },
1063
    { "listing", "li" },
1064
    { "listing", "table" },
1065
    { "listing", "ul" },
1066
    { "menu", "dd" },
1067
    { "menu", "dl" },
1068
    { "menu", "dt" },
1069
    { "menu", "form" },
1070
    { "menu", "ul" },
1071
    { "ol", "form" },
1072
    { "option", "optgroup" },
1073
    { "option", "option" },
1074
    { "p", "address" },
1075
    { "p", "blockquote" },
1076
    { "p", "body" },
1077
    { "p", "caption" },
1078
    { "p", "center" },
1079
    { "p", "col" },
1080
    { "p", "colgroup" },
1081
    { "p", "dd" },
1082
    { "p", "dir" },
1083
    { "p", "div" },
1084
    { "p", "dl" },
1085
    { "p", "dt" },
1086
    { "p", "fieldset" },
1087
    { "p", "form" },
1088
    { "p", "frameset" },
1089
    { "p", "h1" },
1090
    { "p", "h2" },
1091
    { "p", "h3" },
1092
    { "p", "h4" },
1093
    { "p", "h5" },
1094
    { "p", "h6" },
1095
    { "p", "head" },
1096
    { "p", "hr" },
1097
    { "p", "li" },
1098
    { "p", "listing" },
1099
    { "p", "menu" },
1100
    { "p", "ol" },
1101
    { "p", "p" },
1102
    { "p", "pre" },
1103
    { "p", "table" },
1104
    { "p", "tbody" },
1105
    { "p", "td" },
1106
    { "p", "tfoot" },
1107
    { "p", "th" },
1108
    { "p", "title" },
1109
    { "p", "tr" },
1110
    { "p", "ul" },
1111
    { "p", "xmp" },
1112
    { "pre", "dd" },
1113
    { "pre", "dl" },
1114
    { "pre", "dt" },
1115
    { "pre", "fieldset" },
1116
    { "pre", "form" },
1117
    { "pre", "li" },
1118
    { "pre", "table" },
1119
    { "pre", "ul" },
1120
    { "s", "p" },
1121
    { "script", "noscript" },
1122
    { "small", "p" },
1123
    { "span", "td" },
1124
    { "span", "th" },
1125
    { "strike", "p" },
1126
    { "style", "body" },
1127
    { "style", "frameset" },
1128
    { "tbody", "tbody" },
1129
    { "tbody", "tfoot" },
1130
    { "td", "tbody" },
1131
    { "td", "td" },
1132
    { "td", "tfoot" },
1133
    { "td", "th" },
1134
    { "td", "tr" },
1135
    { "tfoot", "tbody" },
1136
    { "th", "tbody" },
1137
    { "th", "td" },
1138
    { "th", "tfoot" },
1139
    { "th", "th" },
1140
    { "th", "tr" },
1141
    { "thead", "tbody" },
1142
    { "thead", "tfoot" },
1143
    { "title", "body" },
1144
    { "title", "frameset" },
1145
    { "tr", "tbody" },
1146
    { "tr", "tfoot" },
1147
    { "tr", "tr" },
1148
    { "tt", "p" },
1149
    { "u", "p" },
1150
    { "u", "td" },
1151
    { "u", "th" },
1152
    { "ul", "address" },
1153
    { "ul", "form" },
1154
    { "ul", "menu" },
1155
    { "ul", "pre" },
1156
    { "xmp", "dd" },
1157
    { "xmp", "dl" },
1158
    { "xmp", "dt" },
1159
    { "xmp", "fieldset" },
1160
    { "xmp", "form" },
1161
    { "xmp", "li" },
1162
    { "xmp", "table" },
1163
    { "xmp", "ul" }
1164
};
1165
1166
/*
1167
 * The list of HTML attributes which are of content %Script;
1168
 * NOTE: when adding ones, check #htmlIsScriptAttribute since
1169
 *       it assumes the name starts with 'on'
1170
 */
1171
static const char *const htmlScriptAttributes[] = {
1172
    "onclick",
1173
    "ondblclick",
1174
    "onmousedown",
1175
    "onmouseup",
1176
    "onmouseover",
1177
    "onmousemove",
1178
    "onmouseout",
1179
    "onkeypress",
1180
    "onkeydown",
1181
    "onkeyup",
1182
    "onload",
1183
    "onunload",
1184
    "onfocus",
1185
    "onblur",
1186
    "onsubmit",
1187
    "onreset",
1188
    "onchange",
1189
    "onselect"
1190
};
1191
1192
/*
1193
 * This table is used by the htmlparser to know what to do with
1194
 * broken html pages. By assigning different priorities to different
1195
 * elements the parser can decide how to handle extra endtags.
1196
 * Endtags are only allowed to close elements with lower or equal
1197
 * priority.
1198
 */
1199
1200
typedef struct {
1201
    const char *name;
1202
    int priority;
1203
} elementPriority;
1204
1205
static const elementPriority htmlEndPriority[] = {
1206
    {"div",   150},
1207
    {"td",    160},
1208
    {"th",    160},
1209
    {"tr",    170},
1210
    {"thead", 180},
1211
    {"tbody", 180},
1212
    {"tfoot", 180},
1213
    {"table", 190},
1214
    {"head",  200},
1215
    {"body",  200},
1216
    {"html",  220},
1217
    {NULL,    100} /* Default priority */
1218
};
1219
1220
/************************************************************************
1221
 *                  *
1222
 *  functions to handle HTML specific data      *
1223
 *                  *
1224
 ************************************************************************/
1225
1226
static void
1227
0
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
1228
    /*
1229
     * Capture end position and add node
1230
     */
1231
0
    if ( ctxt->node != NULL && ctxt->record_info ) {
1232
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
1233
0
                                (CUR_PTR - ctxt->input->base);
1234
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
1235
0
       ctxt->nodeInfo->node = ctxt->node;
1236
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
1237
0
       htmlNodeInfoPop(ctxt);
1238
0
    }
1239
0
}
1240
1241
/**
1242
 * @deprecated This is a no-op.
1243
 */
1244
void
1245
0
htmlInitAutoClose(void) {
1246
0
}
1247
1248
static int
1249
0
htmlCompareTags(const void *key, const void *member) {
1250
0
    const xmlChar *tag = (const xmlChar *) key;
1251
0
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1252
1253
0
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1254
0
}
1255
1256
/**
1257
 * Lookup the HTML tag in the ElementTable
1258
 *
1259
 * @deprecated Only supports HTML 4.
1260
 *
1261
 * @param tag  The tag name in lowercase
1262
 * @returns the related htmlElemDesc or NULL if not found.
1263
 */
1264
const htmlElemDesc *
1265
0
htmlTagLookup(const xmlChar *tag) {
1266
0
    if (tag == NULL)
1267
0
        return(NULL);
1268
1269
0
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1270
0
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1271
0
                sizeof(htmlElemDesc), htmlCompareTags));
1272
0
}
1273
1274
/**
1275
 * @param name  The name of the element to look up the priority for.
1276
 * @returns value: The "endtag" priority.
1277
 **/
1278
static int
1279
0
htmlGetEndPriority (const xmlChar *name) {
1280
0
    int i = 0;
1281
1282
0
    while ((htmlEndPriority[i].name != NULL) &&
1283
0
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1284
0
  i++;
1285
1286
0
    return(htmlEndPriority[i].priority);
1287
0
}
1288
1289
1290
static int
1291
0
htmlCompareStartClose(const void *vkey, const void *member) {
1292
0
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1293
0
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1294
0
    int ret;
1295
1296
0
    ret = strcmp(key->oldTag, entry->oldTag);
1297
0
    if (ret == 0)
1298
0
        ret = strcmp(key->newTag, entry->newTag);
1299
1300
0
    return(ret);
1301
0
}
1302
1303
/**
1304
 * Checks whether the new tag is one of the registered valid tags for
1305
 * closing old.
1306
 *
1307
 * @param newtag  The new tag name
1308
 * @param oldtag  The old tag name
1309
 * @returns 0 if no, 1 if yes.
1310
 */
1311
static int
1312
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1313
0
{
1314
0
    htmlStartCloseEntry key;
1315
0
    void *res;
1316
1317
0
    key.oldTag = (const char *) oldtag;
1318
0
    key.newTag = (const char *) newtag;
1319
0
    res = bsearch(&key, htmlStartClose,
1320
0
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1321
0
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1322
0
    return(res != NULL);
1323
0
}
1324
1325
/**
1326
 * The HTML DTD allows an ending tag to implicitly close other tags.
1327
 *
1328
 * @param ctxt  an HTML parser context
1329
 * @param newtag  The new tag name
1330
 */
1331
static void
1332
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1333
0
{
1334
0
    const htmlElemDesc *info;
1335
0
    int i, priority;
1336
1337
0
    if (ctxt->options & HTML_PARSE_HTML5)
1338
0
        return;
1339
1340
0
    priority = htmlGetEndPriority(newtag);
1341
1342
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1343
1344
0
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1345
0
            break;
1346
        /*
1347
         * A misplaced endtag can only close elements with lower
1348
         * or equal priority, so if we find an element with higher
1349
         * priority before we find an element with
1350
         * matching name, we just ignore this endtag
1351
         */
1352
0
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1353
0
            return;
1354
0
    }
1355
0
    if (i < 0)
1356
0
        return;
1357
1358
0
    while (!xmlStrEqual(newtag, ctxt->name)) {
1359
0
        info = htmlTagLookup(ctxt->name);
1360
0
        if ((info != NULL) && (info->endTag == 3)) {
1361
0
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1362
0
                   "Opening and ending tag mismatch: %s and %s\n",
1363
0
       newtag, ctxt->name);
1364
0
        }
1365
0
  htmlParserFinishElementParsing(ctxt);
1366
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1367
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1368
0
  htmlnamePop(ctxt);
1369
0
    }
1370
0
}
1371
1372
/**
1373
 * Close all remaining tags at the end of the stream
1374
 *
1375
 * @param ctxt  an HTML parser context
1376
 */
1377
static void
1378
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1379
0
{
1380
0
    int i;
1381
1382
0
    if (ctxt->options & HTML_PARSE_HTML5)
1383
0
        return;
1384
1385
0
    if (ctxt->nameNr == 0)
1386
0
        return;
1387
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1388
0
  htmlParserFinishElementParsing(ctxt);
1389
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1390
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1391
0
  htmlnamePop(ctxt);
1392
0
    }
1393
0
}
1394
1395
/**
1396
 * The HTML DTD allows a tag to implicitly close other tags.
1397
 * The list is kept in htmlStartClose array. This function is
1398
 * called when a new tag has been detected and generates the
1399
 * appropriates closes if possible/needed.
1400
 * If newtag is NULL this mean we are at the end of the resource
1401
 * and we should check
1402
 *
1403
 * @param ctxt  an HTML parser context
1404
 * @param newtag  The new tag name or NULL
1405
 */
1406
static void
1407
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1408
0
{
1409
0
    if (ctxt->options & HTML_PARSE_HTML5)
1410
0
        return;
1411
1412
0
    if (newtag == NULL)
1413
0
        return;
1414
1415
0
    while ((ctxt->name != NULL) &&
1416
0
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1417
0
  htmlParserFinishElementParsing(ctxt);
1418
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420
0
  htmlnamePop(ctxt);
1421
0
    }
1422
0
}
1423
1424
/**
1425
 * The HTML DTD allows a tag to implicitly close other tags.
1426
 * The list is kept in htmlStartClose array. This function checks
1427
 * if the element or one of it's children would autoclose the
1428
 * given tag.
1429
 *
1430
 * @deprecated Internal function, don't use.
1431
 *
1432
 * @param doc  the HTML document
1433
 * @param name  The tag name
1434
 * @param elem  the HTML element
1435
 * @returns 1 if autoclose, 0 otherwise
1436
 */
1437
int
1438
0
htmlAutoCloseTag(xmlDoc *doc, const xmlChar *name, xmlNode *elem) {
1439
0
    htmlNodePtr child;
1440
1441
0
    if (elem == NULL) return(1);
1442
0
    if (xmlStrEqual(name, elem->name)) return(0);
1443
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1444
0
    child = elem->children;
1445
0
    while (child != NULL) {
1446
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1447
0
  child = child->next;
1448
0
    }
1449
0
    return(0);
1450
0
}
1451
1452
/**
1453
 * The HTML DTD allows a tag to implicitly close other tags.
1454
 * The list is kept in htmlStartClose array. This function checks
1455
 * if a tag is autoclosed by one of it's child
1456
 *
1457
 * @deprecated Internal function, don't use.
1458
 *
1459
 * @param doc  the HTML document
1460
 * @param elem  the HTML element
1461
 * @returns 1 if autoclosed, 0 otherwise
1462
 */
1463
int
1464
0
htmlIsAutoClosed(xmlDoc *doc, xmlNode *elem) {
1465
0
    htmlNodePtr child;
1466
1467
0
    if (elem == NULL) return(1);
1468
0
    child = elem->children;
1469
0
    while (child != NULL) {
1470
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471
0
  child = child->next;
1472
0
    }
1473
0
    return(0);
1474
0
}
1475
1476
/**
1477
 * The HTML DTD allows a tag to exists only implicitly
1478
 * called when a new tag has been detected and generates the
1479
 * appropriates implicit tags if missing
1480
 *
1481
 * @param ctxt  an HTML parser context
1482
 * @param newtag  The new tag name
1483
 */
1484
static void
1485
0
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1486
0
    int i;
1487
1488
0
    if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1489
0
        return;
1490
0
    if (!htmlOmittedDefaultValue)
1491
0
  return;
1492
0
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1493
0
  return;
1494
0
    if (ctxt->nameNr <= 0) {
1495
0
  htmlnamePush(ctxt, BAD_CAST"html");
1496
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1497
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1498
0
    }
1499
0
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1500
0
        return;
1501
0
    if ((ctxt->nameNr <= 1) &&
1502
0
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1503
0
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1504
0
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1505
0
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1506
0
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1507
0
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1508
0
        if (ctxt->html >= INSERT_IN_HEAD) {
1509
            /* we already saw or generated an <head> before */
1510
0
            return;
1511
0
        }
1512
        /*
1513
         * dropped OBJECT ... i you put it first BODY will be
1514
         * assumed !
1515
         */
1516
0
        htmlnamePush(ctxt, BAD_CAST"head");
1517
0
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1518
0
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1519
0
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1520
0
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1521
0
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1522
0
        if (ctxt->html >= INSERT_IN_BODY) {
1523
            /* we already saw or generated a <body> before */
1524
0
            return;
1525
0
        }
1526
0
  for (i = 0;i < ctxt->nameNr;i++) {
1527
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1528
0
    return;
1529
0
      }
1530
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1531
0
    return;
1532
0
      }
1533
0
  }
1534
1535
0
  htmlnamePush(ctxt, BAD_CAST"body");
1536
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1537
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1538
0
    }
1539
0
}
1540
1541
/**
1542
 * Prepare for non-whitespace character data.
1543
 *
1544
 * @param ctxt  an HTML parser context
1545
 */
1546
1547
static void
1548
0
htmlStartCharData(htmlParserCtxtPtr ctxt) {
1549
0
    if (ctxt->options & (HTML_PARSE_NOIMPLIED | HTML_PARSE_HTML5))
1550
0
        return;
1551
0
    if (!htmlOmittedDefaultValue)
1552
0
  return;
1553
1554
0
    if (xmlStrEqual(ctxt->name, BAD_CAST "head"))
1555
0
        htmlAutoClose(ctxt, BAD_CAST "p");
1556
0
    htmlCheckImplied(ctxt, BAD_CAST "p");
1557
0
}
1558
1559
/**
1560
 * Check if an attribute is of content type Script
1561
 *
1562
 * @deprecated Only supports HTML 4.
1563
 *
1564
 * @param name  an attribute name
1565
 * @returns 1 is the attribute is a script 0 otherwise
1566
 */
1567
int
1568
0
htmlIsScriptAttribute(const xmlChar *name) {
1569
0
    unsigned int i;
1570
1571
0
    if (name == NULL)
1572
0
      return(0);
1573
    /*
1574
     * all script attributes start with 'on'
1575
     */
1576
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1577
0
      return(0);
1578
0
    for (i = 0;
1579
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1580
0
   i++) {
1581
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1582
0
      return(1);
1583
0
    }
1584
0
    return(0);
1585
0
}
1586
1587
/************************************************************************
1588
 *                  *
1589
 *  The list of HTML predefined entities      *
1590
 *                  *
1591
 ************************************************************************/
1592
1593
1594
static const htmlEntityDesc  html40EntitiesTable[] = {
1595
/*
1596
 * the 4 absolute ones, plus apostrophe.
1597
 */
1598
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1599
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1600
{ 39, "apos", "single quote" },
1601
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1602
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1603
1604
/*
1605
 * A bunch still in the 128-255 range
1606
 * Replacing them depend really on the charset used.
1607
 */
1608
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1609
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1610
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1611
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1612
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1613
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1614
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1615
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1616
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1617
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1618
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1619
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1620
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1621
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1622
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1623
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1624
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1625
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1626
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1627
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1628
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1629
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1630
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1631
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1632
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1633
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1634
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1635
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1636
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1637
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1638
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1639
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1640
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1641
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1642
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1643
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1644
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1645
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1646
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1647
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1648
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1649
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1650
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1651
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1652
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1653
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1654
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1655
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1656
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1657
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1658
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1659
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1660
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1661
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1662
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1663
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1664
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1665
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1666
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1667
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1668
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1669
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1670
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1671
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1672
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1673
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1674
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1675
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1676
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1677
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1678
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1679
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1680
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1681
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1682
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1683
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1684
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1685
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1686
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1687
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1688
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1689
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1690
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1691
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1692
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1693
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1694
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1695
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1696
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1697
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1698
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1699
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1700
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1701
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1702
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1703
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1704
1705
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1706
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1707
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1708
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1709
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1710
1711
/*
1712
 * Anything below should really be kept as entities references
1713
 */
1714
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1715
1716
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1717
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1718
1719
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1720
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1721
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1722
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1723
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1724
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1725
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1726
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1727
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1728
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1729
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1730
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1731
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1732
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1733
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1734
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1735
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1736
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1737
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1738
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1739
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1740
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1741
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1742
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1743
1744
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1745
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1746
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1747
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1748
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1749
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1750
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1751
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1752
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1753
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1754
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1755
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1756
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1757
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1758
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1759
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1760
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1761
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1762
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1763
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1764
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1765
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1766
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1767
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1768
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1769
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1770
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1771
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1772
1773
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1774
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1775
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1776
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1777
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1778
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1779
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1780
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1781
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1782
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1783
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1784
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1785
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1786
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1787
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1788
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1789
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1790
1791
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1792
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1793
1794
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1795
1796
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1797
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1798
1799
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1800
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1801
1802
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1803
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1804
1805
{ 8364, "euro", "euro sign, U+20AC NEW" },
1806
1807
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1808
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1809
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1810
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1811
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1812
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1813
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1814
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1815
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1816
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1817
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1818
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1819
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1820
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1821
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1822
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1823
1824
{ 8704, "forall","for all, U+2200 ISOtech" },
1825
{ 8706, "part", "partial differential, U+2202 ISOtech" },
1826
{ 8707, "exist","there exists, U+2203 ISOtech" },
1827
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1828
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1829
{ 8712, "isin", "element of, U+2208 ISOtech" },
1830
{ 8713, "notin","not an element of, U+2209 ISOtech" },
1831
{ 8715, "ni", "contains as member, U+220B ISOtech" },
1832
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1833
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1834
{ 8722, "minus","minus sign, U+2212 ISOtech" },
1835
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1836
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1837
{ 8733, "prop", "proportional to, U+221D ISOtech" },
1838
{ 8734, "infin","infinity, U+221E ISOtech" },
1839
{ 8736, "ang",  "angle, U+2220 ISOamso" },
1840
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1841
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1842
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1843
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
1844
{ 8747, "int",  "integral, U+222B ISOtech" },
1845
{ 8756, "there4","therefore, U+2234 ISOtech" },
1846
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1847
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1848
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1849
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1850
{ 8801, "equiv","identical to, U+2261 ISOtech" },
1851
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1852
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1853
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
1854
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
1855
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1856
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1857
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1858
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1859
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1860
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1861
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1862
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1863
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1864
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1865
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1866
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1867
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1868
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
1869
1870
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1871
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1872
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1873
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1874
1875
};
1876
1877
/************************************************************************
1878
 *                  *
1879
 *    Commodity functions to handle entities      *
1880
 *                  *
1881
 ************************************************************************/
1882
1883
/**
1884
 * Lookup the given entity in EntitiesTable
1885
 *
1886
 * @deprecated Only supports HTML 4.
1887
 *
1888
 * TODO: the linear scan is really ugly, an hash table is really needed.
1889
 *
1890
 * @param name  the entity name
1891
 * @returns the associated htmlEntityDesc if found, NULL otherwise.
1892
 */
1893
const htmlEntityDesc *
1894
0
htmlEntityLookup(const xmlChar *name) {
1895
0
    unsigned int i;
1896
1897
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
1898
0
                    sizeof(html40EntitiesTable[0]));i++) {
1899
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1900
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1901
0
  }
1902
0
    }
1903
0
    return(NULL);
1904
0
}
1905
1906
static int
1907
0
htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
1908
0
    const unsigned *key = vkey;
1909
0
    const htmlEntityDesc *desc = vdesc;
1910
1911
0
    return((int) *key - (int) desc->value);
1912
0
}
1913
1914
/**
1915
 * Lookup the given entity in EntitiesTable
1916
 *
1917
 * @deprecated Only supports HTML 4.
1918
 *
1919
 * TODO: the linear scan is really ugly, an hash table is really needed.
1920
 *
1921
 * @param value  the entity's unicode value
1922
 * @returns the associated htmlEntityDesc if found, NULL otherwise.
1923
 */
1924
const htmlEntityDesc *
1925
0
htmlEntityValueLookup(unsigned int value) {
1926
0
    const htmlEntityDesc *desc;
1927
0
    size_t nmemb;
1928
1929
0
    nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
1930
0
    desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
1931
0
                   htmlCompareEntityDesc);
1932
1933
0
    return(desc);
1934
0
}
1935
1936
/**
1937
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1938
 * plus HTML entities block of chars out.
1939
 *
1940
 * @deprecated Internal function, don't use.
1941
 *
1942
 * @param out  a pointer to an array of bytes to store the result
1943
 * @param outlen  the length of `out`
1944
 * @param in  a pointer to an array of UTF-8 chars
1945
 * @param inlen  the length of `in`
1946
 * @returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1947
 * The value of `inlen` after return is the number of octets consumed
1948
 *     as the return value is positive, else unpredictable.
1949
 * The value of `outlen` after return is the number of octets consumed.
1950
 */
1951
int
1952
htmlUTF8ToHtml(unsigned char* out, int *outlen,
1953
0
               const unsigned char* in, int *inlen) {
1954
0
    const unsigned char* instart = in;
1955
0
    const unsigned char* inend;
1956
0
    unsigned char* outstart = out;
1957
0
    unsigned char* outend;
1958
0
    int ret = XML_ENC_ERR_SPACE;
1959
1960
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL))
1961
0
        return(XML_ENC_ERR_INTERNAL);
1962
1963
0
    if (in == NULL) {
1964
        /*
1965
   * initialization nothing to do
1966
   */
1967
0
  *outlen = 0;
1968
0
  *inlen = 0;
1969
0
  return(XML_ENC_ERR_SUCCESS);
1970
0
    }
1971
1972
0
    inend = in + *inlen;
1973
0
    outend = out + *outlen;
1974
0
    while (in < inend) {
1975
0
        const htmlEntityDesc *ent;
1976
0
        const char *cp;
1977
0
        char nbuf[16];
1978
0
        unsigned c, d;
1979
0
        int seqlen, len, i;
1980
1981
0
  d = *in;
1982
1983
0
  if (d < 0x80) {
1984
0
            if (out >= outend)
1985
0
                goto done;
1986
0
            *out++ = d;
1987
0
            in += 1;
1988
0
            continue;
1989
0
        }
1990
1991
0
        if (d < 0xE0)      { c = d & 0x1F; seqlen = 2; }
1992
0
        else if (d < 0xF0) { c = d & 0x0F; seqlen = 3; }
1993
0
        else               { c = d & 0x07; seqlen = 4; }
1994
1995
0
  if (inend - in < seqlen)
1996
0
      break;
1997
1998
0
  for (i = 1; i < seqlen; i++) {
1999
0
      d = in[i];
2000
0
      c <<= 6;
2001
0
      c |= d & 0x3F;
2002
0
  }
2003
2004
        /*
2005
         * Try to lookup a predefined HTML entity for it
2006
         */
2007
0
        ent = htmlEntityValueLookup(c);
2008
2009
0
        if (ent == NULL) {
2010
0
          snprintf(nbuf, sizeof(nbuf), "#%u", c);
2011
0
          cp = nbuf;
2012
0
        } else {
2013
0
          cp = ent->name;
2014
0
        }
2015
2016
0
        len = strlen(cp);
2017
0
        if (outend - out < len + 2)
2018
0
            goto done;
2019
2020
0
        *out++ = '&';
2021
0
        memcpy(out, cp, len);
2022
0
        out += len;
2023
0
        *out++ = ';';
2024
2025
0
        in += seqlen;
2026
0
    }
2027
2028
0
    ret = out - outstart;
2029
2030
0
done:
2031
0
    *outlen = out - outstart;
2032
0
    *inlen = in - instart;
2033
0
    return(ret);
2034
0
}
2035
2036
/**
2037
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2038
 * plus HTML entities block of chars out.
2039
 *
2040
 * @deprecated Only supports HTML 4.
2041
 *
2042
 * @param out  a pointer to an array of bytes to store the result
2043
 * @param outlen  the length of `out`
2044
 * @param in  a pointer to an array of UTF-8 chars
2045
 * @param inlen  the length of `in`
2046
 * @param quoteChar  the quote character to escape (' or ") or zero.
2047
 * @returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2048
 * The value of `inlen` after return is the number of octets consumed
2049
 *     as the return value is positive, else unpredictable.
2050
 * The value of `outlen` after return is the number of octets consumed.
2051
 */
2052
int
2053
htmlEncodeEntities(unsigned char* out, int *outlen,
2054
0
       const unsigned char* in, int *inlen, int quoteChar) {
2055
0
    const unsigned char* processed = in;
2056
0
    const unsigned char* outend;
2057
0
    const unsigned char* outstart = out;
2058
0
    const unsigned char* instart = in;
2059
0
    const unsigned char* inend;
2060
0
    unsigned int c, d;
2061
0
    int trailing;
2062
2063
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2064
0
        return(-1);
2065
0
    outend = out + (*outlen);
2066
0
    inend = in + (*inlen);
2067
0
    while (in < inend) {
2068
0
  d = *in++;
2069
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2070
0
  else if (d < 0xC0) {
2071
      /* trailing byte in leading position */
2072
0
      *outlen = out - outstart;
2073
0
      *inlen = processed - instart;
2074
0
      return(-2);
2075
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2076
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2077
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2078
0
  else {
2079
      /* no chance for this in Ascii */
2080
0
      *outlen = out - outstart;
2081
0
      *inlen = processed - instart;
2082
0
      return(-2);
2083
0
  }
2084
2085
0
  if (inend - in < trailing)
2086
0
      break;
2087
2088
0
  while (trailing--) {
2089
0
      if (((d= *in++) & 0xC0) != 0x80) {
2090
0
    *outlen = out - outstart;
2091
0
    *inlen = processed - instart;
2092
0
    return(-2);
2093
0
      }
2094
0
      c <<= 6;
2095
0
      c |= d & 0x3F;
2096
0
  }
2097
2098
  /* assertion: c is a single UTF-4 value */
2099
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2100
0
      (c != '&') && (c != '<') && (c != '>')) {
2101
0
      if (out >= outend)
2102
0
    break;
2103
0
      *out++ = c;
2104
0
  } else {
2105
0
      const htmlEntityDesc * ent;
2106
0
      const char *cp;
2107
0
      char nbuf[16];
2108
0
      int len;
2109
2110
      /*
2111
       * Try to lookup a predefined HTML entity for it
2112
       */
2113
0
      ent = htmlEntityValueLookup(c);
2114
0
      if (ent == NULL) {
2115
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2116
0
    cp = nbuf;
2117
0
      }
2118
0
      else
2119
0
    cp = ent->name;
2120
0
      len = strlen(cp);
2121
0
      if (outend - out < len + 2)
2122
0
    break;
2123
0
      *out++ = '&';
2124
0
      memcpy(out, cp, len);
2125
0
      out += len;
2126
0
      *out++ = ';';
2127
0
  }
2128
0
  processed = in;
2129
0
    }
2130
0
    *outlen = out - outstart;
2131
0
    *inlen = processed - instart;
2132
0
    return(0);
2133
0
}
2134
2135
/************************************************************************
2136
 *                  *
2137
 *    Commodity functions, cleanup needed ?     *
2138
 *                  *
2139
 ************************************************************************/
2140
/*
2141
 * all tags allowing pc data from the html 4.01 loose dtd
2142
 * NOTE: it might be more appropriate to integrate this information
2143
 * into the html40ElementTable array but I don't want to risk any
2144
 * binary incompatibility
2145
 */
2146
static const char *allowPCData[] = {
2147
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2148
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2149
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2150
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2151
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2152
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2153
};
2154
2155
/**
2156
 * Is this a sequence of blank chars that one can ignore ?
2157
 *
2158
 * @param ctxt  an HTML parser context
2159
 * @param str  a xmlChar *
2160
 * @param len  the size of `str`
2161
 * @returns 1 if ignorable 0 if whitespace, -1 otherwise.
2162
 */
2163
2164
0
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2165
0
    unsigned int i;
2166
0
    int j;
2167
0
    xmlNodePtr lastChild;
2168
0
    xmlDtdPtr dtd;
2169
2170
0
    for (j = 0;j < len;j++)
2171
0
        if (!(IS_WS_HTML(str[j]))) return(-1);
2172
2173
0
    if (CUR == 0) return(1);
2174
0
    if (CUR != '<') return(0);
2175
0
    if (ctxt->name == NULL)
2176
0
  return(1);
2177
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2178
0
  return(1);
2179
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2180
0
  return(1);
2181
2182
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2183
0
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2184
0
        dtd = xmlGetIntSubset(ctxt->myDoc);
2185
0
        if (dtd != NULL && dtd->ExternalID != NULL) {
2186
0
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2187
0
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2188
0
                return(1);
2189
0
        }
2190
0
    }
2191
2192
0
    if (ctxt->node == NULL) return(0);
2193
0
    lastChild = xmlGetLastChild(ctxt->node);
2194
0
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2195
0
  lastChild = lastChild->prev;
2196
0
    if (lastChild == NULL) {
2197
0
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2198
0
            (ctxt->node->content != NULL)) return(0);
2199
  /* keep ws in constructs like ...<b> </b>...
2200
     for all tags "b" allowing PCDATA */
2201
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2202
0
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2203
0
    return(0);
2204
0
      }
2205
0
  }
2206
0
    } else if (xmlNodeIsText(lastChild)) {
2207
0
        return(0);
2208
0
    } else {
2209
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2210
     for all tags "p" allowing PCDATA */
2211
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2212
0
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2213
0
    return(0);
2214
0
      }
2215
0
  }
2216
0
    }
2217
0
    return(1);
2218
0
}
2219
2220
/**
2221
 * Creates a new HTML document without a DTD node if `URI` and `publicId`
2222
 * are NULL
2223
 *
2224
 * @param URI  system ID (URI) of the DTD (optional)
2225
 * @param publicId  public ID of the DTD (optional)
2226
 * @returns a new document, do not initialize the DTD if not provided
2227
 */
2228
xmlDoc *
2229
0
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *publicId) {
2230
0
    xmlDocPtr cur;
2231
2232
    /*
2233
     * Allocate a new document and fill the fields.
2234
     */
2235
0
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2236
0
    if (cur == NULL)
2237
0
  return(NULL);
2238
0
    memset(cur, 0, sizeof(xmlDoc));
2239
2240
0
    cur->type = XML_HTML_DOCUMENT_NODE;
2241
0
    cur->version = NULL;
2242
0
    cur->intSubset = NULL;
2243
0
    cur->doc = cur;
2244
0
    cur->name = NULL;
2245
0
    cur->children = NULL;
2246
0
    cur->extSubset = NULL;
2247
0
    cur->oldNs = NULL;
2248
0
    cur->encoding = NULL;
2249
0
    cur->standalone = 1;
2250
0
    cur->compression = 0;
2251
0
    cur->ids = NULL;
2252
0
    cur->refs = NULL;
2253
0
    cur->_private = NULL;
2254
0
    cur->charset = XML_CHAR_ENCODING_UTF8;
2255
0
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2256
0
    if ((publicId != NULL) ||
2257
0
  (URI != NULL)) {
2258
0
        xmlDtdPtr intSubset;
2259
2260
0
  intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", publicId, URI);
2261
0
        if (intSubset == NULL) {
2262
0
            xmlFree(cur);
2263
0
            return(NULL);
2264
0
        }
2265
0
    }
2266
0
    if ((xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2267
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2268
0
    return(cur);
2269
0
}
2270
2271
/**
2272
 * Creates a new HTML document
2273
 *
2274
 * @param URI  system ID (URI) of the DTD (optional)
2275
 * @param publicId  public ID of the DTD (optional)
2276
 * @returns a new document
2277
 */
2278
xmlDoc *
2279
0
htmlNewDoc(const xmlChar *URI, const xmlChar *publicId) {
2280
0
    if ((URI == NULL) && (publicId == NULL))
2281
0
  return(htmlNewDocNoDtD(
2282
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2283
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2284
2285
0
    return(htmlNewDocNoDtD(URI, publicId));
2286
0
}
2287
2288
2289
/************************************************************************
2290
 *                  *
2291
 *      The parser itself       *
2292
 *  Relates to http://www.w3.org/TR/html40        *
2293
 *                  *
2294
 ************************************************************************/
2295
2296
/************************************************************************
2297
 *                  *
2298
 *      The parser itself       *
2299
 *                  *
2300
 ************************************************************************/
2301
2302
/**
2303
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2304
 * since HTML names are not case-sensitive.
2305
 *
2306
 * @param ctxt  an HTML parser context
2307
 * @param attr  whether this is an attribute name
2308
 * @returns the Tag Name parsed or NULL
2309
 */
2310
2311
static xmlHashedString
2312
0
htmlParseHTMLName(htmlParserCtxtPtr ctxt, int attr) {
2313
0
    xmlHashedString ret;
2314
0
    xmlChar buf[HTML_PARSER_BUFFER_SIZE];
2315
0
    const xmlChar *in;
2316
0
    size_t avail;
2317
0
    int eof = PARSER_PROGRESSIVE(ctxt);
2318
0
    int nbchar = 0;
2319
0
    int stop = attr ? '=' : ' ';
2320
2321
0
    in = ctxt->input->cur;
2322
0
    avail = ctxt->input->end - in;
2323
2324
0
    while (1) {
2325
0
        int c, size;
2326
2327
0
        if ((!eof) && (avail < 32)) {
2328
0
            size_t oldAvail = avail;
2329
2330
0
            ctxt->input->cur = in;
2331
2332
0
            SHRINK;
2333
0
            xmlParserGrow(ctxt);
2334
2335
0
            in = ctxt->input->cur;
2336
0
            avail = ctxt->input->end - in;
2337
2338
0
            if (oldAvail == avail)
2339
0
                eof = 1;
2340
0
        }
2341
2342
0
        if (avail == 0)
2343
0
            break;
2344
2345
0
        c = *in;
2346
0
        size = 1;
2347
2348
0
        if ((nbchar != 0) &&
2349
0
            ((c == '/') || (c == '>') || (c == stop) ||
2350
0
             (IS_WS_HTML(c))))
2351
0
            break;
2352
2353
0
        if (c == 0) {
2354
0
            if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2355
0
                buf[nbchar++] = 0xEF;
2356
0
                buf[nbchar++] = 0xBF;
2357
0
                buf[nbchar++] = 0xBD;
2358
0
            }
2359
0
        } else if (c < 0x80) {
2360
0
            if (nbchar < HTML_PARSER_BUFFER_SIZE) {
2361
0
                if (IS_UPPER(c))
2362
0
                    c += 0x20;
2363
0
                buf[nbchar++] = c;
2364
0
            }
2365
0
        } else {
2366
0
            size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2367
2368
0
            if (size > 0) {
2369
0
                if (nbchar + size <= HTML_PARSER_BUFFER_SIZE) {
2370
0
                    memcpy(buf + nbchar, in, size);
2371
0
                    nbchar += size;
2372
0
                }
2373
0
            } else {
2374
0
                size = 1;
2375
2376
0
                if (nbchar + 3 <= HTML_PARSER_BUFFER_SIZE) {
2377
0
                    buf[nbchar++] = 0xEF;
2378
0
                    buf[nbchar++] = 0xBF;
2379
0
                    buf[nbchar++] = 0xBD;
2380
0
                }
2381
0
            }
2382
0
        }
2383
2384
0
        in += size;
2385
0
        avail -= size;
2386
0
    }
2387
2388
0
    ctxt->input->cur = in;
2389
2390
0
    SHRINK;
2391
2392
0
    ret = xmlDictLookupHashed(ctxt->dict, buf, nbchar);
2393
0
    if (ret.name == NULL)
2394
0
        htmlErrMemory(ctxt);
2395
2396
0
    return(ret);
2397
0
}
2398
2399
static const short htmlC1Remap[32] = {
2400
    0x20AC, 0x0081, 0x201A, 0x0192, 0x201E, 0x2026, 0x2020, 0x2021,
2401
    0x02C6, 0x2030, 0x0160, 0x2039, 0x0152, 0x008D, 0x017D, 0x008F,
2402
    0x0090, 0x2018, 0x2019, 0x201C, 0x201D, 0x2022, 0x2013, 0x2014,
2403
    0x02DC, 0x2122, 0x0161, 0x203A, 0x0153, 0x009D, 0x017E, 0x0178
2404
};
2405
2406
static const xmlChar *
2407
0
htmlCodePointToUtf8(int c, xmlChar *out, int *osize) {
2408
0
    int i = 0;
2409
0
    int bits, hi;
2410
2411
0
    if ((c >= 0x80) && (c < 0xA0)) {
2412
0
        c = htmlC1Remap[c - 0x80];
2413
0
    } else if ((c <= 0) ||
2414
0
               ((c >= 0xD800) && (c < 0xE000)) ||
2415
0
               (c > 0x10FFFF)) {
2416
0
        c = 0xFFFD;
2417
0
    }
2418
2419
0
    if      (c <    0x80) { bits =  0; hi = 0x00; }
2420
0
    else if (c <   0x800) { bits =  6; hi = 0xC0; }
2421
0
    else if (c < 0x10000) { bits = 12; hi = 0xE0; }
2422
0
    else                  { bits = 18; hi = 0xF0; }
2423
2424
0
    out[i++] = (c >> bits) | hi;
2425
2426
0
    while (bits > 0) {
2427
0
        bits -= 6;
2428
0
        out[i++] = ((c >> bits) & 0x3F) | 0x80;
2429
0
    }
2430
2431
0
    *osize = i;
2432
0
    return(out);
2433
0
}
2434
2435
#include "codegen/html5ent.inc"
2436
2437
0
#define ENT_F_SEMICOLON 0x80u
2438
0
#define ENT_F_SUBTABLE  0x40u
2439
0
#define ENT_F_ALL       0xC0u
2440
2441
static const xmlChar *
2442
htmlFindEntityPrefix(const xmlChar *string, size_t slen, int isAttr,
2443
0
                     int *nlen, int *rlen) {
2444
0
    const xmlChar *match = NULL;
2445
0
    unsigned left, right;
2446
0
    int first = string[0];
2447
0
    size_t matchLen = 0;
2448
0
    size_t soff = 1;
2449
2450
0
    if (slen < 2)
2451
0
        return(NULL);
2452
0
    if (!IS_ASCII_LETTER(first))
2453
0
        return(NULL);
2454
2455
    /*
2456
     * Look up range by first character
2457
     */
2458
0
    first &= 63;
2459
0
    left = htmlEntAlpha[first*3] | htmlEntAlpha[first*3+1] << 8;
2460
0
    right = left + htmlEntAlpha[first*3+2];
2461
2462
    /*
2463
     * Binary search
2464
     */
2465
0
    while (left < right) {
2466
0
        const xmlChar *bytes;
2467
0
        unsigned mid;
2468
0
        size_t len;
2469
0
        int cmp;
2470
2471
0
        mid = left + (right - left) / 2;
2472
0
        bytes = htmlEntStrings + htmlEntValues[mid];
2473
0
        len = bytes[0] & ~ENT_F_ALL;
2474
2475
0
        cmp = string[soff] - bytes[1];
2476
2477
0
        if (cmp == 0) {
2478
0
            if (slen < len) {
2479
0
                cmp = strncmp((const char *) string + soff + 1,
2480
0
                              (const char *) bytes + 2,
2481
0
                              slen - 1);
2482
                /* Prefix can never match */
2483
0
                if (cmp == 0)
2484
0
                    break;
2485
0
            } else {
2486
0
                cmp = strncmp((const char *) string + soff + 1,
2487
0
                              (const char *) bytes + 2,
2488
0
                              len - 1);
2489
0
            }
2490
0
        }
2491
2492
0
        if (cmp < 0) {
2493
0
            right = mid;
2494
0
        } else if (cmp > 0) {
2495
0
            left = mid + 1;
2496
0
        } else {
2497
0
            int term = soff + len < slen ? string[soff + len] : 0;
2498
0
            int isAlnum, isTerm;
2499
2500
0
            isAlnum = IS_ALNUM(term);
2501
0
            isTerm = ((term == ';') ||
2502
0
                      ((bytes[0] & ENT_F_SEMICOLON) &&
2503
0
                       ((!isAttr) ||
2504
0
                        ((!isAlnum) && (term != '=')))));
2505
2506
0
            if (isTerm) {
2507
0
                match = bytes + len + 1;
2508
0
                matchLen = soff + len;
2509
0
                if (term == ';')
2510
0
                    matchLen += 1;
2511
0
            }
2512
2513
0
            if (bytes[0] & ENT_F_SUBTABLE) {
2514
0
                if (isTerm)
2515
0
                    match += 2;
2516
2517
0
                if ((isAlnum) && (soff + len < slen)) {
2518
0
                    left = mid + bytes[len + 1];
2519
0
                    right = left + bytes[len + 2];
2520
0
                    soff += len;
2521
0
                    continue;
2522
0
                }
2523
0
            }
2524
2525
0
            break;
2526
0
        }
2527
0
    }
2528
2529
0
    if (match == NULL)
2530
0
        return(NULL);
2531
2532
0
    *nlen = matchLen;
2533
0
    *rlen = match[0];
2534
0
    return(match + 1);
2535
0
}
2536
2537
/**
2538
 * Parse data until terminator is reached.
2539
 *
2540
 * @param ctxt  an HTML parser context
2541
 * @param mask  mask of terminating characters
2542
 * @param comment  true if parsing a comment
2543
 * @param refs  true if references are allowed
2544
 * @param maxLength  maximum output length
2545
 * @returns the parsed string or NULL in case of errors.
2546
 */
2547
2548
static xmlChar *
2549
htmlParseData(htmlParserCtxtPtr ctxt, htmlAsciiMask mask,
2550
0
              int comment, int refs, int maxLength) {
2551
0
    xmlParserInputPtr input = ctxt->input;
2552
0
    xmlChar *ret = NULL;
2553
0
    xmlChar *buffer;
2554
0
    xmlChar utf8Char[4];
2555
0
    size_t buffer_size;
2556
0
    size_t used;
2557
0
    int eof = PARSER_PROGRESSIVE(ctxt);
2558
0
    int line, col;
2559
0
    int termSkip = -1;
2560
2561
0
    used = 0;
2562
0
    buffer_size = ctxt->spaceMax;
2563
0
    buffer = (xmlChar *) ctxt->spaceTab;
2564
0
    if (buffer == NULL) {
2565
0
        buffer_size = 500;
2566
0
        buffer = xmlMalloc(buffer_size + 1);
2567
0
        if (buffer == NULL) {
2568
0
            htmlErrMemory(ctxt);
2569
0
            return(NULL);
2570
0
        }
2571
0
    }
2572
2573
0
    line = input->line;
2574
0
    col = input->col;
2575
2576
0
    while (!PARSER_STOPPED(ctxt)) {
2577
0
        const xmlChar *chunk, *in, *repl;
2578
0
        size_t avail, chunkSize, extraSize;
2579
0
        int replSize;
2580
0
        int skip = 0;
2581
0
        int ncr = 0;
2582
0
        int ncrSize = 0;
2583
0
        int cp = 0;
2584
2585
0
        chunk = input->cur;
2586
0
        avail = input->end - chunk;
2587
0
        in = chunk;
2588
2589
0
        repl = BAD_CAST "";
2590
0
        replSize = 0;
2591
2592
0
        while (!PARSER_STOPPED(ctxt)) {
2593
0
            size_t j;
2594
0
            int cur, size;
2595
2596
0
            if ((!eof) && (avail <= 64)) {
2597
0
                size_t oldAvail = avail;
2598
0
                size_t off = in - chunk;
2599
2600
0
                input->cur = in;
2601
2602
0
                xmlParserGrow(ctxt);
2603
2604
0
                in = input->cur;
2605
0
                chunk = in - off;
2606
0
                input->cur = chunk;
2607
0
                avail = input->end - in;
2608
2609
0
                if (oldAvail == avail)
2610
0
                    eof = 1;
2611
0
            }
2612
2613
0
            if (avail == 0) {
2614
0
                termSkip = 0;
2615
0
                break;
2616
0
            }
2617
2618
0
            cur = *in;
2619
0
            size = 1;
2620
0
            col += 1;
2621
2622
0
            if (htmlMaskMatch(mask, cur)) {
2623
0
                if (comment) {
2624
0
                    if (avail < 2) {
2625
0
                        termSkip = 1;
2626
0
                    } else if (in[1] == '-') {
2627
0
                        if  (avail < 3) {
2628
0
                            termSkip = 2;
2629
0
                        } else if (in[2] == '>') {
2630
0
                            termSkip = 3;
2631
0
                        } else if (in[2] == '!') {
2632
0
                            if (avail < 4)
2633
0
                                termSkip = 3;
2634
0
                            else if (in[3] == '>')
2635
0
                                termSkip = 4;
2636
0
                        }
2637
0
                    }
2638
2639
0
                    if (termSkip >= 0)
2640
0
                        break;
2641
0
                } else {
2642
0
                    termSkip = 0;
2643
0
                    break;
2644
0
                }
2645
0
            }
2646
2647
0
            if (ncr) {
2648
0
                int lc = cur | 0x20;
2649
0
                int digit;
2650
2651
0
                if ((cur >= '0') && (cur <= '9')) {
2652
0
                    digit = cur - '0';
2653
0
                } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
2654
0
                    digit = (lc - 'a') + 10;
2655
0
                } else {
2656
0
                    if (cur == ';') {
2657
0
                        in += 1;
2658
0
                        size += 1;
2659
0
                        ncrSize += 1;
2660
0
                    }
2661
0
                    goto next_chunk;
2662
0
                }
2663
2664
0
                cp = cp * ncr + digit;
2665
0
                if (cp >= 0x110000)
2666
0
                    cp = 0x110000;
2667
2668
0
                ncrSize += 1;
2669
2670
0
                goto next_char;
2671
0
            }
2672
2673
0
            switch (cur) {
2674
0
            case '&':
2675
0
                if (!refs)
2676
0
                    break;
2677
2678
0
                j = 1;
2679
2680
0
                if ((j < avail) && (in[j] == '#')) {
2681
0
                    j += 1;
2682
0
                    if (j < avail) {
2683
0
                        if ((in[j] | 0x20) == 'x') {
2684
0
                            j += 1;
2685
0
                            if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
2686
0
                                ncr = 16;
2687
0
                                size = 3;
2688
0
                                ncrSize = 3;
2689
0
                                cp = 0;
2690
0
                            }
2691
0
                        } else if (IS_ASCII_DIGIT(in[j])) {
2692
0
                            ncr = 10;
2693
0
                            size = 2;
2694
0
                            ncrSize = 2;
2695
0
                            cp = 0;
2696
0
                        }
2697
0
                    }
2698
0
                } else {
2699
0
                    repl = htmlFindEntityPrefix(in + j,
2700
0
                                                avail - j,
2701
0
                                                /* isAttr */ 1,
2702
0
                                                &skip, &replSize);
2703
0
                    if (repl != NULL) {
2704
0
                        skip += 1;
2705
0
                        goto next_chunk;
2706
0
                    }
2707
2708
0
                    skip = 0;
2709
0
                }
2710
2711
0
                break;
2712
2713
0
            case '\0':
2714
0
                skip = 1;
2715
0
                repl = BAD_CAST "\xEF\xBF\xBD";
2716
0
                replSize = 3;
2717
0
                goto next_chunk;
2718
2719
0
            case '\n':
2720
0
                line += 1;
2721
0
                col = 1;
2722
0
                break;
2723
2724
0
            case '\r':
2725
0
                skip = 1;
2726
0
                if (in[1] != 0x0A) {
2727
0
                    repl = BAD_CAST "\x0A";
2728
0
                    replSize = 1;
2729
0
                }
2730
0
                goto next_chunk;
2731
2732
0
            default:
2733
0
                if (cur < 0x80)
2734
0
                    break;
2735
2736
0
                if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
2737
0
                    xmlChar * guess;
2738
2739
0
                    if (in > chunk)
2740
0
                        goto next_chunk;
2741
2742
0
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
2743
0
                    guess = NULL;
2744
#else
2745
                    guess = htmlFindEncoding(ctxt);
2746
#endif
2747
0
                    if (guess == NULL) {
2748
0
                        xmlSwitchEncoding(ctxt,
2749
0
                                XML_CHAR_ENCODING_WINDOWS_1252);
2750
0
                    } else {
2751
0
                        xmlSwitchEncodingName(ctxt, (const char *) guess);
2752
0
                        xmlFree(guess);
2753
0
                    }
2754
0
                    input->flags |= XML_INPUT_HAS_ENCODING;
2755
2756
0
                    eof = PARSER_PROGRESSIVE(ctxt);
2757
0
                    goto restart;
2758
0
                }
2759
2760
0
                size = htmlValidateUtf8(ctxt, in, avail, /* partial */ 0);
2761
2762
0
                if (size <= 0) {
2763
0
                    skip = 1;
2764
0
                    repl = BAD_CAST "\xEF\xBF\xBD";
2765
0
                    replSize = 3;
2766
0
                    goto next_chunk;
2767
0
                }
2768
2769
0
                break;
2770
0
            }
2771
2772
0
next_char:
2773
0
            in += size;
2774
0
            avail -= size;
2775
0
        }
2776
2777
0
next_chunk:
2778
0
        if (ncrSize > 0) {
2779
0
            skip = ncrSize;
2780
0
            in -= ncrSize;
2781
2782
0
            repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
2783
0
        }
2784
2785
0
        chunkSize = in - chunk;
2786
0
        extraSize = chunkSize + replSize;
2787
2788
0
        if (extraSize > maxLength - used) {
2789
0
            htmlParseErr(ctxt, XML_ERR_RESOURCE_LIMIT,
2790
0
                         "value too long\n", NULL, NULL);
2791
0
            goto error;
2792
0
        }
2793
2794
0
        if (extraSize > buffer_size - used) {
2795
0
            size_t newSize = (used + extraSize) * 2;
2796
0
            xmlChar *tmp = xmlRealloc(buffer, newSize + 1);
2797
2798
0
            if (tmp == NULL) {
2799
0
                htmlErrMemory(ctxt);
2800
0
                goto error;
2801
0
            }
2802
0
            buffer = tmp;
2803
0
            buffer_size = newSize;
2804
0
        }
2805
2806
0
        if (chunkSize > 0) {
2807
0
            input->cur += chunkSize;
2808
0
            memcpy(buffer + used, chunk, chunkSize);
2809
0
            used += chunkSize;
2810
0
        }
2811
2812
0
        input->cur += skip;
2813
0
        if (replSize > 0) {
2814
0
            memcpy(buffer + used, repl, replSize);
2815
0
            used += replSize;
2816
0
        }
2817
2818
0
        SHRINK;
2819
2820
0
        if (termSkip >= 0)
2821
0
            break;
2822
2823
0
restart:
2824
0
        ;
2825
0
    }
2826
2827
0
    if (termSkip > 0) {
2828
0
        input->cur += termSkip;
2829
0
        col += termSkip;
2830
0
    }
2831
2832
0
    input->line = line;
2833
0
    input->col = col;
2834
2835
0
    ret = xmlMalloc(used + 1);
2836
0
    if (ret == NULL) {
2837
0
        htmlErrMemory(ctxt);
2838
0
    } else {
2839
0
        memcpy(ret, buffer, used);
2840
0
        ret[used] = 0;
2841
0
    }
2842
2843
0
error:
2844
0
    ctxt->spaceTab = (void *) buffer;
2845
0
    ctxt->spaceMax = buffer_size;
2846
2847
0
    return(ret);
2848
0
}
2849
2850
/**
2851
 * @deprecated Internal function, don't use.
2852
 *
2853
 * @param ctxt  an HTML parser context
2854
 * @param str  location to store the entity name
2855
 * @returns NULL.
2856
 */
2857
const htmlEntityDesc *
2858
htmlParseEntityRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED,
2859
0
                   const xmlChar **str ATTRIBUTE_UNUSED) {
2860
0
    return(NULL);
2861
0
}
2862
2863
/**
2864
 * parse a value for an attribute
2865
 * Note: the parser won't do substitution of entities here, this
2866
 * will be handled later in #xmlStringGetNodeList, unless it was
2867
 * asked for ctxt->replaceEntities != 0
2868
 *
2869
 * @param ctxt  an HTML parser context
2870
 * @returns the AttValue parsed or NULL.
2871
 */
2872
2873
static xmlChar *
2874
0
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2875
0
    xmlChar *ret = NULL;
2876
0
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
2877
0
                    XML_MAX_HUGE_LENGTH :
2878
0
                    XML_MAX_TEXT_LENGTH;
2879
2880
0
    if (CUR == '"') {
2881
0
        SKIP(1);
2882
0
  ret = htmlParseData(ctxt, MASK_DQ, 0, 1, maxLength);
2883
0
        if (CUR == '"')
2884
0
            SKIP(1);
2885
0
    } else if (CUR == '\'') {
2886
0
        SKIP(1);
2887
0
  ret = htmlParseData(ctxt, MASK_SQ, 0, 1, maxLength);
2888
0
        if (CUR == '\'')
2889
0
            SKIP(1);
2890
0
    } else {
2891
0
  ret = htmlParseData(ctxt, MASK_WS_GT, 0, 1, maxLength);
2892
0
    }
2893
0
    return(ret);
2894
0
}
2895
2896
static void
2897
htmlCharDataSAXCallback(htmlParserCtxtPtr ctxt, const xmlChar *buf,
2898
0
                        int size, int mode) {
2899
0
    if ((ctxt->sax == NULL) || (ctxt->disableSAX))
2900
0
        return;
2901
2902
0
    if ((mode == 0) || (mode == DATA_RCDATA) ||
2903
0
        (ctxt->sax->cdataBlock == NULL)) {
2904
0
        if ((ctxt->name == NULL) ||
2905
0
            (xmlStrEqual(ctxt->name, BAD_CAST "html")) ||
2906
0
            (xmlStrEqual(ctxt->name, BAD_CAST "head"))) {
2907
0
            int i;
2908
2909
            /*
2910
             * Add leading whitespace to html or head elements before
2911
             * calling htmlStartCharData.
2912
             */
2913
0
            for (i = 0; i < size; i++)
2914
0
                if (!IS_WS_HTML(buf[i]))
2915
0
                    break;
2916
2917
0
            if (i > 0) {
2918
0
                if (!ctxt->keepBlanks) {
2919
0
                    if (ctxt->sax->ignorableWhitespace != NULL)
2920
0
                        ctxt->sax->ignorableWhitespace(ctxt->userData, buf, i);
2921
0
                } else {
2922
0
                    if (ctxt->sax->characters != NULL)
2923
0
                        ctxt->sax->characters(ctxt->userData, buf, i);
2924
0
                }
2925
2926
0
                buf += i;
2927
0
                size -= i;
2928
0
            }
2929
2930
0
            if (size <= 0)
2931
0
                return;
2932
2933
0
            htmlStartCharData(ctxt);
2934
2935
0
            if (PARSER_STOPPED(ctxt))
2936
0
                return;
2937
0
        }
2938
2939
0
        if ((mode == 0) &&
2940
0
            (!ctxt->keepBlanks) &&
2941
0
            (areBlanks(ctxt, buf, size) > 0)) {
2942
0
            if (ctxt->sax->ignorableWhitespace != NULL)
2943
0
                ctxt->sax->ignorableWhitespace(ctxt->userData, buf, size);
2944
0
        } else {
2945
0
            if (ctxt->sax->characters != NULL)
2946
0
                ctxt->sax->characters(ctxt->userData, buf, size);
2947
0
        }
2948
0
    } else {
2949
        /*
2950
         * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2951
         */
2952
0
        ctxt->sax->cdataBlock(ctxt->userData, buf, size);
2953
0
    }
2954
0
}
2955
2956
/**
2957
 * Parse character data and references.
2958
 *
2959
 * @param ctxt  an HTML parser context
2960
 * @param partial  true if the input buffer is incomplete
2961
 * @returns 1 if all data was parsed, 0 otherwise.
2962
 */
2963
2964
static int
2965
0
htmlParseCharData(htmlParserCtxtPtr ctxt, int partial) {
2966
0
    xmlParserInputPtr input = ctxt->input;
2967
0
    xmlChar utf8Char[4];
2968
0
    int complete = 0;
2969
0
    int done = 0;
2970
0
    int mode;
2971
0
    int eof = PARSER_PROGRESSIVE(ctxt);
2972
0
    int line, col;
2973
2974
0
    mode = ctxt->endCheckState;
2975
2976
0
    line = input->line;
2977
0
    col = input->col;
2978
2979
0
    while (!PARSER_STOPPED(ctxt)) {
2980
0
        const xmlChar *chunk, *in, *repl;
2981
0
        size_t avail;
2982
0
        int replSize;
2983
0
        int skip = 0;
2984
0
        int ncr = 0;
2985
0
        int ncrSize = 0;
2986
0
        int cp = 0;
2987
2988
0
        chunk = input->cur;
2989
0
        avail = input->end - chunk;
2990
0
        in = chunk;
2991
2992
0
        repl = BAD_CAST "";
2993
0
        replSize = 0;
2994
2995
0
        while (!PARSER_STOPPED(ctxt)) {
2996
0
            size_t j;
2997
0
            int cur, size;
2998
2999
0
            if (avail <= 64) {
3000
0
                if (!eof) {
3001
0
                    size_t oldAvail = avail;
3002
0
                    size_t off = in - chunk;
3003
3004
0
                    input->cur = in;
3005
3006
0
                    xmlParserGrow(ctxt);
3007
3008
0
                    in = input->cur;
3009
0
                    chunk = in - off;
3010
0
                    input->cur = chunk;
3011
0
                    avail = input->end - in;
3012
3013
0
                    if (oldAvail == avail)
3014
0
                        eof = 1;
3015
0
                }
3016
3017
0
                if (avail == 0) {
3018
0
                    if ((partial) && (ncr)) {
3019
0
                        in -= ncrSize;
3020
0
                        ncrSize = 0;
3021
0
                    }
3022
3023
0
                    done = 1;
3024
0
                    break;
3025
0
                }
3026
0
            }
3027
3028
            /* Accelerator */
3029
0
            if (!ncr) {
3030
0
                while (avail > 0) {
3031
0
                    static const unsigned mask[8] = {
3032
0
                        0x00002401, 0x10002040,
3033
0
                        0x00000000, 0x00000000,
3034
0
                        0xFFFFFFFF, 0xFFFFFFFF,
3035
0
                        0xFFFFFFFF, 0xFFFFFFFF
3036
0
                    };
3037
0
                    cur = *in;
3038
0
                    if ((1u << (cur & 0x1F)) & mask[cur >> 5])
3039
0
                        break;
3040
0
                    col += 1;
3041
0
                    in += 1;
3042
0
                    avail -= 1;
3043
0
                }
3044
3045
0
                if ((!eof) && (avail <= 64))
3046
0
                    continue;
3047
0
                if (avail == 0)
3048
0
                    continue;
3049
0
            }
3050
3051
0
            cur = *in;
3052
0
            size = 1;
3053
0
            col += 1;
3054
3055
0
            if (ncr) {
3056
0
                int lc = cur | 0x20;
3057
0
                int digit;
3058
3059
0
                if ((cur >= '0') && (cur <= '9')) {
3060
0
                    digit = cur - '0';
3061
0
                } else if ((ncr == 16) && (lc >= 'a') && (lc <= 'f')) {
3062
0
                    digit = (lc - 'a') + 10;
3063
0
                } else {
3064
0
                    if (cur == ';') {
3065
0
                        in += 1;
3066
0
                        size += 1;
3067
0
                        ncrSize += 1;
3068
0
                    }
3069
0
                    goto next_chunk;
3070
0
                }
3071
3072
0
                cp = cp * ncr + digit;
3073
0
                if (cp >= 0x110000)
3074
0
                    cp = 0x110000;
3075
3076
0
                ncrSize += 1;
3077
3078
0
                goto next_char;
3079
0
            }
3080
3081
0
            switch (cur) {
3082
0
            case '<':
3083
0
                if (mode == 0) {
3084
0
                    done = 1;
3085
0
                    complete = 1;
3086
0
                    goto next_chunk;
3087
0
                }
3088
0
                if (mode == DATA_PLAINTEXT)
3089
0
                    break;
3090
3091
0
                j = 1;
3092
0
                if (j < avail) {
3093
0
                    if ((mode == DATA_SCRIPT) && (in[j] == '!')) {
3094
                        /* Check for comment start */
3095
3096
0
                        j += 1;
3097
0
                        if ((j < avail) && (in[j] == '-')) {
3098
0
                            j += 1;
3099
0
                            if ((j < avail) && (in[j] == '-'))
3100
0
                                mode = DATA_SCRIPT_ESC1;
3101
0
                        }
3102
0
                    } else {
3103
0
                        int i = 0;
3104
0
                        int solidus = 0;
3105
3106
                        /* Check for tag */
3107
3108
0
                        if (in[j] == '/') {
3109
0
                            j += 1;
3110
0
                            solidus = 1;
3111
0
                        }
3112
3113
0
                        if ((solidus) || (mode == DATA_SCRIPT_ESC1)) {
3114
0
                            while ((j < avail) &&
3115
0
                                   (ctxt->name[i] != 0) &&
3116
0
                                   (ctxt->name[i] == (in[j] | 0x20))) {
3117
0
                                i += 1;
3118
0
                                j += 1;
3119
0
                            }
3120
3121
0
                            if ((ctxt->name[i] == 0) && (j < avail)) {
3122
0
                                int c = in[j];
3123
3124
0
                                if ((c == '>') || (c == '/') ||
3125
0
                                    (IS_WS_HTML(c))) {
3126
0
                                    if ((mode == DATA_SCRIPT_ESC1) &&
3127
0
                                        (!solidus)) {
3128
0
                                        mode = DATA_SCRIPT_ESC2;
3129
0
                                    } else if (mode == DATA_SCRIPT_ESC2) {
3130
0
                                        mode = DATA_SCRIPT_ESC1;
3131
0
                                    } else {
3132
0
                                        complete = 1;
3133
0
                                        done = 1;
3134
0
                                        goto next_chunk;
3135
0
                                    }
3136
0
                                }
3137
0
                            }
3138
0
                        }
3139
0
                    }
3140
0
                }
3141
3142
0
                if ((partial) && (j >= avail)) {
3143
0
                    done = 1;
3144
0
                    goto next_chunk;
3145
0
                }
3146
3147
0
                break;
3148
3149
0
            case '-':
3150
0
                if ((mode != DATA_SCRIPT_ESC1) && (mode != DATA_SCRIPT_ESC2))
3151
0
                    break;
3152
3153
                /* Check for comment end */
3154
3155
0
                j = 1;
3156
0
                if ((j < avail) && (in[j] == '-')) {
3157
0
                    j += 1;
3158
0
                    if ((j < avail) && (in[j] == '>'))
3159
0
                        mode = DATA_SCRIPT;
3160
0
                }
3161
3162
0
                if ((partial) && (j >= avail)) {
3163
0
                    done = 1;
3164
0
                    goto next_chunk;
3165
0
                }
3166
3167
0
                break;
3168
3169
0
            case '&':
3170
0
                if ((mode != 0) && (mode != DATA_RCDATA))
3171
0
                    break;
3172
3173
0
                j = 1;
3174
3175
0
                if ((j < avail) && (in[j] == '#')) {
3176
0
                    j += 1;
3177
0
                    if (j < avail) {
3178
0
                        if ((in[j] | 0x20) == 'x') {
3179
0
                            j += 1;
3180
0
                            if ((j < avail) && (IS_HEX_DIGIT(in[j]))) {
3181
0
                                ncr = 16;
3182
0
                                size = 3;
3183
0
                                ncrSize = 3;
3184
0
                                cp = 0;
3185
0
                            }
3186
0
                        } else if (IS_ASCII_DIGIT(in[j])) {
3187
0
                            ncr = 10;
3188
0
                            size = 2;
3189
0
                            ncrSize = 2;
3190
0
                            cp = 0;
3191
0
                        }
3192
0
                    }
3193
0
                } else {
3194
0
                    if (partial) {
3195
0
                        int terminated = 0;
3196
0
                        size_t i;
3197
3198
                        /*
3199
                         * &CounterClockwiseContourIntegral; has 33 bytes.
3200
                         */
3201
0
                        for (i = 1; i < avail; i++) {
3202
0
                            if ((i >= 32) ||
3203
0
                                (!IS_ASCII_LETTER(in[i]) &&
3204
0
                                 ((i < 2) || !IS_ASCII_DIGIT(in[i])))) {
3205
0
                                terminated = 1;
3206
0
                                break;
3207
0
                            }
3208
0
                        }
3209
3210
0
                        if (!terminated) {
3211
0
                            done = 1;
3212
0
                            goto next_chunk;
3213
0
                        }
3214
0
                    }
3215
3216
0
                    repl = htmlFindEntityPrefix(in + j,
3217
0
                                                avail - j,
3218
0
                                                /* isAttr */ 0,
3219
0
                                                &skip, &replSize);
3220
0
                    if (repl != NULL) {
3221
0
                        skip += 1;
3222
0
                        goto next_chunk;
3223
0
                    }
3224
3225
0
                    skip = 0;
3226
0
                }
3227
3228
0
                if ((partial) && (j >= avail)) {
3229
0
                    done = 1;
3230
0
                    goto next_chunk;
3231
0
                }
3232
3233
0
                break;
3234
3235
0
            case '\0':
3236
0
                skip = 1;
3237
3238
0
                if (mode == 0) {
3239
                    /*
3240
                     * The HTML5 spec says that the tokenizer should
3241
                     * pass on U+0000 unmodified in normal data mode.
3242
                     * These characters should then be ignored in body
3243
                     * and other text, but should be replaced with
3244
                     * U+FFFD in foreign content.
3245
                     *
3246
                     * At least for now, we always strip U+0000 when
3247
                     * tokenizing.
3248
                     */
3249
0
                    repl = BAD_CAST "";
3250
0
                    replSize = 0;
3251
0
                } else {
3252
0
                    repl = BAD_CAST "\xEF\xBF\xBD";
3253
0
                    replSize = 3;
3254
0
                }
3255
3256
0
                goto next_chunk;
3257
3258
0
            case '\n':
3259
0
                line += 1;
3260
0
                col = 1;
3261
0
                break;
3262
3263
0
            case '\r':
3264
0
                if (partial && avail < 2) {
3265
0
                    done = 1;
3266
0
                    goto next_chunk;
3267
0
                }
3268
3269
0
                skip = 1;
3270
0
                if (in[1] != 0x0A) {
3271
0
                    repl = BAD_CAST "\x0A";
3272
0
                    replSize = 1;
3273
0
                }
3274
0
                goto next_chunk;
3275
3276
0
            default:
3277
0
                if (cur < 0x80)
3278
0
                    break;
3279
3280
0
                if ((input->flags & XML_INPUT_HAS_ENCODING) == 0) {
3281
0
                    xmlChar * guess;
3282
3283
0
                    if (in > chunk)
3284
0
                        goto next_chunk;
3285
3286
0
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3287
0
                    guess = NULL;
3288
#else
3289
                    guess = htmlFindEncoding(ctxt);
3290
#endif
3291
0
                    if (guess == NULL) {
3292
0
                        xmlSwitchEncoding(ctxt,
3293
0
                                XML_CHAR_ENCODING_WINDOWS_1252);
3294
0
                    } else {
3295
0
                        xmlSwitchEncodingName(ctxt, (const char *) guess);
3296
0
                        xmlFree(guess);
3297
0
                    }
3298
0
                    input->flags |= XML_INPUT_HAS_ENCODING;
3299
3300
0
                    eof = PARSER_PROGRESSIVE(ctxt);
3301
0
                    goto restart;
3302
0
                }
3303
3304
0
                size = htmlValidateUtf8(ctxt, in, avail, partial);
3305
3306
0
                if ((partial) && (size == 0)) {
3307
0
                    done = 1;
3308
0
                    goto next_chunk;
3309
0
                }
3310
3311
0
                if (size <= 0) {
3312
0
                    skip = 1;
3313
0
                    repl = BAD_CAST "\xEF\xBF\xBD";
3314
0
                    replSize = 3;
3315
0
                    goto next_chunk;
3316
0
                }
3317
3318
0
                break;
3319
0
            }
3320
3321
0
next_char:
3322
0
            in += size;
3323
0
            avail -= size;
3324
0
        }
3325
3326
0
next_chunk:
3327
0
        if (ncrSize > 0) {
3328
0
            skip = ncrSize;
3329
0
            in -= ncrSize;
3330
3331
0
            repl = htmlCodePointToUtf8(cp, utf8Char, &replSize);
3332
0
        }
3333
3334
0
        if (in > chunk) {
3335
0
            input->cur += in - chunk;
3336
0
            htmlCharDataSAXCallback(ctxt, chunk, in - chunk, mode);
3337
0
        }
3338
3339
0
        input->cur += skip;
3340
0
        if (replSize > 0)
3341
0
            htmlCharDataSAXCallback(ctxt, repl, replSize, mode);
3342
3343
0
        SHRINK;
3344
3345
0
        if (done)
3346
0
            break;
3347
3348
0
restart:
3349
0
        ;
3350
0
    }
3351
3352
0
    input->line = line;
3353
0
    input->col = col;
3354
3355
0
    if (complete)
3356
0
        ctxt->endCheckState = 0;
3357
0
    else
3358
0
        ctxt->endCheckState = mode;
3359
3360
0
    return(complete);
3361
0
}
3362
3363
/**
3364
 * Parse an HTML comment
3365
 *
3366
 * @param ctxt  an HTML parser context
3367
 * @param bogus  true if this is a bogus comment
3368
 */
3369
static void
3370
0
htmlParseComment(htmlParserCtxtPtr ctxt, int bogus) {
3371
0
    const xmlChar *comment = BAD_CAST "";
3372
0
    xmlChar *buf = NULL;
3373
0
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3374
0
                    XML_MAX_HUGE_LENGTH :
3375
0
                    XML_MAX_TEXT_LENGTH;
3376
3377
0
    if (bogus) {
3378
0
        buf = htmlParseData(ctxt, MASK_GT, 0, 0, maxLength);
3379
0
        if (CUR == '>')
3380
0
            SKIP(1);
3381
0
        comment = buf;
3382
0
    } else {
3383
0
        if ((!PARSER_PROGRESSIVE(ctxt)) &&
3384
0
            (ctxt->input->end - ctxt->input->cur < 2))
3385
0
            xmlParserGrow(ctxt);
3386
3387
0
        if (CUR == '>') {
3388
0
            SKIP(1);
3389
0
        } else if ((CUR == '-') && (NXT(1) == '>')) {
3390
0
            SKIP(2);
3391
0
        } else {
3392
0
            buf = htmlParseData(ctxt, MASK_DASH, 1, 0, maxLength);
3393
0
            comment = buf;
3394
0
        }
3395
0
    }
3396
3397
0
    if (comment == NULL)
3398
0
        return;
3399
3400
0
    if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3401
0
        (!ctxt->disableSAX))
3402
0
        ctxt->sax->comment(ctxt->userData, comment);
3403
3404
0
    xmlFree(buf);
3405
0
}
3406
3407
/**
3408
 * @deprecated Internal function, don't use.
3409
 *
3410
 * @param ctxt  an HTML parser context
3411
 * @returns 0
3412
 */
3413
int
3414
0
htmlParseCharRef(htmlParserCtxt *ctxt ATTRIBUTE_UNUSED) {
3415
0
    return(0);
3416
0
}
3417
3418
3419
/**
3420
 * Parse a DOCTYPE SYTSTEM or PUBLIC literal.
3421
 *
3422
 * @param ctxt  an HTML parser context
3423
 * @returns the literal or NULL in case of error.
3424
 */
3425
3426
static xmlChar *
3427
0
htmlParseDoctypeLiteral(htmlParserCtxtPtr ctxt) {
3428
0
    xmlChar *ret;
3429
0
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3430
0
                    XML_MAX_TEXT_LENGTH :
3431
0
                    XML_MAX_NAME_LENGTH;
3432
3433
0
    if (CUR == '"') {
3434
0
        SKIP(1);
3435
0
        ret = htmlParseData(ctxt, MASK_DQ_GT, 0, 0, maxLength);
3436
0
        if (CUR == '"')
3437
0
            SKIP(1);
3438
0
    } else if (CUR == '\'') {
3439
0
        SKIP(1);
3440
0
        ret = htmlParseData(ctxt, MASK_SQ_GT, 0, 0, maxLength);
3441
0
        if (CUR == '\'')
3442
0
            SKIP(1);
3443
0
    } else {
3444
0
        return(NULL);
3445
0
    }
3446
3447
0
    return(ret);
3448
0
}
3449
3450
static void
3451
0
htmlSkipBogusDoctype(htmlParserCtxtPtr ctxt) {
3452
0
    const xmlChar *in;
3453
0
    size_t avail;
3454
0
    int eof = PARSER_PROGRESSIVE(ctxt);
3455
0
    int line, col;
3456
3457
0
    line = ctxt->input->line;
3458
0
    col = ctxt->input->col;
3459
3460
0
    in = ctxt->input->cur;
3461
0
    avail = ctxt->input->end - in;
3462
3463
0
    while (!PARSER_STOPPED(ctxt)) {
3464
0
        int cur;
3465
3466
0
        if ((!eof) && (avail <= 64)) {
3467
0
            size_t oldAvail = avail;
3468
3469
0
            ctxt->input->cur = in;
3470
3471
0
            xmlParserGrow(ctxt);
3472
3473
0
            in = ctxt->input->cur;
3474
0
            avail = ctxt->input->end - in;
3475
3476
0
            if (oldAvail == avail)
3477
0
                eof = 1;
3478
0
        }
3479
3480
0
        if (avail == 0)
3481
0
            break;
3482
3483
0
        col += 1;
3484
3485
0
        cur = *in;
3486
0
        if (cur == '>') {
3487
0
            in += 1;
3488
0
            break;
3489
0
        } else if (cur == 0x0A) {
3490
0
            line += 1;
3491
0
            col = 1;
3492
0
        }
3493
3494
0
        in += 1;
3495
0
        avail -= 1;
3496
3497
0
        SHRINK;
3498
0
    }
3499
3500
0
    ctxt->input->cur = in;
3501
0
    ctxt->input->line = line;
3502
0
    ctxt->input->col = col;
3503
0
}
3504
3505
/**
3506
 * Parse a DOCTYPE declaration.
3507
 *
3508
 * @param ctxt  an HTML parser context
3509
 */
3510
3511
static void
3512
0
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3513
0
    xmlChar *name = NULL;
3514
0
    xmlChar *publicId = NULL;
3515
0
    xmlChar *URI = NULL;
3516
0
    int maxLength = (ctxt->options & HTML_PARSE_HUGE) ?
3517
0
                    XML_MAX_TEXT_LENGTH :
3518
0
                    XML_MAX_NAME_LENGTH;
3519
3520
    /*
3521
     * We know that '<!DOCTYPE' has been detected.
3522
     */
3523
0
    SKIP(9);
3524
3525
0
    SKIP_BLANKS;
3526
3527
0
    if ((ctxt->input->cur < ctxt->input->end) && (CUR != '>')) {
3528
0
        name = htmlParseData(ctxt, MASK_WS_GT, 0, 0, maxLength);
3529
3530
0
        if ((ctxt->options & HTML_PARSE_HTML5) && (name != NULL)) {
3531
0
            xmlChar *cur;
3532
3533
0
            for (cur = name; *cur; cur++) {
3534
0
                if (IS_UPPER(*cur))
3535
0
                    *cur += 0x20;
3536
0
            }
3537
0
        }
3538
3539
0
        SKIP_BLANKS;
3540
0
    }
3541
3542
    /*
3543
     * Check for SystemID and publicId
3544
     */
3545
0
    if ((UPPER == 'P') && (UPP(1) == 'U') &&
3546
0
  (UPP(2) == 'B') && (UPP(3) == 'L') &&
3547
0
  (UPP(4) == 'I') && (UPP(5) == 'C')) {
3548
0
        SKIP(6);
3549
0
        SKIP_BLANKS;
3550
0
  publicId = htmlParseDoctypeLiteral(ctxt);
3551
0
  if (publicId == NULL)
3552
0
            goto bogus;
3553
0
        SKIP_BLANKS;
3554
0
  URI = htmlParseDoctypeLiteral(ctxt);
3555
0
    } else if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3556
0
               (UPP(2) == 'S') && (UPP(3) == 'T') &&
3557
0
         (UPP(4) == 'E') && (UPP(5) == 'M')) {
3558
0
        SKIP(6);
3559
0
        SKIP_BLANKS;
3560
0
  URI = htmlParseDoctypeLiteral(ctxt);
3561
0
    }
3562
3563
0
bogus:
3564
0
    htmlSkipBogusDoctype(ctxt);
3565
3566
    /*
3567
     * Create or update the document accordingly to the DOCTYPE
3568
     */
3569
0
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3570
0
  (!ctxt->disableSAX))
3571
0
  ctxt->sax->internalSubset(ctxt->userData, name, publicId, URI);
3572
3573
0
    xmlFree(name);
3574
0
    xmlFree(URI);
3575
0
    xmlFree(publicId);
3576
0
}
3577
3578
/**
3579
 * parse an attribute
3580
 *
3581
 * [41] Attribute ::= Name Eq AttValue
3582
 *
3583
 * [25] Eq ::= S? '=' S?
3584
 *
3585
 * With namespace:
3586
 *
3587
 * [NS 11] Attribute ::= QName Eq AttValue
3588
 *
3589
 * Also the case QName == xmlns:??? is handled independently as a namespace
3590
 * definition.
3591
 *
3592
 * @param ctxt  an HTML parser context
3593
 * @param value  a xmlChar ** used to store the value of the attribute
3594
 * @returns the attribute name, and the value in *value.
3595
 */
3596
3597
static xmlHashedString
3598
0
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3599
0
    xmlHashedString hname;
3600
0
    xmlChar *val = NULL;
3601
3602
0
    *value = NULL;
3603
0
    hname = htmlParseHTMLName(ctxt, 1);
3604
0
    if (hname.name == NULL)
3605
0
        return(hname);
3606
3607
    /*
3608
     * read the value
3609
     */
3610
0
    SKIP_BLANKS;
3611
0
    if (CUR == '=') {
3612
0
        SKIP(1);
3613
0
  SKIP_BLANKS;
3614
0
  val = htmlParseAttValue(ctxt);
3615
0
    }
3616
3617
0
    *value = val;
3618
0
    return(hname);
3619
0
}
3620
3621
static int
3622
htmlCharEncCheckAsciiCompatible(htmlParserCtxt *ctxt,
3623
0
                                const xmlChar *encoding) {
3624
0
    xmlCharEncodingHandler *handler;
3625
0
    xmlChar in[9] = "<a A=\"/>";
3626
0
    xmlChar out[9];
3627
0
    int inlen, outlen;
3628
0
    int res;
3629
3630
0
    res = xmlCreateCharEncodingHandler(
3631
0
            (const char *) encoding,
3632
0
            XML_ENC_INPUT | XML_ENC_HTML,
3633
0
            ctxt->convImpl, ctxt->convCtxt,
3634
0
            &handler);
3635
0
    if (res != XML_ERR_OK) {
3636
0
        xmlFatalErr(ctxt, res, (const char *) encoding);
3637
0
        return(-1);
3638
0
    }
3639
3640
    /* UTF-8 */
3641
0
    if (handler == NULL)
3642
0
        return(0);
3643
3644
0
    inlen = 8;
3645
0
    outlen = 8;
3646
0
    res = xmlEncInputChunk(handler, out, &outlen, in, &inlen, /* flush */ 1);
3647
3648
0
    xmlCharEncCloseFunc(handler);
3649
3650
0
    if ((res != XML_ENC_ERR_SUCCESS) ||
3651
0
        (inlen != 8) || (outlen != 8) ||
3652
0
        (memcmp(in, out, 8) != 0)) {
3653
0
        htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3654
0
                     "Encoding %s isn't ASCII-compatible", encoding, NULL);
3655
0
        return(-1);
3656
0
    }
3657
3658
0
    return(0);
3659
0
}
3660
3661
/**
3662
 * Handle charset encoding in meta tag.
3663
 *
3664
 * @param ctxt  an HTML parser context
3665
 * @param atts  the attributes values
3666
 */
3667
static void
3668
0
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3669
0
    int i;
3670
0
    const xmlChar *att, *value;
3671
0
    int isContentType = 0;
3672
0
    const xmlChar *content = NULL;
3673
0
    xmlChar *encoding = NULL;
3674
3675
0
    if ((ctxt == NULL) || (atts == NULL))
3676
0
  return;
3677
3678
0
    i = 0;
3679
0
    att = atts[i++];
3680
0
    while (att != NULL) {
3681
0
  value = atts[i++];
3682
0
        if (value != NULL) {
3683
0
            if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3684
0
                (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3685
0
                isContentType = 1;
3686
0
            } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3687
0
                encoding = xmlStrdup(value);
3688
0
                if (encoding == NULL)
3689
0
                    htmlErrMemory(ctxt);
3690
0
                break;
3691
0
            } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3692
0
                content = value;
3693
0
            }
3694
0
        }
3695
0
  att = atts[i++];
3696
0
    }
3697
3698
0
    if ((encoding == NULL) && (isContentType) && (content != NULL)) {
3699
0
        htmlMetaEncodingOffsets off;
3700
3701
0
        if (htmlParseContentType(content, &off)) {
3702
0
            encoding = xmlStrndup(content + off.start, off.end - off.start);
3703
0
            if (encoding == NULL)
3704
0
                htmlErrMemory(ctxt);
3705
0
        }
3706
0
    }
3707
3708
0
    if (encoding != NULL) {
3709
0
        if (htmlCharEncCheckAsciiCompatible(ctxt, encoding) < 0) {
3710
0
            xmlFree(encoding);
3711
0
            return;
3712
0
        }
3713
3714
0
        xmlSetDeclaredEncoding(ctxt, encoding);
3715
0
    }
3716
0
}
3717
3718
/**
3719
 * Inserts a new attribute into the hash table.
3720
 *
3721
 * @param ctxt  parser context
3722
 * @param size  size of the hash table
3723
 * @param name  attribute name
3724
 * @param hashValue  hash value of name
3725
 * @param aindex  attribute index (this is a multiple of 5)
3726
 * @returns INT_MAX if no existing attribute was found, the attribute
3727
 * index if an attribute was found, -1 if a memory allocation failed.
3728
 */
3729
static int
3730
htmlAttrHashInsert(xmlParserCtxtPtr ctxt, unsigned size, const xmlChar *name,
3731
0
                   unsigned hashValue, int aindex) {
3732
0
    xmlAttrHashBucket *table = ctxt->attrHash;
3733
0
    xmlAttrHashBucket *bucket;
3734
0
    unsigned hindex;
3735
3736
0
    hindex = hashValue & (size - 1);
3737
0
    bucket = &table[hindex];
3738
3739
0
    while (bucket->index >= 0) {
3740
0
        const xmlChar **atts = &ctxt->atts[bucket->index];
3741
3742
0
        if (name == atts[0])
3743
0
            return(bucket->index);
3744
3745
0
        hindex++;
3746
0
        bucket++;
3747
0
        if (hindex >= size) {
3748
0
            hindex = 0;
3749
0
            bucket = table;
3750
0
        }
3751
0
    }
3752
3753
0
    bucket->index = aindex;
3754
3755
0
    return(INT_MAX);
3756
0
}
3757
3758
/**
3759
 * parse a start of tag either for rule element or
3760
 * EmptyElement. In both case we don't parse the tag closing chars.
3761
 *
3762
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3763
 *
3764
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3765
 *
3766
 * With namespace:
3767
 *
3768
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3769
 *
3770
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3771
 *
3772
 * @param ctxt  an HTML parser context
3773
 * @returns 0 in case of success, -1 in case of error and 1 if discarded
3774
 */
3775
3776
static void
3777
0
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3778
0
    const xmlChar *name;
3779
0
    const xmlChar *attname;
3780
0
    xmlChar *attvalue;
3781
0
    const xmlChar **atts;
3782
0
    int nbatts = 0;
3783
0
    int maxatts;
3784
0
    int i;
3785
0
    int discardtag = 0;
3786
3787
0
    ctxt->endCheckState = 0;
3788
3789
0
    SKIP(1);
3790
3791
0
    atts = ctxt->atts;
3792
0
    maxatts = ctxt->maxatts;
3793
3794
0
    GROW;
3795
0
    name = htmlParseHTMLName(ctxt, 0).name;
3796
0
    if (name == NULL)
3797
0
        return;
3798
3799
0
    if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
3800
        /*
3801
         * Check for auto-closure of HTML elements.
3802
         */
3803
0
        htmlAutoClose(ctxt, name);
3804
3805
        /*
3806
         * Check for implied HTML elements.
3807
         */
3808
0
        htmlCheckImplied(ctxt, name);
3809
3810
        /*
3811
         * Avoid html at any level > 0, head at any level != 1
3812
         * or any attempt to recurse body
3813
         */
3814
0
        if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3815
0
            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3816
0
                         "htmlParseStartTag: misplaced <html> tag\n",
3817
0
                         name, NULL);
3818
0
            discardtag = 1;
3819
0
            ctxt->depth++;
3820
0
        }
3821
0
        if ((ctxt->nameNr != 1) &&
3822
0
            (xmlStrEqual(name, BAD_CAST"head"))) {
3823
0
            htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3824
0
                         "htmlParseStartTag: misplaced <head> tag\n",
3825
0
                         name, NULL);
3826
0
            discardtag = 1;
3827
0
            ctxt->depth++;
3828
0
        }
3829
0
        if (xmlStrEqual(name, BAD_CAST"body")) {
3830
0
            int indx;
3831
0
            for (indx = 0;indx < ctxt->nameNr;indx++) {
3832
0
                if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3833
0
                    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3834
0
                                 "htmlParseStartTag: misplaced <body> tag\n",
3835
0
                                 name, NULL);
3836
0
                    discardtag = 1;
3837
0
                    ctxt->depth++;
3838
0
                }
3839
0
            }
3840
0
        }
3841
0
    }
3842
3843
    /*
3844
     * Now parse the attributes, it ends up with the ending
3845
     *
3846
     * (S Attribute)* S?
3847
     */
3848
0
    SKIP_BLANKS;
3849
0
    while ((ctxt->input->cur < ctxt->input->end) &&
3850
0
           (CUR != '>') &&
3851
0
     ((CUR != '/') || (NXT(1) != '>')) &&
3852
0
           (PARSER_STOPPED(ctxt) == 0)) {
3853
0
        xmlHashedString hattname;
3854
3855
        /*  unexpected-solidus-in-tag */
3856
0
        if (CUR == '/') {
3857
0
            SKIP(1);
3858
0
            SKIP_BLANKS;
3859
0
            continue;
3860
0
        }
3861
0
  GROW;
3862
0
  hattname = htmlParseAttribute(ctxt, &attvalue);
3863
0
        attname = hattname.name;
3864
3865
0
        if (attname != NULL) {
3866
      /*
3867
       * Add the pair to atts
3868
       */
3869
0
      if (nbatts + 4 > maxatts) {
3870
0
          const xmlChar **tmp;
3871
0
                unsigned *utmp;
3872
0
                int newSize;
3873
3874
0
                newSize = xmlGrowCapacity(maxatts,
3875
0
                                          sizeof(tmp[0]) * 2 + sizeof(utmp[0]),
3876
0
                                          11, HTML_MAX_ATTRS);
3877
0
    if (newSize < 0) {
3878
0
        htmlErrMemory(ctxt);
3879
0
        goto failed;
3880
0
    }
3881
0
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
3882
0
                if (newSize < 2)
3883
0
                    newSize = 2;
3884
0
#endif
3885
0
          tmp = xmlRealloc(atts, newSize * sizeof(tmp[0]) * 2);
3886
0
    if (tmp == NULL) {
3887
0
        htmlErrMemory(ctxt);
3888
0
        goto failed;
3889
0
    }
3890
0
                atts = tmp;
3891
0
    ctxt->atts = tmp;
3892
3893
0
          utmp = xmlRealloc(ctxt->attallocs, newSize * sizeof(utmp[0]));
3894
0
    if (utmp == NULL) {
3895
0
        htmlErrMemory(ctxt);
3896
0
        goto failed;
3897
0
    }
3898
0
                ctxt->attallocs = utmp;
3899
3900
0
                maxatts = newSize * 2;
3901
0
    ctxt->maxatts = maxatts;
3902
0
      }
3903
3904
0
            ctxt->attallocs[nbatts/2] = hattname.hashValue;
3905
0
      atts[nbatts++] = attname;
3906
0
      atts[nbatts++] = attvalue;
3907
3908
0
            attvalue = NULL;
3909
0
  }
3910
3911
0
failed:
3912
0
        if (attvalue != NULL)
3913
0
            xmlFree(attvalue);
3914
3915
0
  SKIP_BLANKS;
3916
0
    }
3917
3918
0
    if (ctxt->input->cur >= ctxt->input->end) {
3919
0
        discardtag = 1;
3920
0
        goto done;
3921
0
    }
3922
3923
    /*
3924
     * Verify that attribute names are unique.
3925
     */
3926
0
    if (nbatts > 2) {
3927
0
        unsigned attrHashSize;
3928
0
        int j, k;
3929
3930
0
        attrHashSize = 4;
3931
0
        while (attrHashSize / 2 < (unsigned) nbatts / 2)
3932
0
            attrHashSize *= 2;
3933
3934
0
        if (attrHashSize > ctxt->attrHashMax) {
3935
0
            xmlAttrHashBucket *tmp;
3936
3937
0
            tmp = xmlRealloc(ctxt->attrHash, attrHashSize * sizeof(tmp[0]));
3938
0
            if (tmp == NULL) {
3939
0
                htmlErrMemory(ctxt);
3940
0
                goto done;
3941
0
            }
3942
3943
0
            ctxt->attrHash = tmp;
3944
0
            ctxt->attrHashMax = attrHashSize;
3945
0
        }
3946
3947
0
        memset(ctxt->attrHash, -1, attrHashSize * sizeof(ctxt->attrHash[0]));
3948
3949
0
        for (i = 0, j = 0, k = 0; i < nbatts; i += 2, k++) {
3950
0
            unsigned hashValue;
3951
0
            int res;
3952
3953
0
            attname = atts[i];
3954
0
            hashValue = ctxt->attallocs[k] | 0x80000000;
3955
3956
0
            res = htmlAttrHashInsert(ctxt, attrHashSize, attname,
3957
0
                                    hashValue, j);
3958
0
            if (res < 0)
3959
0
                continue;
3960
3961
0
            if (res == INT_MAX) {
3962
0
                atts[j] = atts[i];
3963
0
                atts[j+1] = atts[i+1];
3964
0
                j += 2;
3965
0
            } else {
3966
0
                xmlFree((xmlChar *) atts[i+1]);
3967
0
            }
3968
0
        }
3969
3970
0
        nbatts = j;
3971
0
    }
3972
3973
0
    if (nbatts > 0) {
3974
0
        atts[nbatts] = NULL;
3975
0
        atts[nbatts + 1] = NULL;
3976
3977
    /*
3978
     * Apple's new libiconv is so broken that you routinely run into
3979
     * issues when fuzz testing (by accident with an uninstrumented
3980
     * libiconv). Here's a harmless (?) example:
3981
     *
3982
     * printf '>'             | iconv -f shift_jis -t utf-8 | hexdump -C
3983
     * printf '\xfc\x00\x00'  | iconv -f shift_jis -t utf-8 | hexdump -C
3984
     * printf '>\xfc\x00\x00' | iconv -f shift_jis -t utf-8 | hexdump -C
3985
     *
3986
     * The last command fails to detect the illegal sequence.
3987
     */
3988
0
#if !defined(__APPLE__) || \
3989
0
    !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION)
3990
        /*
3991
         * Handle specific association to the META tag
3992
         */
3993
0
        if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
3994
0
            (strcmp((char *) name, "meta") == 0)) {
3995
0
            htmlCheckMeta(ctxt, atts);
3996
0
        }
3997
0
#endif
3998
0
    }
3999
4000
    /*
4001
     * SAX: Start of Element !
4002
     */
4003
0
    if (!discardtag) {
4004
0
        if (ctxt->options & HTML_PARSE_HTML5) {
4005
0
            if (ctxt->nameNr > 0)
4006
0
                htmlnamePop(ctxt);
4007
0
        }
4008
4009
0
  htmlnamePush(ctxt, name);
4010
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4011
0
      if (nbatts != 0)
4012
0
    ctxt->sax->startElement(ctxt->userData, name, atts);
4013
0
      else
4014
0
    ctxt->sax->startElement(ctxt->userData, name, NULL);
4015
0
  }
4016
0
    }
4017
4018
0
done:
4019
0
    if (atts != NULL) {
4020
0
        for (i = 1;i < nbatts;i += 2) {
4021
0
      if (atts[i] != NULL)
4022
0
    xmlFree((xmlChar *) atts[i]);
4023
0
  }
4024
0
    }
4025
0
}
4026
4027
/**
4028
 * parse an end of tag
4029
 *
4030
 * [42] ETag ::= '</' Name S? '>'
4031
 *
4032
 * With namespace
4033
 *
4034
 * [NS 9] ETag ::= '</' QName S? '>'
4035
 *
4036
 * @param ctxt  an HTML parser context
4037
 * @returns 1 if the current level should be closed.
4038
 */
4039
4040
static void
4041
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4042
0
{
4043
0
    const xmlChar *name;
4044
0
    const xmlChar *oldname;
4045
0
    int i;
4046
4047
0
    ctxt->endCheckState = 0;
4048
4049
0
    SKIP(2);
4050
4051
0
    if (ctxt->input->cur >= ctxt->input->end) {
4052
0
        htmlStartCharData(ctxt);
4053
0
        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4054
0
            (ctxt->sax->characters != NULL))
4055
0
            ctxt->sax->characters(ctxt->userData,
4056
0
                                  BAD_CAST "</", 2);
4057
0
        return;
4058
0
    }
4059
4060
0
    if (CUR == '>') {
4061
0
        SKIP(1);
4062
0
        return;
4063
0
    }
4064
4065
0
    if (!IS_ASCII_LETTER(CUR)) {
4066
0
        htmlParseComment(ctxt, /* bogus */ 1);
4067
0
        return;
4068
0
    }
4069
4070
0
    name = htmlParseHTMLName(ctxt, 0).name;
4071
0
    if (name == NULL)
4072
0
        return;
4073
4074
    /*
4075
     * Parse and ignore attributes.
4076
     */
4077
0
    SKIP_BLANKS;
4078
0
    while ((ctxt->input->cur < ctxt->input->end) &&
4079
0
           (CUR != '>') &&
4080
0
     ((CUR != '/') || (NXT(1) != '>')) &&
4081
0
           (ctxt->instate != XML_PARSER_EOF)) {
4082
0
        xmlChar *attvalue = NULL;
4083
4084
        /*  unexpected-solidus-in-tag */
4085
0
        if (CUR == '/') {
4086
0
            SKIP(1);
4087
0
            SKIP_BLANKS;
4088
0
            continue;
4089
0
        }
4090
0
  GROW;
4091
0
  htmlParseAttribute(ctxt, &attvalue);
4092
0
        if (attvalue != NULL)
4093
0
            xmlFree(attvalue);
4094
4095
0
  SKIP_BLANKS;
4096
0
    }
4097
4098
0
    if (CUR == '>') {
4099
0
        SKIP(1);
4100
0
    } else if ((CUR == '/') && (NXT(1) == '>')) {
4101
0
        SKIP(2);
4102
0
    } else {
4103
0
        return;
4104
0
    }
4105
4106
0
    if (ctxt->options & HTML_PARSE_HTML5) {
4107
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4108
0
            ctxt->sax->endElement(ctxt->userData, name);
4109
0
        return;
4110
0
    }
4111
4112
    /*
4113
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4114
     * out now.
4115
     */
4116
0
    if ((ctxt->depth > 0) &&
4117
0
        (xmlStrEqual(name, BAD_CAST "html") ||
4118
0
         xmlStrEqual(name, BAD_CAST "body") ||
4119
0
   xmlStrEqual(name, BAD_CAST "head"))) {
4120
0
  ctxt->depth--;
4121
0
  return;
4122
0
    }
4123
4124
    /*
4125
     * If the name read is not one of the element in the parsing stack
4126
     * then return, it's just an error.
4127
     */
4128
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4129
0
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4130
0
            break;
4131
0
    }
4132
0
    if (i < 0) {
4133
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4134
0
               "Unexpected end tag : %s\n", name, NULL);
4135
0
        return;
4136
0
    }
4137
4138
4139
    /*
4140
     * Check for auto-closure of HTML elements.
4141
     */
4142
4143
0
    htmlAutoCloseOnClose(ctxt, name);
4144
4145
    /*
4146
     * Well formedness constraints, opening and closing must match.
4147
     * With the exception that the autoclose may have popped stuff out
4148
     * of the stack.
4149
     */
4150
0
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4151
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4152
0
                     "Opening and ending tag mismatch: %s and %s\n",
4153
0
                     name, ctxt->name);
4154
0
    }
4155
4156
    /*
4157
     * SAX: End of Tag
4158
     */
4159
0
    oldname = ctxt->name;
4160
0
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4161
0
  htmlParserFinishElementParsing(ctxt);
4162
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4163
0
            ctxt->sax->endElement(ctxt->userData, name);
4164
0
        htmlnamePop(ctxt);
4165
0
    }
4166
0
}
4167
4168
/**
4169
 * Parse a content: comment, sub-element, reference or text.
4170
 * New version for non recursive htmlParseElementInternal
4171
 *
4172
 * @param ctxt  an HTML parser context
4173
 */
4174
4175
static void
4176
0
htmlParseContent(htmlParserCtxtPtr ctxt) {
4177
0
    GROW;
4178
4179
0
    while ((PARSER_STOPPED(ctxt) == 0) &&
4180
0
           (ctxt->input->cur < ctxt->input->end)) {
4181
0
        int mode;
4182
4183
0
        mode = ctxt->endCheckState;
4184
4185
0
        if ((mode == 0) && (CUR == '<')) {
4186
0
            if (NXT(1) == '/') {
4187
0
          htmlParseEndTag(ctxt);
4188
0
            } else if (NXT(1) == '!') {
4189
                /*
4190
                 * Sometimes DOCTYPE arrives in the middle of the document
4191
                 */
4192
0
                if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4193
0
                    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4194
0
                    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4195
0
                    (UPP(8) == 'E')) {
4196
0
                    htmlParseDocTypeDecl(ctxt);
4197
0
                } else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4198
0
                    SKIP(4);
4199
0
                    htmlParseComment(ctxt, /* bogus */ 0);
4200
0
                } else {
4201
0
                    SKIP(2);
4202
0
                    htmlParseComment(ctxt, /* bogus */ 1);
4203
0
                }
4204
0
            } else if (NXT(1) == '?') {
4205
0
                SKIP(1);
4206
0
                htmlParseComment(ctxt, /* bogus */ 1);
4207
0
            } else if (IS_ASCII_LETTER(NXT(1))) {
4208
0
                htmlParseElementInternal(ctxt);
4209
0
            } else {
4210
0
                htmlStartCharData(ctxt);
4211
0
                if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4212
0
                    (ctxt->sax->characters != NULL))
4213
0
                    ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4214
0
                SKIP(1);
4215
0
            }
4216
0
        } else {
4217
0
            htmlParseCharData(ctxt, /* partial */ 0);
4218
0
        }
4219
4220
0
        SHRINK;
4221
0
        GROW;
4222
0
    }
4223
4224
0
    if (ctxt->input->cur >= ctxt->input->end)
4225
0
        htmlAutoCloseOnEnd(ctxt);
4226
0
}
4227
4228
/**
4229
 * Parse an HTML element, new version, non recursive
4230
 *
4231
 * @param ctxt  an HTML parser context
4232
 */
4233
static int
4234
0
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4235
0
    const xmlChar *name;
4236
0
    const htmlElemDesc * info;
4237
0
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4238
4239
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4240
0
  return(0);
4241
4242
    /* Capture start position */
4243
0
    if (ctxt->record_info) {
4244
0
        node_info.begin_pos = ctxt->input->consumed +
4245
0
                          (CUR_PTR - ctxt->input->base);
4246
0
  node_info.begin_line = ctxt->input->line;
4247
0
    }
4248
4249
0
    htmlParseStartTag(ctxt);
4250
0
    name = ctxt->name;
4251
0
    if (name == NULL)
4252
0
        return(0);
4253
4254
0
    if (ctxt->record_info)
4255
0
        htmlNodeInfoPush(ctxt, &node_info);
4256
4257
    /*
4258
     * Check for an Empty Element labeled the XML/SGML way
4259
     */
4260
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4261
0
        SKIP(2);
4262
0
        htmlParserFinishElementParsing(ctxt);
4263
0
        if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4264
0
            if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4265
0
                ctxt->sax->endElement(ctxt->userData, name);
4266
0
        }
4267
0
  htmlnamePop(ctxt);
4268
0
  return(0);
4269
0
    }
4270
4271
0
    if (CUR != '>')
4272
0
        return(0);
4273
0
    SKIP(1);
4274
4275
    /*
4276
     * Lookup the info for that element.
4277
     */
4278
0
    info = htmlTagLookup(name);
4279
4280
    /*
4281
     * Check for an Empty Element from DTD definition
4282
     */
4283
0
    if ((info != NULL) && (info->empty)) {
4284
0
        htmlParserFinishElementParsing(ctxt);
4285
0
        if ((ctxt->options & HTML_PARSE_HTML5) == 0) {
4286
0
            if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4287
0
                ctxt->sax->endElement(ctxt->userData, name);
4288
0
        }
4289
0
  htmlnamePop(ctxt);
4290
0
  return(0);
4291
0
    }
4292
4293
0
    if (info != NULL)
4294
0
        ctxt->endCheckState = info->dataMode;
4295
4296
0
    return(1);
4297
0
}
4298
4299
/**
4300
 * This is kept for compatibility with previous code versions
4301
 *
4302
 * @deprecated Internal function, don't use.
4303
 *
4304
 * @param ctxt  an HTML parser context
4305
 */
4306
void
4307
0
htmlParseElement(htmlParserCtxt *ctxt) {
4308
0
    const xmlChar *oldptr;
4309
0
    int depth;
4310
4311
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4312
0
  return;
4313
4314
0
    if (htmlParseElementInternal(ctxt) == 0)
4315
0
        return;
4316
4317
    /*
4318
     * Parse the content of the element:
4319
     */
4320
0
    depth = ctxt->nameNr;
4321
0
    while (CUR != 0) {
4322
0
  oldptr = ctxt->input->cur;
4323
0
  htmlParseContent(ctxt);
4324
0
  if (oldptr==ctxt->input->cur) break;
4325
0
  if (ctxt->nameNr < depth) break;
4326
0
    }
4327
4328
0
    if (CUR == 0) {
4329
0
  htmlAutoCloseOnEnd(ctxt);
4330
0
    }
4331
0
}
4332
4333
/**
4334
 * @param ctxt  parser context
4335
 * @param input  parser input
4336
 * @returns a node list.
4337
 */
4338
xmlNode *
4339
0
htmlCtxtParseContentInternal(htmlParserCtxt *ctxt, xmlParserInput *input) {
4340
0
    xmlNodePtr root;
4341
0
    xmlNodePtr list = NULL;
4342
0
    xmlChar *rootName = BAD_CAST "#root";
4343
4344
0
    root = xmlNewDocNode(ctxt->myDoc, NULL, rootName, NULL);
4345
0
    if (root == NULL) {
4346
0
        htmlErrMemory(ctxt);
4347
0
        return(NULL);
4348
0
    }
4349
4350
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4351
0
        xmlFreeNode(root);
4352
0
        return(NULL);
4353
0
    }
4354
4355
0
    htmlnamePush(ctxt, rootName);
4356
0
    nodePush(ctxt, root);
4357
4358
0
    htmlParseContent(ctxt);
4359
4360
    /*
4361
     * Only check for truncated multi-byte sequences
4362
     */
4363
0
    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
4364
4365
    /* TODO: Use xmlCtxtIsCatastrophicError */
4366
0
    if (ctxt->errNo != XML_ERR_NO_MEMORY) {
4367
0
        xmlNodePtr cur;
4368
4369
        /*
4370
         * Unlink newly created node list.
4371
         */
4372
0
        list = root->children;
4373
0
        root->children = NULL;
4374
0
        root->last = NULL;
4375
0
        for (cur = list; cur != NULL; cur = cur->next)
4376
0
            cur->parent = NULL;
4377
0
    }
4378
4379
0
    nodePop(ctxt);
4380
0
    htmlnamePop(ctxt);
4381
4382
0
    xmlCtxtPopInput(ctxt);
4383
4384
0
    xmlFreeNode(root);
4385
0
    return(list);
4386
0
}
4387
4388
/**
4389
 * Parse an HTML document and invoke the SAX handlers. This is useful
4390
 * if you're only interested in custom SAX callbacks. If you want a
4391
 * document tree, use #htmlCtxtParseDocument.
4392
 *
4393
 * @param ctxt  an HTML parser context
4394
 * @returns 0, -1 in case of error.
4395
 */
4396
int
4397
0
htmlParseDocument(htmlParserCtxt *ctxt) {
4398
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4399
0
  return(-1);
4400
4401
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4402
0
        ctxt->sax->setDocumentLocator(ctxt->userData,
4403
0
                (xmlSAXLocator *) &xmlDefaultSAXLocator);
4404
0
    }
4405
4406
0
    xmlDetectEncoding(ctxt);
4407
4408
    /*
4409
     * TODO: Implement HTML5 prescan algorithm
4410
     */
4411
4412
    /*
4413
     * This is wrong but matches long-standing behavior. In most
4414
     * cases, a document starting with an XML declaration will
4415
     * specify UTF-8. The HTML5 prescan algorithm handles
4416
     * XML declarations in a better way.
4417
     */
4418
0
    if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4419
0
        (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4420
0
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4421
4422
    /*
4423
     * Wipe out everything which is before the first '<'
4424
     */
4425
0
    SKIP_BLANKS;
4426
4427
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4428
0
  ctxt->sax->startDocument(ctxt->userData);
4429
4430
    /*
4431
     * Parse possible comments and PIs before any content
4432
     */
4433
0
    while (CUR == '<') {
4434
0
        if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4435
0
            SKIP(4);
4436
0
            htmlParseComment(ctxt, /* bogus */ 0);
4437
0
        } else if (NXT(1) == '?') {
4438
0
            SKIP(1);
4439
0
            htmlParseComment(ctxt, /* bogus */ 1);
4440
0
        } else {
4441
0
            break;
4442
0
        }
4443
0
  SKIP_BLANKS;
4444
0
    }
4445
4446
    /*
4447
     * Then possibly doc type declaration(s) and more Misc
4448
     * (doctypedecl Misc*)?
4449
     */
4450
0
    if ((CUR == '<') && (NXT(1) == '!') &&
4451
0
  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4452
0
  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4453
0
  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4454
0
  (UPP(8) == 'E')) {
4455
0
        ctxt->instate = XML_PARSER_MISC;
4456
0
  htmlParseDocTypeDecl(ctxt);
4457
0
    }
4458
0
    SKIP_BLANKS;
4459
4460
    /*
4461
     * Parse possible comments and PIs before any content
4462
     */
4463
0
    ctxt->instate = XML_PARSER_PROLOG;
4464
0
    while (CUR == '<') {
4465
0
        if ((NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) {
4466
0
            SKIP(4);
4467
0
            htmlParseComment(ctxt, /* bogus */ 0);
4468
0
        } else if (NXT(1) == '?') {
4469
0
            SKIP(1);
4470
0
            htmlParseComment(ctxt, /* bogus */ 1);
4471
0
        } else {
4472
0
            break;
4473
0
        }
4474
0
  SKIP_BLANKS;
4475
0
    }
4476
4477
    /*
4478
     * Time to start parsing the tree itself
4479
     */
4480
0
    ctxt->instate = XML_PARSER_CONTENT;
4481
0
    htmlParseContent(ctxt);
4482
4483
    /*
4484
     * Only check for truncated multi-byte sequences
4485
     */
4486
0
    xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
4487
4488
    /*
4489
     * SAX: end of the document processing.
4490
     */
4491
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4492
0
        ctxt->sax->endDocument(ctxt->userData);
4493
4494
0
    if (! ctxt->wellFormed) return(-1);
4495
0
    return(0);
4496
0
}
4497
4498
4499
/************************************************************************
4500
 *                  *
4501
 *      Parser contexts handling      *
4502
 *                  *
4503
 ************************************************************************/
4504
4505
/**
4506
 * Initialize a parser context
4507
 *
4508
 * @param ctxt  an HTML parser context
4509
 * @param sax  SAX handler
4510
 * @param userData  user data
4511
 * @returns 0 in case of success and -1 in case of error
4512
 */
4513
static int
4514
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4515
                   void *userData)
4516
0
{
4517
0
#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
4518
0
    size_t initialNodeTabSize = 1;
4519
#else
4520
    size_t initialNodeTabSize = 10;
4521
#endif
4522
4523
0
    if (ctxt == NULL) return(-1);
4524
0
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4525
4526
0
    ctxt->dict = xmlDictCreate();
4527
0
    if (ctxt->dict == NULL)
4528
0
  return(-1);
4529
4530
0
    if (ctxt->sax == NULL)
4531
0
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4532
0
    if (ctxt->sax == NULL)
4533
0
  return(-1);
4534
0
    if (sax == NULL) {
4535
0
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4536
0
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4537
0
        ctxt->userData = ctxt;
4538
0
    } else {
4539
0
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4540
0
        ctxt->userData = userData ? userData : ctxt;
4541
0
    }
4542
4543
    /* Allocate the Input stack */
4544
0
    ctxt->inputTab = (htmlParserInputPtr *)
4545
0
                      xmlMalloc(sizeof(htmlParserInputPtr));
4546
0
    if (ctxt->inputTab == NULL)
4547
0
  return(-1);
4548
0
    ctxt->inputNr = 0;
4549
0
    ctxt->inputMax = 1;
4550
0
    ctxt->input = NULL;
4551
0
    ctxt->version = NULL;
4552
0
    ctxt->encoding = NULL;
4553
0
    ctxt->standalone = -1;
4554
0
    ctxt->instate = XML_PARSER_START;
4555
4556
    /* Allocate the Node stack */
4557
0
    ctxt->nodeTab = xmlMalloc(initialNodeTabSize * sizeof(htmlNodePtr));
4558
0
    if (ctxt->nodeTab == NULL)
4559
0
  return(-1);
4560
0
    ctxt->nodeNr = 0;
4561
0
    ctxt->nodeMax = initialNodeTabSize;
4562
0
    ctxt->node = NULL;
4563
4564
    /* Allocate the Name stack */
4565
0
    ctxt->nameTab = xmlMalloc(initialNodeTabSize * sizeof(xmlChar *));
4566
0
    if (ctxt->nameTab == NULL)
4567
0
  return(-1);
4568
0
    ctxt->nameNr = 0;
4569
0
    ctxt->nameMax = initialNodeTabSize;
4570
0
    ctxt->name = NULL;
4571
4572
0
    ctxt->nodeInfoTab = NULL;
4573
0
    ctxt->nodeInfoNr  = 0;
4574
0
    ctxt->nodeInfoMax = 0;
4575
4576
0
    ctxt->myDoc = NULL;
4577
0
    ctxt->wellFormed = 1;
4578
0
    ctxt->replaceEntities = 0;
4579
0
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4580
0
    ctxt->html = INSERT_INITIAL;
4581
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4582
0
    ctxt->vctxt.userData = ctxt;
4583
0
    ctxt->vctxt.error = xmlParserValidityError;
4584
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
4585
0
    ctxt->record_info = 0;
4586
0
    ctxt->validate = 0;
4587
0
    ctxt->checkIndex = 0;
4588
0
    ctxt->catalogs = NULL;
4589
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
4590
0
    return(0);
4591
0
}
4592
4593
/**
4594
 * Free all the memory used by a parser context. However the parsed
4595
 * document in `ctxt->myDoc` is not freed.
4596
 *
4597
 * @param ctxt  an HTML parser context
4598
 */
4599
void
4600
htmlFreeParserCtxt(htmlParserCtxt *ctxt)
4601
0
{
4602
0
    xmlFreeParserCtxt(ctxt);
4603
0
}
4604
4605
/**
4606
 * Allocate and initialize a new HTML parser context.
4607
 *
4608
 * This can be used to parse HTML documents into DOM trees with
4609
 * functions like #xmlCtxtReadFile or #xmlCtxtReadMemory.
4610
 *
4611
 * See #htmlCtxtUseOptions for parser options.
4612
 *
4613
 * See #xmlCtxtSetErrorHandler for advanced error handling.
4614
 *
4615
 * See #htmlNewSAXParserCtxt for custom SAX parsers.
4616
 *
4617
 * @returns the htmlParserCtxt or NULL in case of allocation error
4618
 */
4619
htmlParserCtxt *
4620
htmlNewParserCtxt(void)
4621
0
{
4622
0
    return(htmlNewSAXParserCtxt(NULL, NULL));
4623
0
}
4624
4625
/**
4626
 * Allocate and initialize a new HTML SAX parser context. If `userData`
4627
 * is NULL, the parser context will be passed as user data.
4628
 *
4629
 * @since 2.11.0
4630
 *
4631
 * If you want support older versions, it's best to invoke
4632
 * #htmlNewParserCtxt and set `ctxt->sax` with struct assignment.
4633
 *
4634
 * Also see #htmlNewParserCtxt.
4635
 *
4636
 * @param sax  SAX handler
4637
 * @param userData  user data
4638
 * @returns the htmlParserCtxt or NULL in case of allocation error
4639
 */
4640
htmlParserCtxt *
4641
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
4642
0
{
4643
0
    xmlParserCtxtPtr ctxt;
4644
4645
0
    xmlInitParser();
4646
4647
0
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4648
0
    if (ctxt == NULL)
4649
0
  return(NULL);
4650
0
    memset(ctxt, 0, sizeof(xmlParserCtxt));
4651
0
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
4652
0
        htmlFreeParserCtxt(ctxt);
4653
0
  return(NULL);
4654
0
    }
4655
0
    return(ctxt);
4656
0
}
4657
4658
static htmlParserCtxtPtr
4659
htmlCreateMemoryParserCtxtInternal(const char *url,
4660
                                   const char *buffer, size_t size,
4661
0
                                   const char *encoding) {
4662
0
    xmlParserCtxtPtr ctxt;
4663
0
    xmlParserInputPtr input;
4664
4665
0
    if (buffer == NULL)
4666
0
  return(NULL);
4667
4668
0
    ctxt = htmlNewParserCtxt();
4669
0
    if (ctxt == NULL)
4670
0
  return(NULL);
4671
4672
0
    input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding, 0);
4673
0
    if (input == NULL) {
4674
0
  xmlFreeParserCtxt(ctxt);
4675
0
        return(NULL);
4676
0
    }
4677
4678
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4679
0
        xmlFreeInputStream(input);
4680
0
        xmlFreeParserCtxt(ctxt);
4681
0
        return(NULL);
4682
0
    }
4683
4684
0
    return(ctxt);
4685
0
}
4686
4687
/**
4688
 * Create a parser context for an HTML in-memory document. The input
4689
 * buffer must not contain any terminating null bytes.
4690
 *
4691
 * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadMemory.
4692
 *
4693
 * @param buffer  a pointer to a char array
4694
 * @param size  the size of the array
4695
 * @returns the new parser context or NULL
4696
 */
4697
htmlParserCtxt *
4698
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
4699
0
    if (size <= 0)
4700
0
  return(NULL);
4701
4702
0
    return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
4703
0
}
4704
4705
/**
4706
 * Create a parser context for a null-terminated string.
4707
 *
4708
 * @param str  a pointer to an array of xmlChar
4709
 * @param url  URL of the document (optional)
4710
 * @param encoding  encoding (optional)
4711
 * @returns the new parser context or NULL if a memory allocation failed.
4712
 */
4713
static htmlParserCtxtPtr
4714
htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
4715
0
                        const char *encoding) {
4716
0
    xmlParserCtxtPtr ctxt;
4717
0
    xmlParserInputPtr input;
4718
4719
0
    if (str == NULL)
4720
0
  return(NULL);
4721
4722
0
    ctxt = htmlNewParserCtxt();
4723
0
    if (ctxt == NULL)
4724
0
  return(NULL);
4725
4726
0
    input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str,
4727
0
                                      encoding, 0);
4728
0
    if (input == NULL) {
4729
0
  xmlFreeParserCtxt(ctxt);
4730
0
  return(NULL);
4731
0
    }
4732
4733
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
4734
0
        xmlFreeInputStream(input);
4735
0
        xmlFreeParserCtxt(ctxt);
4736
0
        return(NULL);
4737
0
    }
4738
4739
0
    return(ctxt);
4740
0
}
4741
4742
#ifdef LIBXML_PUSH_ENABLED
4743
/************************************************************************
4744
 *                  *
4745
 *  Progressive parsing interfaces        *
4746
 *                  *
4747
 ************************************************************************/
4748
4749
typedef enum {
4750
    LSTATE_TAG_NAME = 0,
4751
    LSTATE_BEFORE_ATTR_NAME,
4752
    LSTATE_ATTR_NAME,
4753
    LSTATE_AFTER_ATTR_NAME,
4754
    LSTATE_BEFORE_ATTR_VALUE,
4755
    LSTATE_ATTR_VALUE_DQUOTED,
4756
    LSTATE_ATTR_VALUE_SQUOTED,
4757
    LSTATE_ATTR_VALUE_UNQUOTED
4758
} xmlLookupStates;
4759
4760
/**
4761
 * Check whether there's enough data in the input buffer to finish parsing
4762
 * a tag. This has to take quotes into account.
4763
 *
4764
 * @param ctxt  an HTML parser context
4765
 */
4766
static int
4767
0
htmlParseLookupGt(xmlParserCtxtPtr ctxt) {
4768
0
    const xmlChar *cur;
4769
0
    const xmlChar *end = ctxt->input->end;
4770
0
    int state = ctxt->endCheckState;
4771
0
    size_t index;
4772
4773
0
    if (ctxt->checkIndex == 0)
4774
0
        cur = ctxt->input->cur + 2; /* Skip '<a' or '</' */
4775
0
    else
4776
0
        cur = ctxt->input->cur + ctxt->checkIndex;
4777
4778
0
    while (cur < end) {
4779
0
        int c = *cur++;
4780
4781
0
        if (state != LSTATE_ATTR_VALUE_SQUOTED &&
4782
0
            state != LSTATE_ATTR_VALUE_DQUOTED) {
4783
0
            if (c == '/' &&
4784
0
                state != LSTATE_BEFORE_ATTR_VALUE &&
4785
0
                state != LSTATE_ATTR_VALUE_UNQUOTED) {
4786
0
                state = LSTATE_BEFORE_ATTR_NAME;
4787
0
                continue;
4788
0
            } else if (c == '>') {
4789
0
                ctxt->checkIndex = 0;
4790
0
                ctxt->endCheckState = 0;
4791
0
                return(0);
4792
0
            }
4793
0
        }
4794
4795
0
        switch (state) {
4796
0
            case LSTATE_TAG_NAME:
4797
0
                if (IS_WS_HTML(c))
4798
0
                    state = LSTATE_BEFORE_ATTR_NAME;
4799
0
                break;
4800
4801
0
            case LSTATE_BEFORE_ATTR_NAME:
4802
0
                if (!IS_WS_HTML(c))
4803
0
                    state = LSTATE_ATTR_NAME;
4804
0
                break;
4805
4806
0
            case LSTATE_ATTR_NAME:
4807
0
                if (c == '=')
4808
0
                    state = LSTATE_BEFORE_ATTR_VALUE;
4809
0
                else if (IS_WS_HTML(c))
4810
0
                    state = LSTATE_AFTER_ATTR_NAME;
4811
0
                break;
4812
4813
0
            case LSTATE_AFTER_ATTR_NAME:
4814
0
                if (c == '=')
4815
0
                    state = LSTATE_BEFORE_ATTR_VALUE;
4816
0
                else if (!IS_WS_HTML(c))
4817
0
                    state = LSTATE_ATTR_NAME;
4818
0
                break;
4819
4820
0
            case LSTATE_BEFORE_ATTR_VALUE:
4821
0
                if (c == '"')
4822
0
                    state = LSTATE_ATTR_VALUE_DQUOTED;
4823
0
                else if (c == '\'')
4824
0
                    state = LSTATE_ATTR_VALUE_SQUOTED;
4825
0
                else if (!IS_WS_HTML(c))
4826
0
                    state = LSTATE_ATTR_VALUE_UNQUOTED;
4827
0
                break;
4828
4829
0
            case LSTATE_ATTR_VALUE_DQUOTED:
4830
0
                if (c == '"')
4831
0
                    state = LSTATE_BEFORE_ATTR_NAME;
4832
0
                break;
4833
4834
0
            case LSTATE_ATTR_VALUE_SQUOTED:
4835
0
                if (c == '\'')
4836
0
                    state = LSTATE_BEFORE_ATTR_NAME;
4837
0
                break;
4838
4839
0
            case LSTATE_ATTR_VALUE_UNQUOTED:
4840
0
                if (IS_WS_HTML(c))
4841
0
                    state = LSTATE_BEFORE_ATTR_NAME;
4842
0
                break;
4843
0
        }
4844
0
    }
4845
4846
0
    index = cur - ctxt->input->cur;
4847
0
    if (index > LONG_MAX) {
4848
0
        ctxt->checkIndex = 0;
4849
0
        ctxt->endCheckState = 0;
4850
0
        return(0);
4851
0
    }
4852
0
    ctxt->checkIndex = index;
4853
0
    ctxt->endCheckState = state;
4854
0
    return(-1);
4855
0
}
4856
4857
/**
4858
 * Check whether the input buffer contains a string.
4859
 *
4860
 * @param ctxt  an XML parser context
4861
 * @param startDelta  delta to apply at the start
4862
 * @param str  string
4863
 * @param strLen  length of string
4864
 * @param extraLen  extra length
4865
 */
4866
static int
4867
htmlParseLookupString(xmlParserCtxtPtr ctxt, size_t startDelta,
4868
0
                      const char *str, size_t strLen, size_t extraLen) {
4869
0
    const xmlChar *end = ctxt->input->end;
4870
0
    const xmlChar *cur, *term;
4871
0
    size_t index, rescan;
4872
0
    int ret;
4873
4874
0
    if (ctxt->checkIndex == 0) {
4875
0
        cur = ctxt->input->cur + startDelta;
4876
0
    } else {
4877
0
        cur = ctxt->input->cur + ctxt->checkIndex;
4878
0
    }
4879
4880
0
    term = BAD_CAST strstr((const char *) cur, str);
4881
0
    if ((term != NULL) &&
4882
0
        ((size_t) (ctxt->input->end - term) >= extraLen + 1)) {
4883
0
        ctxt->checkIndex = 0;
4884
4885
0
        if (term - ctxt->input->cur > INT_MAX / 2)
4886
0
            ret = INT_MAX / 2;
4887
0
        else
4888
0
            ret = term - ctxt->input->cur;
4889
4890
0
        return(ret);
4891
0
    }
4892
4893
    /* Rescan (strLen + extraLen - 1) characters. */
4894
0
    rescan = strLen + extraLen - 1;
4895
0
    if ((size_t) (end - cur) <= rescan)
4896
0
        end = cur;
4897
0
    else
4898
0
        end -= rescan;
4899
0
    index = end - ctxt->input->cur;
4900
0
    if (index > INT_MAX / 2) {
4901
0
        ctxt->checkIndex = 0;
4902
0
        ret = INT_MAX / 2;
4903
0
    } else {
4904
0
        ctxt->checkIndex = index;
4905
0
        ret = -1;
4906
0
    }
4907
4908
0
    return(ret);
4909
0
}
4910
4911
/**
4912
 * Try to find a comment end tag in the input stream
4913
 * The search includes "-->" as well as WHATWG-recommended
4914
 * incorrectly-closed tags.
4915
 *
4916
 * @param ctxt  an HTML parser context
4917
 * @returns the index to the current parsing point if the full
4918
 * sequence is available, -1 otherwise.
4919
 */
4920
static int
4921
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
4922
0
{
4923
0
    int mark = 0;
4924
0
    int offset;
4925
4926
0
    while (1) {
4927
0
  mark = htmlParseLookupString(ctxt, 2, "--", 2, 0);
4928
0
  if (mark < 0)
4929
0
            break;
4930
        /*
4931
         * <!-->    is a complete comment, but
4932
         * <!--!>   is not
4933
         * <!---!>  is not
4934
         * <!----!> is
4935
         */
4936
0
        if ((NXT(mark+2) == '>') ||
4937
0
      ((mark >= 4) && (NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
4938
0
            ctxt->checkIndex = 0;
4939
0
      break;
4940
0
  }
4941
0
        offset = (NXT(mark+2) == '!') ? 3 : 2;
4942
0
        if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
4943
0
      ctxt->checkIndex = mark;
4944
0
            return(-1);
4945
0
        }
4946
0
  ctxt->checkIndex = mark + 1;
4947
0
    }
4948
0
    return mark;
4949
0
}
4950
4951
4952
/**
4953
 * Try to progress on parsing
4954
 *
4955
 * @param ctxt  an HTML parser context
4956
 * @param terminate  last chunk indicator
4957
 * @returns zero if no parsing was possible
4958
 */
4959
static void
4960
0
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
4961
0
    while (PARSER_STOPPED(ctxt) == 0) {
4962
0
        htmlParserInputPtr in;
4963
0
        size_t avail;
4964
4965
0
  in = ctxt->input;
4966
0
  if (in == NULL) break;
4967
0
  avail = in->end - in->cur;
4968
4969
0
        switch (ctxt->instate) {
4970
0
            case XML_PARSER_EOF:
4971
          /*
4972
     * Document parsing is done !
4973
     */
4974
0
          return;
4975
4976
0
            case XML_PARSER_START:
4977
                /*
4978
                 * Very first chars read from the document flow.
4979
                 */
4980
0
                if ((!terminate) && (avail < 4))
4981
0
                    return;
4982
4983
0
                xmlDetectEncoding(ctxt);
4984
4985
                /*
4986
                 * TODO: Implement HTML5 prescan algorithm
4987
                 */
4988
4989
                /*
4990
                 * This is wrong but matches long-standing behavior. In most
4991
                 * cases, a document starting with an XML declaration will
4992
                 * specify UTF-8. The HTML5 prescan algorithm handles
4993
                 * XML declarations in a better way.
4994
                 */
4995
0
                if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4996
0
                    (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
4997
0
                    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4998
0
                }
4999
5000
                /* fall through */
5001
5002
0
            case XML_PARSER_XML_DECL:
5003
0
                if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5004
0
                    ctxt->sax->setDocumentLocator(ctxt->userData,
5005
0
                            (xmlSAXLocator *) &xmlDefaultSAXLocator);
5006
0
                }
5007
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5008
0
              (!ctxt->disableSAX))
5009
0
        ctxt->sax->startDocument(ctxt->userData);
5010
5011
                /* Allow callback to modify state for tests */
5012
0
                if ((ctxt->instate == XML_PARSER_START) ||
5013
0
                    (ctxt->instate == XML_PARSER_XML_DECL))
5014
0
                    ctxt->instate = XML_PARSER_MISC;
5015
0
    break;
5016
5017
0
            case XML_PARSER_START_TAG:
5018
0
    if ((!terminate) &&
5019
0
        (htmlParseLookupGt(ctxt) < 0))
5020
0
        return;
5021
5022
0
                htmlParseElementInternal(ctxt);
5023
5024
0
    ctxt->instate = XML_PARSER_CONTENT;
5025
0
                break;
5026
5027
0
            case XML_PARSER_MISC: /* initial */
5028
0
            case XML_PARSER_PROLOG: /* before html */
5029
0
            case XML_PARSER_CONTENT: {
5030
0
                int mode;
5031
5032
0
                if ((ctxt->instate == XML_PARSER_MISC) ||
5033
0
                    (ctxt->instate == XML_PARSER_PROLOG)) {
5034
0
                    SKIP_BLANKS;
5035
0
                    avail = in->end - in->cur;
5036
0
                }
5037
5038
0
    if (avail < 1)
5039
0
        return;
5040
                /*
5041
                 * Note that endCheckState is also used by
5042
                 * xmlParseLookupGt.
5043
                 */
5044
0
                mode = ctxt->endCheckState;
5045
5046
0
                if (mode != 0) {
5047
0
                    if (htmlParseCharData(ctxt, !terminate) == 0)
5048
0
                        return;
5049
0
    } else if (in->cur[0] == '<') {
5050
0
                    int next;
5051
5052
0
                    if (avail < 2) {
5053
0
                        if (!terminate)
5054
0
                            return;
5055
0
                        next = ' ';
5056
0
                    } else {
5057
0
                        next = in->cur[1];
5058
0
                    }
5059
5060
0
                    if (next == '!') {
5061
0
                        if ((!terminate) && (avail < 4))
5062
0
                            return;
5063
0
                        if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5064
0
                            if ((!terminate) &&
5065
0
                                (htmlParseLookupCommentEnd(ctxt) < 0))
5066
0
                                return;
5067
0
                            SKIP(4);
5068
0
                            htmlParseComment(ctxt, /* bogus */ 0);
5069
                            /* don't change state */
5070
0
                            break;
5071
0
                        }
5072
5073
0
                        if ((!terminate) && (avail < 9))
5074
0
                            return;
5075
0
                        if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5076
0
                            (UPP(4) == 'C') && (UPP(5) == 'T') &&
5077
0
                            (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5078
0
                            (UPP(8) == 'E')) {
5079
0
                            if ((!terminate) &&
5080
0
                                (htmlParseLookupString(ctxt, 9, ">", 1,
5081
0
                                                       0) < 0))
5082
0
                                return;
5083
0
                            htmlParseDocTypeDecl(ctxt);
5084
0
                            if (ctxt->instate == XML_PARSER_MISC)
5085
0
                                ctxt->instate = XML_PARSER_PROLOG;
5086
0
                            else
5087
0
                                ctxt->instate = XML_PARSER_CONTENT;
5088
0
                        } else {
5089
0
                            ctxt->instate = XML_PARSER_CONTENT;
5090
0
                            if ((!terminate) &&
5091
0
                                (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5092
0
                                return;
5093
0
                            SKIP(2);
5094
0
                            htmlParseComment(ctxt, /* bogus */ 1);
5095
0
                        }
5096
0
                    } else if (next == '?') {
5097
0
                        if ((!terminate) &&
5098
0
                            (htmlParseLookupString(ctxt, 2, ">", 1, 0) < 0))
5099
0
                            return;
5100
0
                        SKIP(1);
5101
0
                        htmlParseComment(ctxt, /* bogus */ 1);
5102
                        /* don't change state */
5103
0
                    } else if (next == '/') {
5104
0
                        ctxt->instate = XML_PARSER_END_TAG;
5105
0
                        ctxt->checkIndex = 0;
5106
0
                    } else if (IS_ASCII_LETTER(next)) {
5107
0
                        ctxt->instate = XML_PARSER_START_TAG;
5108
0
                        ctxt->checkIndex = 0;
5109
0
                    } else {
5110
0
                        ctxt->instate = XML_PARSER_CONTENT;
5111
0
                        htmlStartCharData(ctxt);
5112
0
                        if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5113
0
                            (ctxt->sax->characters != NULL))
5114
0
                            ctxt->sax->characters(ctxt->userData,
5115
0
                                                  BAD_CAST "<", 1);
5116
0
                        SKIP(1);
5117
0
                    }
5118
0
                } else {
5119
0
                    ctxt->instate = XML_PARSER_CONTENT;
5120
                    /*
5121
                     * We follow the logic of the XML push parser
5122
                     */
5123
0
        if (avail < HTML_PARSER_BIG_BUFFER_SIZE) {
5124
0
                        if ((!terminate) &&
5125
0
                            (htmlParseLookupString(ctxt, 0, "<", 1, 0) < 0))
5126
0
                            return;
5127
0
                    }
5128
0
                    ctxt->checkIndex = 0;
5129
0
                    if (htmlParseCharData(ctxt, !terminate) == 0)
5130
0
                        return;
5131
0
    }
5132
5133
0
    break;
5134
0
      }
5135
5136
0
            case XML_PARSER_END_TAG:
5137
0
    if ((!terminate) &&
5138
0
        (htmlParseLookupGt(ctxt) < 0))
5139
0
        return;
5140
0
    htmlParseEndTag(ctxt);
5141
0
    ctxt->instate = XML_PARSER_CONTENT;
5142
0
    ctxt->checkIndex = 0;
5143
0
          break;
5144
5145
0
      default:
5146
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5147
0
           "HPP: internal error\n", NULL, NULL);
5148
0
    ctxt->instate = XML_PARSER_EOF;
5149
0
    break;
5150
0
  }
5151
0
    }
5152
0
}
5153
5154
/**
5155
 * Parse a chunk of memory in push parser mode.
5156
 *
5157
 * Assumes that the parser context was initialized with
5158
 * #htmlCreatePushParserCtxt.
5159
 *
5160
 * The last chunk, which will often be empty, must be marked with
5161
 * the `terminate` flag. With the default SAX callbacks, the resulting
5162
 * document will be available in `ctxt->myDoc`. This pointer will not
5163
 * be freed by the library.
5164
 *
5165
 * If the document isn't well-formed, `ctxt->myDoc` is set to NULL.
5166
 *
5167
 * Since 2.14.0, #xmlCtxtGetDocument can be used to retrieve the
5168
 * result document.
5169
 *
5170
 * @param ctxt  an HTML parser context
5171
 * @param chunk  chunk of memory
5172
 * @param size  size of chunk in bytes
5173
 * @param terminate  last chunk indicator
5174
 * @returns an xmlParserErrors code (0 on success).
5175
 */
5176
int
5177
htmlParseChunk(htmlParserCtxt *ctxt, const char *chunk, int size,
5178
0
              int terminate) {
5179
0
    if ((ctxt == NULL) ||
5180
0
        (ctxt->input == NULL) || (ctxt->input->buf == NULL) ||
5181
0
        (size < 0) ||
5182
0
        ((size > 0) && (chunk == NULL)))
5183
0
  return(XML_ERR_ARGUMENT);
5184
0
    if (PARSER_STOPPED(ctxt) != 0)
5185
0
        return(ctxt->errNo);
5186
5187
0
    if (size > 0)  {
5188
0
  size_t pos = ctxt->input->cur - ctxt->input->base;
5189
0
  int res;
5190
5191
0
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5192
0
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5193
0
  if (res < 0) {
5194
0
            htmlParseErr(ctxt, ctxt->input->buf->error,
5195
0
                         "xmlParserInputBufferPush failed", NULL, NULL);
5196
0
            xmlHaltParser(ctxt);
5197
0
      return (ctxt->errNo);
5198
0
  }
5199
0
    }
5200
5201
0
    htmlParseTryOrFinish(ctxt, terminate);
5202
5203
0
    if ((terminate) && (ctxt->instate != XML_PARSER_EOF)) {
5204
0
        htmlAutoCloseOnEnd(ctxt);
5205
5206
        /*
5207
         * Only check for truncated multi-byte sequences
5208
         */
5209
0
        xmlParserCheckEOF(ctxt, XML_ERR_INTERNAL_ERROR);
5210
5211
0
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5212
0
            ctxt->sax->endDocument(ctxt->userData);
5213
5214
0
  ctxt->instate = XML_PARSER_EOF;
5215
0
    }
5216
5217
0
    return((xmlParserErrors) ctxt->errNo);
5218
0
}
5219
5220
/************************************************************************
5221
 *                  *
5222
 *      User entry points       *
5223
 *                  *
5224
 ************************************************************************/
5225
5226
/**
5227
 * Create a parser context for using the HTML parser in push mode.
5228
 *
5229
 * @param sax  a SAX handler (optional)
5230
 * @param user_data  The user data returned on SAX callbacks (optional)
5231
 * @param chunk  a pointer to an array of chars (optional)
5232
 * @param size  number of chars in the array
5233
 * @param filename  only used for error reporting (optional)
5234
 * @param enc  encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5235
 * @returns the new parser context or NULL if a memory allocation
5236
 * failed.
5237
 */
5238
htmlParserCtxt *
5239
htmlCreatePushParserCtxt(htmlSAXHandler *sax, void *user_data,
5240
                         const char *chunk, int size, const char *filename,
5241
0
       xmlCharEncoding enc) {
5242
0
    htmlParserCtxtPtr ctxt;
5243
0
    htmlParserInputPtr input;
5244
0
    const char *encoding;
5245
5246
0
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
5247
0
    if (ctxt == NULL)
5248
0
  return(NULL);
5249
5250
0
    encoding = xmlGetCharEncodingName(enc);
5251
0
    input = xmlNewPushInput(filename, chunk, size);
5252
0
    if (input == NULL) {
5253
0
  htmlFreeParserCtxt(ctxt);
5254
0
  return(NULL);
5255
0
    }
5256
5257
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5258
0
        xmlFreeInputStream(input);
5259
0
        xmlFreeParserCtxt(ctxt);
5260
0
        return(NULL);
5261
0
    }
5262
5263
0
    if (encoding != NULL)
5264
0
        xmlSwitchEncodingName(ctxt, encoding);
5265
5266
0
    return(ctxt);
5267
0
}
5268
#endif /* LIBXML_PUSH_ENABLED */
5269
5270
/**
5271
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5272
 * to handle parse events. If sax is NULL, fallback to the default DOM
5273
 * behavior and return a tree.
5274
 *
5275
 * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadDoc.
5276
 *
5277
 * @param cur  a pointer to an array of xmlChar
5278
 * @param encoding  a free form C string describing the HTML document encoding, or NULL
5279
 * @param sax  the SAX handler block
5280
 * @param userData  if using SAX, this pointer will be provided on callbacks.
5281
 * @returns the resulting document tree unless SAX is NULL or the document is
5282
 *     not well formed.
5283
 */
5284
5285
xmlDoc *
5286
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5287
0
                htmlSAXHandler *sax, void *userData) {
5288
0
    htmlDocPtr ret;
5289
0
    htmlParserCtxtPtr ctxt;
5290
5291
0
    if (cur == NULL)
5292
0
        return(NULL);
5293
5294
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5295
0
    if (ctxt == NULL)
5296
0
        return(NULL);
5297
5298
0
    if (sax != NULL) {
5299
0
        *ctxt->sax = *sax;
5300
0
        ctxt->userData = userData;
5301
0
    }
5302
5303
0
    htmlParseDocument(ctxt);
5304
0
    ret = ctxt->myDoc;
5305
0
    htmlFreeParserCtxt(ctxt);
5306
5307
0
    return(ret);
5308
0
}
5309
5310
/**
5311
 * Parse an HTML in-memory document and build a tree.
5312
 *
5313
 * @deprecated Use #htmlReadDoc.
5314
 *
5315
 * This function uses deprecated global parser options.
5316
 *
5317
 * @param cur  a pointer to an array of xmlChar
5318
 * @param encoding  the encoding (optional)
5319
 * @returns the resulting document tree
5320
 */
5321
5322
xmlDoc *
5323
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
5324
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5325
0
}
5326
5327
5328
/**
5329
 * Create a parser context to read from a file.
5330
 *
5331
 * @deprecated Use #htmlNewParserCtxt and #htmlCtxtReadFile.
5332
 *
5333
 * A non-NULL encoding overrides encoding declarations in the document.
5334
 *
5335
 * Automatic support for ZLIB/Compress compressed document is provided
5336
 * by default if found at compile-time.
5337
 *
5338
 * @param filename  the filename
5339
 * @param encoding  optional encoding
5340
 * @returns the new parser context or NULL if a memory allocation failed.
5341
 */
5342
htmlParserCtxt *
5343
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5344
0
{
5345
0
    htmlParserCtxtPtr ctxt;
5346
0
    htmlParserInputPtr input;
5347
5348
0
    if (filename == NULL)
5349
0
        return(NULL);
5350
5351
0
    ctxt = htmlNewParserCtxt();
5352
0
    if (ctxt == NULL) {
5353
0
  return(NULL);
5354
0
    }
5355
5356
0
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5357
0
    if (input == NULL) {
5358
0
  xmlFreeParserCtxt(ctxt);
5359
0
  return(NULL);
5360
0
    }
5361
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5362
0
        xmlFreeInputStream(input);
5363
0
        xmlFreeParserCtxt(ctxt);
5364
0
        return(NULL);
5365
0
    }
5366
5367
0
    return(ctxt);
5368
0
}
5369
5370
/**
5371
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5372
 * compressed document is provided by default if found at compile-time.
5373
 * It use the given SAX function block to handle the parsing callback.
5374
 * If sax is NULL, fallback to the default DOM tree building routines.
5375
 *
5376
 * @deprecated Use #htmlNewSAXParserCtxt and #htmlCtxtReadFile.
5377
 *
5378
 * @param filename  the filename
5379
 * @param encoding  encoding (optional)
5380
 * @param sax  the SAX handler block
5381
 * @param userData  if using SAX, this pointer will be provided on callbacks.
5382
 * @returns the resulting document tree unless SAX is NULL or the document is
5383
 *     not well formed.
5384
 */
5385
5386
xmlDoc *
5387
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandler *sax,
5388
0
                 void *userData) {
5389
0
    htmlDocPtr ret;
5390
0
    htmlParserCtxtPtr ctxt;
5391
0
    htmlSAXHandlerPtr oldsax = NULL;
5392
5393
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5394
0
    if (ctxt == NULL) return(NULL);
5395
0
    if (sax != NULL) {
5396
0
  oldsax = ctxt->sax;
5397
0
        ctxt->sax = sax;
5398
0
        ctxt->userData = userData;
5399
0
    }
5400
5401
0
    htmlParseDocument(ctxt);
5402
5403
0
    ret = ctxt->myDoc;
5404
0
    if (sax != NULL) {
5405
0
        ctxt->sax = oldsax;
5406
0
        ctxt->userData = NULL;
5407
0
    }
5408
0
    htmlFreeParserCtxt(ctxt);
5409
5410
0
    return(ret);
5411
0
}
5412
5413
/**
5414
 * Parse an HTML file and build a tree.
5415
 *
5416
 * @param filename  the filename
5417
 * @param encoding  encoding (optional)
5418
 * @returns the resulting document tree
5419
 */
5420
5421
xmlDoc *
5422
0
htmlParseFile(const char *filename, const char *encoding) {
5423
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5424
0
}
5425
5426
/**
5427
 * Set and return the previous value for handling HTML omitted tags.
5428
 *
5429
 * @deprecated Use HTML_PARSE_NOIMPLIED
5430
 *
5431
 * @param val  int 0 or 1
5432
 * @returns the last value for 0 for no handling, 1 for auto insertion.
5433
 */
5434
5435
int
5436
0
htmlHandleOmittedElem(int val) {
5437
0
    int old = htmlOmittedDefaultValue;
5438
5439
0
    htmlOmittedDefaultValue = val;
5440
0
    return(old);
5441
0
}
5442
5443
/**
5444
 * @deprecated Don't use.
5445
 *
5446
 * @param parent  HTML parent element
5447
 * @param elt  HTML element
5448
 * @returns 1
5449
 */
5450
int
5451
htmlElementAllowedHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5452
0
                       const xmlChar* elt ATTRIBUTE_UNUSED) {
5453
0
    return(1);
5454
0
}
5455
5456
/**
5457
 * @deprecated Don't use.
5458
 *
5459
 * @param parent  HTML parent element
5460
 * @param elt  HTML element
5461
 * @returns HTML_VALID
5462
 */
5463
htmlStatus
5464
htmlElementStatusHere(const htmlElemDesc* parent ATTRIBUTE_UNUSED,
5465
0
                      const htmlElemDesc* elt ATTRIBUTE_UNUSED) {
5466
0
    return(HTML_VALID);
5467
0
}
5468
5469
/**
5470
 * @deprecated Don't use.
5471
 *
5472
 * @param elt  HTML element
5473
 * @param attr  HTML attribute
5474
 * @param legacy  whether to allow deprecated attributes
5475
 * @returns HTML_VALID
5476
 */
5477
htmlStatus
5478
htmlAttrAllowed(const htmlElemDesc* elt ATTRIBUTE_UNUSED,
5479
                const xmlChar* attr ATTRIBUTE_UNUSED,
5480
0
                int legacy ATTRIBUTE_UNUSED) {
5481
0
    return(HTML_VALID);
5482
0
}
5483
5484
/**
5485
 * @deprecated Don't use.
5486
 *
5487
 * @param node  an xmlNode in a tree
5488
 * @param legacy  whether to allow deprecated elements (YES is faster here
5489
 *  for Element nodes)
5490
 * @returns HTML_VALID
5491
 */
5492
htmlStatus
5493
htmlNodeStatus(xmlNode *node ATTRIBUTE_UNUSED,
5494
0
               int legacy ATTRIBUTE_UNUSED) {
5495
0
    return(HTML_VALID);
5496
0
}
5497
5498
/************************************************************************
5499
 *                  *
5500
 *  New set (2.6.0) of simpler and more flexible APIs   *
5501
 *                  *
5502
 ************************************************************************/
5503
/**
5504
 * Free a string if it is not owned by the "dict" dictionary in the
5505
 * current scope
5506
 *
5507
 * @param str  a string
5508
 */
5509
#define DICT_FREE(str)            \
5510
  if ((str) && ((!dict) ||        \
5511
      (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
5512
      xmlFree((char *)(str));
5513
5514
/**
5515
 * Reset a parser context
5516
 *
5517
 * Same as #xmlCtxtReset.
5518
 *
5519
 * @param ctxt  an HTML parser context
5520
 */
5521
void
5522
htmlCtxtReset(htmlParserCtxt *ctxt)
5523
0
{
5524
0
    xmlCtxtReset(ctxt);
5525
0
}
5526
5527
static int
5528
htmlCtxtSetOptionsInternal(xmlParserCtxtPtr ctxt, int options, int keepMask)
5529
0
{
5530
0
    int allMask;
5531
5532
0
    if (ctxt == NULL)
5533
0
        return(-1);
5534
5535
0
    allMask = HTML_PARSE_RECOVER |
5536
0
              HTML_PARSE_HTML5 |
5537
0
              HTML_PARSE_NODEFDTD |
5538
0
              HTML_PARSE_NOERROR |
5539
0
              HTML_PARSE_NOWARNING |
5540
0
              HTML_PARSE_PEDANTIC |
5541
0
              HTML_PARSE_NOBLANKS |
5542
0
              HTML_PARSE_NONET |
5543
0
              HTML_PARSE_NOIMPLIED |
5544
0
              HTML_PARSE_COMPACT |
5545
0
              HTML_PARSE_HUGE |
5546
0
              HTML_PARSE_IGNORE_ENC |
5547
0
              HTML_PARSE_BIG_LINES;
5548
5549
0
    ctxt->options = (ctxt->options & keepMask) | (options & allMask);
5550
5551
    /*
5552
     * For some options, struct members are historically the source
5553
     * of truth. See xmlCtxtSetOptionsInternal.
5554
     */
5555
0
    ctxt->keepBlanks = (options & HTML_PARSE_NOBLANKS) ? 0 : 1;
5556
5557
    /*
5558
     * Changing SAX callbacks is a bad idea. This should be fixed.
5559
     */
5560
0
    if (options & HTML_PARSE_NOBLANKS) {
5561
0
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
5562
0
    }
5563
0
    if (options & HTML_PARSE_HUGE) {
5564
0
        if (ctxt->dict != NULL)
5565
0
            xmlDictSetLimit(ctxt->dict, 0);
5566
0
    }
5567
5568
    /*
5569
     * It would be useful to allow this feature.
5570
     */
5571
0
    ctxt->dictNames = 0;
5572
5573
    /*
5574
     * Allow XML_PARSE_NOENT which many users set on the HTML parser.
5575
     */
5576
0
    return(options & ~allMask & ~XML_PARSE_NOENT);
5577
0
}
5578
5579
/**
5580
 * Applies the options to the parser context. Unset options are
5581
 * cleared.
5582
 *
5583
 * @since 2.14.0
5584
 *
5585
 * With older versions, you can use #htmlCtxtUseOptions.
5586
 *
5587
 * @param ctxt  an HTML parser context
5588
 * @param options  a bitmask of htmlParserOption values
5589
 * @returns 0 in case of success, the set of unknown or unimplemented options
5590
 *         in case of error.
5591
 */
5592
int
5593
htmlCtxtSetOptions(htmlParserCtxt *ctxt, int options)
5594
0
{
5595
0
    return(htmlCtxtSetOptionsInternal(ctxt, options, 0));
5596
0
}
5597
5598
/**
5599
 * Applies the options to the parser context. The following options
5600
 * are never cleared and can only be enabled:
5601
 *
5602
 * @deprecated Use #htmlCtxtSetOptions.
5603
 *
5604
 * - HTML_PARSE_NODEFDTD
5605
 * - HTML_PARSE_NOERROR
5606
 * - HTML_PARSE_NOWARNING
5607
 * - HTML_PARSE_NOIMPLIED
5608
 * - HTML_PARSE_COMPACT
5609
 * - HTML_PARSE_HUGE
5610
 * - HTML_PARSE_IGNORE_ENC
5611
 * - HTML_PARSE_BIG_LINES
5612
 *
5613
 * @param ctxt  an HTML parser context
5614
 * @param options  a combination of htmlParserOption values
5615
 * @returns 0 in case of success, the set of unknown or unimplemented options
5616
 *         in case of error.
5617
 */
5618
int
5619
htmlCtxtUseOptions(htmlParserCtxt *ctxt, int options)
5620
0
{
5621
0
    int keepMask;
5622
5623
    /*
5624
     * For historic reasons, some options can only be enabled.
5625
     */
5626
0
    keepMask = HTML_PARSE_NODEFDTD |
5627
0
               HTML_PARSE_NOERROR |
5628
0
               HTML_PARSE_NOWARNING |
5629
0
               HTML_PARSE_NOIMPLIED |
5630
0
               HTML_PARSE_COMPACT |
5631
0
               HTML_PARSE_HUGE |
5632
0
               HTML_PARSE_IGNORE_ENC |
5633
0
               HTML_PARSE_BIG_LINES;
5634
5635
0
    return(htmlCtxtSetOptionsInternal(ctxt, options, keepMask));
5636
0
}
5637
5638
/**
5639
 * Parse an HTML document and return the resulting document tree.
5640
 *
5641
 * @since 2.13.0
5642
 *
5643
 * @param ctxt  an HTML parser context
5644
 * @param input  parser input
5645
 * @returns the resulting document tree or NULL
5646
 */
5647
xmlDoc *
5648
htmlCtxtParseDocument(htmlParserCtxt *ctxt, xmlParserInput *input)
5649
0
{
5650
0
    htmlDocPtr ret;
5651
5652
0
    if ((ctxt == NULL) || (input == NULL)) {
5653
0
        xmlFatalErr(ctxt, XML_ERR_ARGUMENT, NULL);
5654
0
        xmlFreeInputStream(input);
5655
0
        return(NULL);
5656
0
    }
5657
5658
    /* assert(ctxt->inputNr == 0); */
5659
0
    while (ctxt->inputNr > 0)
5660
0
        xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5661
5662
0
    if (xmlCtxtPushInput(ctxt, input) < 0) {
5663
0
        xmlFreeInputStream(input);
5664
0
        return(NULL);
5665
0
    }
5666
5667
0
    ctxt->html = INSERT_INITIAL;
5668
0
    htmlParseDocument(ctxt);
5669
5670
0
    ret = xmlCtxtGetDocument(ctxt);
5671
5672
    /* assert(ctxt->inputNr == 1); */
5673
0
    while (ctxt->inputNr > 0)
5674
0
        xmlFreeInputStream(xmlCtxtPopInput(ctxt));
5675
5676
0
    return(ret);
5677
0
}
5678
5679
/**
5680
 * Convenience function to parse an HTML document from a zero-terminated
5681
 * string.
5682
 *
5683
 * See #htmlCtxtReadDoc for details.
5684
 *
5685
 * @param str  a pointer to a zero terminated string
5686
 * @param url  only used for error reporting (optoinal)
5687
 * @param encoding  the document encoding (optional)
5688
 * @param options  a combination of htmlParserOption values
5689
 * @returns the resulting document tree.
5690
 */
5691
xmlDoc *
5692
htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
5693
            int options)
5694
0
{
5695
0
    htmlParserCtxtPtr ctxt;
5696
0
    xmlParserInputPtr input;
5697
0
    htmlDocPtr doc = NULL;
5698
5699
0
    ctxt = htmlNewParserCtxt();
5700
0
    if (ctxt == NULL)
5701
0
        return(NULL);
5702
5703
0
    htmlCtxtUseOptions(ctxt, options);
5704
5705
0
    input = xmlCtxtNewInputFromString(ctxt, url, (const char *) str, encoding,
5706
0
                                      XML_INPUT_BUF_STATIC);
5707
5708
0
    if (input != NULL)
5709
0
        doc = htmlCtxtParseDocument(ctxt, input);
5710
5711
0
    htmlFreeParserCtxt(ctxt);
5712
0
    return(doc);
5713
0
}
5714
5715
/**
5716
 * Convenience function to parse an HTML file from the filesystem,
5717
 * the network or a global user-defined resource loader.
5718
 *
5719
 * See #htmlCtxtReadFile for details.
5720
 *
5721
 * @param filename  a file or URL
5722
 * @param encoding  the document encoding (optional)
5723
 * @param options  a combination of htmlParserOption values
5724
 * @returns the resulting document tree.
5725
 */
5726
xmlDoc *
5727
htmlReadFile(const char *filename, const char *encoding, int options)
5728
0
{
5729
0
    htmlParserCtxtPtr ctxt;
5730
0
    xmlParserInputPtr input;
5731
0
    htmlDocPtr doc = NULL;
5732
5733
0
    ctxt = htmlNewParserCtxt();
5734
0
    if (ctxt == NULL)
5735
0
        return(NULL);
5736
5737
0
    htmlCtxtUseOptions(ctxt, options);
5738
5739
0
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5740
5741
0
    if (input != NULL)
5742
0
        doc = htmlCtxtParseDocument(ctxt, input);
5743
5744
0
    htmlFreeParserCtxt(ctxt);
5745
0
    return(doc);
5746
0
}
5747
5748
/**
5749
 * Convenience function to parse an HTML document from memory.
5750
 * The input buffer must not contain any terminating null bytes.
5751
 *
5752
 * See #htmlCtxtReadMemory for details.
5753
 *
5754
 * @param buffer  a pointer to a char array
5755
 * @param size  the size of the array
5756
 * @param url  only used for error reporting (optional)
5757
 * @param encoding  the document encoding, or NULL
5758
 * @param options  a combination of htmlParserOption values
5759
 * @returns the resulting document tree
5760
 */
5761
xmlDoc *
5762
htmlReadMemory(const char *buffer, int size, const char *url,
5763
               const char *encoding, int options)
5764
0
{
5765
0
    htmlParserCtxtPtr ctxt;
5766
0
    xmlParserInputPtr input;
5767
0
    htmlDocPtr doc = NULL;
5768
5769
0
    if (size < 0)
5770
0
  return(NULL);
5771
5772
0
    ctxt = htmlNewParserCtxt();
5773
0
    if (ctxt == NULL)
5774
0
        return(NULL);
5775
5776
0
    htmlCtxtUseOptions(ctxt, options);
5777
5778
0
    input = xmlCtxtNewInputFromMemory(ctxt, url, buffer, size, encoding,
5779
0
                                      XML_INPUT_BUF_STATIC);
5780
5781
0
    if (input != NULL)
5782
0
        doc = htmlCtxtParseDocument(ctxt, input);
5783
5784
0
    htmlFreeParserCtxt(ctxt);
5785
0
    return(doc);
5786
0
}
5787
5788
/**
5789
 * Convenience function to parse an HTML document from a
5790
 * file descriptor.
5791
 *
5792
 * NOTE that the file descriptor will not be closed when the
5793
 * context is freed or reset.
5794
 *
5795
 * See #htmlCtxtReadFd for details.
5796
 *
5797
 * @param fd  an open file descriptor
5798
 * @param url  only used for error reporting (optional)
5799
 * @param encoding  the document encoding, or NULL
5800
 * @param options  a combination of htmlParserOption values
5801
 * @returns the resulting document tree
5802
 */
5803
xmlDoc *
5804
htmlReadFd(int fd, const char *url, const char *encoding, int options)
5805
0
{
5806
0
    htmlParserCtxtPtr ctxt;
5807
0
    xmlParserInputPtr input;
5808
0
    htmlDocPtr doc = NULL;
5809
5810
0
    ctxt = htmlNewParserCtxt();
5811
0
    if (ctxt == NULL)
5812
0
        return(NULL);
5813
5814
0
    htmlCtxtUseOptions(ctxt, options);
5815
5816
0
    input = xmlCtxtNewInputFromFd(ctxt, url, fd, encoding, 0);
5817
5818
0
    if (input != NULL)
5819
0
        doc = htmlCtxtParseDocument(ctxt, input);
5820
5821
0
    htmlFreeParserCtxt(ctxt);
5822
0
    return(doc);
5823
0
}
5824
5825
/**
5826
 * Convenience function to parse an HTML document from I/O functions
5827
 * and context.
5828
 *
5829
 * See #htmlCtxtReadIO for details.
5830
 *
5831
 * @param ioread  an I/O read function
5832
 * @param ioclose  an I/O close function (optional)
5833
 * @param ioctx  an I/O handler
5834
 * @param url  only used for error reporting (optional)
5835
 * @param encoding  the document encoding (optional)
5836
 * @param options  a combination of htmlParserOption values
5837
 * @returns the resulting document tree
5838
 */
5839
xmlDoc *
5840
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
5841
          void *ioctx, const char *url, const char *encoding, int options)
5842
0
{
5843
0
    htmlParserCtxtPtr ctxt;
5844
0
    xmlParserInputPtr input;
5845
0
    htmlDocPtr doc = NULL;
5846
5847
0
    ctxt = htmlNewParserCtxt();
5848
0
    if (ctxt == NULL)
5849
0
        return (NULL);
5850
5851
0
    htmlCtxtUseOptions(ctxt, options);
5852
5853
0
    input = xmlCtxtNewInputFromIO(ctxt, url, ioread, ioclose, ioctx,
5854
0
                                  encoding, 0);
5855
5856
0
    if (input != NULL)
5857
0
        doc = htmlCtxtParseDocument(ctxt, input);
5858
5859
0
    htmlFreeParserCtxt(ctxt);
5860
0
    return(doc);
5861
0
}
5862
5863
/**
5864
 * Parse an HTML in-memory document and build a tree.
5865
 *
5866
 * See #htmlCtxtUseOptions for details.
5867
 *
5868
 * @param ctxt  an HTML parser context
5869
 * @param str  a pointer to a zero terminated string
5870
 * @param URL  only used for error reporting (optional)
5871
 * @param encoding  the document encoding (optional)
5872
 * @param options  a combination of htmlParserOption values
5873
 * @returns the resulting document tree
5874
 */
5875
xmlDoc *
5876
htmlCtxtReadDoc(xmlParserCtxt *ctxt, const xmlChar *str,
5877
                const char *URL, const char *encoding, int options)
5878
0
{
5879
0
    xmlParserInputPtr input;
5880
5881
0
    if (ctxt == NULL)
5882
0
        return (NULL);
5883
5884
0
    htmlCtxtReset(ctxt);
5885
0
    htmlCtxtUseOptions(ctxt, options);
5886
5887
0
    input = xmlCtxtNewInputFromString(ctxt, URL, (const char *) str,
5888
0
                                      encoding, 0);
5889
0
    if (input == NULL)
5890
0
        return(NULL);
5891
5892
0
    return(htmlCtxtParseDocument(ctxt, input));
5893
0
}
5894
5895
/**
5896
 * Parse an HTML file from the filesystem, the network or a
5897
 * user-defined resource loader.
5898
 *
5899
 * See #htmlCtxtUseOptions for details.
5900
 *
5901
 * @param ctxt  an HTML parser context
5902
 * @param filename  a file or URL
5903
 * @param encoding  the document encoding (optional)
5904
 * @param options  a combination of htmlParserOption values
5905
 * @returns the resulting document tree
5906
 */
5907
xmlDoc *
5908
htmlCtxtReadFile(xmlParserCtxt *ctxt, const char *filename,
5909
                const char *encoding, int options)
5910
0
{
5911
0
    xmlParserInputPtr input;
5912
5913
0
    if (ctxt == NULL)
5914
0
        return (NULL);
5915
5916
0
    htmlCtxtReset(ctxt);
5917
0
    htmlCtxtUseOptions(ctxt, options);
5918
5919
0
    input = xmlCtxtNewInputFromUrl(ctxt, filename, NULL, encoding, 0);
5920
0
    if (input == NULL)
5921
0
        return(NULL);
5922
5923
0
    return(htmlCtxtParseDocument(ctxt, input));
5924
0
}
5925
5926
/**
5927
 * Parse an HTML in-memory document and build a tree. The input buffer must
5928
 * not contain any terminating null bytes.
5929
 *
5930
 * See #htmlCtxtUseOptions for details.
5931
 *
5932
 * @param ctxt  an HTML parser context
5933
 * @param buffer  a pointer to a char array
5934
 * @param size  the size of the array
5935
 * @param URL  only used for error reporting (optional)
5936
 * @param encoding  the document encoding (optinal)
5937
 * @param options  a combination of htmlParserOption values
5938
 * @returns the resulting document tree
5939
 */
5940
xmlDoc *
5941
htmlCtxtReadMemory(xmlParserCtxt *ctxt, const char *buffer, int size,
5942
                  const char *URL, const char *encoding, int options)
5943
0
{
5944
0
    xmlParserInputPtr input;
5945
5946
0
    if ((ctxt == NULL) || (size < 0))
5947
0
        return (NULL);
5948
5949
0
    htmlCtxtReset(ctxt);
5950
0
    htmlCtxtUseOptions(ctxt, options);
5951
5952
0
    input = xmlCtxtNewInputFromMemory(ctxt, URL, buffer, size, encoding,
5953
0
                                      XML_INPUT_BUF_STATIC);
5954
0
    if (input == NULL)
5955
0
        return(NULL);
5956
5957
0
    return(htmlCtxtParseDocument(ctxt, input));
5958
0
}
5959
5960
/**
5961
 * Parse an HTML from a file descriptor and build a tree.
5962
 *
5963
 * See #htmlCtxtUseOptions for details.
5964
 *
5965
 * NOTE that the file descriptor will not be closed when the
5966
 * context is freed or reset.
5967
 *
5968
 * @param ctxt  an HTML parser context
5969
 * @param fd  an open file descriptor
5970
 * @param URL  only used for error reporting (optional)
5971
 * @param encoding  the document encoding (optinal)
5972
 * @param options  a combination of htmlParserOption values
5973
 * @returns the resulting document tree
5974
 */
5975
xmlDoc *
5976
htmlCtxtReadFd(xmlParserCtxt *ctxt, int fd,
5977
              const char *URL, const char *encoding, int options)
5978
0
{
5979
0
    xmlParserInputPtr input;
5980
5981
0
    if (ctxt == NULL)
5982
0
        return(NULL);
5983
5984
0
    htmlCtxtReset(ctxt);
5985
0
    htmlCtxtUseOptions(ctxt, options);
5986
5987
0
    input = xmlCtxtNewInputFromFd(ctxt, URL, fd, encoding, 0);
5988
0
    if (input == NULL)
5989
0
        return(NULL);
5990
5991
0
    return(htmlCtxtParseDocument(ctxt, input));
5992
0
}
5993
5994
/**
5995
 * Parse an HTML document from I/O functions and source and build a tree.
5996
 *
5997
 * See #htmlCtxtUseOptions for details.
5998
 *
5999
 * @param ctxt  an HTML parser context
6000
 * @param ioread  an I/O read function
6001
 * @param ioclose  an I/O close function
6002
 * @param ioctx  an I/O handler
6003
 * @param URL  the base URL to use for the document
6004
 * @param encoding  the document encoding, or NULL
6005
 * @param options  a combination of htmlParserOption values
6006
 * @returns the resulting document tree
6007
 */
6008
xmlDoc *
6009
htmlCtxtReadIO(xmlParserCtxt *ctxt, xmlInputReadCallback ioread,
6010
              xmlInputCloseCallback ioclose, void *ioctx,
6011
        const char *URL,
6012
              const char *encoding, int options)
6013
0
{
6014
0
    xmlParserInputPtr input;
6015
6016
0
    if (ctxt == NULL)
6017
0
        return (NULL);
6018
6019
0
    htmlCtxtReset(ctxt);
6020
0
    htmlCtxtUseOptions(ctxt, options);
6021
6022
0
    input = xmlCtxtNewInputFromIO(ctxt, URL, ioread, ioclose, ioctx,
6023
0
                                  encoding, 0);
6024
0
    if (input == NULL)
6025
0
        return(NULL);
6026
6027
0
    return(htmlCtxtParseDocument(ctxt, input));
6028
0
}
6029
6030
#endif /* LIBXML_HTML_ENABLED */