Coverage Report

Created: 2025-07-11 06:48

/src/tinysparql/subprojects/libxml2-2.13.1/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12
13
#include <string.h>
14
#include <ctype.h>
15
#include <stdlib.h>
16
17
#include <libxml/HTMLparser.h>
18
#include <libxml/xmlmemory.h>
19
#include <libxml/tree.h>
20
#include <libxml/parser.h>
21
#include <libxml/parserInternals.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/HTMLtree.h>
24
#include <libxml/entities.h>
25
#include <libxml/encoding.h>
26
#include <libxml/xmlIO.h>
27
#include <libxml/uri.h>
28
29
#include "private/buf.h"
30
#include "private/enc.h"
31
#include "private/error.h"
32
#include "private/html.h"
33
#include "private/io.h"
34
#include "private/parser.h"
35
#include "private/tree.h"
36
37
#define HTML_MAX_NAMELEN 1000
38
0
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
39
0
#define HTML_PARSER_BUFFER_SIZE 100
40
41
static int htmlOmittedDefaultValue = 1;
42
43
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44
           xmlChar end, xmlChar  end2, xmlChar end3);
45
static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47
/************************************************************************
48
 *                  *
49
 *    Some factorized error routines        *
50
 *                  *
51
 ************************************************************************/
52
53
/**
54
 * htmlErrMemory:
55
 * @ctxt:  an HTML parser context
56
 * @extra:  extra information
57
 *
58
 * Handle a redefinition of attribute error
59
 */
60
static void
61
htmlErrMemory(xmlParserCtxtPtr ctxt)
62
0
{
63
0
    xmlCtxtErrMemory(ctxt);
64
0
}
65
66
/**
67
 * htmlParseErr:
68
 * @ctxt:  an HTML parser context
69
 * @error:  the error number
70
 * @msg:  the error message
71
 * @str1:  string infor
72
 * @str2:  string infor
73
 *
74
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
75
 */
76
static void LIBXML_ATTR_FORMAT(3,0)
77
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
78
             const char *msg, const xmlChar *str1, const xmlChar *str2)
79
0
{
80
0
    xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
81
0
               str1, str2, NULL, 0, msg, str1, str2);
82
0
}
83
84
/**
85
 * htmlParseErrInt:
86
 * @ctxt:  an HTML parser context
87
 * @error:  the error number
88
 * @msg:  the error message
89
 * @val:  integer info
90
 *
91
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
92
 */
93
static void LIBXML_ATTR_FORMAT(3,0)
94
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
95
             const char *msg, int val)
96
0
{
97
0
    xmlCtxtErr(ctxt, NULL, XML_FROM_HTML, error, XML_ERR_ERROR,
98
0
               NULL, NULL, NULL, val, msg, val);
99
0
}
100
101
/************************************************************************
102
 *                  *
103
 *  Parser stacks related functions and macros    *
104
 *                  *
105
 ************************************************************************/
106
107
/**
108
 * htmlnamePush:
109
 * @ctxt:  an HTML parser context
110
 * @value:  the element name
111
 *
112
 * Pushes a new element name on top of the name stack
113
 *
114
 * Returns -1 in case of error, the index in the stack otherwise
115
 */
116
static int
117
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
118
0
{
119
0
    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
120
0
        ctxt->html = 3;
121
0
    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
122
0
        ctxt->html = 10;
123
0
    if (ctxt->nameNr >= ctxt->nameMax) {
124
0
        size_t newSize = ctxt->nameMax * 2;
125
0
        const xmlChar **tmp;
126
127
0
        tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
128
0
                         newSize * sizeof(ctxt->nameTab[0]));
129
0
        if (tmp == NULL) {
130
0
            htmlErrMemory(ctxt);
131
0
            return (-1);
132
0
        }
133
0
        ctxt->nameTab = tmp;
134
0
        ctxt->nameMax = newSize;
135
0
    }
136
0
    ctxt->nameTab[ctxt->nameNr] = value;
137
0
    ctxt->name = value;
138
0
    return (ctxt->nameNr++);
139
0
}
140
/**
141
 * htmlnamePop:
142
 * @ctxt: an HTML parser context
143
 *
144
 * Pops the top element name from the name stack
145
 *
146
 * Returns the name just removed
147
 */
148
static const xmlChar *
149
htmlnamePop(htmlParserCtxtPtr ctxt)
150
0
{
151
0
    const xmlChar *ret;
152
153
0
    if (ctxt->nameNr <= 0)
154
0
        return (NULL);
155
0
    ctxt->nameNr--;
156
0
    if (ctxt->nameNr < 0)
157
0
        return (NULL);
158
0
    if (ctxt->nameNr > 0)
159
0
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
160
0
    else
161
0
        ctxt->name = NULL;
162
0
    ret = ctxt->nameTab[ctxt->nameNr];
163
0
    ctxt->nameTab[ctxt->nameNr] = NULL;
164
0
    return (ret);
165
0
}
166
167
/**
168
 * htmlNodeInfoPush:
169
 * @ctxt:  an HTML parser context
170
 * @value:  the node info
171
 *
172
 * Pushes a new element name on top of the node info stack
173
 *
174
 * Returns 0 in case of error, the index in the stack otherwise
175
 */
176
static int
177
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
178
0
{
179
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
180
0
        if (ctxt->nodeInfoMax == 0)
181
0
                ctxt->nodeInfoMax = 5;
182
0
        ctxt->nodeInfoMax *= 2;
183
0
        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
184
0
                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
185
0
                                    ctxt->nodeInfoMax *
186
0
                                    sizeof(ctxt->nodeInfoTab[0]));
187
0
        if (ctxt->nodeInfoTab == NULL) {
188
0
            htmlErrMemory(ctxt);
189
0
            return (0);
190
0
        }
191
0
    }
192
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
193
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
194
0
    return (ctxt->nodeInfoNr++);
195
0
}
196
197
/**
198
 * htmlNodeInfoPop:
199
 * @ctxt:  an HTML parser context
200
 *
201
 * Pops the top element name from the node info stack
202
 *
203
 * Returns 0 in case of error, the pointer to NodeInfo otherwise
204
 */
205
static htmlParserNodeInfo *
206
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
207
0
{
208
0
    if (ctxt->nodeInfoNr <= 0)
209
0
        return (NULL);
210
0
    ctxt->nodeInfoNr--;
211
0
    if (ctxt->nodeInfoNr < 0)
212
0
        return (NULL);
213
0
    if (ctxt->nodeInfoNr > 0)
214
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
215
0
    else
216
0
        ctxt->nodeInfo = NULL;
217
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
218
0
}
219
220
/*
221
 * Macros for accessing the content. Those should be used only by the parser,
222
 * and not exported.
223
 *
224
 * Dirty macros, i.e. one need to make assumption on the context to use them
225
 *
226
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
227
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
228
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
229
 *           in UNICODE mode. This should be used internally by the parser
230
 *           only to compare to ASCII values otherwise it would break when
231
 *           running with UTF-8 encoding.
232
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
233
 *           to compare on ASCII based substring.
234
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
235
 *           it should be used only to compare on ASCII based substring.
236
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
237
 *           strings without newlines within the parser.
238
 *
239
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
240
 *
241
 *   NEXT    Skip to the next character, this does the proper decoding
242
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
243
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
244
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
245
 */
246
247
0
#define UPPER (toupper(*ctxt->input->cur))
248
249
0
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
250
251
0
#define NXT(val) ctxt->input->cur[(val)]
252
253
0
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
254
255
0
#define CUR_PTR ctxt->input->cur
256
0
#define BASE_PTR ctxt->input->base
257
258
#define SHRINK \
259
0
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
260
0
        (ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
261
0
  (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
262
0
  xmlParserShrink(ctxt);
263
264
#define GROW \
265
0
    if ((!PARSER_PROGRESSIVE(ctxt)) && \
266
0
        (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
267
0
  xmlParserGrow(ctxt);
268
269
0
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
270
271
/* Imported from XML */
272
273
0
#define CUR (*ctxt->input->cur)
274
0
#define NEXT xmlNextChar(ctxt)
275
276
0
#define RAW (*ctxt->input->cur)
277
278
279
0
#define NEXTL(l) do {             \
280
0
    if (*(ctxt->input->cur) == '\n') {         \
281
0
  ctxt->input->line++; ctxt->input->col = 1;      \
282
0
    } else ctxt->input->col++;           \
283
0
    ctxt->input->cur += l;            \
284
0
  } while (0)
285
286
/************
287
    \
288
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
289
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
290
 ************/
291
292
0
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
293
294
#define COPY_BUF(l,b,i,v)           \
295
0
    if (l == 1) b[i++] = v;           \
296
0
    else i += xmlCopyChar(l,&b[i],v)
297
298
/**
299
 * htmlFindEncoding:
300
 * @the HTML parser context
301
 *
302
 * Ty to find and encoding in the current data available in the input
303
 * buffer this is needed to try to switch to the proper encoding when
304
 * one face a character error.
305
 * That's an heuristic, since it's operating outside of parsing it could
306
 * try to use a meta which had been commented out, that's the reason it
307
 * should only be used in case of error, not as a default.
308
 *
309
 * Returns an encoding string or NULL if not found, the string need to
310
 *   be freed
311
 */
312
static xmlChar *
313
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
314
0
    const xmlChar *start, *cur, *end;
315
0
    xmlChar *ret;
316
317
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
318
0
        (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
319
0
        return(NULL);
320
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
321
0
        return(NULL);
322
323
0
    start = ctxt->input->cur;
324
0
    end = ctxt->input->end;
325
    /* we also expect the input buffer to be zero terminated */
326
0
    if (*end != 0)
327
0
        return(NULL);
328
329
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
330
0
    if (cur == NULL)
331
0
        return(NULL);
332
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
333
0
    if (cur == NULL)
334
0
        return(NULL);
335
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
336
0
    if (cur == NULL)
337
0
        return(NULL);
338
0
    cur += 8;
339
0
    start = cur;
340
0
    while (((*cur >= 'A') && (*cur <= 'Z')) ||
341
0
           ((*cur >= 'a') && (*cur <= 'z')) ||
342
0
           ((*cur >= '0') && (*cur <= '9')) ||
343
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
344
0
           cur++;
345
0
    if (cur == start)
346
0
        return(NULL);
347
0
    ret = xmlStrndup(start, cur - start);
348
0
    if (ret == NULL)
349
0
        htmlErrMemory(ctxt);
350
0
    return(ret);
351
0
}
352
353
/**
354
 * htmlCurrentChar:
355
 * @ctxt:  the HTML parser context
356
 * @len:  pointer to the length of the char read
357
 *
358
 * The current char value, if using UTF-8 this may actually span multiple
359
 * bytes in the input buffer. Implement the end of line normalization:
360
 * 2.11 End-of-Line Handling
361
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
362
 * char, then the encoding converter is plugged in automatically.
363
 *
364
 * Returns the current char value and its length
365
 */
366
367
static int
368
0
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
369
0
    const unsigned char *cur;
370
0
    unsigned char c;
371
0
    unsigned int val;
372
373
0
    if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)
374
0
        xmlParserGrow(ctxt);
375
376
0
    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
377
0
        xmlChar * guess;
378
379
        /*
380
         * Assume it's a fixed length encoding (1) with
381
         * a compatible encoding for the ASCII set, since
382
         * HTML constructs only use < 128 chars
383
         */
384
0
        if (*ctxt->input->cur < 0x80) {
385
0
            if (*ctxt->input->cur == 0) {
386
0
                if (ctxt->input->cur < ctxt->input->end) {
387
0
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
388
0
                                    "Char 0x%X out of allowed range\n", 0);
389
0
                    *len = 1;
390
0
                    return(' ');
391
0
                } else {
392
0
                    *len = 0;
393
0
                    return(0);
394
0
                }
395
0
            }
396
0
            *len = 1;
397
0
            return(*ctxt->input->cur);
398
0
        }
399
400
        /*
401
         * Humm this is bad, do an automatic flow conversion
402
         */
403
0
        guess = htmlFindEncoding(ctxt);
404
0
        if (guess == NULL) {
405
0
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
406
0
        } else {
407
0
            xmlSwitchEncodingName(ctxt, (const char *) guess);
408
0
            xmlFree(guess);
409
0
        }
410
0
        ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
411
0
    }
412
413
    /*
414
     * We are supposed to handle UTF8, check it's valid
415
     * From rfc2044: encoding of the Unicode values on UTF-8:
416
     *
417
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
418
     * 0000 0000-0000 007F   0xxxxxxx
419
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
420
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
421
     *
422
     * Check for the 0x110000 limit too
423
     */
424
0
    cur = ctxt->input->cur;
425
0
    c = *cur;
426
0
    if (c & 0x80) {
427
0
        size_t avail;
428
429
0
        if ((c & 0x40) == 0)
430
0
            goto encoding_error;
431
432
0
        avail = ctxt->input->end - ctxt->input->cur;
433
434
0
        if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
435
0
            goto encoding_error;
436
0
        if ((c & 0xe0) == 0xe0) {
437
0
            if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
438
0
                goto encoding_error;
439
0
            if ((c & 0xf0) == 0xf0) {
440
0
                if (((c & 0xf8) != 0xf0) ||
441
0
                    (avail < 4) || ((cur[3] & 0xc0) != 0x80))
442
0
                    goto encoding_error;
443
                /* 4-byte code */
444
0
                *len = 4;
445
0
                val = (cur[0] & 0x7) << 18;
446
0
                val |= (cur[1] & 0x3f) << 12;
447
0
                val |= (cur[2] & 0x3f) << 6;
448
0
                val |= cur[3] & 0x3f;
449
0
                if (val < 0x10000)
450
0
                    goto encoding_error;
451
0
            } else {
452
              /* 3-byte code */
453
0
                *len = 3;
454
0
                val = (cur[0] & 0xf) << 12;
455
0
                val |= (cur[1] & 0x3f) << 6;
456
0
                val |= cur[2] & 0x3f;
457
0
                if (val < 0x800)
458
0
                    goto encoding_error;
459
0
            }
460
0
        } else {
461
          /* 2-byte code */
462
0
            *len = 2;
463
0
            val = (cur[0] & 0x1f) << 6;
464
0
            val |= cur[1] & 0x3f;
465
0
            if (val < 0x80)
466
0
                goto encoding_error;
467
0
        }
468
0
        if (!IS_CHAR(val)) {
469
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
470
0
                            "Char 0x%X out of allowed range\n", val);
471
0
        }
472
0
        return(val);
473
0
    } else {
474
0
        if (*ctxt->input->cur == 0) {
475
0
            if (ctxt->input->cur < ctxt->input->end) {
476
0
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
477
0
                                "Char 0x%X out of allowed range\n", 0);
478
0
                *len = 1;
479
0
                return(' ');
480
0
            } else {
481
0
                *len = 0;
482
0
                return(0);
483
0
            }
484
0
        }
485
        /* 1-byte code */
486
0
        *len = 1;
487
0
        return(*ctxt->input->cur);
488
0
    }
489
490
0
encoding_error:
491
0
    xmlCtxtErrIO(ctxt, XML_ERR_INVALID_ENCODING, NULL);
492
493
0
    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
494
0
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
495
0
    *len = 1;
496
0
    return(*ctxt->input->cur);
497
0
}
498
499
/**
500
 * htmlSkipBlankChars:
501
 * @ctxt:  the HTML parser context
502
 *
503
 * skip all blanks character found at that point in the input streams.
504
 *
505
 * Returns the number of space chars skipped
506
 */
507
508
static int
509
0
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
510
0
    int res = 0;
511
512
0
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
513
0
        if (*(ctxt->input->cur) == '\n') {
514
0
            ctxt->input->line++; ctxt->input->col = 1;
515
0
        } else ctxt->input->col++;
516
0
        ctxt->input->cur++;
517
0
        if (*ctxt->input->cur == 0)
518
0
            xmlParserGrow(ctxt);
519
0
  if (res < INT_MAX)
520
0
      res++;
521
0
    }
522
0
    return(res);
523
0
}
524
525
526
527
/************************************************************************
528
 *                  *
529
 *  The list of HTML elements and their properties    *
530
 *                  *
531
 ************************************************************************/
532
533
/*
534
 *  Start Tag: 1 means the start tag can be omitted
535
 *  End Tag:   1 means the end tag can be omitted
536
 *             2 means it's forbidden (empty elements)
537
 *             3 means the tag is stylistic and should be closed easily
538
 *  Depr:      this element is deprecated
539
 *  DTD:       1 means that this element is valid only in the Loose DTD
540
 *             2 means that this element is valid only in the Frameset DTD
541
 *
542
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
543
  , subElements , impliedsubelt , Attributes, userdata
544
 */
545
546
/* Definitions and a couple of vars for HTML Elements */
547
548
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
549
#define NB_FONTSTYLE 8
550
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
551
#define NB_PHRASE 10
552
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
553
#define NB_SPECIAL 16
554
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
555
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
556
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
557
#define NB_BLOCK NB_HEADING + NB_LIST + 14
558
#define FORMCTRL "input", "select", "textarea", "label", "button"
559
#define NB_FORMCTRL 5
560
#define PCDATA
561
#define NB_PCDATA 0
562
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
563
#define NB_HEADING 6
564
#define LIST "ul", "ol", "dir", "menu"
565
#define NB_LIST 4
566
#define MODIFIER
567
#define NB_MODIFIER 0
568
#define FLOW BLOCK,INLINE
569
#define NB_FLOW NB_BLOCK + NB_INLINE
570
#define EMPTY NULL
571
572
573
static const char* const html_flow[] = { FLOW, NULL } ;
574
static const char* const html_inline[] = { INLINE, NULL } ;
575
576
/* placeholders: elts with content but no subelements */
577
static const char* const html_pcdata[] = { NULL } ;
578
#define html_cdata html_pcdata
579
580
581
/* ... and for HTML Attributes */
582
583
#define COREATTRS "id", "class", "style", "title"
584
#define NB_COREATTRS 4
585
#define I18N "lang", "dir"
586
#define NB_I18N 2
587
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
588
#define NB_EVENTS 9
589
#define ATTRS COREATTRS,I18N,EVENTS
590
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
591
#define CELLHALIGN "align", "char", "charoff"
592
#define NB_CELLHALIGN 3
593
#define CELLVALIGN "valign"
594
#define NB_CELLVALIGN 1
595
596
static const char* const html_attrs[] = { ATTRS, NULL } ;
597
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
598
static const char* const core_attrs[] = { COREATTRS, NULL } ;
599
static const char* const i18n_attrs[] = { I18N, NULL } ;
600
601
602
/* Other declarations that should go inline ... */
603
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
604
  "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
605
  "tabindex", "onfocus", "onblur", NULL } ;
606
static const char* const target_attr[] = { "target", NULL } ;
607
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
608
static const char* const alt_attr[] = { "alt", NULL } ;
609
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
610
static const char* const href_attrs[] = { "href", NULL } ;
611
static const char* const clear_attrs[] = { "clear", NULL } ;
612
static const char* const inline_p[] = { INLINE, "p", NULL } ;
613
614
static const char* const flow_param[] = { FLOW, "param", NULL } ;
615
static const char* const applet_attrs[] = { COREATTRS , "codebase",
616
    "archive", "alt", "name", "height", "width", "align",
617
    "hspace", "vspace", NULL } ;
618
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
619
  "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
620
static const char* const basefont_attrs[] =
621
  { "id", "size", "color", "face", NULL } ;
622
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
623
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
624
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
625
static const char* const body_depr[] = { "background", "bgcolor", "text",
626
  "link", "vlink", "alink", NULL } ;
627
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
628
  "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
629
630
631
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
632
static const char* const col_elt[] = { "col", NULL } ;
633
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
634
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
635
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
636
static const char* const compact_attr[] = { "compact", NULL } ;
637
static const char* const label_attr[] = { "label", NULL } ;
638
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
639
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
640
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
641
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
642
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
643
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
644
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
645
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
646
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
647
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
648
static const char* const version_attr[] = { "version", NULL } ;
649
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
650
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
651
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
652
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
653
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
654
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
655
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
656
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
657
static const char* const align_attr[] = { "align", NULL } ;
658
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
659
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
660
static const char* const name_attr[] = { "name", NULL } ;
661
static const char* const action_attr[] = { "action", NULL } ;
662
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
663
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
664
static const char* const content_attr[] = { "content", NULL } ;
665
static const char* const type_attr[] = { "type", NULL } ;
666
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
667
static const char* const object_contents[] = { FLOW, "param", NULL } ;
668
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
669
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
670
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
671
static const char* const option_elt[] = { "option", NULL } ;
672
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
673
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
674
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
675
static const char* const width_attr[] = { "width", NULL } ;
676
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
677
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
678
static const char* const language_attr[] = { "language", NULL } ;
679
static const char* const select_content[] = { "optgroup", "option", NULL } ;
680
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
681
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
682
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
683
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
684
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
685
static const char* const tr_elt[] = { "tr", NULL } ;
686
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
687
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
688
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
689
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
690
static const char* const tr_contents[] = { "th", "td", NULL } ;
691
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
692
static const char* const li_elt[] = { "li", NULL } ;
693
static const char* const ul_depr[] = { "type", "compact", NULL} ;
694
static const char* const dir_attr[] = { "dir", NULL} ;
695
696
#define DECL (const char**)
697
698
static const htmlElemDesc
699
html40ElementTable[] = {
700
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
701
  DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
702
},
703
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
704
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
705
},
706
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
707
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
708
},
709
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
710
  DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
711
},
712
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
713
  DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
714
},
715
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
716
  EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
717
},
718
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
719
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
720
},
721
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
722
  EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
723
},
724
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
725
  EMPTY , NULL , NULL, DECL basefont_attrs, NULL
726
},
727
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
728
  DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
729
},
730
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
731
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
732
},
733
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
734
  DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
735
},
736
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
737
  DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
738
},
739
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
740
  EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
741
},
742
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
743
  DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
744
},
745
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
746
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
747
},
748
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
749
  DECL html_flow , NULL , NULL, DECL html_attrs, NULL
750
},
751
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
752
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
753
},
754
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
755
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
756
},
757
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
758
  EMPTY , NULL , DECL col_attrs , NULL, NULL
759
},
760
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
761
  DECL col_elt , "col" , DECL col_attrs , NULL, NULL
762
},
763
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
764
  DECL html_flow , NULL , DECL html_attrs, NULL, NULL
765
},
766
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
767
  DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
768
},
769
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
770
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
771
},
772
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
773
  DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
774
},
775
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
776
  DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
777
},
778
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
779
  DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
780
},
781
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
782
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
783
},
784
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
785
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
786
},
787
{ "embed",  0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
788
  EMPTY, NULL, DECL embed_attrs, NULL, NULL
789
},
790
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
791
  DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
792
},
793
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
794
  DECL html_inline, NULL, NULL, DECL font_attrs, NULL
795
},
796
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
797
  DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
798
},
799
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
800
  EMPTY, NULL, NULL, DECL frame_attrs, NULL
801
},
802
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
803
  DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
804
},
805
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
806
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
807
},
808
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
809
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
810
},
811
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
812
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
813
},
814
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
815
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
816
},
817
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
818
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
819
},
820
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
821
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
822
},
823
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
824
  DECL head_contents, NULL, DECL head_attrs, NULL, NULL
825
},
826
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
827
  EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
828
},
829
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
830
  DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
831
},
832
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
833
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
834
},
835
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
836
  DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
837
},
838
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
839
  EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
840
},
841
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
842
  EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
843
},
844
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
845
  DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
846
},
847
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
848
  EMPTY, NULL, NULL, DECL prompt_attrs, NULL
849
},
850
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
851
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
852
},
853
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
854
  DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
855
},
856
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
857
  DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
858
},
859
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
860
  DECL html_flow, NULL, DECL html_attrs, NULL, NULL
861
},
862
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
863
  EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
864
},
865
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
866
  DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
867
},
868
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
869
  DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
870
},
871
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
872
  EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
873
},
874
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
875
  DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
876
},
877
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
878
  DECL html_flow, "div", DECL html_attrs, NULL, NULL
879
},
880
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
881
  DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
882
},
883
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
884
  DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
885
},
886
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
887
  DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
888
},
889
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
890
  DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
891
},
892
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
893
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894
},
895
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
896
  EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
897
},
898
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
899
  DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
900
},
901
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
902
  DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
903
},
904
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
905
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
906
},
907
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
908
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
909
},
910
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
911
  DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
912
},
913
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
914
  DECL select_content, NULL, DECL select_attrs, NULL, NULL
915
},
916
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
917
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
918
},
919
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
920
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
921
},
922
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
923
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
924
},
925
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
926
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
927
},
928
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
929
  DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
930
},
931
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
932
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933
},
934
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
935
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
936
},
937
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
938
  DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
939
},
940
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
941
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
942
},
943
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
944
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
945
},
946
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
947
  DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
948
},
949
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
950
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
951
},
952
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
953
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
954
},
955
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
956
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
957
},
958
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
959
  DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
960
},
961
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
962
  DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
963
},
964
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
965
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
966
},
967
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
968
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
969
},
970
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
971
  DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
972
},
973
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
974
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975
}
976
};
977
978
typedef struct {
979
    const char *oldTag;
980
    const char *newTag;
981
} htmlStartCloseEntry;
982
983
/*
984
 * start tags that imply the end of current element
985
 */
986
static const htmlStartCloseEntry htmlStartClose[] = {
987
    { "a", "a" },
988
    { "a", "fieldset" },
989
    { "a", "table" },
990
    { "a", "td" },
991
    { "a", "th" },
992
    { "address", "dd" },
993
    { "address", "dl" },
994
    { "address", "dt" },
995
    { "address", "form" },
996
    { "address", "li" },
997
    { "address", "ul" },
998
    { "b", "center" },
999
    { "b", "p" },
1000
    { "b", "td" },
1001
    { "b", "th" },
1002
    { "big", "p" },
1003
    { "caption", "col" },
1004
    { "caption", "colgroup" },
1005
    { "caption", "tbody" },
1006
    { "caption", "tfoot" },
1007
    { "caption", "thead" },
1008
    { "caption", "tr" },
1009
    { "col", "col" },
1010
    { "col", "colgroup" },
1011
    { "col", "tbody" },
1012
    { "col", "tfoot" },
1013
    { "col", "thead" },
1014
    { "col", "tr" },
1015
    { "colgroup", "colgroup" },
1016
    { "colgroup", "tbody" },
1017
    { "colgroup", "tfoot" },
1018
    { "colgroup", "thead" },
1019
    { "colgroup", "tr" },
1020
    { "dd", "dt" },
1021
    { "dir", "dd" },
1022
    { "dir", "dl" },
1023
    { "dir", "dt" },
1024
    { "dir", "form" },
1025
    { "dir", "ul" },
1026
    { "dl", "form" },
1027
    { "dl", "li" },
1028
    { "dt", "dd" },
1029
    { "dt", "dl" },
1030
    { "font", "center" },
1031
    { "font", "td" },
1032
    { "font", "th" },
1033
    { "form", "form" },
1034
    { "h1", "fieldset" },
1035
    { "h1", "form" },
1036
    { "h1", "li" },
1037
    { "h1", "p" },
1038
    { "h1", "table" },
1039
    { "h2", "fieldset" },
1040
    { "h2", "form" },
1041
    { "h2", "li" },
1042
    { "h2", "p" },
1043
    { "h2", "table" },
1044
    { "h3", "fieldset" },
1045
    { "h3", "form" },
1046
    { "h3", "li" },
1047
    { "h3", "p" },
1048
    { "h3", "table" },
1049
    { "h4", "fieldset" },
1050
    { "h4", "form" },
1051
    { "h4", "li" },
1052
    { "h4", "p" },
1053
    { "h4", "table" },
1054
    { "h5", "fieldset" },
1055
    { "h5", "form" },
1056
    { "h5", "li" },
1057
    { "h5", "p" },
1058
    { "h5", "table" },
1059
    { "h6", "fieldset" },
1060
    { "h6", "form" },
1061
    { "h6", "li" },
1062
    { "h6", "p" },
1063
    { "h6", "table" },
1064
    { "head", "a" },
1065
    { "head", "abbr" },
1066
    { "head", "acronym" },
1067
    { "head", "address" },
1068
    { "head", "b" },
1069
    { "head", "bdo" },
1070
    { "head", "big" },
1071
    { "head", "blockquote" },
1072
    { "head", "body" },
1073
    { "head", "br" },
1074
    { "head", "center" },
1075
    { "head", "cite" },
1076
    { "head", "code" },
1077
    { "head", "dd" },
1078
    { "head", "dfn" },
1079
    { "head", "dir" },
1080
    { "head", "div" },
1081
    { "head", "dl" },
1082
    { "head", "dt" },
1083
    { "head", "em" },
1084
    { "head", "fieldset" },
1085
    { "head", "font" },
1086
    { "head", "form" },
1087
    { "head", "frameset" },
1088
    { "head", "h1" },
1089
    { "head", "h2" },
1090
    { "head", "h3" },
1091
    { "head", "h4" },
1092
    { "head", "h5" },
1093
    { "head", "h6" },
1094
    { "head", "hr" },
1095
    { "head", "i" },
1096
    { "head", "iframe" },
1097
    { "head", "img" },
1098
    { "head", "kbd" },
1099
    { "head", "li" },
1100
    { "head", "listing" },
1101
    { "head", "map" },
1102
    { "head", "menu" },
1103
    { "head", "ol" },
1104
    { "head", "p" },
1105
    { "head", "pre" },
1106
    { "head", "q" },
1107
    { "head", "s" },
1108
    { "head", "samp" },
1109
    { "head", "small" },
1110
    { "head", "span" },
1111
    { "head", "strike" },
1112
    { "head", "strong" },
1113
    { "head", "sub" },
1114
    { "head", "sup" },
1115
    { "head", "table" },
1116
    { "head", "tt" },
1117
    { "head", "u" },
1118
    { "head", "ul" },
1119
    { "head", "var" },
1120
    { "head", "xmp" },
1121
    { "hr", "form" },
1122
    { "i", "center" },
1123
    { "i", "p" },
1124
    { "i", "td" },
1125
    { "i", "th" },
1126
    { "legend", "fieldset" },
1127
    { "li", "li" },
1128
    { "link", "body" },
1129
    { "link", "frameset" },
1130
    { "listing", "dd" },
1131
    { "listing", "dl" },
1132
    { "listing", "dt" },
1133
    { "listing", "fieldset" },
1134
    { "listing", "form" },
1135
    { "listing", "li" },
1136
    { "listing", "table" },
1137
    { "listing", "ul" },
1138
    { "menu", "dd" },
1139
    { "menu", "dl" },
1140
    { "menu", "dt" },
1141
    { "menu", "form" },
1142
    { "menu", "ul" },
1143
    { "ol", "form" },
1144
    { "option", "optgroup" },
1145
    { "option", "option" },
1146
    { "p", "address" },
1147
    { "p", "blockquote" },
1148
    { "p", "body" },
1149
    { "p", "caption" },
1150
    { "p", "center" },
1151
    { "p", "col" },
1152
    { "p", "colgroup" },
1153
    { "p", "dd" },
1154
    { "p", "dir" },
1155
    { "p", "div" },
1156
    { "p", "dl" },
1157
    { "p", "dt" },
1158
    { "p", "fieldset" },
1159
    { "p", "form" },
1160
    { "p", "frameset" },
1161
    { "p", "h1" },
1162
    { "p", "h2" },
1163
    { "p", "h3" },
1164
    { "p", "h4" },
1165
    { "p", "h5" },
1166
    { "p", "h6" },
1167
    { "p", "head" },
1168
    { "p", "hr" },
1169
    { "p", "li" },
1170
    { "p", "listing" },
1171
    { "p", "menu" },
1172
    { "p", "ol" },
1173
    { "p", "p" },
1174
    { "p", "pre" },
1175
    { "p", "table" },
1176
    { "p", "tbody" },
1177
    { "p", "td" },
1178
    { "p", "tfoot" },
1179
    { "p", "th" },
1180
    { "p", "title" },
1181
    { "p", "tr" },
1182
    { "p", "ul" },
1183
    { "p", "xmp" },
1184
    { "pre", "dd" },
1185
    { "pre", "dl" },
1186
    { "pre", "dt" },
1187
    { "pre", "fieldset" },
1188
    { "pre", "form" },
1189
    { "pre", "li" },
1190
    { "pre", "table" },
1191
    { "pre", "ul" },
1192
    { "s", "p" },
1193
    { "script", "noscript" },
1194
    { "small", "p" },
1195
    { "span", "td" },
1196
    { "span", "th" },
1197
    { "strike", "p" },
1198
    { "style", "body" },
1199
    { "style", "frameset" },
1200
    { "tbody", "tbody" },
1201
    { "tbody", "tfoot" },
1202
    { "td", "tbody" },
1203
    { "td", "td" },
1204
    { "td", "tfoot" },
1205
    { "td", "th" },
1206
    { "td", "tr" },
1207
    { "tfoot", "tbody" },
1208
    { "th", "tbody" },
1209
    { "th", "td" },
1210
    { "th", "tfoot" },
1211
    { "th", "th" },
1212
    { "th", "tr" },
1213
    { "thead", "tbody" },
1214
    { "thead", "tfoot" },
1215
    { "title", "body" },
1216
    { "title", "frameset" },
1217
    { "tr", "tbody" },
1218
    { "tr", "tfoot" },
1219
    { "tr", "tr" },
1220
    { "tt", "p" },
1221
    { "u", "p" },
1222
    { "u", "td" },
1223
    { "u", "th" },
1224
    { "ul", "address" },
1225
    { "ul", "form" },
1226
    { "ul", "menu" },
1227
    { "ul", "pre" },
1228
    { "xmp", "dd" },
1229
    { "xmp", "dl" },
1230
    { "xmp", "dt" },
1231
    { "xmp", "fieldset" },
1232
    { "xmp", "form" },
1233
    { "xmp", "li" },
1234
    { "xmp", "table" },
1235
    { "xmp", "ul" }
1236
};
1237
1238
/*
1239
 * The list of HTML elements which are supposed not to have
1240
 * CDATA content and where a p element will be implied
1241
 *
1242
 * TODO: extend that list by reading the HTML SGML DTD on
1243
 *       implied paragraph
1244
 */
1245
static const char *const htmlNoContentElements[] = {
1246
    "html",
1247
    "head",
1248
    NULL
1249
};
1250
1251
/*
1252
 * The list of HTML attributes which are of content %Script;
1253
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1254
 *       it assumes the name starts with 'on'
1255
 */
1256
static const char *const htmlScriptAttributes[] = {
1257
    "onclick",
1258
    "ondblclick",
1259
    "onmousedown",
1260
    "onmouseup",
1261
    "onmouseover",
1262
    "onmousemove",
1263
    "onmouseout",
1264
    "onkeypress",
1265
    "onkeydown",
1266
    "onkeyup",
1267
    "onload",
1268
    "onunload",
1269
    "onfocus",
1270
    "onblur",
1271
    "onsubmit",
1272
    "onreset",
1273
    "onchange",
1274
    "onselect"
1275
};
1276
1277
/*
1278
 * This table is used by the htmlparser to know what to do with
1279
 * broken html pages. By assigning different priorities to different
1280
 * elements the parser can decide how to handle extra endtags.
1281
 * Endtags are only allowed to close elements with lower or equal
1282
 * priority.
1283
 */
1284
1285
typedef struct {
1286
    const char *name;
1287
    int priority;
1288
} elementPriority;
1289
1290
static const elementPriority htmlEndPriority[] = {
1291
    {"div",   150},
1292
    {"td",    160},
1293
    {"th",    160},
1294
    {"tr",    170},
1295
    {"thead", 180},
1296
    {"tbody", 180},
1297
    {"tfoot", 180},
1298
    {"table", 190},
1299
    {"head",  200},
1300
    {"body",  200},
1301
    {"html",  220},
1302
    {NULL,    100} /* Default priority */
1303
};
1304
1305
/************************************************************************
1306
 *                  *
1307
 *  functions to handle HTML specific data      *
1308
 *                  *
1309
 ************************************************************************/
1310
1311
/**
1312
 * htmlInitAutoClose:
1313
 *
1314
 * DEPRECATED: This is a no-op.
1315
 */
1316
void
1317
0
htmlInitAutoClose(void) {
1318
0
}
1319
1320
static int
1321
0
htmlCompareTags(const void *key, const void *member) {
1322
0
    const xmlChar *tag = (const xmlChar *) key;
1323
0
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1324
1325
0
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1326
0
}
1327
1328
/**
1329
 * htmlTagLookup:
1330
 * @tag:  The tag name in lowercase
1331
 *
1332
 * Lookup the HTML tag in the ElementTable
1333
 *
1334
 * Returns the related htmlElemDescPtr or NULL if not found.
1335
 */
1336
const htmlElemDesc *
1337
0
htmlTagLookup(const xmlChar *tag) {
1338
0
    if (tag == NULL)
1339
0
        return(NULL);
1340
1341
0
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1342
0
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1343
0
                sizeof(htmlElemDesc), htmlCompareTags));
1344
0
}
1345
1346
/**
1347
 * htmlGetEndPriority:
1348
 * @name: The name of the element to look up the priority for.
1349
 *
1350
 * Return value: The "endtag" priority.
1351
 **/
1352
static int
1353
0
htmlGetEndPriority (const xmlChar *name) {
1354
0
    int i = 0;
1355
1356
0
    while ((htmlEndPriority[i].name != NULL) &&
1357
0
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1358
0
  i++;
1359
1360
0
    return(htmlEndPriority[i].priority);
1361
0
}
1362
1363
1364
static int
1365
0
htmlCompareStartClose(const void *vkey, const void *member) {
1366
0
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1367
0
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1368
0
    int ret;
1369
1370
0
    ret = strcmp(key->oldTag, entry->oldTag);
1371
0
    if (ret == 0)
1372
0
        ret = strcmp(key->newTag, entry->newTag);
1373
1374
0
    return(ret);
1375
0
}
1376
1377
/**
1378
 * htmlCheckAutoClose:
1379
 * @newtag:  The new tag name
1380
 * @oldtag:  The old tag name
1381
 *
1382
 * Checks whether the new tag is one of the registered valid tags for
1383
 * closing old.
1384
 *
1385
 * Returns 0 if no, 1 if yes.
1386
 */
1387
static int
1388
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1389
0
{
1390
0
    htmlStartCloseEntry key;
1391
0
    void *res;
1392
1393
0
    key.oldTag = (const char *) oldtag;
1394
0
    key.newTag = (const char *) newtag;
1395
0
    res = bsearch(&key, htmlStartClose,
1396
0
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1397
0
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1398
0
    return(res != NULL);
1399
0
}
1400
1401
/**
1402
 * htmlAutoCloseOnClose:
1403
 * @ctxt:  an HTML parser context
1404
 * @newtag:  The new tag name
1405
 * @force:  force the tag closure
1406
 *
1407
 * The HTML DTD allows an ending tag to implicitly close other tags.
1408
 */
1409
static void
1410
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1411
0
{
1412
0
    const htmlElemDesc *info;
1413
0
    int i, priority;
1414
1415
0
    priority = htmlGetEndPriority(newtag);
1416
1417
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1418
1419
0
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1420
0
            break;
1421
        /*
1422
         * A misplaced endtag can only close elements with lower
1423
         * or equal priority, so if we find an element with higher
1424
         * priority before we find an element with
1425
         * matching name, we just ignore this endtag
1426
         */
1427
0
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1428
0
            return;
1429
0
    }
1430
0
    if (i < 0)
1431
0
        return;
1432
1433
0
    while (!xmlStrEqual(newtag, ctxt->name)) {
1434
0
        info = htmlTagLookup(ctxt->name);
1435
0
        if ((info != NULL) && (info->endTag == 3)) {
1436
0
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1437
0
                   "Opening and ending tag mismatch: %s and %s\n",
1438
0
       newtag, ctxt->name);
1439
0
        }
1440
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1441
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1442
0
  htmlnamePop(ctxt);
1443
0
    }
1444
0
}
1445
1446
/**
1447
 * htmlAutoCloseOnEnd:
1448
 * @ctxt:  an HTML parser context
1449
 *
1450
 * Close all remaining tags at the end of the stream
1451
 */
1452
static void
1453
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1454
0
{
1455
0
    int i;
1456
1457
0
    if (ctxt->nameNr == 0)
1458
0
        return;
1459
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1460
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1461
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1462
0
  htmlnamePop(ctxt);
1463
0
    }
1464
0
}
1465
1466
/**
1467
 * htmlAutoClose:
1468
 * @ctxt:  an HTML parser context
1469
 * @newtag:  The new tag name or NULL
1470
 *
1471
 * The HTML DTD allows a tag to implicitly close other tags.
1472
 * The list is kept in htmlStartClose array. This function is
1473
 * called when a new tag has been detected and generates the
1474
 * appropriates closes if possible/needed.
1475
 * If newtag is NULL this mean we are at the end of the resource
1476
 * and we should check
1477
 */
1478
static void
1479
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1480
0
{
1481
0
    if (newtag == NULL)
1482
0
        return;
1483
1484
0
    while ((ctxt->name != NULL) &&
1485
0
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1486
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1487
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1488
0
  htmlnamePop(ctxt);
1489
0
    }
1490
0
}
1491
1492
/**
1493
 * htmlAutoCloseTag:
1494
 * @doc:  the HTML document
1495
 * @name:  The tag name
1496
 * @elem:  the HTML element
1497
 *
1498
 * The HTML DTD allows a tag to implicitly close other tags.
1499
 * The list is kept in htmlStartClose array. This function checks
1500
 * if the element or one of it's children would autoclose the
1501
 * given tag.
1502
 *
1503
 * Returns 1 if autoclose, 0 otherwise
1504
 */
1505
int
1506
0
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1507
0
    htmlNodePtr child;
1508
1509
0
    if (elem == NULL) return(1);
1510
0
    if (xmlStrEqual(name, elem->name)) return(0);
1511
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1512
0
    child = elem->children;
1513
0
    while (child != NULL) {
1514
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1515
0
  child = child->next;
1516
0
    }
1517
0
    return(0);
1518
0
}
1519
1520
/**
1521
 * htmlIsAutoClosed:
1522
 * @doc:  the HTML document
1523
 * @elem:  the HTML element
1524
 *
1525
 * The HTML DTD allows a tag to implicitly close other tags.
1526
 * The list is kept in htmlStartClose array. This function checks
1527
 * if a tag is autoclosed by one of it's child
1528
 *
1529
 * Returns 1 if autoclosed, 0 otherwise
1530
 */
1531
int
1532
0
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1533
0
    htmlNodePtr child;
1534
1535
0
    if (elem == NULL) return(1);
1536
0
    child = elem->children;
1537
0
    while (child != NULL) {
1538
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1539
0
  child = child->next;
1540
0
    }
1541
0
    return(0);
1542
0
}
1543
1544
/**
1545
 * htmlCheckImplied:
1546
 * @ctxt:  an HTML parser context
1547
 * @newtag:  The new tag name
1548
 *
1549
 * The HTML DTD allows a tag to exists only implicitly
1550
 * called when a new tag has been detected and generates the
1551
 * appropriates implicit tags if missing
1552
 */
1553
static void
1554
0
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1555
0
    int i;
1556
1557
0
    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1558
0
        return;
1559
0
    if (!htmlOmittedDefaultValue)
1560
0
  return;
1561
0
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1562
0
  return;
1563
0
    if (ctxt->nameNr <= 0) {
1564
0
  htmlnamePush(ctxt, BAD_CAST"html");
1565
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1567
0
    }
1568
0
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1569
0
        return;
1570
0
    if ((ctxt->nameNr <= 1) &&
1571
0
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1572
0
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1573
0
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1574
0
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1575
0
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1576
0
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1577
0
        if (ctxt->html >= 3) {
1578
            /* we already saw or generated an <head> before */
1579
0
            return;
1580
0
        }
1581
        /*
1582
         * dropped OBJECT ... i you put it first BODY will be
1583
         * assumed !
1584
         */
1585
0
        htmlnamePush(ctxt, BAD_CAST"head");
1586
0
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1587
0
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1588
0
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1589
0
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1590
0
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1591
0
        if (ctxt->html >= 10) {
1592
            /* we already saw or generated a <body> before */
1593
0
            return;
1594
0
        }
1595
0
  for (i = 0;i < ctxt->nameNr;i++) {
1596
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1597
0
    return;
1598
0
      }
1599
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1600
0
    return;
1601
0
      }
1602
0
  }
1603
1604
0
  htmlnamePush(ctxt, BAD_CAST"body");
1605
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1606
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1607
0
    }
1608
0
}
1609
1610
/**
1611
 * htmlCheckParagraph
1612
 * @ctxt:  an HTML parser context
1613
 *
1614
 * Check whether a p element need to be implied before inserting
1615
 * characters in the current element.
1616
 *
1617
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1618
 *         in case of error.
1619
 */
1620
1621
static int
1622
0
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1623
0
    const xmlChar *tag;
1624
0
    int i;
1625
1626
0
    if (ctxt == NULL)
1627
0
  return(-1);
1628
0
    tag = ctxt->name;
1629
0
    if (tag == NULL) {
1630
0
  htmlAutoClose(ctxt, BAD_CAST"p");
1631
0
  htmlCheckImplied(ctxt, BAD_CAST"p");
1632
0
  htmlnamePush(ctxt, BAD_CAST"p");
1633
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1634
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1635
0
  return(1);
1636
0
    }
1637
0
    if (!htmlOmittedDefaultValue)
1638
0
  return(0);
1639
0
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1640
0
  if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1641
0
      htmlAutoClose(ctxt, BAD_CAST"p");
1642
0
      htmlCheckImplied(ctxt, BAD_CAST"p");
1643
0
      htmlnamePush(ctxt, BAD_CAST"p");
1644
0
      if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1645
0
    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1646
0
      return(1);
1647
0
  }
1648
0
    }
1649
0
    return(0);
1650
0
}
1651
1652
/**
1653
 * htmlIsScriptAttribute:
1654
 * @name:  an attribute name
1655
 *
1656
 * Check if an attribute is of content type Script
1657
 *
1658
 * Returns 1 is the attribute is a script 0 otherwise
1659
 */
1660
int
1661
0
htmlIsScriptAttribute(const xmlChar *name) {
1662
0
    unsigned int i;
1663
1664
0
    if (name == NULL)
1665
0
      return(0);
1666
    /*
1667
     * all script attributes start with 'on'
1668
     */
1669
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1670
0
      return(0);
1671
0
    for (i = 0;
1672
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1673
0
   i++) {
1674
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1675
0
      return(1);
1676
0
    }
1677
0
    return(0);
1678
0
}
1679
1680
/************************************************************************
1681
 *                  *
1682
 *  The list of HTML predefined entities      *
1683
 *                  *
1684
 ************************************************************************/
1685
1686
1687
static const htmlEntityDesc  html40EntitiesTable[] = {
1688
/*
1689
 * the 4 absolute ones, plus apostrophe.
1690
 */
1691
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1692
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1693
{ 39, "apos", "single quote" },
1694
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1695
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1696
1697
/*
1698
 * A bunch still in the 128-255 range
1699
 * Replacing them depend really on the charset used.
1700
 */
1701
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1702
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1703
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1704
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1705
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1706
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1707
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1708
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1709
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1710
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1711
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1712
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1713
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1714
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1715
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1716
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1717
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1718
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1719
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1720
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1721
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1722
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1723
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1724
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1725
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1726
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1727
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1728
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1729
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1730
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1731
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1732
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1733
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1734
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1735
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1736
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1737
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1738
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1739
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1740
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1741
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1742
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1743
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1744
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1745
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1746
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1747
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1748
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1749
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1750
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1751
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1752
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1753
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1754
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1755
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1756
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1757
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1758
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1759
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1760
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1761
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1762
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1763
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1764
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1765
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1766
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1767
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1768
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1769
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1770
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1771
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1772
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1773
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1774
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1775
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1776
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1777
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1778
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1779
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1780
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1781
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1782
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1783
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1784
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1785
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1786
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1787
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1788
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1789
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1790
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1791
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1792
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1793
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1794
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1795
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1796
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1797
1798
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1799
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1800
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1801
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1802
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1803
1804
/*
1805
 * Anything below should really be kept as entities references
1806
 */
1807
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1808
1809
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1810
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1811
1812
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1813
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1814
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1815
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1816
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1817
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1818
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1819
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1820
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1821
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1822
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1823
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1824
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1825
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1826
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1827
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1828
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1829
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1830
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1831
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1832
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1833
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1834
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1835
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1836
1837
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1838
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1839
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1840
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1841
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1842
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1843
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1844
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1845
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1846
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1847
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1848
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1849
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1850
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1851
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1852
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1853
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1854
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1855
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1856
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1857
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1858
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1859
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1860
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1861
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1862
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1863
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1864
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1865
1866
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1867
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1868
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1869
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1870
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1871
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1872
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1873
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1874
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1875
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1876
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1877
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1878
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1879
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1880
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1881
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1882
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1883
1884
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1885
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1886
1887
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1888
1889
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1890
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1891
1892
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1893
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1894
1895
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1896
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1897
1898
{ 8364, "euro", "euro sign, U+20AC NEW" },
1899
1900
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1901
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1902
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1903
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1904
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1905
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1906
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1907
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1908
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1909
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1910
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1911
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1912
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1913
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1914
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1915
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1916
1917
{ 8704, "forall","for all, U+2200 ISOtech" },
1918
{ 8706, "part", "partial differential, U+2202 ISOtech" },
1919
{ 8707, "exist","there exists, U+2203 ISOtech" },
1920
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1921
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1922
{ 8712, "isin", "element of, U+2208 ISOtech" },
1923
{ 8713, "notin","not an element of, U+2209 ISOtech" },
1924
{ 8715, "ni", "contains as member, U+220B ISOtech" },
1925
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1926
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1927
{ 8722, "minus","minus sign, U+2212 ISOtech" },
1928
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1929
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1930
{ 8733, "prop", "proportional to, U+221D ISOtech" },
1931
{ 8734, "infin","infinity, U+221E ISOtech" },
1932
{ 8736, "ang",  "angle, U+2220 ISOamso" },
1933
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1934
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1935
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1936
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
1937
{ 8747, "int",  "integral, U+222B ISOtech" },
1938
{ 8756, "there4","therefore, U+2234 ISOtech" },
1939
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1940
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1941
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1942
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1943
{ 8801, "equiv","identical to, U+2261 ISOtech" },
1944
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1945
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1946
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
1947
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
1948
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1949
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1950
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1951
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1952
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1953
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1954
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1955
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1956
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1957
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1958
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1959
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1960
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1961
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
1962
1963
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1964
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1965
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1966
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1967
1968
};
1969
1970
/************************************************************************
1971
 *                  *
1972
 *    Commodity functions to handle entities      *
1973
 *                  *
1974
 ************************************************************************/
1975
1976
/*
1977
 * Macro used to grow the current buffer.
1978
 */
1979
0
#define growBuffer(buffer) {           \
1980
0
    xmlChar *tmp;             \
1981
0
    buffer##_size *= 2;             \
1982
0
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size);    \
1983
0
    if (tmp == NULL) {             \
1984
0
  htmlErrMemory(ctxt);      \
1985
0
  xmlFree(buffer);           \
1986
0
  return(NULL);             \
1987
0
    }                 \
1988
0
    buffer = tmp;             \
1989
0
}
1990
1991
/**
1992
 * htmlEntityLookup:
1993
 * @name: the entity name
1994
 *
1995
 * Lookup the given entity in EntitiesTable
1996
 *
1997
 * TODO: the linear scan is really ugly, an hash table is really needed.
1998
 *
1999
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2000
 */
2001
const htmlEntityDesc *
2002
0
htmlEntityLookup(const xmlChar *name) {
2003
0
    unsigned int i;
2004
2005
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2006
0
                    sizeof(html40EntitiesTable[0]));i++) {
2007
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2008
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2009
0
  }
2010
0
    }
2011
0
    return(NULL);
2012
0
}
2013
2014
static int
2015
0
htmlCompareEntityDesc(const void *vkey, const void *vdesc) {
2016
0
    const unsigned *key = vkey;
2017
0
    const htmlEntityDesc *desc = vdesc;
2018
2019
0
    return((int) *key - (int) desc->value);
2020
0
}
2021
2022
/**
2023
 * htmlEntityValueLookup:
2024
 * @value: the entity's unicode value
2025
 *
2026
 * Lookup the given entity in EntitiesTable
2027
 *
2028
 * TODO: the linear scan is really ugly, an hash table is really needed.
2029
 *
2030
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2031
 */
2032
const htmlEntityDesc *
2033
0
htmlEntityValueLookup(unsigned int value) {
2034
0
    const htmlEntityDesc *desc;
2035
0
    size_t nmemb;
2036
2037
0
    nmemb = sizeof(html40EntitiesTable) / sizeof(html40EntitiesTable[0]);
2038
0
    desc = bsearch(&value, html40EntitiesTable, nmemb, sizeof(htmlEntityDesc),
2039
0
                   htmlCompareEntityDesc);
2040
2041
0
    return(desc);
2042
0
}
2043
2044
/**
2045
 * UTF8ToHtml:
2046
 * @out:  a pointer to an array of bytes to store the result
2047
 * @outlen:  the length of @out
2048
 * @in:  a pointer to an array of UTF-8 chars
2049
 * @inlen:  the length of @in
2050
 *
2051
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2052
 * plus HTML entities block of chars out.
2053
 *
2054
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2055
 * The value of @inlen after return is the number of octets consumed
2056
 *     as the return value is positive, else unpredictable.
2057
 * The value of @outlen after return is the number of octets consumed.
2058
 */
2059
int
2060
UTF8ToHtml(unsigned char* out, int *outlen,
2061
0
              const unsigned char* in, int *inlen) {
2062
0
    const unsigned char* processed = in;
2063
0
    const unsigned char* outend;
2064
0
    const unsigned char* outstart = out;
2065
0
    const unsigned char* instart = in;
2066
0
    const unsigned char* inend;
2067
0
    unsigned int c, d;
2068
0
    int trailing;
2069
2070
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2071
0
    if (in == NULL) {
2072
        /*
2073
   * initialization nothing to do
2074
   */
2075
0
  *outlen = 0;
2076
0
  *inlen = 0;
2077
0
  return(0);
2078
0
    }
2079
0
    inend = in + (*inlen);
2080
0
    outend = out + (*outlen);
2081
0
    while (in < inend) {
2082
0
  d = *in++;
2083
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2084
0
  else if (d < 0xC0) {
2085
      /* trailing byte in leading position */
2086
0
      *outlen = out - outstart;
2087
0
      *inlen = processed - instart;
2088
0
      return(-2);
2089
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2090
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2091
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2092
0
  else {
2093
      /* no chance for this in Ascii */
2094
0
      *outlen = out - outstart;
2095
0
      *inlen = processed - instart;
2096
0
      return(-2);
2097
0
  }
2098
2099
0
  if (inend - in < trailing) {
2100
0
      break;
2101
0
  }
2102
2103
0
  for ( ; trailing; trailing--) {
2104
0
      if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2105
0
    break;
2106
0
      c <<= 6;
2107
0
      c |= d & 0x3F;
2108
0
  }
2109
2110
  /* assertion: c is a single UTF-4 value */
2111
0
  if (c < 0x80) {
2112
0
      if (out + 1 >= outend)
2113
0
    break;
2114
0
      *out++ = c;
2115
0
  } else {
2116
0
      int len;
2117
0
      const htmlEntityDesc * ent;
2118
0
      const char *cp;
2119
0
      char nbuf[16];
2120
2121
      /*
2122
       * Try to lookup a predefined HTML entity for it
2123
       */
2124
2125
0
      ent = htmlEntityValueLookup(c);
2126
0
      if (ent == NULL) {
2127
0
        snprintf(nbuf, sizeof(nbuf), "#%u", c);
2128
0
        cp = nbuf;
2129
0
      }
2130
0
      else
2131
0
        cp = ent->name;
2132
0
      len = strlen(cp);
2133
0
      if (out + 2 + len >= outend)
2134
0
    break;
2135
0
      *out++ = '&';
2136
0
      memcpy(out, cp, len);
2137
0
      out += len;
2138
0
      *out++ = ';';
2139
0
  }
2140
0
  processed = in;
2141
0
    }
2142
0
    *outlen = out - outstart;
2143
0
    *inlen = processed - instart;
2144
0
    return(0);
2145
0
}
2146
2147
/**
2148
 * htmlEncodeEntities:
2149
 * @out:  a pointer to an array of bytes to store the result
2150
 * @outlen:  the length of @out
2151
 * @in:  a pointer to an array of UTF-8 chars
2152
 * @inlen:  the length of @in
2153
 * @quoteChar: the quote character to escape (' or ") or zero.
2154
 *
2155
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2156
 * plus HTML entities block of chars out.
2157
 *
2158
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2159
 * The value of @inlen after return is the number of octets consumed
2160
 *     as the return value is positive, else unpredictable.
2161
 * The value of @outlen after return is the number of octets consumed.
2162
 */
2163
int
2164
htmlEncodeEntities(unsigned char* out, int *outlen,
2165
0
       const unsigned char* in, int *inlen, int quoteChar) {
2166
0
    const unsigned char* processed = in;
2167
0
    const unsigned char* outend;
2168
0
    const unsigned char* outstart = out;
2169
0
    const unsigned char* instart = in;
2170
0
    const unsigned char* inend;
2171
0
    unsigned int c, d;
2172
0
    int trailing;
2173
2174
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2175
0
        return(-1);
2176
0
    outend = out + (*outlen);
2177
0
    inend = in + (*inlen);
2178
0
    while (in < inend) {
2179
0
  d = *in++;
2180
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2181
0
  else if (d < 0xC0) {
2182
      /* trailing byte in leading position */
2183
0
      *outlen = out - outstart;
2184
0
      *inlen = processed - instart;
2185
0
      return(-2);
2186
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2187
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2188
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2189
0
  else {
2190
      /* no chance for this in Ascii */
2191
0
      *outlen = out - outstart;
2192
0
      *inlen = processed - instart;
2193
0
      return(-2);
2194
0
  }
2195
2196
0
  if (inend - in < trailing)
2197
0
      break;
2198
2199
0
  while (trailing--) {
2200
0
      if (((d= *in++) & 0xC0) != 0x80) {
2201
0
    *outlen = out - outstart;
2202
0
    *inlen = processed - instart;
2203
0
    return(-2);
2204
0
      }
2205
0
      c <<= 6;
2206
0
      c |= d & 0x3F;
2207
0
  }
2208
2209
  /* assertion: c is a single UTF-4 value */
2210
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2211
0
      (c != '&') && (c != '<') && (c != '>')) {
2212
0
      if (out >= outend)
2213
0
    break;
2214
0
      *out++ = c;
2215
0
  } else {
2216
0
      const htmlEntityDesc * ent;
2217
0
      const char *cp;
2218
0
      char nbuf[16];
2219
0
      int len;
2220
2221
      /*
2222
       * Try to lookup a predefined HTML entity for it
2223
       */
2224
0
      ent = htmlEntityValueLookup(c);
2225
0
      if (ent == NULL) {
2226
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2227
0
    cp = nbuf;
2228
0
      }
2229
0
      else
2230
0
    cp = ent->name;
2231
0
      len = strlen(cp);
2232
0
      if (outend - out < len + 2)
2233
0
    break;
2234
0
      *out++ = '&';
2235
0
      memcpy(out, cp, len);
2236
0
      out += len;
2237
0
      *out++ = ';';
2238
0
  }
2239
0
  processed = in;
2240
0
    }
2241
0
    *outlen = out - outstart;
2242
0
    *inlen = processed - instart;
2243
0
    return(0);
2244
0
}
2245
2246
/************************************************************************
2247
 *                  *
2248
 *    Commodity functions, cleanup needed ?     *
2249
 *                  *
2250
 ************************************************************************/
2251
/*
2252
 * all tags allowing pc data from the html 4.01 loose dtd
2253
 * NOTE: it might be more appropriate to integrate this information
2254
 * into the html40ElementTable array but I don't want to risk any
2255
 * binary incompatibility
2256
 */
2257
static const char *allowPCData[] = {
2258
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2259
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2260
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2261
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2262
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2263
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2264
};
2265
2266
/**
2267
 * areBlanks:
2268
 * @ctxt:  an HTML parser context
2269
 * @str:  a xmlChar *
2270
 * @len:  the size of @str
2271
 *
2272
 * Is this a sequence of blank chars that one can ignore ?
2273
 *
2274
 * Returns 1 if ignorable 0 otherwise.
2275
 */
2276
2277
0
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2278
0
    unsigned int i;
2279
0
    int j;
2280
0
    xmlNodePtr lastChild;
2281
0
    xmlDtdPtr dtd;
2282
2283
0
    for (j = 0;j < len;j++)
2284
0
        if (!(IS_BLANK_CH(str[j]))) return(0);
2285
2286
0
    if (CUR == 0) return(1);
2287
0
    if (CUR != '<') return(0);
2288
0
    if (ctxt->name == NULL)
2289
0
  return(1);
2290
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2291
0
  return(1);
2292
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2293
0
  return(1);
2294
2295
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2296
0
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2297
0
        dtd = xmlGetIntSubset(ctxt->myDoc);
2298
0
        if (dtd != NULL && dtd->ExternalID != NULL) {
2299
0
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2300
0
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2301
0
                return(1);
2302
0
        }
2303
0
    }
2304
2305
0
    if (ctxt->node == NULL) return(0);
2306
0
    lastChild = xmlGetLastChild(ctxt->node);
2307
0
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2308
0
  lastChild = lastChild->prev;
2309
0
    if (lastChild == NULL) {
2310
0
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2311
0
            (ctxt->node->content != NULL)) return(0);
2312
  /* keep ws in constructs like ...<b> </b>...
2313
     for all tags "b" allowing PCDATA */
2314
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2315
0
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2316
0
    return(0);
2317
0
      }
2318
0
  }
2319
0
    } else if (xmlNodeIsText(lastChild)) {
2320
0
        return(0);
2321
0
    } else {
2322
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2323
     for all tags "p" allowing PCDATA */
2324
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2325
0
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2326
0
    return(0);
2327
0
      }
2328
0
  }
2329
0
    }
2330
0
    return(1);
2331
0
}
2332
2333
/**
2334
 * htmlNewDocNoDtD:
2335
 * @URI:  URI for the dtd, or NULL
2336
 * @ExternalID:  the external ID of the DTD, or NULL
2337
 *
2338
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2339
 * are NULL
2340
 *
2341
 * Returns a new document, do not initialize the DTD if not provided
2342
 */
2343
htmlDocPtr
2344
0
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2345
0
    xmlDocPtr cur;
2346
2347
    /*
2348
     * Allocate a new document and fill the fields.
2349
     */
2350
0
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2351
0
    if (cur == NULL)
2352
0
  return(NULL);
2353
0
    memset(cur, 0, sizeof(xmlDoc));
2354
2355
0
    cur->type = XML_HTML_DOCUMENT_NODE;
2356
0
    cur->version = NULL;
2357
0
    cur->intSubset = NULL;
2358
0
    cur->doc = cur;
2359
0
    cur->name = NULL;
2360
0
    cur->children = NULL;
2361
0
    cur->extSubset = NULL;
2362
0
    cur->oldNs = NULL;
2363
0
    cur->encoding = NULL;
2364
0
    cur->standalone = 1;
2365
0
    cur->compression = 0;
2366
0
    cur->ids = NULL;
2367
0
    cur->refs = NULL;
2368
0
    cur->_private = NULL;
2369
0
    cur->charset = XML_CHAR_ENCODING_UTF8;
2370
0
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2371
0
    if ((ExternalID != NULL) ||
2372
0
  (URI != NULL)) {
2373
0
        xmlDtdPtr intSubset;
2374
2375
0
  intSubset = xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2376
0
        if (intSubset == NULL) {
2377
0
            xmlFree(cur);
2378
0
            return(NULL);
2379
0
        }
2380
0
    }
2381
0
    if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2382
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2383
0
    return(cur);
2384
0
}
2385
2386
/**
2387
 * htmlNewDoc:
2388
 * @URI:  URI for the dtd, or NULL
2389
 * @ExternalID:  the external ID of the DTD, or NULL
2390
 *
2391
 * Creates a new HTML document
2392
 *
2393
 * Returns a new document
2394
 */
2395
htmlDocPtr
2396
0
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2397
0
    if ((URI == NULL) && (ExternalID == NULL))
2398
0
  return(htmlNewDocNoDtD(
2399
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2400
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2401
2402
0
    return(htmlNewDocNoDtD(URI, ExternalID));
2403
0
}
2404
2405
2406
/************************************************************************
2407
 *                  *
2408
 *      The parser itself       *
2409
 *  Relates to http://www.w3.org/TR/html40        *
2410
 *                  *
2411
 ************************************************************************/
2412
2413
/************************************************************************
2414
 *                  *
2415
 *      The parser itself       *
2416
 *                  *
2417
 ************************************************************************/
2418
2419
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2420
2421
static void
2422
0
htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2423
0
    int c;
2424
2425
0
    htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2426
0
                 "Incorrectly opened comment\n", NULL, NULL);
2427
2428
0
    while (PARSER_STOPPED(ctxt) == 0) {
2429
0
        c = CUR;
2430
0
        if (c == 0)
2431
0
            break;
2432
0
        NEXT;
2433
0
        if (c == '>')
2434
0
            break;
2435
0
    }
2436
0
}
2437
2438
/**
2439
 * htmlParseHTMLName:
2440
 * @ctxt:  an HTML parser context
2441
 *
2442
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2443
 * since HTML names are not case-sensitive.
2444
 *
2445
 * Returns the Tag Name parsed or NULL
2446
 */
2447
2448
static const xmlChar *
2449
0
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2450
0
    const xmlChar *ret;
2451
0
    int i = 0;
2452
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2453
2454
0
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2455
0
        (CUR != ':') && (CUR != '.')) return(NULL);
2456
2457
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2458
0
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2459
0
     (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2460
0
           (CUR == '.'))) {
2461
0
  if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2462
0
        else loc[i] = CUR;
2463
0
  i++;
2464
2465
0
  NEXT;
2466
0
    }
2467
2468
0
    ret = xmlDictLookup(ctxt->dict, loc, i);
2469
0
    if (ret == NULL)
2470
0
        htmlErrMemory(ctxt);
2471
2472
0
    return(ret);
2473
0
}
2474
2475
2476
/**
2477
 * htmlParseHTMLName_nonInvasive:
2478
 * @ctxt:  an HTML parser context
2479
 *
2480
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2481
 * since HTML names are not case-sensitive, this doesn't consume the data
2482
 * from the stream, it's a look-ahead
2483
 *
2484
 * Returns the Tag Name parsed or NULL
2485
 */
2486
2487
static const xmlChar *
2488
0
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2489
0
    int i = 0;
2490
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2491
0
    const xmlChar *ret;
2492
2493
0
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2494
0
        (NXT(1) != ':')) return(NULL);
2495
2496
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2497
0
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2498
0
     (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2499
0
  if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2500
0
        else loc[i] = NXT(1+i);
2501
0
  i++;
2502
0
    }
2503
2504
0
    ret = xmlDictLookup(ctxt->dict, loc, i);
2505
0
    if (ret == NULL)
2506
0
        htmlErrMemory(ctxt);
2507
2508
0
    return(ret);
2509
0
}
2510
2511
2512
/**
2513
 * htmlParseName:
2514
 * @ctxt:  an HTML parser context
2515
 *
2516
 * parse an HTML name, this routine is case sensitive.
2517
 *
2518
 * Returns the Name parsed or NULL
2519
 */
2520
2521
static const xmlChar *
2522
0
htmlParseName(htmlParserCtxtPtr ctxt) {
2523
0
    const xmlChar *in;
2524
0
    const xmlChar *ret;
2525
0
    int count = 0;
2526
2527
0
    GROW;
2528
2529
    /*
2530
     * Accelerator for simple ASCII names
2531
     */
2532
0
    in = ctxt->input->cur;
2533
0
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2534
0
  ((*in >= 0x41) && (*in <= 0x5A)) ||
2535
0
  (*in == '_') || (*in == ':')) {
2536
0
  in++;
2537
0
  while (((*in >= 0x61) && (*in <= 0x7A)) ||
2538
0
         ((*in >= 0x41) && (*in <= 0x5A)) ||
2539
0
         ((*in >= 0x30) && (*in <= 0x39)) ||
2540
0
         (*in == '_') || (*in == '-') ||
2541
0
         (*in == ':') || (*in == '.'))
2542
0
      in++;
2543
2544
0
  if (in == ctxt->input->end)
2545
0
      return(NULL);
2546
2547
0
  if ((*in > 0) && (*in < 0x80)) {
2548
0
      count = in - ctxt->input->cur;
2549
0
      ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2550
0
            if (ret == NULL)
2551
0
                htmlErrMemory(ctxt);
2552
0
      ctxt->input->cur = in;
2553
0
      ctxt->input->col += count;
2554
0
      return(ret);
2555
0
  }
2556
0
    }
2557
0
    return(htmlParseNameComplex(ctxt));
2558
0
}
2559
2560
static const xmlChar *
2561
0
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2562
0
    int len = 0, l;
2563
0
    int c;
2564
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2565
0
                    XML_MAX_TEXT_LENGTH :
2566
0
                    XML_MAX_NAME_LENGTH;
2567
0
    const xmlChar *base = ctxt->input->base;
2568
0
    const xmlChar *ret;
2569
2570
    /*
2571
     * Handler for more complex cases
2572
     */
2573
0
    c = CUR_CHAR(l);
2574
0
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2575
0
  (!IS_LETTER(c) && (c != '_') &&
2576
0
         (c != ':'))) {
2577
0
  return(NULL);
2578
0
    }
2579
2580
0
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2581
0
     ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2582
0
            (c == '.') || (c == '-') ||
2583
0
      (c == '_') || (c == ':') ||
2584
0
      (IS_COMBINING(c)) ||
2585
0
      (IS_EXTENDER(c)))) {
2586
0
  len += l;
2587
0
        if (len > maxLength) {
2588
0
            htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2589
0
            return(NULL);
2590
0
        }
2591
0
  NEXTL(l);
2592
0
  c = CUR_CHAR(l);
2593
0
  if (ctxt->input->base != base) {
2594
      /*
2595
       * We changed encoding from an unknown encoding
2596
       * Input buffer changed location, so we better start again
2597
       */
2598
0
      return(htmlParseNameComplex(ctxt));
2599
0
  }
2600
0
    }
2601
2602
0
    if (ctxt->input->cur - ctxt->input->base < len) {
2603
        /* Sanity check */
2604
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2605
0
                     "unexpected change of input buffer", NULL, NULL);
2606
0
        return (NULL);
2607
0
    }
2608
2609
0
    ret = xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len);
2610
0
    if (ret == NULL)
2611
0
        htmlErrMemory(ctxt);
2612
2613
0
    return(ret);
2614
0
}
2615
2616
2617
/**
2618
 * htmlParseHTMLAttribute:
2619
 * @ctxt:  an HTML parser context
2620
 * @stop:  a char stop value
2621
 *
2622
 * parse an HTML attribute value till the stop (quote), if
2623
 * stop is 0 then it stops at the first space
2624
 *
2625
 * Returns the attribute parsed or NULL
2626
 */
2627
2628
static xmlChar *
2629
0
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2630
0
    xmlChar *buffer = NULL;
2631
0
    int buffer_size = 0;
2632
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2633
0
                    XML_MAX_HUGE_LENGTH :
2634
0
                    XML_MAX_TEXT_LENGTH;
2635
0
    xmlChar *out = NULL;
2636
0
    const xmlChar *name = NULL;
2637
0
    const xmlChar *cur = NULL;
2638
0
    const htmlEntityDesc * ent;
2639
2640
    /*
2641
     * allocate a translation buffer.
2642
     */
2643
0
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2644
0
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2645
0
    if (buffer == NULL) {
2646
0
  htmlErrMemory(ctxt);
2647
0
  return(NULL);
2648
0
    }
2649
0
    out = buffer;
2650
2651
    /*
2652
     * Ok loop until we reach one of the ending chars
2653
     */
2654
0
    while ((PARSER_STOPPED(ctxt) == 0) &&
2655
0
           (CUR != 0) && (CUR != stop)) {
2656
0
  if ((stop == 0) && (CUR == '>')) break;
2657
0
  if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2658
0
        if (CUR == '&') {
2659
0
      if (NXT(1) == '#') {
2660
0
    unsigned int c;
2661
0
    int bits;
2662
2663
0
    c = htmlParseCharRef(ctxt);
2664
0
    if      (c <    0x80)
2665
0
            { *out++  = c;                bits= -6; }
2666
0
    else if (c <   0x800)
2667
0
            { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2668
0
    else if (c < 0x10000)
2669
0
            { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2670
0
    else
2671
0
            { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2672
2673
0
    for ( ; bits >= 0; bits-= 6) {
2674
0
        *out++  = ((c >> bits) & 0x3F) | 0x80;
2675
0
    }
2676
2677
0
    if (out - buffer > buffer_size - 100) {
2678
0
      int indx = out - buffer;
2679
2680
0
      growBuffer(buffer);
2681
0
      out = &buffer[indx];
2682
0
    }
2683
0
      } else {
2684
0
    ent = htmlParseEntityRef(ctxt, &name);
2685
0
    if (name == NULL) {
2686
0
        *out++ = '&';
2687
0
        if (out - buffer > buffer_size - 100) {
2688
0
      int indx = out - buffer;
2689
2690
0
      growBuffer(buffer);
2691
0
      out = &buffer[indx];
2692
0
        }
2693
0
    } else if (ent == NULL) {
2694
0
        *out++ = '&';
2695
0
        cur = name;
2696
0
        while (*cur != 0) {
2697
0
      if (out - buffer > buffer_size - 100) {
2698
0
          int indx = out - buffer;
2699
2700
0
          growBuffer(buffer);
2701
0
          out = &buffer[indx];
2702
0
      }
2703
0
      *out++ = *cur++;
2704
0
        }
2705
0
    } else {
2706
0
        unsigned int c;
2707
0
        int bits;
2708
2709
0
        if (out - buffer > buffer_size - 100) {
2710
0
      int indx = out - buffer;
2711
2712
0
      growBuffer(buffer);
2713
0
      out = &buffer[indx];
2714
0
        }
2715
0
        c = ent->value;
2716
0
        if      (c <    0x80)
2717
0
      { *out++  = c;                bits= -6; }
2718
0
        else if (c <   0x800)
2719
0
      { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2720
0
        else if (c < 0x10000)
2721
0
      { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2722
0
        else
2723
0
      { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2724
2725
0
        for ( ; bits >= 0; bits-= 6) {
2726
0
      *out++  = ((c >> bits) & 0x3F) | 0x80;
2727
0
        }
2728
0
    }
2729
0
      }
2730
0
  } else {
2731
0
      unsigned int c;
2732
0
      int bits, l;
2733
2734
0
      if (out - buffer > buffer_size - 100) {
2735
0
    int indx = out - buffer;
2736
2737
0
    growBuffer(buffer);
2738
0
    out = &buffer[indx];
2739
0
      }
2740
0
      c = CUR_CHAR(l);
2741
0
      if      (c <    0x80)
2742
0
        { *out++  = c;                bits= -6; }
2743
0
      else if (c <   0x800)
2744
0
        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2745
0
      else if (c < 0x10000)
2746
0
        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2747
0
      else
2748
0
        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2749
2750
0
      for ( ; bits >= 0; bits-= 6) {
2751
0
    *out++  = ((c >> bits) & 0x3F) | 0x80;
2752
0
      }
2753
0
      NEXTL(l);
2754
0
  }
2755
0
        if (out - buffer > maxLength) {
2756
0
            htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2757
0
                         "attribute value too long\n", NULL, NULL);
2758
0
            xmlFree(buffer);
2759
0
            return(NULL);
2760
0
        }
2761
0
    }
2762
0
    *out = 0;
2763
0
    return(buffer);
2764
0
}
2765
2766
/**
2767
 * htmlParseEntityRef:
2768
 * @ctxt:  an HTML parser context
2769
 * @str:  location to store the entity name
2770
 *
2771
 * DEPRECATED: Internal function, don't use.
2772
 *
2773
 * parse an HTML ENTITY references
2774
 *
2775
 * [68] EntityRef ::= '&' Name ';'
2776
 *
2777
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2778
 *         if non-NULL *str will have to be freed by the caller.
2779
 */
2780
const htmlEntityDesc *
2781
0
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2782
0
    const xmlChar *name;
2783
0
    const htmlEntityDesc * ent = NULL;
2784
2785
0
    if (str != NULL) *str = NULL;
2786
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2787
2788
0
    if (CUR == '&') {
2789
0
        NEXT;
2790
0
        name = htmlParseName(ctxt);
2791
0
  if (name == NULL) {
2792
0
      htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2793
0
                   "htmlParseEntityRef: no name\n", NULL, NULL);
2794
0
  } else {
2795
0
      GROW;
2796
0
      if (CUR == ';') {
2797
0
          if (str != NULL)
2798
0
        *str = name;
2799
2800
    /*
2801
     * Lookup the entity in the table.
2802
     */
2803
0
    ent = htmlEntityLookup(name);
2804
0
    if (ent != NULL) /* OK that's ugly !!! */
2805
0
        NEXT;
2806
0
      } else {
2807
0
    htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2808
0
                 "htmlParseEntityRef: expecting ';'\n",
2809
0
           NULL, NULL);
2810
0
          if (str != NULL)
2811
0
        *str = name;
2812
0
      }
2813
0
  }
2814
0
    }
2815
0
    return(ent);
2816
0
}
2817
2818
/**
2819
 * htmlParseAttValue:
2820
 * @ctxt:  an HTML parser context
2821
 *
2822
 * parse a value for an attribute
2823
 * Note: the parser won't do substitution of entities here, this
2824
 * will be handled later in xmlStringGetNodeList, unless it was
2825
 * asked for ctxt->replaceEntities != 0
2826
 *
2827
 * Returns the AttValue parsed or NULL.
2828
 */
2829
2830
static xmlChar *
2831
0
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2832
0
    xmlChar *ret = NULL;
2833
2834
0
    if (CUR == '"') {
2835
0
        NEXT;
2836
0
  ret = htmlParseHTMLAttribute(ctxt, '"');
2837
0
        if (CUR != '"') {
2838
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2839
0
                   "AttValue: \" expected\n", NULL, NULL);
2840
0
  } else
2841
0
      NEXT;
2842
0
    } else if (CUR == '\'') {
2843
0
        NEXT;
2844
0
  ret = htmlParseHTMLAttribute(ctxt, '\'');
2845
0
        if (CUR != '\'') {
2846
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2847
0
                   "AttValue: ' expected\n", NULL, NULL);
2848
0
  } else
2849
0
      NEXT;
2850
0
    } else {
2851
        /*
2852
   * That's an HTMLism, the attribute value may not be quoted
2853
   */
2854
0
  ret = htmlParseHTMLAttribute(ctxt, 0);
2855
0
  if (ret == NULL) {
2856
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2857
0
                   "AttValue: no value found\n", NULL, NULL);
2858
0
  }
2859
0
    }
2860
0
    return(ret);
2861
0
}
2862
2863
/**
2864
 * htmlParseSystemLiteral:
2865
 * @ctxt:  an HTML parser context
2866
 *
2867
 * parse an HTML Literal
2868
 *
2869
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2870
 *
2871
 * Returns the SystemLiteral parsed or NULL
2872
 */
2873
2874
static xmlChar *
2875
0
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2876
0
    size_t len = 0, startPosition = 0;
2877
0
    int err = 0;
2878
0
    int quote;
2879
0
    xmlChar *ret = NULL;
2880
2881
0
    if ((CUR != '"') && (CUR != '\'')) {
2882
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2883
0
               "SystemLiteral \" or ' expected\n", NULL, NULL);
2884
0
        return(NULL);
2885
0
    }
2886
0
    quote = CUR;
2887
0
    NEXT;
2888
2889
0
    if (CUR_PTR < BASE_PTR)
2890
0
        return(ret);
2891
0
    startPosition = CUR_PTR - BASE_PTR;
2892
2893
0
    while ((PARSER_STOPPED(ctxt) == 0) &&
2894
0
           (CUR != 0) && (CUR != quote)) {
2895
        /* TODO: Handle UTF-8 */
2896
0
        if (!IS_CHAR_CH(CUR)) {
2897
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2898
0
                            "Invalid char in SystemLiteral 0x%X\n", CUR);
2899
0
            err = 1;
2900
0
        }
2901
0
        NEXT;
2902
0
        len++;
2903
0
    }
2904
0
    if (CUR != quote) {
2905
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2906
0
                     "Unfinished SystemLiteral\n", NULL, NULL);
2907
0
    } else {
2908
0
        if (err == 0) {
2909
0
            ret = xmlStrndup((BASE_PTR+startPosition), len);
2910
0
            if (ret == NULL) {
2911
0
                htmlErrMemory(ctxt);
2912
0
                return(NULL);
2913
0
            }
2914
0
        }
2915
0
        NEXT;
2916
0
    }
2917
2918
0
    return(ret);
2919
0
}
2920
2921
/**
2922
 * htmlParsePubidLiteral:
2923
 * @ctxt:  an HTML parser context
2924
 *
2925
 * parse an HTML public literal
2926
 *
2927
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2928
 *
2929
 * Returns the PubidLiteral parsed or NULL.
2930
 */
2931
2932
static xmlChar *
2933
0
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2934
0
    size_t len = 0, startPosition = 0;
2935
0
    int err = 0;
2936
0
    int quote;
2937
0
    xmlChar *ret = NULL;
2938
2939
0
    if ((CUR != '"') && (CUR != '\'')) {
2940
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2941
0
               "PubidLiteral \" or ' expected\n", NULL, NULL);
2942
0
        return(NULL);
2943
0
    }
2944
0
    quote = CUR;
2945
0
    NEXT;
2946
2947
    /*
2948
     * Name ::= (Letter | '_') (NameChar)*
2949
     */
2950
0
    if (CUR_PTR < BASE_PTR)
2951
0
        return(ret);
2952
0
    startPosition = CUR_PTR - BASE_PTR;
2953
2954
0
    while ((PARSER_STOPPED(ctxt) == 0) &&
2955
0
           (CUR != 0) && (CUR != quote)) {
2956
0
        if (!IS_PUBIDCHAR_CH(CUR)) {
2957
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2958
0
                            "Invalid char in PubidLiteral 0x%X\n", CUR);
2959
0
            err = 1;
2960
0
        }
2961
0
        len++;
2962
0
        NEXT;
2963
0
    }
2964
2965
0
    if (CUR != quote) {
2966
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2967
0
                     "Unfinished PubidLiteral\n", NULL, NULL);
2968
0
    } else {
2969
0
        if (err == 0) {
2970
0
            ret = xmlStrndup((BASE_PTR + startPosition), len);
2971
0
            if (ret == NULL) {
2972
0
                htmlErrMemory(ctxt);
2973
0
                return(NULL);
2974
0
            }
2975
0
        }
2976
0
        NEXT;
2977
0
    }
2978
2979
0
    return(ret);
2980
0
}
2981
2982
/**
2983
 * htmlParseScript:
2984
 * @ctxt:  an HTML parser context
2985
 *
2986
 * parse the content of an HTML SCRIPT or STYLE element
2987
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2988
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2989
 * http://www.w3.org/TR/html4/types.html#type-script
2990
 * http://www.w3.org/TR/html4/types.html#h-6.15
2991
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2992
 *
2993
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2994
 * element and the value of intrinsic event attributes. User agents must
2995
 * not evaluate script data as HTML markup but instead must pass it on as
2996
 * data to a script engine.
2997
 * NOTES:
2998
 * - The content is passed like CDATA
2999
 * - the attributes for style and scripting "onXXX" are also described
3000
 *   as CDATA but SGML allows entities references in attributes so their
3001
 *   processing is identical as other attributes
3002
 */
3003
static void
3004
0
htmlParseScript(htmlParserCtxtPtr ctxt) {
3005
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3006
0
    int nbchar = 0;
3007
0
    int cur,l;
3008
3009
0
    cur = CUR_CHAR(l);
3010
0
    while (cur != 0) {
3011
0
  if ((cur == '<') && (NXT(1) == '/')) {
3012
            /*
3013
             * One should break here, the specification is clear:
3014
             * Authors should therefore escape "</" within the content.
3015
             * Escape mechanisms are specific to each scripting or
3016
             * style sheet language.
3017
             *
3018
             * In recovery mode, only break if end tag match the
3019
             * current tag, effectively ignoring all tags inside the
3020
             * script/style block and treating the entire block as
3021
             * CDATA.
3022
             */
3023
0
            if (ctxt->recovery) {
3024
0
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3025
0
           xmlStrlen(ctxt->name)) == 0)
3026
0
                {
3027
0
                    break; /* while */
3028
0
                } else {
3029
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3030
0
         "Element %s embeds close tag\n",
3031
0
                     ctxt->name, NULL);
3032
0
    }
3033
0
            } else {
3034
0
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3035
0
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3036
0
                {
3037
0
                    break; /* while */
3038
0
                }
3039
0
            }
3040
0
  }
3041
0
        if (IS_CHAR(cur)) {
3042
0
      COPY_BUF(l,buf,nbchar,cur);
3043
0
        } else {
3044
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3045
0
                            "Invalid char in CDATA 0x%X\n", cur);
3046
0
        }
3047
0
  NEXTL(l);
3048
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3049
0
            buf[nbchar] = 0;
3050
0
      if (ctxt->sax->cdataBlock!= NULL) {
3051
    /*
3052
     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3053
     */
3054
0
    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3055
0
      } else if (ctxt->sax->characters != NULL) {
3056
0
    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3057
0
      }
3058
0
      nbchar = 0;
3059
0
            SHRINK;
3060
0
  }
3061
0
  cur = CUR_CHAR(l);
3062
0
    }
3063
3064
0
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3065
0
        buf[nbchar] = 0;
3066
0
  if (ctxt->sax->cdataBlock!= NULL) {
3067
      /*
3068
       * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3069
       */
3070
0
      ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3071
0
  } else if (ctxt->sax->characters != NULL) {
3072
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3073
0
  }
3074
0
    }
3075
0
}
3076
3077
3078
/**
3079
 * htmlParseCharDataInternal:
3080
 * @ctxt:  an HTML parser context
3081
 * @readahead: optional read ahead character in ascii range
3082
 *
3083
 * parse a CharData section.
3084
 * if we are within a CDATA section ']]>' marks an end of section.
3085
 *
3086
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3087
 */
3088
3089
static void
3090
0
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3091
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3092
0
    int nbchar = 0;
3093
0
    int cur, l;
3094
3095
0
    if (readahead)
3096
0
        buf[nbchar++] = readahead;
3097
3098
0
    cur = CUR_CHAR(l);
3099
0
    while ((cur != '<') &&
3100
0
           (cur != '&') &&
3101
0
     (cur != 0) &&
3102
0
           (!PARSER_STOPPED(ctxt))) {
3103
0
  if (!(IS_CHAR(cur))) {
3104
0
      htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3105
0
                  "Invalid char in CDATA 0x%X\n", cur);
3106
0
  } else {
3107
0
      COPY_BUF(l,buf,nbchar,cur);
3108
0
  }
3109
0
  NEXTL(l);
3110
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3111
0
            buf[nbchar] = 0;
3112
3113
      /*
3114
       * Ok the segment is to be consumed as chars.
3115
       */
3116
0
      if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3117
0
    if (areBlanks(ctxt, buf, nbchar)) {
3118
0
        if (ctxt->keepBlanks) {
3119
0
      if (ctxt->sax->characters != NULL)
3120
0
          ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121
0
        } else {
3122
0
      if (ctxt->sax->ignorableWhitespace != NULL)
3123
0
          ctxt->sax->ignorableWhitespace(ctxt->userData,
3124
0
                                         buf, nbchar);
3125
0
        }
3126
0
    } else {
3127
0
        htmlCheckParagraph(ctxt);
3128
0
        if (ctxt->sax->characters != NULL)
3129
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3130
0
    }
3131
0
      }
3132
0
      nbchar = 0;
3133
0
            SHRINK;
3134
0
  }
3135
0
  cur = CUR_CHAR(l);
3136
0
    }
3137
0
    if (nbchar != 0) {
3138
0
        buf[nbchar] = 0;
3139
3140
  /*
3141
   * Ok the segment is to be consumed as chars.
3142
   */
3143
0
  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3144
0
      if (areBlanks(ctxt, buf, nbchar)) {
3145
0
    if (ctxt->keepBlanks) {
3146
0
        if (ctxt->sax->characters != NULL)
3147
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3148
0
    } else {
3149
0
        if (ctxt->sax->ignorableWhitespace != NULL)
3150
0
      ctxt->sax->ignorableWhitespace(ctxt->userData,
3151
0
                                     buf, nbchar);
3152
0
    }
3153
0
      } else {
3154
0
    htmlCheckParagraph(ctxt);
3155
0
    if (ctxt->sax->characters != NULL)
3156
0
        ctxt->sax->characters(ctxt->userData, buf, nbchar);
3157
0
      }
3158
0
  }
3159
0
    }
3160
0
}
3161
3162
/**
3163
 * htmlParseCharData:
3164
 * @ctxt:  an HTML parser context
3165
 *
3166
 * parse a CharData section.
3167
 * if we are within a CDATA section ']]>' marks an end of section.
3168
 *
3169
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3170
 */
3171
3172
static void
3173
0
htmlParseCharData(htmlParserCtxtPtr ctxt) {
3174
0
    htmlParseCharDataInternal(ctxt, 0);
3175
0
}
3176
3177
/**
3178
 * htmlParseExternalID:
3179
 * @ctxt:  an HTML parser context
3180
 * @publicID:  a xmlChar** receiving PubidLiteral
3181
 *
3182
 * Parse an External ID or a Public ID
3183
 *
3184
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3185
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3186
 *
3187
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3188
 *
3189
 * Returns the function returns SystemLiteral and in the second
3190
 *                case publicID receives PubidLiteral, is strict is off
3191
 *                it is possible to return NULL and have publicID set.
3192
 */
3193
3194
static xmlChar *
3195
0
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3196
0
    xmlChar *URI = NULL;
3197
3198
0
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3199
0
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3200
0
   (UPP(4) == 'E') && (UPP(5) == 'M')) {
3201
0
        SKIP(6);
3202
0
  if (!IS_BLANK_CH(CUR)) {
3203
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3204
0
                   "Space required after 'SYSTEM'\n", NULL, NULL);
3205
0
  }
3206
0
        SKIP_BLANKS;
3207
0
  URI = htmlParseSystemLiteral(ctxt);
3208
0
  if (URI == NULL) {
3209
0
      htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3210
0
                   "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3211
0
        }
3212
0
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3213
0
         (UPP(2) == 'B') && (UPP(3) == 'L') &&
3214
0
         (UPP(4) == 'I') && (UPP(5) == 'C')) {
3215
0
        SKIP(6);
3216
0
  if (!IS_BLANK_CH(CUR)) {
3217
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3218
0
                   "Space required after 'PUBLIC'\n", NULL, NULL);
3219
0
  }
3220
0
        SKIP_BLANKS;
3221
0
  *publicID = htmlParsePubidLiteral(ctxt);
3222
0
  if (*publicID == NULL) {
3223
0
      htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3224
0
                   "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3225
0
       NULL, NULL);
3226
0
  }
3227
0
        SKIP_BLANKS;
3228
0
        if ((CUR == '"') || (CUR == '\'')) {
3229
0
      URI = htmlParseSystemLiteral(ctxt);
3230
0
  }
3231
0
    }
3232
0
    return(URI);
3233
0
}
3234
3235
/**
3236
 * htmlParsePI:
3237
 * @ctxt:  an HTML parser context
3238
 *
3239
 * Parse an XML Processing Instruction. HTML5 doesn't allow processing
3240
 * instructions, so this will be removed at some point.
3241
 */
3242
static void
3243
0
htmlParsePI(htmlParserCtxtPtr ctxt) {
3244
0
    xmlChar *buf = NULL;
3245
0
    int len = 0;
3246
0
    int size = HTML_PARSER_BUFFER_SIZE;
3247
0
    int cur, l;
3248
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3249
0
                    XML_MAX_HUGE_LENGTH :
3250
0
                    XML_MAX_TEXT_LENGTH;
3251
0
    const xmlChar *target;
3252
0
    xmlParserInputState state;
3253
3254
0
    if ((RAW == '<') && (NXT(1) == '?')) {
3255
0
  state = ctxt->instate;
3256
0
        ctxt->instate = XML_PARSER_PI;
3257
  /*
3258
   * this is a Processing Instruction.
3259
   */
3260
0
  SKIP(2);
3261
3262
  /*
3263
   * Parse the target name and check for special support like
3264
   * namespace.
3265
   */
3266
0
        target = htmlParseName(ctxt);
3267
0
  if (target != NULL) {
3268
0
      if (RAW == '>') {
3269
0
    SKIP(1);
3270
3271
    /*
3272
     * SAX: PI detected.
3273
     */
3274
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3275
0
        (ctxt->sax->processingInstruction != NULL))
3276
0
        ctxt->sax->processingInstruction(ctxt->userData,
3277
0
                                         target, NULL);
3278
0
                goto done;
3279
0
      }
3280
0
      buf = (xmlChar *) xmlMallocAtomic(size);
3281
0
      if (buf == NULL) {
3282
0
    htmlErrMemory(ctxt);
3283
0
    return;
3284
0
      }
3285
0
      cur = CUR;
3286
0
      if (!IS_BLANK(cur)) {
3287
0
    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3288
0
        "ParsePI: PI %s space expected\n", target, NULL);
3289
0
      }
3290
0
            SKIP_BLANKS;
3291
0
      cur = CUR_CHAR(l);
3292
0
      while ((cur != 0) && (cur != '>')) {
3293
0
    if (len + 5 >= size) {
3294
0
        xmlChar *tmp;
3295
3296
0
        size *= 2;
3297
0
        tmp = (xmlChar *) xmlRealloc(buf, size);
3298
0
        if (tmp == NULL) {
3299
0
      htmlErrMemory(ctxt);
3300
0
      xmlFree(buf);
3301
0
      return;
3302
0
        }
3303
0
        buf = tmp;
3304
0
    }
3305
0
                if (IS_CHAR(cur)) {
3306
0
        COPY_BUF(l,buf,len,cur);
3307
0
                } else {
3308
0
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3309
0
                                    "Invalid char in processing instruction "
3310
0
                                    "0x%X\n", cur);
3311
0
                }
3312
0
                if (len > maxLength) {
3313
0
                    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3314
0
                                 "PI %s too long", target, NULL);
3315
0
                    xmlFree(buf);
3316
0
                    goto done;
3317
0
                }
3318
0
    NEXTL(l);
3319
0
    cur = CUR_CHAR(l);
3320
0
      }
3321
0
      buf[len] = 0;
3322
0
      if (cur != '>') {
3323
0
    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3324
0
          "ParsePI: PI %s never end ...\n", target, NULL);
3325
0
      } else {
3326
0
    SKIP(1);
3327
3328
    /*
3329
     * SAX: PI detected.
3330
     */
3331
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3332
0
        (ctxt->sax->processingInstruction != NULL))
3333
0
        ctxt->sax->processingInstruction(ctxt->userData,
3334
0
                                         target, buf);
3335
0
      }
3336
0
      xmlFree(buf);
3337
0
  } else {
3338
0
      htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3339
0
                         "PI is not started correctly", NULL, NULL);
3340
0
  }
3341
3342
0
done:
3343
0
  ctxt->instate = state;
3344
0
    }
3345
0
}
3346
3347
/**
3348
 * htmlParseComment:
3349
 * @ctxt:  an HTML parser context
3350
 *
3351
 * Parse an HTML comment
3352
 */
3353
static void
3354
0
htmlParseComment(htmlParserCtxtPtr ctxt) {
3355
0
    xmlChar *buf = NULL;
3356
0
    int len;
3357
0
    int size = HTML_PARSER_BUFFER_SIZE;
3358
0
    int q, ql;
3359
0
    int r, rl;
3360
0
    int cur, l;
3361
0
    int next, nl;
3362
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3363
0
                    XML_MAX_HUGE_LENGTH :
3364
0
                    XML_MAX_TEXT_LENGTH;
3365
0
    xmlParserInputState state;
3366
3367
    /*
3368
     * Check that there is a comment right here.
3369
     */
3370
0
    if ((RAW != '<') || (NXT(1) != '!') ||
3371
0
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3372
3373
0
    state = ctxt->instate;
3374
0
    ctxt->instate = XML_PARSER_COMMENT;
3375
0
    SKIP(4);
3376
0
    buf = (xmlChar *) xmlMallocAtomic(size);
3377
0
    if (buf == NULL) {
3378
0
        htmlErrMemory(ctxt);
3379
0
  return;
3380
0
    }
3381
0
    len = 0;
3382
0
    buf[len] = 0;
3383
0
    q = CUR_CHAR(ql);
3384
0
    if (q == 0)
3385
0
        goto unfinished;
3386
0
    if (q == '>') {
3387
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3388
0
        cur = '>';
3389
0
        goto finished;
3390
0
    }
3391
0
    NEXTL(ql);
3392
0
    r = CUR_CHAR(rl);
3393
0
    if (r == 0)
3394
0
        goto unfinished;
3395
0
    if (q == '-' && r == '>') {
3396
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3397
0
        cur = '>';
3398
0
        goto finished;
3399
0
    }
3400
0
    NEXTL(rl);
3401
0
    cur = CUR_CHAR(l);
3402
0
    while ((cur != 0) &&
3403
0
           ((cur != '>') ||
3404
0
      (r != '-') || (q != '-'))) {
3405
0
  NEXTL(l);
3406
0
  next = CUR_CHAR(nl);
3407
3408
0
  if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3409
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3410
0
           "Comment incorrectly closed by '--!>'", NULL, NULL);
3411
0
    cur = '>';
3412
0
    break;
3413
0
  }
3414
3415
0
  if (len + 5 >= size) {
3416
0
      xmlChar *tmp;
3417
3418
0
      size *= 2;
3419
0
      tmp = (xmlChar *) xmlRealloc(buf, size);
3420
0
      if (tmp == NULL) {
3421
0
          xmlFree(buf);
3422
0
          htmlErrMemory(ctxt);
3423
0
    return;
3424
0
      }
3425
0
      buf = tmp;
3426
0
  }
3427
0
        if (IS_CHAR(q)) {
3428
0
      COPY_BUF(ql,buf,len,q);
3429
0
        } else {
3430
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3431
0
                            "Invalid char in comment 0x%X\n", q);
3432
0
        }
3433
0
        if (len > maxLength) {
3434
0
            htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3435
0
                         "comment too long", NULL, NULL);
3436
0
            xmlFree(buf);
3437
0
            ctxt->instate = state;
3438
0
            return;
3439
0
        }
3440
3441
0
  q = r;
3442
0
  ql = rl;
3443
0
  r = cur;
3444
0
  rl = l;
3445
0
  cur = next;
3446
0
  l = nl;
3447
0
    }
3448
0
finished:
3449
0
    buf[len] = 0;
3450
0
    if (cur == '>') {
3451
0
        NEXT;
3452
0
  if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3453
0
      (!ctxt->disableSAX))
3454
0
      ctxt->sax->comment(ctxt->userData, buf);
3455
0
  xmlFree(buf);
3456
0
  ctxt->instate = state;
3457
0
  return;
3458
0
    }
3459
3460
0
unfinished:
3461
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3462
0
     "Comment not terminated \n<!--%.50s\n", buf, NULL);
3463
0
    xmlFree(buf);
3464
0
}
3465
3466
/**
3467
 * htmlParseCharRef:
3468
 * @ctxt:  an HTML parser context
3469
 *
3470
 * DEPRECATED: Internal function, don't use.
3471
 *
3472
 * parse Reference declarations
3473
 *
3474
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3475
 *                  '&#x' [0-9a-fA-F]+ ';'
3476
 *
3477
 * Returns the value parsed (as an int)
3478
 */
3479
int
3480
0
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3481
0
    int val = 0;
3482
3483
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
3484
0
        return(0);
3485
0
    if ((CUR == '&') && (NXT(1) == '#') &&
3486
0
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3487
0
  SKIP(3);
3488
0
  while (CUR != ';') {
3489
0
      if ((CUR >= '0') && (CUR <= '9')) {
3490
0
                if (val < 0x110000)
3491
0
              val = val * 16 + (CUR - '0');
3492
0
            } else if ((CUR >= 'a') && (CUR <= 'f')) {
3493
0
                if (val < 0x110000)
3494
0
              val = val * 16 + (CUR - 'a') + 10;
3495
0
            } else if ((CUR >= 'A') && (CUR <= 'F')) {
3496
0
                if (val < 0x110000)
3497
0
              val = val * 16 + (CUR - 'A') + 10;
3498
0
            } else {
3499
0
          htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3500
0
                 "htmlParseCharRef: missing semicolon\n",
3501
0
           NULL, NULL);
3502
0
    break;
3503
0
      }
3504
0
      NEXT;
3505
0
  }
3506
0
  if (CUR == ';')
3507
0
      NEXT;
3508
0
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3509
0
  SKIP(2);
3510
0
  while (CUR != ';') {
3511
0
      if ((CUR >= '0') && (CUR <= '9')) {
3512
0
                if (val < 0x110000)
3513
0
              val = val * 10 + (CUR - '0');
3514
0
            } else {
3515
0
          htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3516
0
                 "htmlParseCharRef: missing semicolon\n",
3517
0
           NULL, NULL);
3518
0
    break;
3519
0
      }
3520
0
      NEXT;
3521
0
  }
3522
0
  if (CUR == ';')
3523
0
      NEXT;
3524
0
    } else {
3525
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3526
0
               "htmlParseCharRef: invalid value\n", NULL, NULL);
3527
0
    }
3528
    /*
3529
     * Check the value IS_CHAR ...
3530
     */
3531
0
    if (IS_CHAR(val)) {
3532
0
        return(val);
3533
0
    } else if (val >= 0x110000) {
3534
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3535
0
         "htmlParseCharRef: value too large\n", NULL, NULL);
3536
0
    } else {
3537
0
  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3538
0
      "htmlParseCharRef: invalid xmlChar value %d\n",
3539
0
      val);
3540
0
    }
3541
0
    return(0);
3542
0
}
3543
3544
3545
/**
3546
 * htmlParseDocTypeDecl:
3547
 * @ctxt:  an HTML parser context
3548
 *
3549
 * parse a DOCTYPE declaration
3550
 *
3551
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3552
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3553
 */
3554
3555
static void
3556
0
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3557
0
    const xmlChar *name;
3558
0
    xmlChar *ExternalID = NULL;
3559
0
    xmlChar *URI = NULL;
3560
3561
    /*
3562
     * We know that '<!DOCTYPE' has been detected.
3563
     */
3564
0
    SKIP(9);
3565
3566
0
    SKIP_BLANKS;
3567
3568
    /*
3569
     * Parse the DOCTYPE name.
3570
     */
3571
0
    name = htmlParseName(ctxt);
3572
0
    if (name == NULL) {
3573
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3574
0
               "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3575
0
         NULL, NULL);
3576
0
    }
3577
    /*
3578
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3579
     */
3580
3581
0
    SKIP_BLANKS;
3582
3583
    /*
3584
     * Check for SystemID and ExternalID
3585
     */
3586
0
    URI = htmlParseExternalID(ctxt, &ExternalID);
3587
0
    SKIP_BLANKS;
3588
3589
    /*
3590
     * We should be at the end of the DOCTYPE declaration.
3591
     */
3592
0
    if (CUR != '>') {
3593
0
  htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3594
0
               "DOCTYPE improperly terminated\n", NULL, NULL);
3595
        /* Ignore bogus content */
3596
0
        while ((CUR != 0) && (CUR != '>') &&
3597
0
               (PARSER_STOPPED(ctxt) == 0))
3598
0
            NEXT;
3599
0
    }
3600
0
    if (CUR == '>')
3601
0
        NEXT;
3602
3603
    /*
3604
     * Create or update the document accordingly to the DOCTYPE
3605
     */
3606
0
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3607
0
  (!ctxt->disableSAX))
3608
0
  ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3609
3610
    /*
3611
     * Cleanup, since we don't use all those identifiers
3612
     */
3613
0
    if (URI != NULL) xmlFree(URI);
3614
0
    if (ExternalID != NULL) xmlFree(ExternalID);
3615
0
}
3616
3617
/**
3618
 * htmlParseAttribute:
3619
 * @ctxt:  an HTML parser context
3620
 * @value:  a xmlChar ** used to store the value of the attribute
3621
 *
3622
 * parse an attribute
3623
 *
3624
 * [41] Attribute ::= Name Eq AttValue
3625
 *
3626
 * [25] Eq ::= S? '=' S?
3627
 *
3628
 * With namespace:
3629
 *
3630
 * [NS 11] Attribute ::= QName Eq AttValue
3631
 *
3632
 * Also the case QName == xmlns:??? is handled independently as a namespace
3633
 * definition.
3634
 *
3635
 * Returns the attribute name, and the value in *value.
3636
 */
3637
3638
static const xmlChar *
3639
0
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3640
0
    const xmlChar *name;
3641
0
    xmlChar *val = NULL;
3642
3643
0
    *value = NULL;
3644
0
    name = htmlParseHTMLName(ctxt);
3645
0
    if (name == NULL) {
3646
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3647
0
               "error parsing attribute name\n", NULL, NULL);
3648
0
        return(NULL);
3649
0
    }
3650
3651
    /*
3652
     * read the value
3653
     */
3654
0
    SKIP_BLANKS;
3655
0
    if (CUR == '=') {
3656
0
        NEXT;
3657
0
  SKIP_BLANKS;
3658
0
  val = htmlParseAttValue(ctxt);
3659
0
    }
3660
3661
0
    *value = val;
3662
0
    return(name);
3663
0
}
3664
3665
/**
3666
 * htmlCheckEncoding:
3667
 * @ctxt:  an HTML parser context
3668
 * @attvalue: the attribute value
3669
 *
3670
 * Checks an http-equiv attribute from a Meta tag to detect
3671
 * the encoding
3672
 * If a new encoding is detected the parser is switched to decode
3673
 * it and pass UTF8
3674
 */
3675
static void
3676
0
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3677
0
    const xmlChar *encoding;
3678
0
    xmlChar *copy;
3679
3680
0
    if (!attvalue)
3681
0
  return;
3682
3683
0
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3684
0
    if (encoding != NULL) {
3685
0
  encoding += 7;
3686
0
    }
3687
    /*
3688
     * skip blank
3689
     */
3690
0
    if (encoding && IS_BLANK_CH(*encoding))
3691
0
  encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3692
0
    if (encoding && *encoding == '=') {
3693
0
  encoding ++;
3694
0
        copy = xmlStrdup(encoding);
3695
0
        if (copy == NULL)
3696
0
            htmlErrMemory(ctxt);
3697
0
  xmlSetDeclaredEncoding(ctxt, copy);
3698
0
    }
3699
0
}
3700
3701
/**
3702
 * htmlCheckMeta:
3703
 * @ctxt:  an HTML parser context
3704
 * @atts:  the attributes values
3705
 *
3706
 * Checks an attributes from a Meta tag
3707
 */
3708
static void
3709
0
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3710
0
    int i;
3711
0
    const xmlChar *att, *value;
3712
0
    int http = 0;
3713
0
    const xmlChar *content = NULL;
3714
3715
0
    if ((ctxt == NULL) || (atts == NULL))
3716
0
  return;
3717
3718
0
    i = 0;
3719
0
    att = atts[i++];
3720
0
    while (att != NULL) {
3721
0
  value = atts[i++];
3722
0
        if (value != NULL) {
3723
0
            if ((!xmlStrcasecmp(att, BAD_CAST "http-equiv")) &&
3724
0
                (!xmlStrcasecmp(value, BAD_CAST "Content-Type"))) {
3725
0
                http = 1;
3726
0
            } else if (!xmlStrcasecmp(att, BAD_CAST "charset")) {
3727
0
                xmlChar *copy;
3728
3729
0
                copy = xmlStrdup(value);
3730
0
                if (copy == NULL)
3731
0
                    htmlErrMemory(ctxt);
3732
0
                xmlSetDeclaredEncoding(ctxt, copy);
3733
0
            } else if (!xmlStrcasecmp(att, BAD_CAST "content")) {
3734
0
                content = value;
3735
0
            }
3736
0
        }
3737
0
  att = atts[i++];
3738
0
    }
3739
0
    if ((http) && (content != NULL))
3740
0
  htmlCheckEncoding(ctxt, content);
3741
3742
0
}
3743
3744
/**
3745
 * htmlParseStartTag:
3746
 * @ctxt:  an HTML parser context
3747
 *
3748
 * parse a start of tag either for rule element or
3749
 * EmptyElement. In both case we don't parse the tag closing chars.
3750
 *
3751
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3752
 *
3753
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3754
 *
3755
 * With namespace:
3756
 *
3757
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3758
 *
3759
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3760
 *
3761
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3762
 */
3763
3764
static int
3765
0
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3766
0
    const xmlChar *name;
3767
0
    const xmlChar *attname;
3768
0
    xmlChar *attvalue;
3769
0
    const xmlChar **atts;
3770
0
    int nbatts = 0;
3771
0
    int maxatts;
3772
0
    int meta = 0;
3773
0
    int i;
3774
0
    int discardtag = 0;
3775
3776
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
3777
0
  return -1;
3778
0
    if (CUR != '<') return -1;
3779
0
    NEXT;
3780
3781
0
    atts = ctxt->atts;
3782
0
    maxatts = ctxt->maxatts;
3783
3784
0
    GROW;
3785
0
    name = htmlParseHTMLName(ctxt);
3786
0
    if (name == NULL) {
3787
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3788
0
               "htmlParseStartTag: invalid element name\n",
3789
0
         NULL, NULL);
3790
  /* Dump the bogus tag like browsers do */
3791
0
  while ((CUR != 0) && (CUR != '>') &&
3792
0
               (PARSER_STOPPED(ctxt) == 0))
3793
0
      NEXT;
3794
0
        return -1;
3795
0
    }
3796
0
    if (xmlStrEqual(name, BAD_CAST"meta"))
3797
0
  meta = 1;
3798
3799
    /*
3800
     * Check for auto-closure of HTML elements.
3801
     */
3802
0
    htmlAutoClose(ctxt, name);
3803
3804
    /*
3805
     * Check for implied HTML elements.
3806
     */
3807
0
    htmlCheckImplied(ctxt, name);
3808
3809
    /*
3810
     * Avoid html at any level > 0, head at any level != 1
3811
     * or any attempt to recurse body
3812
     */
3813
0
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3814
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3815
0
               "htmlParseStartTag: misplaced <html> tag\n",
3816
0
         name, NULL);
3817
0
  discardtag = 1;
3818
0
  ctxt->depth++;
3819
0
    }
3820
0
    if ((ctxt->nameNr != 1) &&
3821
0
  (xmlStrEqual(name, BAD_CAST"head"))) {
3822
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3823
0
               "htmlParseStartTag: misplaced <head> tag\n",
3824
0
         name, NULL);
3825
0
  discardtag = 1;
3826
0
  ctxt->depth++;
3827
0
    }
3828
0
    if (xmlStrEqual(name, BAD_CAST"body")) {
3829
0
  int indx;
3830
0
  for (indx = 0;indx < ctxt->nameNr;indx++) {
3831
0
      if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3832
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3833
0
                 "htmlParseStartTag: misplaced <body> tag\n",
3834
0
           name, NULL);
3835
0
    discardtag = 1;
3836
0
    ctxt->depth++;
3837
0
      }
3838
0
  }
3839
0
    }
3840
3841
    /*
3842
     * Now parse the attributes, it ends up with the ending
3843
     *
3844
     * (S Attribute)* S?
3845
     */
3846
0
    SKIP_BLANKS;
3847
0
    while ((CUR != 0) &&
3848
0
           (CUR != '>') &&
3849
0
     ((CUR != '/') || (NXT(1) != '>')) &&
3850
0
           (PARSER_STOPPED(ctxt) == 0)) {
3851
0
  GROW;
3852
0
  attname = htmlParseAttribute(ctxt, &attvalue);
3853
0
        if (attname != NULL) {
3854
3855
      /*
3856
       * Well formedness requires at most one declaration of an attribute
3857
       */
3858
0
      for (i = 0; i < nbatts;i += 2) {
3859
0
          if (xmlStrEqual(atts[i], attname)) {
3860
0
        htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3861
0
                     "Attribute %s redefined\n", attname, NULL);
3862
0
        if (attvalue != NULL)
3863
0
      xmlFree(attvalue);
3864
0
        goto failed;
3865
0
    }
3866
0
      }
3867
3868
      /*
3869
       * Add the pair to atts
3870
       */
3871
0
      if (atts == NULL) {
3872
0
          maxatts = 22; /* allow for 10 attrs by default */
3873
0
          atts = (const xmlChar **)
3874
0
           xmlMalloc(maxatts * sizeof(xmlChar *));
3875
0
    if (atts == NULL) {
3876
0
        htmlErrMemory(ctxt);
3877
0
        if (attvalue != NULL)
3878
0
      xmlFree(attvalue);
3879
0
        goto failed;
3880
0
    }
3881
0
    ctxt->atts = atts;
3882
0
    ctxt->maxatts = maxatts;
3883
0
      } else if (nbatts + 4 > maxatts) {
3884
0
          const xmlChar **n;
3885
3886
0
          maxatts *= 2;
3887
0
          n = (const xmlChar **) xmlRealloc((void *) atts,
3888
0
               maxatts * sizeof(const xmlChar *));
3889
0
    if (n == NULL) {
3890
0
        htmlErrMemory(ctxt);
3891
0
        if (attvalue != NULL)
3892
0
      xmlFree(attvalue);
3893
0
        goto failed;
3894
0
    }
3895
0
    atts = n;
3896
0
    ctxt->atts = atts;
3897
0
    ctxt->maxatts = maxatts;
3898
0
      }
3899
0
      atts[nbatts++] = attname;
3900
0
      atts[nbatts++] = attvalue;
3901
0
      atts[nbatts] = NULL;
3902
0
      atts[nbatts + 1] = NULL;
3903
0
  }
3904
0
  else {
3905
0
      if (attvalue != NULL)
3906
0
          xmlFree(attvalue);
3907
      /* Dump the bogus attribute string up to the next blank or
3908
       * the end of the tag. */
3909
0
      while ((CUR != 0) &&
3910
0
             !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3911
0
       ((CUR != '/') || (NXT(1) != '>')) &&
3912
0
                   (PARSER_STOPPED(ctxt) == 0))
3913
0
    NEXT;
3914
0
  }
3915
3916
0
failed:
3917
0
  SKIP_BLANKS;
3918
0
    }
3919
3920
    /*
3921
     * Handle specific association to the META tag
3922
     */
3923
0
    if (meta && (nbatts != 0))
3924
0
  htmlCheckMeta(ctxt, atts);
3925
3926
    /*
3927
     * SAX: Start of Element !
3928
     */
3929
0
    if (!discardtag) {
3930
0
  htmlnamePush(ctxt, name);
3931
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3932
0
      if (nbatts != 0)
3933
0
    ctxt->sax->startElement(ctxt->userData, name, atts);
3934
0
      else
3935
0
    ctxt->sax->startElement(ctxt->userData, name, NULL);
3936
0
  }
3937
0
    }
3938
3939
0
    if (atts != NULL) {
3940
0
        for (i = 1;i < nbatts;i += 2) {
3941
0
      if (atts[i] != NULL)
3942
0
    xmlFree((xmlChar *) atts[i]);
3943
0
  }
3944
0
    }
3945
3946
0
    return(discardtag);
3947
0
}
3948
3949
/**
3950
 * htmlParseEndTag:
3951
 * @ctxt:  an HTML parser context
3952
 *
3953
 * parse an end of tag
3954
 *
3955
 * [42] ETag ::= '</' Name S? '>'
3956
 *
3957
 * With namespace
3958
 *
3959
 * [NS 9] ETag ::= '</' QName S? '>'
3960
 *
3961
 * Returns 1 if the current level should be closed.
3962
 */
3963
3964
static int
3965
htmlParseEndTag(htmlParserCtxtPtr ctxt)
3966
0
{
3967
0
    const xmlChar *name;
3968
0
    const xmlChar *oldname;
3969
0
    int i, ret;
3970
3971
0
    if ((CUR != '<') || (NXT(1) != '/')) {
3972
0
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3973
0
               "htmlParseEndTag: '</' not found\n", NULL, NULL);
3974
0
        return (0);
3975
0
    }
3976
0
    SKIP(2);
3977
3978
0
    name = htmlParseHTMLName(ctxt);
3979
0
    if (name == NULL)
3980
0
        return (0);
3981
    /*
3982
     * We should definitely be at the ending "S? '>'" part
3983
     */
3984
0
    SKIP_BLANKS;
3985
0
    if (CUR != '>') {
3986
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3987
0
               "End tag : expected '>'\n", NULL, NULL);
3988
        /* Skip to next '>' */
3989
0
        while ((PARSER_STOPPED(ctxt) == 0) &&
3990
0
               (CUR != 0) && (CUR != '>'))
3991
0
            NEXT;
3992
0
    }
3993
0
    if (CUR == '>')
3994
0
        NEXT;
3995
3996
    /*
3997
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
3998
     * out now.
3999
     */
4000
0
    if ((ctxt->depth > 0) &&
4001
0
        (xmlStrEqual(name, BAD_CAST "html") ||
4002
0
         xmlStrEqual(name, BAD_CAST "body") ||
4003
0
   xmlStrEqual(name, BAD_CAST "head"))) {
4004
0
  ctxt->depth--;
4005
0
  return (0);
4006
0
    }
4007
4008
    /*
4009
     * If the name read is not one of the element in the parsing stack
4010
     * then return, it's just an error.
4011
     */
4012
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4013
0
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4014
0
            break;
4015
0
    }
4016
0
    if (i < 0) {
4017
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4018
0
               "Unexpected end tag : %s\n", name, NULL);
4019
0
        return (0);
4020
0
    }
4021
4022
4023
    /*
4024
     * Check for auto-closure of HTML elements.
4025
     */
4026
4027
0
    htmlAutoCloseOnClose(ctxt, name);
4028
4029
    /*
4030
     * Well formedness constraints, opening and closing must match.
4031
     * With the exception that the autoclose may have popped stuff out
4032
     * of the stack.
4033
     */
4034
0
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4035
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4036
0
                     "Opening and ending tag mismatch: %s and %s\n",
4037
0
                     name, ctxt->name);
4038
0
    }
4039
4040
    /*
4041
     * SAX: End of Tag
4042
     */
4043
0
    oldname = ctxt->name;
4044
0
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4045
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4046
0
            ctxt->sax->endElement(ctxt->userData, name);
4047
0
  htmlNodeInfoPop(ctxt);
4048
0
        htmlnamePop(ctxt);
4049
0
        ret = 1;
4050
0
    } else {
4051
0
        ret = 0;
4052
0
    }
4053
4054
0
    return (ret);
4055
0
}
4056
4057
4058
/**
4059
 * htmlParseReference:
4060
 * @ctxt:  an HTML parser context
4061
 *
4062
 * parse and handle entity references in content,
4063
 * this will end-up in a call to character() since this is either a
4064
 * CharRef, or a predefined entity.
4065
 */
4066
static void
4067
0
htmlParseReference(htmlParserCtxtPtr ctxt) {
4068
0
    const htmlEntityDesc * ent;
4069
0
    xmlChar out[6];
4070
0
    const xmlChar *name;
4071
0
    if (CUR != '&') return;
4072
4073
0
    if (NXT(1) == '#') {
4074
0
  unsigned int c;
4075
0
  int bits, i = 0;
4076
4077
0
  c = htmlParseCharRef(ctxt);
4078
0
  if (c == 0)
4079
0
      return;
4080
4081
0
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
4082
0
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4083
0
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4084
0
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4085
4086
0
        for ( ; bits >= 0; bits-= 6) {
4087
0
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
4088
0
        }
4089
0
  out[i] = 0;
4090
4091
0
  htmlCheckParagraph(ctxt);
4092
0
  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4093
0
      ctxt->sax->characters(ctxt->userData, out, i);
4094
0
    } else {
4095
0
  ent = htmlParseEntityRef(ctxt, &name);
4096
0
  if (name == NULL) {
4097
0
      htmlCheckParagraph(ctxt);
4098
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4099
0
          ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4100
0
      return;
4101
0
  }
4102
0
  if ((ent == NULL) || !(ent->value > 0)) {
4103
0
      htmlCheckParagraph(ctxt);
4104
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4105
0
    ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4106
0
    ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4107
    /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4108
0
      }
4109
0
  } else {
4110
0
      unsigned int c;
4111
0
      int bits, i = 0;
4112
4113
0
      c = ent->value;
4114
0
      if      (c <    0x80)
4115
0
              { out[i++]= c;                bits= -6; }
4116
0
      else if (c <   0x800)
4117
0
              { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4118
0
      else if (c < 0x10000)
4119
0
              { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4120
0
      else
4121
0
              { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4122
4123
0
      for ( ; bits >= 0; bits-= 6) {
4124
0
    out[i++]= ((c >> bits) & 0x3F) | 0x80;
4125
0
      }
4126
0
      out[i] = 0;
4127
4128
0
      htmlCheckParagraph(ctxt);
4129
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4130
0
    ctxt->sax->characters(ctxt->userData, out, i);
4131
0
  }
4132
0
    }
4133
0
}
4134
4135
/**
4136
 * htmlParseContent:
4137
 * @ctxt:  an HTML parser context
4138
 *
4139
 * Parse a content: comment, sub-element, reference or text.
4140
 * Kept for compatibility with old code
4141
 */
4142
4143
static void
4144
0
htmlParseContent(htmlParserCtxtPtr ctxt) {
4145
0
    xmlChar *currentNode;
4146
0
    int depth;
4147
0
    const xmlChar *name;
4148
4149
0
    currentNode = xmlStrdup(ctxt->name);
4150
0
    depth = ctxt->nameNr;
4151
0
    while (!PARSER_STOPPED(ctxt)) {
4152
0
        GROW;
4153
4154
  /*
4155
   * Our tag or one of it's parent or children is ending.
4156
   */
4157
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4158
0
      if (htmlParseEndTag(ctxt) &&
4159
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4160
0
    if (currentNode != NULL)
4161
0
        xmlFree(currentNode);
4162
0
    return;
4163
0
      }
4164
0
      continue; /* while */
4165
0
        }
4166
4167
0
  else if ((CUR == '<') &&
4168
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4169
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4170
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4171
0
      if (name == NULL) {
4172
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4173
0
       "htmlParseStartTag: invalid element name\n",
4174
0
       NULL, NULL);
4175
          /* Dump the bogus tag like browsers do */
4176
0
                while ((CUR != 0) && (CUR != '>'))
4177
0
              NEXT;
4178
4179
0
          if (currentNode != NULL)
4180
0
              xmlFree(currentNode);
4181
0
          return;
4182
0
      }
4183
4184
0
      if (ctxt->name != NULL) {
4185
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4186
0
              htmlAutoClose(ctxt, name);
4187
0
              continue;
4188
0
          }
4189
0
      }
4190
0
  }
4191
4192
  /*
4193
   * Has this node been popped out during parsing of
4194
   * the next element
4195
   */
4196
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4197
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4198
0
       {
4199
0
      if (currentNode != NULL) xmlFree(currentNode);
4200
0
      return;
4201
0
  }
4202
4203
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4204
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4205
      /*
4206
       * Handle SCRIPT/STYLE separately
4207
       */
4208
0
      htmlParseScript(ctxt);
4209
0
  }
4210
4211
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4212
            /*
4213
             * Sometimes DOCTYPE arrives in the middle of the document
4214
             */
4215
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4216
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4217
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4218
0
                (UPP(8) == 'E')) {
4219
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4220
0
                             "Misplaced DOCTYPE declaration\n",
4221
0
                             BAD_CAST "DOCTYPE" , NULL);
4222
0
                htmlParseDocTypeDecl(ctxt);
4223
0
            }
4224
            /*
4225
             * First case :  a comment
4226
             */
4227
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4228
0
                htmlParseComment(ctxt);
4229
0
            }
4230
0
            else {
4231
0
                htmlSkipBogusComment(ctxt);
4232
0
            }
4233
0
        }
4234
4235
        /*
4236
         * Second case : a Processing Instruction.
4237
         */
4238
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4239
0
            htmlParsePI(ctxt);
4240
0
        }
4241
4242
        /*
4243
         * Third case :  a sub-element.
4244
         */
4245
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4246
0
            htmlParseElement(ctxt);
4247
0
        }
4248
0
        else if (CUR == '<') {
4249
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4250
0
                (ctxt->sax->characters != NULL))
4251
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4252
0
            NEXT;
4253
0
        }
4254
4255
        /*
4256
         * Fourth case : a reference. If if has not been resolved,
4257
         *    parsing returns it's Name, create the node
4258
         */
4259
0
        else if (CUR == '&') {
4260
0
            htmlParseReference(ctxt);
4261
0
        }
4262
4263
        /*
4264
         * Fifth case : end of the resource
4265
         */
4266
0
        else if (CUR == 0) {
4267
0
            htmlAutoCloseOnEnd(ctxt);
4268
0
            break;
4269
0
        }
4270
4271
        /*
4272
         * Last case, text. Note that References are handled directly.
4273
         */
4274
0
        else {
4275
0
            htmlParseCharData(ctxt);
4276
0
        }
4277
4278
0
        SHRINK;
4279
0
        GROW;
4280
0
    }
4281
0
    if (currentNode != NULL) xmlFree(currentNode);
4282
0
}
4283
4284
/**
4285
 * htmlParseElement:
4286
 * @ctxt:  an HTML parser context
4287
 *
4288
 * DEPRECATED: Internal function, don't use.
4289
 *
4290
 * parse an HTML element, this is highly recursive
4291
 * this is kept for compatibility with previous code versions
4292
 *
4293
 * [39] element ::= EmptyElemTag | STag content ETag
4294
 *
4295
 * [41] Attribute ::= Name Eq AttValue
4296
 */
4297
4298
void
4299
0
htmlParseElement(htmlParserCtxtPtr ctxt) {
4300
0
    const xmlChar *name;
4301
0
    xmlChar *currentNode = NULL;
4302
0
    const htmlElemDesc * info;
4303
0
    htmlParserNodeInfo node_info;
4304
0
    int failed;
4305
0
    int depth;
4306
0
    const xmlChar *oldptr;
4307
4308
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4309
0
  return;
4310
4311
    /* Capture start position */
4312
0
    if (ctxt->record_info) {
4313
0
        node_info.begin_pos = ctxt->input->consumed +
4314
0
                          (CUR_PTR - ctxt->input->base);
4315
0
  node_info.begin_line = ctxt->input->line;
4316
0
    }
4317
4318
0
    failed = htmlParseStartTag(ctxt);
4319
0
    name = ctxt->name;
4320
0
    if ((failed == -1) || (name == NULL)) {
4321
0
  if (CUR == '>')
4322
0
      NEXT;
4323
0
        return;
4324
0
    }
4325
4326
    /*
4327
     * Lookup the info for that element.
4328
     */
4329
0
    info = htmlTagLookup(name);
4330
0
    if (info == NULL) {
4331
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4332
0
               "Tag %s invalid\n", name, NULL);
4333
0
    }
4334
4335
    /*
4336
     * Check for an Empty Element labeled the XML/SGML way
4337
     */
4338
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4339
0
        SKIP(2);
4340
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4341
0
      ctxt->sax->endElement(ctxt->userData, name);
4342
0
  htmlnamePop(ctxt);
4343
0
  return;
4344
0
    }
4345
4346
0
    if (CUR == '>') {
4347
0
        NEXT;
4348
0
    } else {
4349
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4350
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4351
4352
  /*
4353
   * end of parsing of this node.
4354
   */
4355
0
  if (xmlStrEqual(name, ctxt->name)) {
4356
0
      nodePop(ctxt);
4357
0
      htmlnamePop(ctxt);
4358
0
  }
4359
4360
  /*
4361
   * Capture end position and add node
4362
   */
4363
0
  if (ctxt->record_info) {
4364
0
     node_info.end_pos = ctxt->input->consumed +
4365
0
            (CUR_PTR - ctxt->input->base);
4366
0
     node_info.end_line = ctxt->input->line;
4367
0
     node_info.node = ctxt->node;
4368
0
     xmlParserAddNodeInfo(ctxt, &node_info);
4369
0
  }
4370
0
  return;
4371
0
    }
4372
4373
    /*
4374
     * Check for an Empty Element from DTD definition
4375
     */
4376
0
    if ((info != NULL) && (info->empty)) {
4377
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4378
0
      ctxt->sax->endElement(ctxt->userData, name);
4379
0
  htmlnamePop(ctxt);
4380
0
  return;
4381
0
    }
4382
4383
    /*
4384
     * Parse the content of the element:
4385
     */
4386
0
    currentNode = xmlStrdup(ctxt->name);
4387
0
    depth = ctxt->nameNr;
4388
0
    while (CUR != 0) {
4389
0
  oldptr = ctxt->input->cur;
4390
0
  htmlParseContent(ctxt);
4391
0
  if (oldptr==ctxt->input->cur) break;
4392
0
  if (ctxt->nameNr < depth) break;
4393
0
    }
4394
4395
    /*
4396
     * Capture end position and add node
4397
     */
4398
0
    if ( currentNode != NULL && ctxt->record_info ) {
4399
0
       node_info.end_pos = ctxt->input->consumed +
4400
0
                          (CUR_PTR - ctxt->input->base);
4401
0
       node_info.end_line = ctxt->input->line;
4402
0
       node_info.node = ctxt->node;
4403
0
       xmlParserAddNodeInfo(ctxt, &node_info);
4404
0
    }
4405
0
    if (CUR == 0) {
4406
0
  htmlAutoCloseOnEnd(ctxt);
4407
0
    }
4408
4409
0
    if (currentNode != NULL)
4410
0
  xmlFree(currentNode);
4411
0
}
4412
4413
static void
4414
0
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4415
    /*
4416
     * Capture end position and add node
4417
     */
4418
0
    if ( ctxt->node != NULL && ctxt->record_info ) {
4419
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4420
0
                                (CUR_PTR - ctxt->input->base);
4421
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
4422
0
       ctxt->nodeInfo->node = ctxt->node;
4423
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4424
0
       htmlNodeInfoPop(ctxt);
4425
0
    }
4426
0
    if (CUR == 0) {
4427
0
       htmlAutoCloseOnEnd(ctxt);
4428
0
    }
4429
0
}
4430
4431
/**
4432
 * htmlParseElementInternal:
4433
 * @ctxt:  an HTML parser context
4434
 *
4435
 * parse an HTML element, new version, non recursive
4436
 *
4437
 * [39] element ::= EmptyElemTag | STag content ETag
4438
 *
4439
 * [41] Attribute ::= Name Eq AttValue
4440
 */
4441
4442
static void
4443
0
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4444
0
    const xmlChar *name;
4445
0
    const htmlElemDesc * info;
4446
0
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4447
0
    int failed;
4448
4449
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4450
0
  return;
4451
4452
    /* Capture start position */
4453
0
    if (ctxt->record_info) {
4454
0
        node_info.begin_pos = ctxt->input->consumed +
4455
0
                          (CUR_PTR - ctxt->input->base);
4456
0
  node_info.begin_line = ctxt->input->line;
4457
0
    }
4458
4459
0
    failed = htmlParseStartTag(ctxt);
4460
0
    name = ctxt->name;
4461
0
    if ((failed == -1) || (name == NULL)) {
4462
0
  if (CUR == '>')
4463
0
      NEXT;
4464
0
        return;
4465
0
    }
4466
4467
    /*
4468
     * Lookup the info for that element.
4469
     */
4470
0
    info = htmlTagLookup(name);
4471
0
    if (info == NULL) {
4472
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4473
0
               "Tag %s invalid\n", name, NULL);
4474
0
    }
4475
4476
    /*
4477
     * Check for an Empty Element labeled the XML/SGML way
4478
     */
4479
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4480
0
        SKIP(2);
4481
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4482
0
      ctxt->sax->endElement(ctxt->userData, name);
4483
0
  htmlnamePop(ctxt);
4484
0
  return;
4485
0
    }
4486
4487
0
    if (CUR == '>') {
4488
0
        NEXT;
4489
0
    } else {
4490
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4491
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4492
4493
  /*
4494
   * end of parsing of this node.
4495
   */
4496
0
  if (xmlStrEqual(name, ctxt->name)) {
4497
0
      nodePop(ctxt);
4498
0
      htmlnamePop(ctxt);
4499
0
  }
4500
4501
0
        if (ctxt->record_info)
4502
0
            htmlNodeInfoPush(ctxt, &node_info);
4503
0
        htmlParserFinishElementParsing(ctxt);
4504
0
  return;
4505
0
    }
4506
4507
    /*
4508
     * Check for an Empty Element from DTD definition
4509
     */
4510
0
    if ((info != NULL) && (info->empty)) {
4511
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4512
0
      ctxt->sax->endElement(ctxt->userData, name);
4513
0
  htmlnamePop(ctxt);
4514
0
  return;
4515
0
    }
4516
4517
0
    if (ctxt->record_info)
4518
0
        htmlNodeInfoPush(ctxt, &node_info);
4519
0
}
4520
4521
/**
4522
 * htmlParseContentInternal:
4523
 * @ctxt:  an HTML parser context
4524
 *
4525
 * Parse a content: comment, sub-element, reference or text.
4526
 * New version for non recursive htmlParseElementInternal
4527
 */
4528
4529
static void
4530
0
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4531
0
    xmlChar *currentNode;
4532
0
    int depth;
4533
0
    const xmlChar *name;
4534
4535
0
    depth = ctxt->nameNr;
4536
0
    if (depth <= 0) {
4537
0
        currentNode = NULL;
4538
0
    } else {
4539
0
        currentNode = xmlStrdup(ctxt->name);
4540
0
        if (currentNode == NULL) {
4541
0
            htmlErrMemory(ctxt);
4542
0
            return;
4543
0
        }
4544
0
    }
4545
0
    while (PARSER_STOPPED(ctxt) == 0) {
4546
0
        GROW;
4547
4548
  /*
4549
   * Our tag or one of it's parent or children is ending.
4550
   */
4551
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4552
0
      if (htmlParseEndTag(ctxt) &&
4553
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4554
0
    if (currentNode != NULL)
4555
0
        xmlFree(currentNode);
4556
4557
0
          depth = ctxt->nameNr;
4558
0
                if (depth <= 0) {
4559
0
                    currentNode = NULL;
4560
0
                } else {
4561
0
                    currentNode = xmlStrdup(ctxt->name);
4562
0
                    if (currentNode == NULL) {
4563
0
                        htmlErrMemory(ctxt);
4564
0
                        break;
4565
0
                    }
4566
0
                }
4567
0
      }
4568
0
      continue; /* while */
4569
0
        }
4570
4571
0
  else if ((CUR == '<') &&
4572
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4573
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4574
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4575
0
      if (name == NULL) {
4576
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4577
0
       "htmlParseStartTag: invalid element name\n",
4578
0
       NULL, NULL);
4579
          /* Dump the bogus tag like browsers do */
4580
0
          while ((CUR == 0) && (CUR != '>'))
4581
0
              NEXT;
4582
4583
0
          htmlParserFinishElementParsing(ctxt);
4584
0
          if (currentNode != NULL)
4585
0
              xmlFree(currentNode);
4586
4587
0
                if (ctxt->name == NULL) {
4588
0
                    currentNode = NULL;
4589
0
                } else {
4590
0
                    currentNode = xmlStrdup(ctxt->name);
4591
0
                    if (currentNode == NULL) {
4592
0
                        htmlErrMemory(ctxt);
4593
0
                        break;
4594
0
                    }
4595
0
                }
4596
0
          depth = ctxt->nameNr;
4597
0
          continue;
4598
0
      }
4599
4600
0
      if (ctxt->name != NULL) {
4601
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4602
0
              htmlAutoClose(ctxt, name);
4603
0
              continue;
4604
0
          }
4605
0
      }
4606
0
  }
4607
4608
  /*
4609
   * Has this node been popped out during parsing of
4610
   * the next element
4611
   */
4612
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4613
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4614
0
       {
4615
0
      htmlParserFinishElementParsing(ctxt);
4616
0
      if (currentNode != NULL) xmlFree(currentNode);
4617
4618
0
            if (ctxt->name == NULL) {
4619
0
                currentNode = NULL;
4620
0
            } else {
4621
0
                currentNode = xmlStrdup(ctxt->name);
4622
0
                if (currentNode == NULL) {
4623
0
                    htmlErrMemory(ctxt);
4624
0
                    break;
4625
0
                }
4626
0
            }
4627
0
      depth = ctxt->nameNr;
4628
0
      continue;
4629
0
  }
4630
4631
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4632
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4633
      /*
4634
       * Handle SCRIPT/STYLE separately
4635
       */
4636
0
      htmlParseScript(ctxt);
4637
0
  }
4638
4639
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4640
            /*
4641
             * Sometimes DOCTYPE arrives in the middle of the document
4642
             */
4643
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4644
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4645
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4646
0
                (UPP(8) == 'E')) {
4647
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4648
0
                             "Misplaced DOCTYPE declaration\n",
4649
0
                             BAD_CAST "DOCTYPE" , NULL);
4650
0
                htmlParseDocTypeDecl(ctxt);
4651
0
            }
4652
            /*
4653
             * First case :  a comment
4654
             */
4655
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4656
0
                htmlParseComment(ctxt);
4657
0
            }
4658
0
            else {
4659
0
                htmlSkipBogusComment(ctxt);
4660
0
            }
4661
0
        }
4662
4663
        /*
4664
         * Second case : a Processing Instruction.
4665
         */
4666
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4667
0
            htmlParsePI(ctxt);
4668
0
        }
4669
4670
        /*
4671
         * Third case :  a sub-element.
4672
         */
4673
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4674
0
            htmlParseElementInternal(ctxt);
4675
0
            if (currentNode != NULL) xmlFree(currentNode);
4676
4677
0
            if (ctxt->name == NULL) {
4678
0
                currentNode = NULL;
4679
0
            } else {
4680
0
                currentNode = xmlStrdup(ctxt->name);
4681
0
                if (currentNode == NULL) {
4682
0
                    htmlErrMemory(ctxt);
4683
0
                    break;
4684
0
                }
4685
0
            }
4686
0
            depth = ctxt->nameNr;
4687
0
        }
4688
0
        else if (CUR == '<') {
4689
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4690
0
                (ctxt->sax->characters != NULL))
4691
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4692
0
            NEXT;
4693
0
        }
4694
4695
        /*
4696
         * Fourth case : a reference. If if has not been resolved,
4697
         *    parsing returns it's Name, create the node
4698
         */
4699
0
        else if (CUR == '&') {
4700
0
            htmlParseReference(ctxt);
4701
0
        }
4702
4703
        /*
4704
         * Fifth case : end of the resource
4705
         */
4706
0
        else if (CUR == 0) {
4707
0
            htmlAutoCloseOnEnd(ctxt);
4708
0
            break;
4709
0
        }
4710
4711
        /*
4712
         * Last case, text. Note that References are handled directly.
4713
         */
4714
0
        else {
4715
0
            htmlParseCharData(ctxt);
4716
0
        }
4717
4718
0
        SHRINK;
4719
0
        GROW;
4720
0
    }
4721
0
    if (currentNode != NULL) xmlFree(currentNode);
4722
0
}
4723
4724
/**
4725
 * htmlParseContent:
4726
 * @ctxt:  an HTML parser context
4727
 *
4728
 * Parse a content: comment, sub-element, reference or text.
4729
 * This is the entry point when called from parser.c
4730
 */
4731
4732
void
4733
0
__htmlParseContent(void *ctxt) {
4734
0
    if (ctxt != NULL)
4735
0
  htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4736
0
}
4737
4738
/**
4739
 * htmlParseDocument:
4740
 * @ctxt:  an HTML parser context
4741
 *
4742
 * Parse an HTML document and invoke the SAX handlers. This is useful
4743
 * if you're only interested in custom SAX callbacks. If you want a
4744
 * document tree, use htmlCtxtParseDocument.
4745
 *
4746
 * Returns 0, -1 in case of error.
4747
 */
4748
4749
int
4750
0
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4751
0
    xmlDtdPtr dtd;
4752
4753
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
4754
0
  return(-1);
4755
4756
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
4757
0
        ctxt->sax->setDocumentLocator(ctxt->userData,
4758
0
                (xmlSAXLocator *) &xmlDefaultSAXLocator);
4759
0
    }
4760
4761
0
    xmlDetectEncoding(ctxt);
4762
4763
    /*
4764
     * This is wrong but matches long-standing behavior. In most cases,
4765
     * a document starting with an XML declaration will specify UTF-8.
4766
     */
4767
0
    if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4768
0
        (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4769
0
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4770
4771
    /*
4772
     * Wipe out everything which is before the first '<'
4773
     */
4774
0
    SKIP_BLANKS;
4775
0
    if (CUR == 0) {
4776
0
  htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4777
0
               "Document is empty\n", NULL, NULL);
4778
0
    }
4779
4780
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4781
0
  ctxt->sax->startDocument(ctxt->userData);
4782
4783
    /*
4784
     * Parse possible comments and PIs before any content
4785
     */
4786
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4787
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4788
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4789
0
        htmlParseComment(ctxt);
4790
0
        htmlParsePI(ctxt);
4791
0
  SKIP_BLANKS;
4792
0
    }
4793
4794
4795
    /*
4796
     * Then possibly doc type declaration(s) and more Misc
4797
     * (doctypedecl Misc*)?
4798
     */
4799
0
    if ((CUR == '<') && (NXT(1) == '!') &&
4800
0
  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4801
0
  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4802
0
  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4803
0
  (UPP(8) == 'E')) {
4804
0
  htmlParseDocTypeDecl(ctxt);
4805
0
    }
4806
0
    SKIP_BLANKS;
4807
4808
    /*
4809
     * Parse possible comments and PIs before any content
4810
     */
4811
0
    while ((PARSER_STOPPED(ctxt) == 0) &&
4812
0
           (((CUR == '<') && (NXT(1) == '!') &&
4813
0
             (NXT(2) == '-') && (NXT(3) == '-')) ||
4814
0
      ((CUR == '<') && (NXT(1) == '?')))) {
4815
0
        htmlParseComment(ctxt);
4816
0
        htmlParsePI(ctxt);
4817
0
  SKIP_BLANKS;
4818
0
    }
4819
4820
    /*
4821
     * Time to start parsing the tree itself
4822
     */
4823
0
    htmlParseContentInternal(ctxt);
4824
4825
    /*
4826
     * autoclose
4827
     */
4828
0
    if (CUR == 0)
4829
0
  htmlAutoCloseOnEnd(ctxt);
4830
4831
4832
    /*
4833
     * SAX: end of the document processing.
4834
     */
4835
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4836
0
        ctxt->sax->endDocument(ctxt->userData);
4837
4838
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4839
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
4840
0
  if (dtd == NULL) {
4841
0
      ctxt->myDoc->intSubset =
4842
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4843
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4844
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4845
0
            if (ctxt->myDoc->intSubset == NULL)
4846
0
                htmlErrMemory(ctxt);
4847
0
        }
4848
0
    }
4849
0
    if (! ctxt->wellFormed) return(-1);
4850
0
    return(0);
4851
0
}
4852
4853
4854
/************************************************************************
4855
 *                  *
4856
 *      Parser contexts handling      *
4857
 *                  *
4858
 ************************************************************************/
4859
4860
/**
4861
 * htmlInitParserCtxt:
4862
 * @ctxt:  an HTML parser context
4863
 * @sax:  SAX handler
4864
 * @userData:  user data
4865
 *
4866
 * Initialize a parser context
4867
 *
4868
 * Returns 0 in case of success and -1 in case of error
4869
 */
4870
4871
static int
4872
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4873
                   void *userData)
4874
0
{
4875
0
    if (ctxt == NULL) return(-1);
4876
0
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4877
4878
0
    ctxt->dict = xmlDictCreate();
4879
0
    if (ctxt->dict == NULL)
4880
0
  return(-1);
4881
4882
0
    if (ctxt->sax == NULL)
4883
0
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4884
0
    if (ctxt->sax == NULL)
4885
0
  return(-1);
4886
0
    if (sax == NULL) {
4887
0
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4888
0
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4889
0
        ctxt->userData = ctxt;
4890
0
    } else {
4891
0
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4892
0
        ctxt->userData = userData ? userData : ctxt;
4893
0
    }
4894
4895
    /* Allocate the Input stack */
4896
0
    ctxt->inputTab = (htmlParserInputPtr *)
4897
0
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4898
0
    if (ctxt->inputTab == NULL)
4899
0
  return(-1);
4900
0
    ctxt->inputNr = 0;
4901
0
    ctxt->inputMax = 5;
4902
0
    ctxt->input = NULL;
4903
0
    ctxt->version = NULL;
4904
0
    ctxt->encoding = NULL;
4905
0
    ctxt->standalone = -1;
4906
0
    ctxt->instate = XML_PARSER_START;
4907
4908
    /* Allocate the Node stack */
4909
0
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4910
0
    if (ctxt->nodeTab == NULL)
4911
0
  return(-1);
4912
0
    ctxt->nodeNr = 0;
4913
0
    ctxt->nodeMax = 10;
4914
0
    ctxt->node = NULL;
4915
4916
    /* Allocate the Name stack */
4917
0
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4918
0
    if (ctxt->nameTab == NULL)
4919
0
  return(-1);
4920
0
    ctxt->nameNr = 0;
4921
0
    ctxt->nameMax = 10;
4922
0
    ctxt->name = NULL;
4923
4924
0
    ctxt->nodeInfoTab = NULL;
4925
0
    ctxt->nodeInfoNr  = 0;
4926
0
    ctxt->nodeInfoMax = 0;
4927
4928
0
    ctxt->myDoc = NULL;
4929
0
    ctxt->wellFormed = 1;
4930
0
    ctxt->replaceEntities = 0;
4931
0
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4932
0
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4933
0
    ctxt->html = 1;
4934
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
4935
0
    ctxt->vctxt.userData = ctxt;
4936
0
    ctxt->vctxt.error = xmlParserValidityError;
4937
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
4938
0
    ctxt->record_info = 0;
4939
0
    ctxt->validate = 0;
4940
0
    ctxt->checkIndex = 0;
4941
0
    ctxt->catalogs = NULL;
4942
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
4943
0
    return(0);
4944
0
}
4945
4946
/**
4947
 * htmlFreeParserCtxt:
4948
 * @ctxt:  an HTML parser context
4949
 *
4950
 * Free all the memory used by a parser context. However the parsed
4951
 * document in ctxt->myDoc is not freed.
4952
 */
4953
4954
void
4955
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4956
0
{
4957
0
    xmlFreeParserCtxt(ctxt);
4958
0
}
4959
4960
/**
4961
 * htmlNewParserCtxt:
4962
 *
4963
 * Allocate and initialize a new HTML parser context.
4964
 *
4965
 * This can be used to parse HTML documents into DOM trees with
4966
 * functions like xmlCtxtReadFile or xmlCtxtReadMemory.
4967
 *
4968
 * See htmlCtxtUseOptions for parser options.
4969
 *
4970
 * See xmlCtxtSetErrorHandler for advanced error handling.
4971
 *
4972
 * See xmlNewInputURL, xmlNewInputMemory, xmlNewInputIO and similar
4973
 * functions for advanced input control.
4974
 *
4975
 * See htmlNewSAXParserCtxt for custom SAX parsers.
4976
 *
4977
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4978
 */
4979
4980
htmlParserCtxtPtr
4981
htmlNewParserCtxt(void)
4982
0
{
4983
0
    return(htmlNewSAXParserCtxt(NULL, NULL));
4984
0
}
4985
4986
/**
4987
 * htmlNewSAXParserCtxt:
4988
 * @sax:  SAX handler
4989
 * @userData:  user data
4990
 *
4991
 * Allocate and initialize a new HTML SAX parser context. If userData
4992
 * is NULL, the parser context will be passed as user data.
4993
 *
4994
 * Available since 2.11.0. If you want support older versions,
4995
 * it's best to invoke htmlNewParserCtxt and set ctxt->sax with
4996
 * struct assignment.
4997
 *
4998
 * Also see htmlNewParserCtxt.
4999
 *
5000
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5001
 */
5002
5003
htmlParserCtxtPtr
5004
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5005
0
{
5006
0
    xmlParserCtxtPtr ctxt;
5007
5008
0
    xmlInitParser();
5009
5010
0
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5011
0
    if (ctxt == NULL)
5012
0
  return(NULL);
5013
0
    memset(ctxt, 0, sizeof(xmlParserCtxt));
5014
0
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5015
0
        htmlFreeParserCtxt(ctxt);
5016
0
  return(NULL);
5017
0
    }
5018
0
    return(ctxt);
5019
0
}
5020
5021
static htmlParserCtxtPtr
5022
htmlCreateMemoryParserCtxtInternal(const char *url,
5023
                                   const char *buffer, size_t size,
5024
0
                                   const char *encoding) {
5025
0
    xmlParserCtxtPtr ctxt;
5026
0
    xmlParserInputPtr input;
5027
5028
0
    if (buffer == NULL)
5029
0
  return(NULL);
5030
5031
0
    ctxt = htmlNewParserCtxt();
5032
0
    if (ctxt == NULL)
5033
0
  return(NULL);
5034
5035
0
    input = xmlNewInputMemory(ctxt, url, buffer, size, encoding, 0);
5036
0
    if (input == NULL) {
5037
0
  xmlFreeParserCtxt(ctxt);
5038
0
        return(NULL);
5039
0
    }
5040
5041
0
    inputPush(ctxt, input);
5042
5043
0
    return(ctxt);
5044
0
}
5045
5046
/**
5047
 * htmlCreateMemoryParserCtxt:
5048
 * @buffer:  a pointer to a char array
5049
 * @size:  the size of the array
5050
 *
5051
 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadMemory.
5052
 *
5053
 * Create a parser context for an HTML in-memory document. The input
5054
 * buffer must not contain any terminating null bytes.
5055
 *
5056
 * Returns the new parser context or NULL
5057
 */
5058
htmlParserCtxtPtr
5059
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5060
0
    if (size <= 0)
5061
0
  return(NULL);
5062
5063
0
    return(htmlCreateMemoryParserCtxtInternal(NULL, buffer, size, NULL));
5064
0
}
5065
5066
/**
5067
 * htmlCreateDocParserCtxt:
5068
 * @str:  a pointer to an array of xmlChar
5069
 * @encoding:  encoding (optional)
5070
 *
5071
 * Create a parser context for a null-terminated string.
5072
 *
5073
 * Returns the new parser context or NULL if a memory allocation failed.
5074
 */
5075
static htmlParserCtxtPtr
5076
htmlCreateDocParserCtxt(const xmlChar *str, const char *url,
5077
0
                        const char *encoding) {
5078
0
    xmlParserCtxtPtr ctxt;
5079
0
    xmlParserInputPtr input;
5080
5081
0
    if (str == NULL)
5082
0
  return(NULL);
5083
5084
0
    ctxt = htmlNewParserCtxt();
5085
0
    if (ctxt == NULL)
5086
0
  return(NULL);
5087
5088
0
    input = xmlNewInputString(ctxt, url, (const char *) str, encoding, 0);
5089
0
    if (input == NULL) {
5090
0
  xmlFreeParserCtxt(ctxt);
5091
0
  return(NULL);
5092
0
    }
5093
5094
0
    inputPush(ctxt, input);
5095
5096
0
    return(ctxt);
5097
0
}
5098
5099
#ifdef LIBXML_PUSH_ENABLED
5100
/************************************************************************
5101
 *                  *
5102
 *  Progressive parsing interfaces        *
5103
 *                  *
5104
 ************************************************************************/
5105
5106
/**
5107
 * htmlParseLookupSequence:
5108
 * @ctxt:  an HTML parser context
5109
 * @first:  the first char to lookup
5110
 * @next:  the next char to lookup or zero
5111
 * @third:  the next char to lookup or zero
5112
 * @ignoreattrval: skip over attribute values
5113
 *
5114
 * Try to find if a sequence (first, next, third) or  just (first next) or
5115
 * (first) is available in the input stream.
5116
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5117
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5118
 * parser, do not use liberally.
5119
 * This is basically similar to xmlParseLookupSequence()
5120
 *
5121
 * Returns the index to the current parsing point if the full sequence
5122
 *      is available, -1 otherwise.
5123
 */
5124
static int
5125
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5126
                        xmlChar next, xmlChar third, int ignoreattrval)
5127
0
{
5128
0
    size_t base, len;
5129
0
    htmlParserInputPtr in;
5130
0
    const xmlChar *buf;
5131
0
    int quote;
5132
5133
0
    in = ctxt->input;
5134
0
    if (in == NULL)
5135
0
        return (-1);
5136
5137
0
    base = ctxt->checkIndex;
5138
0
    quote = ctxt->endCheckState;
5139
5140
0
    buf = in->cur;
5141
0
    len = in->end - in->cur;
5142
5143
    /* take into account the sequence length */
5144
0
    if (third)
5145
0
        len -= 2;
5146
0
    else if (next)
5147
0
        len--;
5148
0
    for (; base < len; base++) {
5149
0
        if (base >= INT_MAX / 2) {
5150
0
            ctxt->checkIndex = 0;
5151
0
            ctxt->endCheckState = 0;
5152
0
            return (base - 2);
5153
0
        }
5154
0
        if (ignoreattrval) {
5155
0
            if (quote) {
5156
0
                if (buf[base] == quote)
5157
0
                    quote = 0;
5158
0
                continue;
5159
0
            }
5160
0
            if (buf[base] == '"' || buf[base] == '\'') {
5161
0
                quote = buf[base];
5162
0
                continue;
5163
0
            }
5164
0
        }
5165
0
        if (buf[base] == first) {
5166
0
            if (third != 0) {
5167
0
                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5168
0
                    continue;
5169
0
            } else if (next != 0) {
5170
0
                if (buf[base + 1] != next)
5171
0
                    continue;
5172
0
            }
5173
0
            ctxt->checkIndex = 0;
5174
0
            ctxt->endCheckState = 0;
5175
0
            return (base);
5176
0
        }
5177
0
    }
5178
0
    ctxt->checkIndex = base;
5179
0
    ctxt->endCheckState = quote;
5180
0
    return (-1);
5181
0
}
5182
5183
/**
5184
 * htmlParseLookupCommentEnd:
5185
 * @ctxt: an HTML parser context
5186
 *
5187
 * Try to find a comment end tag in the input stream
5188
 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5189
 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5190
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5191
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5192
 * parser, do not use liberally.
5193
 * This wraps to htmlParseLookupSequence()
5194
 *
5195
 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5196
 */
5197
static int
5198
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5199
0
{
5200
0
    int mark = 0;
5201
0
    int offset;
5202
5203
0
    while (1) {
5204
0
  mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5205
0
  if (mark < 0)
5206
0
            break;
5207
0
        if ((NXT(mark+2) == '>') ||
5208
0
      ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5209
0
            ctxt->checkIndex = 0;
5210
0
      break;
5211
0
  }
5212
0
        offset = (NXT(mark+2) == '!') ? 3 : 2;
5213
0
        if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5214
0
      ctxt->checkIndex = mark;
5215
0
            return(-1);
5216
0
        }
5217
0
  ctxt->checkIndex = mark + 1;
5218
0
    }
5219
0
    return mark;
5220
0
}
5221
5222
5223
/**
5224
 * htmlParseTryOrFinish:
5225
 * @ctxt:  an HTML parser context
5226
 * @terminate:  last chunk indicator
5227
 *
5228
 * Try to progress on parsing
5229
 *
5230
 * Returns zero if no parsing was possible
5231
 */
5232
static int
5233
0
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5234
0
    int ret = 0;
5235
0
    htmlParserInputPtr in;
5236
0
    ptrdiff_t avail = 0;
5237
0
    xmlChar cur, next;
5238
5239
0
    htmlParserNodeInfo node_info;
5240
5241
0
    while (PARSER_STOPPED(ctxt) == 0) {
5242
5243
0
  in = ctxt->input;
5244
0
  if (in == NULL) break;
5245
0
  avail = in->end - in->cur;
5246
0
  if ((avail == 0) && (terminate)) {
5247
0
      htmlAutoCloseOnEnd(ctxt);
5248
0
      if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5249
    /*
5250
     * SAX: end of the document processing.
5251
     */
5252
0
    ctxt->instate = XML_PARSER_EOF;
5253
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5254
0
        ctxt->sax->endDocument(ctxt->userData);
5255
0
      }
5256
0
  }
5257
0
        if (avail < 1)
5258
0
      goto done;
5259
        /*
5260
         * This is done to make progress and avoid an infinite loop
5261
         * if a parsing attempt was aborted by hitting a NUL byte. After
5262
         * changing htmlCurrentChar, this probably isn't necessary anymore.
5263
         * We should consider removing this check.
5264
         */
5265
0
  cur = in->cur[0];
5266
0
  if (cur == 0) {
5267
0
      SKIP(1);
5268
0
      continue;
5269
0
  }
5270
5271
0
        switch (ctxt->instate) {
5272
0
            case XML_PARSER_EOF:
5273
          /*
5274
     * Document parsing is done !
5275
     */
5276
0
          goto done;
5277
0
            case XML_PARSER_START:
5278
                /*
5279
                 * This is wrong but matches long-standing behavior. In most
5280
                 * cases, a document starting with an XML declaration will
5281
                 * specify UTF-8.
5282
                 */
5283
0
                if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5284
0
                    (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5285
0
                    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5286
0
                }
5287
5288
          /*
5289
     * Very first chars read from the document flow.
5290
     */
5291
0
    cur = in->cur[0];
5292
0
    if (IS_BLANK_CH(cur)) {
5293
0
        SKIP_BLANKS;
5294
0
                    avail = in->end - in->cur;
5295
0
    }
5296
0
                if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) {
5297
0
                    ctxt->sax->setDocumentLocator(ctxt->userData,
5298
0
                            (xmlSAXLocator *) &xmlDefaultSAXLocator);
5299
0
                }
5300
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5301
0
              (!ctxt->disableSAX))
5302
0
        ctxt->sax->startDocument(ctxt->userData);
5303
5304
0
    cur = in->cur[0];
5305
0
    next = in->cur[1];
5306
0
    if ((cur == '<') && (next == '!') &&
5307
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5308
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5309
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5310
0
        (UPP(8) == 'E')) {
5311
0
        if ((!terminate) &&
5312
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5313
0
      goto done;
5314
0
        htmlParseDocTypeDecl(ctxt);
5315
0
        ctxt->instate = XML_PARSER_PROLOG;
5316
0
                } else {
5317
0
        ctxt->instate = XML_PARSER_MISC;
5318
0
    }
5319
0
    break;
5320
0
            case XML_PARSER_MISC:
5321
0
    SKIP_BLANKS;
5322
0
                avail = in->end - in->cur;
5323
    /*
5324
     * no chars in buffer
5325
     */
5326
0
    if (avail < 1)
5327
0
        goto done;
5328
    /*
5329
     * not enough chars in buffer
5330
     */
5331
0
    if (avail < 2) {
5332
0
        if (!terminate)
5333
0
      goto done;
5334
0
        else
5335
0
      next = ' ';
5336
0
    } else {
5337
0
        next = in->cur[1];
5338
0
    }
5339
0
    cur = in->cur[0];
5340
0
          if ((cur == '<') && (next == '!') &&
5341
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5342
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5343
0
      goto done;
5344
0
        htmlParseComment(ctxt);
5345
0
        ctxt->instate = XML_PARSER_MISC;
5346
0
          } else if ((cur == '<') && (next == '?')) {
5347
0
        if ((!terminate) &&
5348
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5349
0
      goto done;
5350
0
        htmlParsePI(ctxt);
5351
0
        ctxt->instate = XML_PARSER_MISC;
5352
0
    } else if ((cur == '<') && (next == '!') &&
5353
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5354
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5355
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5356
0
        (UPP(8) == 'E')) {
5357
0
        if ((!terminate) &&
5358
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5359
0
      goto done;
5360
0
        htmlParseDocTypeDecl(ctxt);
5361
0
        ctxt->instate = XML_PARSER_PROLOG;
5362
0
    } else if ((cur == '<') && (next == '!') &&
5363
0
               (avail < 9)) {
5364
0
        goto done;
5365
0
    } else {
5366
0
        ctxt->instate = XML_PARSER_CONTENT;
5367
0
    }
5368
0
    break;
5369
0
            case XML_PARSER_PROLOG:
5370
0
    SKIP_BLANKS;
5371
0
                avail = in->end - in->cur;
5372
0
    if (avail < 2)
5373
0
        goto done;
5374
0
    cur = in->cur[0];
5375
0
    next = in->cur[1];
5376
0
    if ((cur == '<') && (next == '!') &&
5377
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5378
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5379
0
      goto done;
5380
0
        htmlParseComment(ctxt);
5381
0
        ctxt->instate = XML_PARSER_PROLOG;
5382
0
          } else if ((cur == '<') && (next == '?')) {
5383
0
        if ((!terminate) &&
5384
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5385
0
      goto done;
5386
0
        htmlParsePI(ctxt);
5387
0
        ctxt->instate = XML_PARSER_PROLOG;
5388
0
    } else if ((cur == '<') && (next == '!') &&
5389
0
               (avail < 4)) {
5390
0
        goto done;
5391
0
    } else {
5392
0
        ctxt->instate = XML_PARSER_CONTENT;
5393
0
    }
5394
0
    break;
5395
0
            case XML_PARSER_EPILOG:
5396
0
                avail = in->end - in->cur;
5397
0
    if (avail < 1)
5398
0
        goto done;
5399
0
    cur = in->cur[0];
5400
0
    if (IS_BLANK_CH(cur)) {
5401
0
        htmlParseCharData(ctxt);
5402
0
        goto done;
5403
0
    }
5404
0
    if (avail < 2)
5405
0
        goto done;
5406
0
    next = in->cur[1];
5407
0
          if ((cur == '<') && (next == '!') &&
5408
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5409
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5410
0
      goto done;
5411
0
        htmlParseComment(ctxt);
5412
0
        ctxt->instate = XML_PARSER_EPILOG;
5413
0
          } else if ((cur == '<') && (next == '?')) {
5414
0
        if ((!terminate) &&
5415
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5416
0
      goto done;
5417
0
        htmlParsePI(ctxt);
5418
0
        ctxt->instate = XML_PARSER_EPILOG;
5419
0
    } else if ((cur == '<') && (next == '!') &&
5420
0
               (avail < 4)) {
5421
0
        goto done;
5422
0
    } else {
5423
0
        ctxt->errNo = XML_ERR_DOCUMENT_END;
5424
0
        ctxt->wellFormed = 0;
5425
0
        ctxt->instate = XML_PARSER_EOF;
5426
0
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5427
0
      ctxt->sax->endDocument(ctxt->userData);
5428
0
        goto done;
5429
0
    }
5430
0
    break;
5431
0
            case XML_PARSER_START_TAG: {
5432
0
          const xmlChar *name;
5433
0
    int failed;
5434
0
    const htmlElemDesc * info;
5435
5436
    /*
5437
     * no chars in buffer
5438
     */
5439
0
    if (avail < 1)
5440
0
        goto done;
5441
    /*
5442
     * not enough chars in buffer
5443
     */
5444
0
    if (avail < 2) {
5445
0
        if (!terminate)
5446
0
      goto done;
5447
0
        else
5448
0
      next = ' ';
5449
0
    } else {
5450
0
        next = in->cur[1];
5451
0
    }
5452
0
    cur = in->cur[0];
5453
0
          if (cur != '<') {
5454
0
        ctxt->instate = XML_PARSER_CONTENT;
5455
0
        break;
5456
0
    }
5457
0
    if (next == '/') {
5458
0
        ctxt->instate = XML_PARSER_END_TAG;
5459
0
        ctxt->checkIndex = 0;
5460
0
        break;
5461
0
    }
5462
0
    if ((!terminate) &&
5463
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5464
0
        goto done;
5465
5466
                /* Capture start position */
5467
0
          if (ctxt->record_info) {
5468
0
               node_info.begin_pos = ctxt->input->consumed +
5469
0
                                  (CUR_PTR - ctxt->input->base);
5470
0
               node_info.begin_line = ctxt->input->line;
5471
0
          }
5472
5473
5474
0
    failed = htmlParseStartTag(ctxt);
5475
0
    name = ctxt->name;
5476
0
    if ((failed == -1) ||
5477
0
        (name == NULL)) {
5478
0
        if (CUR == '>')
5479
0
      NEXT;
5480
0
        break;
5481
0
    }
5482
5483
    /*
5484
     * Lookup the info for that element.
5485
     */
5486
0
    info = htmlTagLookup(name);
5487
0
    if (info == NULL) {
5488
0
        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5489
0
                     "Tag %s invalid\n", name, NULL);
5490
0
    }
5491
5492
    /*
5493
     * Check for an Empty Element labeled the XML/SGML way
5494
     */
5495
0
    if ((CUR == '/') && (NXT(1) == '>')) {
5496
0
        SKIP(2);
5497
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5498
0
      ctxt->sax->endElement(ctxt->userData, name);
5499
0
        htmlnamePop(ctxt);
5500
0
        ctxt->instate = XML_PARSER_CONTENT;
5501
0
        break;
5502
0
    }
5503
5504
0
    if (CUR == '>') {
5505
0
        NEXT;
5506
0
    } else {
5507
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5508
0
                     "Couldn't find end of Start Tag %s\n",
5509
0
         name, NULL);
5510
5511
        /*
5512
         * end of parsing of this node.
5513
         */
5514
0
        if (xmlStrEqual(name, ctxt->name)) {
5515
0
      nodePop(ctxt);
5516
0
      htmlnamePop(ctxt);
5517
0
        }
5518
5519
0
        if (ctxt->record_info)
5520
0
            htmlNodeInfoPush(ctxt, &node_info);
5521
5522
0
        ctxt->instate = XML_PARSER_CONTENT;
5523
0
        break;
5524
0
    }
5525
5526
    /*
5527
     * Check for an Empty Element from DTD definition
5528
     */
5529
0
    if ((info != NULL) && (info->empty)) {
5530
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5531
0
      ctxt->sax->endElement(ctxt->userData, name);
5532
0
        htmlnamePop(ctxt);
5533
0
    }
5534
5535
0
                if (ctxt->record_info)
5536
0
              htmlNodeInfoPush(ctxt, &node_info);
5537
5538
0
    ctxt->instate = XML_PARSER_CONTENT;
5539
0
                break;
5540
0
      }
5541
0
            case XML_PARSER_CONTENT: {
5542
0
    xmlChar chr[2] = { 0, 0 };
5543
5544
                /*
5545
     * Handle preparsed entities and charRef
5546
     */
5547
0
    if ((avail == 1) && (terminate)) {
5548
0
        cur = in->cur[0];
5549
0
        if ((cur != '<') && (cur != '&')) {
5550
0
      if (ctxt->sax != NULL) {
5551
0
                            chr[0] = cur;
5552
0
          if (IS_BLANK_CH(cur)) {
5553
0
        if (ctxt->keepBlanks) {
5554
0
            if (ctxt->sax->characters != NULL)
5555
0
          ctxt->sax->characters(
5556
0
            ctxt->userData, chr, 1);
5557
0
        } else {
5558
0
            if (ctxt->sax->ignorableWhitespace != NULL)
5559
0
          ctxt->sax->ignorableWhitespace(
5560
0
            ctxt->userData, chr, 1);
5561
0
        }
5562
0
          } else {
5563
0
        htmlCheckParagraph(ctxt);
5564
0
        if (ctxt->sax->characters != NULL)
5565
0
            ctxt->sax->characters(
5566
0
              ctxt->userData, chr, 1);
5567
0
          }
5568
0
      }
5569
0
      ctxt->checkIndex = 0;
5570
0
      in->cur++;
5571
0
      break;
5572
0
        }
5573
0
    }
5574
0
    if (avail < 2)
5575
0
        goto done;
5576
0
    cur = in->cur[0];
5577
0
    next = in->cur[1];
5578
0
    if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5579
0
        (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5580
        /*
5581
         * Handle SCRIPT/STYLE separately
5582
         */
5583
0
        if (!terminate) {
5584
0
            int idx;
5585
0
      xmlChar val;
5586
5587
0
      idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5588
0
      if (idx < 0)
5589
0
          goto done;
5590
0
            val = in->cur[idx + 2];
5591
0
      if (val == 0) { /* bad cut of input */
5592
                            /*
5593
                             * FIXME: htmlParseScript checks for additional
5594
                             * characters after '</'.
5595
                             */
5596
0
                            ctxt->checkIndex = idx;
5597
0
          goto done;
5598
0
                        }
5599
0
        }
5600
0
        htmlParseScript(ctxt);
5601
0
        if ((cur == '<') && (next == '/')) {
5602
0
      ctxt->instate = XML_PARSER_END_TAG;
5603
0
      ctxt->checkIndex = 0;
5604
0
      break;
5605
0
        }
5606
0
    } else if ((cur == '<') && (next == '!')) {
5607
0
                    if (avail < 4)
5608
0
                        goto done;
5609
                    /*
5610
                     * Sometimes DOCTYPE arrives in the middle of the document
5611
                     */
5612
0
                    if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5613
0
                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5614
0
                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5615
0
                        (UPP(8) == 'E')) {
5616
0
                        if ((!terminate) &&
5617
0
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5618
0
                            goto done;
5619
0
                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5620
0
                                     "Misplaced DOCTYPE declaration\n",
5621
0
                                     BAD_CAST "DOCTYPE" , NULL);
5622
0
                        htmlParseDocTypeDecl(ctxt);
5623
0
                    } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5624
0
                        if ((!terminate) &&
5625
0
                            (htmlParseLookupCommentEnd(ctxt) < 0))
5626
0
                            goto done;
5627
0
                        htmlParseComment(ctxt);
5628
0
                        ctxt->instate = XML_PARSER_CONTENT;
5629
0
                    } else {
5630
0
                        if ((!terminate) &&
5631
0
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5632
0
                            goto done;
5633
0
                        htmlSkipBogusComment(ctxt);
5634
0
                    }
5635
0
                } else if ((cur == '<') && (next == '?')) {
5636
0
                    if ((!terminate) &&
5637
0
                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5638
0
                        goto done;
5639
0
                    htmlParsePI(ctxt);
5640
0
                    ctxt->instate = XML_PARSER_CONTENT;
5641
0
                } else if ((cur == '<') && (next == '/')) {
5642
0
                    ctxt->instate = XML_PARSER_END_TAG;
5643
0
                    ctxt->checkIndex = 0;
5644
0
                    break;
5645
0
                } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5646
0
                    if ((!terminate) && (next == 0))
5647
0
                        goto done;
5648
0
                    ctxt->instate = XML_PARSER_START_TAG;
5649
0
                    ctxt->checkIndex = 0;
5650
0
                    break;
5651
0
                } else if (cur == '<') {
5652
0
                    if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5653
0
                        (ctxt->sax->characters != NULL))
5654
0
                        ctxt->sax->characters(ctxt->userData,
5655
0
                                              BAD_CAST "<", 1);
5656
0
                    NEXT;
5657
0
                } else {
5658
                    /*
5659
                     * check that the text sequence is complete
5660
                     * before handing out the data to the parser
5661
                     * to avoid problems with erroneous end of
5662
                     * data detection.
5663
                     */
5664
0
                    if ((!terminate) &&
5665
0
                        (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5666
0
                        goto done;
5667
0
                    ctxt->checkIndex = 0;
5668
0
                    while ((PARSER_STOPPED(ctxt) == 0) &&
5669
0
                           (cur != '<') && (in->cur < in->end)) {
5670
0
                        if (cur == '&') {
5671
0
                            htmlParseReference(ctxt);
5672
0
                        } else {
5673
0
                            htmlParseCharData(ctxt);
5674
0
                        }
5675
0
                        cur = in->cur[0];
5676
0
                    }
5677
0
    }
5678
5679
0
    break;
5680
0
      }
5681
0
            case XML_PARSER_END_TAG:
5682
0
    if (avail < 2)
5683
0
        goto done;
5684
0
    if ((!terminate) &&
5685
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5686
0
        goto done;
5687
0
    htmlParseEndTag(ctxt);
5688
0
    if (ctxt->nameNr == 0) {
5689
0
        ctxt->instate = XML_PARSER_EPILOG;
5690
0
    } else {
5691
0
        ctxt->instate = XML_PARSER_CONTENT;
5692
0
    }
5693
0
    ctxt->checkIndex = 0;
5694
0
          break;
5695
0
      default:
5696
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5697
0
           "HPP: internal error\n", NULL, NULL);
5698
0
    ctxt->instate = XML_PARSER_EOF;
5699
0
    break;
5700
0
  }
5701
0
    }
5702
0
done:
5703
0
    if ((avail == 0) && (terminate)) {
5704
0
  htmlAutoCloseOnEnd(ctxt);
5705
0
  if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5706
      /*
5707
       * SAX: end of the document processing.
5708
       */
5709
0
      ctxt->instate = XML_PARSER_EOF;
5710
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5711
0
    ctxt->sax->endDocument(ctxt->userData);
5712
0
  }
5713
0
    }
5714
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5715
0
  ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5716
0
   (ctxt->instate == XML_PARSER_EPILOG))) {
5717
0
  xmlDtdPtr dtd;
5718
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
5719
0
  if (dtd == NULL) {
5720
0
      ctxt->myDoc->intSubset =
5721
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5722
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5723
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5724
0
            if (ctxt->myDoc->intSubset == NULL)
5725
0
                htmlErrMemory(ctxt);
5726
0
        }
5727
0
    }
5728
0
    return(ret);
5729
0
}
5730
5731
/**
5732
 * htmlParseChunk:
5733
 * @ctxt:  an HTML parser context
5734
 * @chunk:  chunk of memory
5735
 * @size:  size of chunk in bytes
5736
 * @terminate:  last chunk indicator
5737
 *
5738
 * Parse a chunk of memory in push parser mode.
5739
 *
5740
 * Assumes that the parser context was initialized with
5741
 * htmlCreatePushParserCtxt.
5742
 *
5743
 * The last chunk, which will often be empty, must be marked with
5744
 * the @terminate flag. With the default SAX callbacks, the resulting
5745
 * document will be available in ctxt->myDoc. This pointer will not
5746
 * be freed by the library.
5747
 *
5748
 * If the document isn't well-formed, ctxt->myDoc is set to NULL.
5749
 *
5750
 * Returns an xmlParserErrors code (0 on success).
5751
 */
5752
int
5753
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5754
0
              int terminate) {
5755
0
    if ((ctxt == NULL) || (ctxt->input == NULL))
5756
0
  return(XML_ERR_ARGUMENT);
5757
0
    if (PARSER_STOPPED(ctxt) != 0)
5758
0
        return(ctxt->errNo);
5759
0
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5760
0
        (ctxt->input->buf != NULL))  {
5761
0
  size_t pos = ctxt->input->cur - ctxt->input->base;
5762
0
  int res;
5763
5764
0
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5765
0
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5766
0
  if (res < 0) {
5767
0
            htmlParseErr(ctxt, ctxt->input->buf->error,
5768
0
                         "xmlParserInputBufferPush failed", NULL, NULL);
5769
0
            xmlHaltParser(ctxt);
5770
0
      return (ctxt->errNo);
5771
0
  }
5772
0
    }
5773
0
    htmlParseTryOrFinish(ctxt, terminate);
5774
0
    if (terminate) {
5775
0
  if (ctxt->instate != XML_PARSER_EOF) {
5776
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5777
0
    ctxt->sax->endDocument(ctxt->userData);
5778
0
  }
5779
0
  ctxt->instate = XML_PARSER_EOF;
5780
0
    }
5781
0
    return((xmlParserErrors) ctxt->errNo);
5782
0
}
5783
5784
/************************************************************************
5785
 *                  *
5786
 *      User entry points       *
5787
 *                  *
5788
 ************************************************************************/
5789
5790
/**
5791
 * htmlCreatePushParserCtxt:
5792
 * @sax:  a SAX handler (optional)
5793
 * @user_data:  The user data returned on SAX callbacks (optional)
5794
 * @chunk:  a pointer to an array of chars (optional)
5795
 * @size:  number of chars in the array
5796
 * @filename:  only used for error reporting (optional)
5797
 * @enc:  encoding (deprecated, pass XML_CHAR_ENCODING_NONE)
5798
 *
5799
 * Create a parser context for using the HTML parser in push mode.
5800
 *
5801
 * Returns the new parser context or NULL if a memory allocation
5802
 * failed.
5803
 */
5804
htmlParserCtxtPtr
5805
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5806
                         const char *chunk, int size, const char *filename,
5807
0
       xmlCharEncoding enc) {
5808
0
    htmlParserCtxtPtr ctxt;
5809
0
    htmlParserInputPtr input;
5810
0
    const char *encoding;
5811
5812
0
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
5813
0
    if (ctxt == NULL)
5814
0
  return(NULL);
5815
5816
0
    encoding = xmlGetCharEncodingName(enc);
5817
0
    input = xmlNewInputPush(ctxt, filename, chunk, size, encoding);
5818
0
    if (input == NULL) {
5819
0
  htmlFreeParserCtxt(ctxt);
5820
0
  return(NULL);
5821
0
    }
5822
0
    inputPush(ctxt, input);
5823
5824
0
    return(ctxt);
5825
0
}
5826
#endif /* LIBXML_PUSH_ENABLED */
5827
5828
/**
5829
 * htmlSAXParseDoc:
5830
 * @cur:  a pointer to an array of xmlChar
5831
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5832
 * @sax:  the SAX handler block
5833
 * @userData: if using SAX, this pointer will be provided on callbacks.
5834
 *
5835
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
5836
 *
5837
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
5838
 * to handle parse events. If sax is NULL, fallback to the default DOM
5839
 * behavior and return a tree.
5840
 *
5841
 * Returns the resulting document tree unless SAX is NULL or the document is
5842
 *     not well formed.
5843
 */
5844
5845
htmlDocPtr
5846
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
5847
0
                htmlSAXHandlerPtr sax, void *userData) {
5848
0
    htmlDocPtr ret;
5849
0
    htmlParserCtxtPtr ctxt;
5850
5851
0
    if (cur == NULL)
5852
0
        return(NULL);
5853
5854
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL, encoding);
5855
0
    if (ctxt == NULL)
5856
0
        return(NULL);
5857
5858
0
    if (sax != NULL) {
5859
0
        *ctxt->sax = *sax;
5860
0
        ctxt->userData = userData;
5861
0
    }
5862
5863
0
    htmlParseDocument(ctxt);
5864
0
    ret = ctxt->myDoc;
5865
0
    htmlFreeParserCtxt(ctxt);
5866
5867
0
    return(ret);
5868
0
}
5869
5870
/**
5871
 * htmlParseDoc:
5872
 * @cur:  a pointer to an array of xmlChar
5873
 * @encoding:  the encoding (optional)
5874
 *
5875
 * DEPRECATED: Use htmlReadDoc.
5876
 *
5877
 * Parse an HTML in-memory document and build a tree.
5878
 *
5879
 * This function uses deprecated global parser options.
5880
 *
5881
 * Returns the resulting document tree
5882
 */
5883
5884
htmlDocPtr
5885
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
5886
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
5887
0
}
5888
5889
5890
/**
5891
 * htmlCreateFileParserCtxt:
5892
 * @filename:  the filename
5893
 * @encoding:  optional encoding
5894
 *
5895
 * DEPRECATED: Use htmlNewParserCtxt and htmlCtxtReadFile.
5896
 *
5897
 * Create a parser context to read from a file.
5898
 *
5899
 * A non-NULL encoding overrides encoding declarations in the document.
5900
 *
5901
 * Automatic support for ZLIB/Compress compressed document is provided
5902
 * by default if found at compile-time.
5903
 *
5904
 * Returns the new parser context or NULL if a memory allocation failed.
5905
 */
5906
htmlParserCtxtPtr
5907
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
5908
0
{
5909
0
    htmlParserCtxtPtr ctxt;
5910
0
    htmlParserInputPtr input;
5911
5912
0
    if (filename == NULL)
5913
0
        return(NULL);
5914
5915
0
    ctxt = htmlNewParserCtxt();
5916
0
    if (ctxt == NULL) {
5917
0
  return(NULL);
5918
0
    }
5919
5920
0
    input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
5921
0
    if (input == NULL) {
5922
0
  xmlFreeParserCtxt(ctxt);
5923
0
  return(NULL);
5924
0
    }
5925
0
    inputPush(ctxt, input);
5926
5927
0
    return(ctxt);
5928
0
}
5929
5930
/**
5931
 * htmlSAXParseFile:
5932
 * @filename:  the filename
5933
 * @encoding:  encoding (optional)
5934
 * @sax:  the SAX handler block
5935
 * @userData: if using SAX, this pointer will be provided on callbacks.
5936
 *
5937
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
5938
 *
5939
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
5940
 * compressed document is provided by default if found at compile-time.
5941
 * It use the given SAX function block to handle the parsing callback.
5942
 * If sax is NULL, fallback to the default DOM tree building routines.
5943
 *
5944
 * Returns the resulting document tree unless SAX is NULL or the document is
5945
 *     not well formed.
5946
 */
5947
5948
htmlDocPtr
5949
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
5950
0
                 void *userData) {
5951
0
    htmlDocPtr ret;
5952
0
    htmlParserCtxtPtr ctxt;
5953
0
    htmlSAXHandlerPtr oldsax = NULL;
5954
5955
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
5956
0
    if (ctxt == NULL) return(NULL);
5957
0
    if (sax != NULL) {
5958
0
  oldsax = ctxt->sax;
5959
0
        ctxt->sax = sax;
5960
0
        ctxt->userData = userData;
5961
0
    }
5962
5963
0
    htmlParseDocument(ctxt);
5964
5965
0
    ret = ctxt->myDoc;
5966
0
    if (sax != NULL) {
5967
0
        ctxt->sax = oldsax;
5968
0
        ctxt->userData = NULL;
5969
0
    }
5970
0
    htmlFreeParserCtxt(ctxt);
5971
5972
0
    return(ret);
5973
0
}
5974
5975
/**
5976
 * htmlParseFile:
5977
 * @filename:  the filename
5978
 * @encoding:  encoding (optional)
5979
 *
5980
 * Parse an HTML file and build a tree.
5981
 *
5982
 * See xmlNewInputURL for details.
5983
 *
5984
 * Returns the resulting document tree
5985
 */
5986
5987
htmlDocPtr
5988
0
htmlParseFile(const char *filename, const char *encoding) {
5989
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
5990
0
}
5991
5992
/**
5993
 * htmlHandleOmittedElem:
5994
 * @val:  int 0 or 1
5995
 *
5996
 * DEPRECATED: Use HTML_PARSE_NOIMPLIED
5997
 *
5998
 * Set and return the previous value for handling HTML omitted tags.
5999
 *
6000
 * Returns the last value for 0 for no handling, 1 for auto insertion.
6001
 */
6002
6003
int
6004
0
htmlHandleOmittedElem(int val) {
6005
0
    int old = htmlOmittedDefaultValue;
6006
6007
0
    htmlOmittedDefaultValue = val;
6008
0
    return(old);
6009
0
}
6010
6011
/**
6012
 * htmlElementAllowedHere:
6013
 * @parent: HTML parent element
6014
 * @elt: HTML element
6015
 *
6016
 * Checks whether an HTML element may be a direct child of a parent element.
6017
 * Note - doesn't check for deprecated elements
6018
 *
6019
 * Returns 1 if allowed; 0 otherwise.
6020
 */
6021
int
6022
0
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6023
0
  const char** p ;
6024
6025
0
  if ( ! elt || ! parent || ! parent->subelts )
6026
0
  return 0 ;
6027
6028
0
  for ( p = parent->subelts; *p; ++p )
6029
0
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6030
0
      return 1 ;
6031
6032
0
  return 0 ;
6033
0
}
6034
/**
6035
 * htmlElementStatusHere:
6036
 * @parent: HTML parent element
6037
 * @elt: HTML element
6038
 *
6039
 * Checks whether an HTML element may be a direct child of a parent element.
6040
 * and if so whether it is valid or deprecated.
6041
 *
6042
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6043
 */
6044
htmlStatus
6045
0
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6046
0
  if ( ! parent || ! elt )
6047
0
    return HTML_INVALID ;
6048
0
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6049
0
    return HTML_INVALID ;
6050
6051
0
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6052
0
}
6053
/**
6054
 * htmlAttrAllowed:
6055
 * @elt: HTML element
6056
 * @attr: HTML attribute
6057
 * @legacy: whether to allow deprecated attributes
6058
 *
6059
 * Checks whether an attribute is valid for an element
6060
 * Has full knowledge of Required and Deprecated attributes
6061
 *
6062
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6063
 */
6064
htmlStatus
6065
0
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6066
0
  const char** p ;
6067
6068
0
  if ( !elt || ! attr )
6069
0
  return HTML_INVALID ;
6070
6071
0
  if ( elt->attrs_req )
6072
0
    for ( p = elt->attrs_req; *p; ++p)
6073
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6074
0
        return HTML_REQUIRED ;
6075
6076
0
  if ( elt->attrs_opt )
6077
0
    for ( p = elt->attrs_opt; *p; ++p)
6078
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6079
0
        return HTML_VALID ;
6080
6081
0
  if ( legacy && elt->attrs_depr )
6082
0
    for ( p = elt->attrs_depr; *p; ++p)
6083
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6084
0
        return HTML_DEPRECATED ;
6085
6086
0
  return HTML_INVALID ;
6087
0
}
6088
/**
6089
 * htmlNodeStatus:
6090
 * @node: an htmlNodePtr in a tree
6091
 * @legacy: whether to allow deprecated elements (YES is faster here
6092
 *  for Element nodes)
6093
 *
6094
 * Checks whether the tree node is valid.  Experimental (the author
6095
 *     only uses the HTML enhancements in a SAX parser)
6096
 *
6097
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6098
 *  legacy allowed) or htmlElementStatusHere (otherwise).
6099
 *  for Attribute nodes, a return from htmlAttrAllowed
6100
 *  for other nodes, HTML_NA (no checks performed)
6101
 */
6102
htmlStatus
6103
0
htmlNodeStatus(htmlNodePtr node, int legacy) {
6104
0
  if ( ! node )
6105
0
    return HTML_INVALID ;
6106
6107
0
  switch ( node->type ) {
6108
0
    case XML_ELEMENT_NODE:
6109
0
      return legacy
6110
0
  ? ( htmlElementAllowedHere (
6111
0
    htmlTagLookup(node->parent->name) , node->name
6112
0
    ) ? HTML_VALID : HTML_INVALID )
6113
0
  : htmlElementStatusHere(
6114
0
    htmlTagLookup(node->parent->name) ,
6115
0
    htmlTagLookup(node->name) )
6116
0
  ;
6117
0
    case XML_ATTRIBUTE_NODE:
6118
0
      return htmlAttrAllowed(
6119
0
  htmlTagLookup(node->parent->name) , node->name, legacy) ;
6120
0
    default: return HTML_NA ;
6121
0
  }
6122
0
}
6123
/************************************************************************
6124
 *                  *
6125
 *  New set (2.6.0) of simpler and more flexible APIs   *
6126
 *                  *
6127
 ************************************************************************/
6128
/**
6129
 * DICT_FREE:
6130
 * @str:  a string
6131
 *
6132
 * Free a string if it is not owned by the "dict" dictionary in the
6133
 * current scope
6134
 */
6135
#define DICT_FREE(str)            \
6136
0
  if ((str) && ((!dict) ||       \
6137
0
      (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6138
0
      xmlFree((char *)(str));
6139
6140
/**
6141
 * htmlCtxtReset:
6142
 * @ctxt: an HTML parser context
6143
 *
6144
 * Reset a parser context
6145
 */
6146
void
6147
htmlCtxtReset(htmlParserCtxtPtr ctxt)
6148
0
{
6149
0
    xmlParserInputPtr input;
6150
0
    xmlDictPtr dict;
6151
6152
0
    if (ctxt == NULL)
6153
0
        return;
6154
6155
0
    dict = ctxt->dict;
6156
6157
0
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6158
0
        xmlFreeInputStream(input);
6159
0
    }
6160
0
    ctxt->inputNr = 0;
6161
0
    ctxt->input = NULL;
6162
6163
0
    ctxt->spaceNr = 0;
6164
0
    if (ctxt->spaceTab != NULL) {
6165
0
  ctxt->spaceTab[0] = -1;
6166
0
  ctxt->space = &ctxt->spaceTab[0];
6167
0
    } else {
6168
0
  ctxt->space = NULL;
6169
0
    }
6170
6171
6172
0
    ctxt->nodeNr = 0;
6173
0
    ctxt->node = NULL;
6174
6175
0
    ctxt->nameNr = 0;
6176
0
    ctxt->name = NULL;
6177
6178
0
    ctxt->nsNr = 0;
6179
6180
0
    DICT_FREE(ctxt->version);
6181
0
    ctxt->version = NULL;
6182
0
    DICT_FREE(ctxt->encoding);
6183
0
    ctxt->encoding = NULL;
6184
0
    DICT_FREE(ctxt->extSubURI);
6185
0
    ctxt->extSubURI = NULL;
6186
0
    DICT_FREE(ctxt->extSubSystem);
6187
0
    ctxt->extSubSystem = NULL;
6188
0
    if (ctxt->myDoc != NULL)
6189
0
        xmlFreeDoc(ctxt->myDoc);
6190
0
    ctxt->myDoc = NULL;
6191
6192
0
    ctxt->standalone = -1;
6193
0
    ctxt->hasExternalSubset = 0;
6194
0
    ctxt->hasPErefs = 0;
6195
0
    ctxt->html = 1;
6196
0
    ctxt->instate = XML_PARSER_START;
6197
6198
0
    ctxt->wellFormed = 1;
6199
0
    ctxt->nsWellFormed = 1;
6200
0
    ctxt->disableSAX = 0;
6201
0
    ctxt->valid = 1;
6202
0
    ctxt->vctxt.userData = ctxt;
6203
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6204
0
    ctxt->vctxt.error = xmlParserValidityError;
6205
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
6206
0
    ctxt->record_info = 0;
6207
0
    ctxt->checkIndex = 0;
6208
0
    ctxt->endCheckState = 0;
6209
0
    ctxt->inSubset = 0;
6210
0
    ctxt->errNo = XML_ERR_OK;
6211
0
    ctxt->depth = 0;
6212
0
    ctxt->catalogs = NULL;
6213
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
6214
6215
0
    if (ctxt->attsDefault != NULL) {
6216
0
        xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6217
0
        ctxt->attsDefault = NULL;
6218
0
    }
6219
0
    if (ctxt->attsSpecial != NULL) {
6220
0
        xmlHashFree(ctxt->attsSpecial, NULL);
6221
0
        ctxt->attsSpecial = NULL;
6222
0
    }
6223
6224
0
    ctxt->nbErrors = 0;
6225
0
    ctxt->nbWarnings = 0;
6226
0
    if (ctxt->lastError.code != XML_ERR_OK)
6227
0
        xmlResetError(&ctxt->lastError);
6228
0
}
6229
6230
/**
6231
 * htmlCtxtUseOptions:
6232
 * @ctxt: an HTML parser context
6233
 * @options:  a combination of htmlParserOption(s)
6234
 *
6235
 * Applies the options to the parser context
6236
 *
6237
 * Returns 0 in case of success, the set of unknown or unimplemented options
6238
 *         in case of error.
6239
 */
6240
int
6241
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6242
0
{
6243
0
    if (ctxt == NULL)
6244
0
        return(-1);
6245
6246
0
    if (options & HTML_PARSE_NOWARNING) {
6247
0
        ctxt->sax->warning = NULL;
6248
0
        ctxt->vctxt.warning = NULL;
6249
0
        options -= XML_PARSE_NOWARNING;
6250
0
  ctxt->options |= XML_PARSE_NOWARNING;
6251
0
    }
6252
0
    if (options & HTML_PARSE_NOERROR) {
6253
0
        ctxt->sax->error = NULL;
6254
0
        ctxt->vctxt.error = NULL;
6255
0
        ctxt->sax->fatalError = NULL;
6256
0
        options -= XML_PARSE_NOERROR;
6257
0
  ctxt->options |= XML_PARSE_NOERROR;
6258
0
    }
6259
0
    if (options & HTML_PARSE_PEDANTIC) {
6260
0
        ctxt->pedantic = 1;
6261
0
        options -= XML_PARSE_PEDANTIC;
6262
0
  ctxt->options |= XML_PARSE_PEDANTIC;
6263
0
    } else
6264
0
        ctxt->pedantic = 0;
6265
0
    if (options & XML_PARSE_NOBLANKS) {
6266
0
        ctxt->keepBlanks = 0;
6267
0
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6268
0
        options -= XML_PARSE_NOBLANKS;
6269
0
  ctxt->options |= XML_PARSE_NOBLANKS;
6270
0
    } else
6271
0
        ctxt->keepBlanks = 1;
6272
0
    if (options & HTML_PARSE_RECOVER) {
6273
0
        ctxt->recovery = 1;
6274
0
  options -= HTML_PARSE_RECOVER;
6275
0
    } else
6276
0
        ctxt->recovery = 0;
6277
0
    if (options & HTML_PARSE_COMPACT) {
6278
0
  ctxt->options |= HTML_PARSE_COMPACT;
6279
0
        options -= HTML_PARSE_COMPACT;
6280
0
    }
6281
0
    if (options & XML_PARSE_HUGE) {
6282
0
  ctxt->options |= XML_PARSE_HUGE;
6283
0
        options -= XML_PARSE_HUGE;
6284
0
    }
6285
0
    if (options & HTML_PARSE_NODEFDTD) {
6286
0
  ctxt->options |= HTML_PARSE_NODEFDTD;
6287
0
        options -= HTML_PARSE_NODEFDTD;
6288
0
    }
6289
0
    if (options & HTML_PARSE_IGNORE_ENC) {
6290
0
  ctxt->options |= HTML_PARSE_IGNORE_ENC;
6291
0
        options -= HTML_PARSE_IGNORE_ENC;
6292
0
    }
6293
0
    if (options & HTML_PARSE_NOIMPLIED) {
6294
0
        ctxt->options |= HTML_PARSE_NOIMPLIED;
6295
0
        options -= HTML_PARSE_NOIMPLIED;
6296
0
    }
6297
0
    ctxt->dictNames = 0;
6298
0
    ctxt->linenumbers = 1;
6299
0
    return (options);
6300
0
}
6301
6302
/**
6303
 * htmlCtxtParseDocument:
6304
 * @ctxt:  an HTML parser context
6305
 * @input:  parser input
6306
 *
6307
 * Parse an HTML document and return the resulting document tree.
6308
 *
6309
 * Available since 2.13.0.
6310
 *
6311
 * Returns the resulting document tree or NULL
6312
 */
6313
htmlDocPtr
6314
htmlCtxtParseDocument(htmlParserCtxtPtr ctxt, xmlParserInputPtr input)
6315
0
{
6316
0
    htmlDocPtr ret;
6317
6318
0
    if ((ctxt == NULL) || (input == NULL))
6319
0
        return(NULL);
6320
6321
    /* assert(ctxt->inputNr == 0); */
6322
0
    while (ctxt->inputNr > 0)
6323
0
        xmlFreeInputStream(inputPop(ctxt));
6324
6325
0
    if (inputPush(ctxt, input) < 0) {
6326
0
        xmlFreeInputStream(input);
6327
0
        return(NULL);
6328
0
    }
6329
6330
0
    ctxt->html = 1;
6331
0
    htmlParseDocument(ctxt);
6332
6333
0
    if (ctxt->errNo != XML_ERR_NO_MEMORY) {
6334
0
        ret = ctxt->myDoc;
6335
0
    } else {
6336
0
        ret = NULL;
6337
0
        xmlFreeDoc(ctxt->myDoc);
6338
0
    }
6339
0
    ctxt->myDoc = NULL;
6340
6341
    /* assert(ctxt->inputNr == 1); */
6342
0
    while (ctxt->inputNr > 0)
6343
0
        xmlFreeInputStream(inputPop(ctxt));
6344
6345
0
    return(ret);
6346
0
}
6347
6348
/**
6349
 * htmlReadDoc:
6350
 * @str:  a pointer to a zero terminated string
6351
 * @url:  only used for error reporting (optoinal)
6352
 * @encoding:  the document encoding (optional)
6353
 * @options:  a combination of htmlParserOptions
6354
 *
6355
 * Convenience function to parse an HTML document from a zero-terminated
6356
 * string.
6357
 *
6358
 * See htmlCtxtReadDoc for details.
6359
 *
6360
 * Returns the resulting document tree.
6361
 */
6362
htmlDocPtr
6363
htmlReadDoc(const xmlChar *str, const char *url, const char *encoding,
6364
            int options)
6365
0
{
6366
0
    htmlParserCtxtPtr ctxt;
6367
0
    xmlParserInputPtr input;
6368
0
    htmlDocPtr doc;
6369
6370
0
    ctxt = htmlNewParserCtxt();
6371
0
    if (ctxt == NULL)
6372
0
        return(NULL);
6373
6374
0
    htmlCtxtUseOptions(ctxt, options);
6375
6376
0
    input = xmlNewInputString(ctxt, url, (const char *) str, encoding,
6377
0
                              XML_INPUT_BUF_STATIC);
6378
6379
0
    doc = htmlCtxtParseDocument(ctxt, input);
6380
6381
0
    htmlFreeParserCtxt(ctxt);
6382
0
    return(doc);
6383
0
}
6384
6385
/**
6386
 * htmlReadFile:
6387
 * @filename:  a file or URL
6388
 * @encoding:  the document encoding (optional)
6389
 * @options:  a combination of htmlParserOptions
6390
 *
6391
 * Convenience function to parse an HTML file from the filesystem,
6392
 * the network or a global user-defined resource loader.
6393
 *
6394
 * See htmlCtxtReadFile for details.
6395
 *
6396
 * Returns the resulting document tree.
6397
 */
6398
htmlDocPtr
6399
htmlReadFile(const char *filename, const char *encoding, int options)
6400
0
{
6401
0
    htmlParserCtxtPtr ctxt;
6402
0
    xmlParserInputPtr input;
6403
0
    htmlDocPtr doc;
6404
6405
0
    ctxt = htmlNewParserCtxt();
6406
0
    if (ctxt == NULL)
6407
0
        return(NULL);
6408
6409
0
    htmlCtxtUseOptions(ctxt, options);
6410
6411
0
    input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6412
6413
0
    doc = htmlCtxtParseDocument(ctxt, input);
6414
6415
0
    htmlFreeParserCtxt(ctxt);
6416
0
    return(doc);
6417
0
}
6418
6419
/**
6420
 * htmlReadMemory:
6421
 * @buffer:  a pointer to a char array
6422
 * @size:  the size of the array
6423
 * @url:  only used for error reporting (optional)
6424
 * @encoding:  the document encoding, or NULL
6425
 * @options:  a combination of htmlParserOption(s)
6426
 *
6427
 * Convenience function to parse an HTML document from memory.
6428
 * The input buffer must not contain any terminating null bytes.
6429
 *
6430
 * See htmlCtxtReadMemory for details.
6431
 *
6432
 * Returns the resulting document tree
6433
 */
6434
htmlDocPtr
6435
htmlReadMemory(const char *buffer, int size, const char *url,
6436
               const char *encoding, int options)
6437
0
{
6438
0
    htmlParserCtxtPtr ctxt;
6439
0
    xmlParserInputPtr input;
6440
0
    htmlDocPtr doc;
6441
6442
0
    if (size < 0)
6443
0
  return(NULL);
6444
6445
0
    ctxt = htmlNewParserCtxt();
6446
0
    if (ctxt == NULL)
6447
0
        return(NULL);
6448
6449
0
    htmlCtxtUseOptions(ctxt, options);
6450
6451
0
    input = xmlNewInputMemory(ctxt, url, buffer, size, encoding,
6452
0
                              XML_INPUT_BUF_STATIC);
6453
6454
0
    doc = htmlCtxtParseDocument(ctxt, input);
6455
6456
0
    htmlFreeParserCtxt(ctxt);
6457
0
    return(doc);
6458
0
}
6459
6460
/**
6461
 * htmlReadFd:
6462
 * @fd:  an open file descriptor
6463
 * @url:  only used for error reporting (optional)
6464
 * @encoding:  the document encoding, or NULL
6465
 * @options:  a combination of htmlParserOptions
6466
 *
6467
 * Convenience function to parse an HTML document from a
6468
 * file descriptor.
6469
 *
6470
 * NOTE that the file descriptor will not be closed when the
6471
 * context is freed or reset.
6472
 *
6473
 * See htmlCtxtReadFd for details.
6474
 *
6475
 * Returns the resulting document tree
6476
 */
6477
htmlDocPtr
6478
htmlReadFd(int fd, const char *url, const char *encoding, int options)
6479
0
{
6480
0
    htmlParserCtxtPtr ctxt;
6481
0
    xmlParserInputPtr input;
6482
0
    htmlDocPtr doc;
6483
6484
0
    ctxt = htmlNewParserCtxt();
6485
0
    if (ctxt == NULL)
6486
0
        return(NULL);
6487
6488
0
    htmlCtxtUseOptions(ctxt, options);
6489
6490
0
    input = xmlNewInputFd(ctxt, url, fd, encoding, 0);
6491
6492
0
    doc = htmlCtxtParseDocument(ctxt, input);
6493
6494
0
    htmlFreeParserCtxt(ctxt);
6495
0
    return(doc);
6496
0
}
6497
6498
/**
6499
 * htmlReadIO:
6500
 * @ioread:  an I/O read function
6501
 * @ioclose:  an I/O close function (optional)
6502
 * @ioctx:  an I/O handler
6503
 * @url:  only used for error reporting (optional)
6504
 * @encoding:  the document encoding (optional)
6505
 * @options:  a combination of htmlParserOption(s)
6506
 *
6507
 * Convenience function to parse an HTML document from I/O functions
6508
 * and context.
6509
 *
6510
 * See htmlCtxtReadIO for details.
6511
 *
6512
 * Returns the resulting document tree
6513
 */
6514
htmlDocPtr
6515
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6516
          void *ioctx, const char *url, const char *encoding, int options)
6517
0
{
6518
0
    htmlParserCtxtPtr ctxt;
6519
0
    xmlParserInputPtr input;
6520
0
    htmlDocPtr doc;
6521
6522
0
    ctxt = htmlNewParserCtxt();
6523
0
    if (ctxt == NULL)
6524
0
        return (NULL);
6525
6526
0
    htmlCtxtUseOptions(ctxt, options);
6527
6528
0
    input = xmlNewInputIO(ctxt, url, ioread, ioclose, ioctx, encoding, 0);
6529
6530
0
    doc = htmlCtxtParseDocument(ctxt, input);
6531
6532
0
    htmlFreeParserCtxt(ctxt);
6533
0
    return(doc);
6534
0
}
6535
6536
/**
6537
 * htmlCtxtReadDoc:
6538
 * @ctxt:  an HTML parser context
6539
 * @str:  a pointer to a zero terminated string
6540
 * @URL:  only used for error reporting (optional)
6541
 * @encoding:  the document encoding (optional)
6542
 * @options:  a combination of htmlParserOptions
6543
 *
6544
 * Parse an HTML in-memory document and build a tree.
6545
 *
6546
 * See htmlCtxtUseOptions for details.
6547
 *
6548
 * Returns the resulting document tree
6549
 */
6550
htmlDocPtr
6551
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6552
                const char *URL, const char *encoding, int options)
6553
0
{
6554
0
    xmlParserInputPtr input;
6555
6556
0
    if (ctxt == NULL)
6557
0
        return (NULL);
6558
6559
0
    htmlCtxtReset(ctxt);
6560
0
    htmlCtxtUseOptions(ctxt, options);
6561
6562
0
    input = xmlNewInputString(ctxt, URL, (const char *) str, encoding, 0);
6563
6564
0
    return(htmlCtxtParseDocument(ctxt, input));
6565
0
}
6566
6567
/**
6568
 * htmlCtxtReadFile:
6569
 * @ctxt:  an HTML parser context
6570
 * @filename:  a file or URL
6571
 * @encoding:  the document encoding (optional)
6572
 * @options:  a combination of htmlParserOptions
6573
 *
6574
 * Parse an HTML file from the filesystem, the network or a
6575
 * user-defined resource loader.
6576
 *
6577
 * See xmlNewInputURL and htmlCtxtUseOptions for details.
6578
 *
6579
 * Returns the resulting document tree
6580
 */
6581
htmlDocPtr
6582
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6583
                const char *encoding, int options)
6584
0
{
6585
0
    xmlParserInputPtr input;
6586
6587
0
    if (ctxt == NULL)
6588
0
        return (NULL);
6589
6590
0
    htmlCtxtReset(ctxt);
6591
0
    htmlCtxtUseOptions(ctxt, options);
6592
6593
0
    input = xmlNewInputURL(ctxt, filename, NULL, encoding, 0);
6594
6595
0
    return(htmlCtxtParseDocument(ctxt, input));
6596
0
}
6597
6598
/**
6599
 * htmlCtxtReadMemory:
6600
 * @ctxt:  an HTML parser context
6601
 * @buffer:  a pointer to a char array
6602
 * @size:  the size of the array
6603
 * @URL:  only used for error reporting (optional)
6604
 * @encoding:  the document encoding (optinal)
6605
 * @options:  a combination of htmlParserOptions
6606
 *
6607
 * Parse an HTML in-memory document and build a tree. The input buffer must
6608
 * not contain any terminating null bytes.
6609
 *
6610
 * See htmlCtxtUseOptions for details.
6611
 *
6612
 * Returns the resulting document tree
6613
 */
6614
htmlDocPtr
6615
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6616
                  const char *URL, const char *encoding, int options)
6617
0
{
6618
0
    xmlParserInputPtr input;
6619
6620
0
    if ((ctxt == NULL) || (size < 0))
6621
0
        return (NULL);
6622
6623
0
    htmlCtxtReset(ctxt);
6624
0
    htmlCtxtUseOptions(ctxt, options);
6625
6626
0
    input = xmlNewInputMemory(ctxt, URL, buffer, size, encoding,
6627
0
                              XML_INPUT_BUF_STATIC);
6628
6629
0
    return(htmlCtxtParseDocument(ctxt, input));
6630
0
}
6631
6632
/**
6633
 * htmlCtxtReadFd:
6634
 * @ctxt:  an HTML parser context
6635
 * @fd:  an open file descriptor
6636
 * @URL:  only used for error reporting (optional)
6637
 * @encoding:  the document encoding (optinal)
6638
 * @options:  a combination of htmlParserOptions
6639
 *
6640
 * Parse an HTML from a file descriptor and build a tree.
6641
 *
6642
 * See htmlCtxtUseOptions for details.
6643
 *
6644
 * NOTE that the file descriptor will not be closed when the
6645
 * context is freed or reset.
6646
 *
6647
 * Returns the resulting document tree
6648
 */
6649
htmlDocPtr
6650
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6651
              const char *URL, const char *encoding, int options)
6652
0
{
6653
0
    xmlParserInputPtr input;
6654
6655
0
    if (ctxt == NULL)
6656
0
        return(NULL);
6657
6658
0
    htmlCtxtReset(ctxt);
6659
0
    htmlCtxtUseOptions(ctxt, options);
6660
6661
0
    input = xmlNewInputFd(ctxt, URL, fd, encoding, 0);
6662
6663
0
    return(htmlCtxtParseDocument(ctxt, input));
6664
0
}
6665
6666
/**
6667
 * htmlCtxtReadIO:
6668
 * @ctxt:  an HTML parser context
6669
 * @ioread:  an I/O read function
6670
 * @ioclose:  an I/O close function
6671
 * @ioctx:  an I/O handler
6672
 * @URL:  the base URL to use for the document
6673
 * @encoding:  the document encoding, or NULL
6674
 * @options:  a combination of htmlParserOption(s)
6675
 *
6676
 * Parse an HTML document from I/O functions and source and build a tree.
6677
 *
6678
 * See xmlNewInputIO and htmlCtxtUseOptions for details.
6679
 *
6680
 * Returns the resulting document tree
6681
 */
6682
htmlDocPtr
6683
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6684
              xmlInputCloseCallback ioclose, void *ioctx,
6685
        const char *URL,
6686
              const char *encoding, int options)
6687
0
{
6688
0
    xmlParserInputPtr input;
6689
6690
0
    if (ctxt == NULL)
6691
0
        return (NULL);
6692
6693
0
    htmlCtxtReset(ctxt);
6694
0
    htmlCtxtUseOptions(ctxt, options);
6695
6696
0
    input = xmlNewInputIO(ctxt, URL, ioread, ioclose, ioctx, encoding, 0);
6697
6698
0
    return(htmlCtxtParseDocument(ctxt, input));
6699
0
}
6700
6701
#endif /* LIBXML_HTML_ENABLED */