Coverage Report

Created: 2022-02-21 09:34

/src/libxml2/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12
13
#include <string.h>
14
#ifdef HAVE_CTYPE_H
15
#include <ctype.h>
16
#endif
17
#ifdef HAVE_STDLIB_H
18
#include <stdlib.h>
19
#endif
20
#ifdef HAVE_SYS_STAT_H
21
#include <sys/stat.h>
22
#endif
23
#ifdef HAVE_FCNTL_H
24
#include <fcntl.h>
25
#endif
26
#ifdef HAVE_UNISTD_H
27
#include <unistd.h>
28
#endif
29
#ifdef LIBXML_ZLIB_ENABLED
30
#include <zlib.h>
31
#endif
32
33
#include <libxml/xmlmemory.h>
34
#include <libxml/tree.h>
35
#include <libxml/parser.h>
36
#include <libxml/parserInternals.h>
37
#include <libxml/xmlerror.h>
38
#include <libxml/HTMLparser.h>
39
#include <libxml/HTMLtree.h>
40
#include <libxml/entities.h>
41
#include <libxml/encoding.h>
42
#include <libxml/valid.h>
43
#include <libxml/xmlIO.h>
44
#include <libxml/globals.h>
45
#include <libxml/uri.h>
46
47
#include "buf.h"
48
#include "enc.h"
49
50
#define HTML_MAX_NAMELEN 1000
51
579M
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52
536M
#define HTML_PARSER_BUFFER_SIZE 100
53
54
/* #define DEBUG */
55
/* #define DEBUG_PUSH */
56
57
static int htmlOmittedDefaultValue = 1;
58
59
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60
           xmlChar end, xmlChar  end2, xmlChar end3);
61
static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63
/************************************************************************
64
 *                  *
65
 *    Some factorized error routines        *
66
 *                  *
67
 ************************************************************************/
68
69
/**
70
 * htmlErrMemory:
71
 * @ctxt:  an HTML parser context
72
 * @extra:  extra information
73
 *
74
 * Handle a redefinition of attribute error
75
 */
76
static void
77
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78
0
{
79
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80
0
        (ctxt->instate == XML_PARSER_EOF))
81
0
  return;
82
0
    if (ctxt != NULL) {
83
0
        ctxt->errNo = XML_ERR_NO_MEMORY;
84
0
        ctxt->instate = XML_PARSER_EOF;
85
0
        ctxt->disableSAX = 1;
86
0
    }
87
0
    if (extra)
88
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90
0
                        NULL, NULL, 0, 0,
91
0
                        "Memory allocation failed : %s\n", extra);
92
0
    else
93
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95
0
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
96
0
}
97
98
/**
99
 * htmlParseErr:
100
 * @ctxt:  an HTML parser context
101
 * @error:  the error number
102
 * @msg:  the error message
103
 * @str1:  string infor
104
 * @str2:  string infor
105
 *
106
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107
 */
108
static void LIBXML_ATTR_FORMAT(3,0)
109
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110
             const char *msg, const xmlChar *str1, const xmlChar *str2)
111
2.33M
{
112
2.33M
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113
2.33M
        (ctxt->instate == XML_PARSER_EOF))
114
87
  return;
115
2.33M
    if (ctxt != NULL)
116
2.33M
  ctxt->errNo = error;
117
2.33M
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118
2.33M
                    XML_ERR_ERROR, NULL, 0,
119
2.33M
        (const char *) str1, (const char *) str2,
120
2.33M
        NULL, 0, 0,
121
2.33M
        msg, str1, str2);
122
2.33M
    if (ctxt != NULL)
123
2.33M
  ctxt->wellFormed = 0;
124
2.33M
}
125
126
/**
127
 * htmlParseErrInt:
128
 * @ctxt:  an HTML parser context
129
 * @error:  the error number
130
 * @msg:  the error message
131
 * @val:  integer info
132
 *
133
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134
 */
135
static void LIBXML_ATTR_FORMAT(3,0)
136
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137
             const char *msg, int val)
138
4.12M
{
139
4.12M
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140
4.12M
        (ctxt->instate == XML_PARSER_EOF))
141
0
  return;
142
4.12M
    if (ctxt != NULL)
143
4.12M
  ctxt->errNo = error;
144
4.12M
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145
4.12M
                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
146
4.12M
        NULL, val, 0, msg, val);
147
4.12M
    if (ctxt != NULL)
148
4.12M
  ctxt->wellFormed = 0;
149
4.12M
}
150
151
/************************************************************************
152
 *                  *
153
 *  Parser stacks related functions and macros    *
154
 *                  *
155
 ************************************************************************/
156
157
/**
158
 * htmlnamePush:
159
 * @ctxt:  an HTML parser context
160
 * @value:  the element name
161
 *
162
 * Pushes a new element name on top of the name stack
163
 *
164
 * Returns 0 in case of error, the index in the stack otherwise
165
 */
166
static int
167
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168
17.7M
{
169
17.7M
    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170
3.96k
        ctxt->html = 3;
171
17.7M
    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172
8.04k
        ctxt->html = 10;
173
17.7M
    if (ctxt->nameNr >= ctxt->nameMax) {
174
6.20k
        ctxt->nameMax *= 2;
175
6.20k
        ctxt->nameTab = (const xmlChar * *)
176
6.20k
                         xmlRealloc((xmlChar * *)ctxt->nameTab,
177
6.20k
                                    ctxt->nameMax *
178
6.20k
                                    sizeof(ctxt->nameTab[0]));
179
6.20k
        if (ctxt->nameTab == NULL) {
180
0
            htmlErrMemory(ctxt, NULL);
181
0
            return (0);
182
0
        }
183
6.20k
    }
184
17.7M
    ctxt->nameTab[ctxt->nameNr] = value;
185
17.7M
    ctxt->name = value;
186
17.7M
    return (ctxt->nameNr++);
187
17.7M
}
188
/**
189
 * htmlnamePop:
190
 * @ctxt: an HTML parser context
191
 *
192
 * Pops the top element name from the name stack
193
 *
194
 * Returns the name just removed
195
 */
196
static const xmlChar *
197
htmlnamePop(htmlParserCtxtPtr ctxt)
198
17.0M
{
199
17.0M
    const xmlChar *ret;
200
201
17.0M
    if (ctxt->nameNr <= 0)
202
0
        return (NULL);
203
17.0M
    ctxt->nameNr--;
204
17.0M
    if (ctxt->nameNr < 0)
205
0
        return (NULL);
206
17.0M
    if (ctxt->nameNr > 0)
207
16.9M
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208
29.5k
    else
209
29.5k
        ctxt->name = NULL;
210
17.0M
    ret = ctxt->nameTab[ctxt->nameNr];
211
17.0M
    ctxt->nameTab[ctxt->nameNr] = NULL;
212
17.0M
    return (ret);
213
17.0M
}
214
215
/**
216
 * htmlNodeInfoPush:
217
 * @ctxt:  an HTML parser context
218
 * @value:  the node info
219
 *
220
 * Pushes a new element name on top of the node info stack
221
 *
222
 * Returns 0 in case of error, the index in the stack otherwise
223
 */
224
static int
225
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226
0
{
227
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228
0
        if (ctxt->nodeInfoMax == 0)
229
0
                ctxt->nodeInfoMax = 5;
230
0
        ctxt->nodeInfoMax *= 2;
231
0
        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232
0
                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233
0
                                    ctxt->nodeInfoMax *
234
0
                                    sizeof(ctxt->nodeInfoTab[0]));
235
0
        if (ctxt->nodeInfoTab == NULL) {
236
0
            htmlErrMemory(ctxt, NULL);
237
0
            return (0);
238
0
        }
239
0
    }
240
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242
0
    return (ctxt->nodeInfoNr++);
243
0
}
244
245
/**
246
 * htmlNodeInfoPop:
247
 * @ctxt:  an HTML parser context
248
 *
249
 * Pops the top element name from the node info stack
250
 *
251
 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252
 */
253
static htmlParserNodeInfo *
254
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255
37.8k
{
256
37.8k
    if (ctxt->nodeInfoNr <= 0)
257
37.8k
        return (NULL);
258
0
    ctxt->nodeInfoNr--;
259
0
    if (ctxt->nodeInfoNr < 0)
260
0
        return (NULL);
261
0
    if (ctxt->nodeInfoNr > 0)
262
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263
0
    else
264
0
        ctxt->nodeInfo = NULL;
265
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266
0
}
267
268
/*
269
 * Macros for accessing the content. Those should be used only by the parser,
270
 * and not exported.
271
 *
272
 * Dirty macros, i.e. one need to make assumption on the context to use them
273
 *
274
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
275
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
276
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277
 *           in UNICODE mode. This should be used internally by the parser
278
 *           only to compare to ASCII values otherwise it would break when
279
 *           running with UTF-8 encoding.
280
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
281
 *           to compare on ASCII based substring.
282
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
283
 *           it should be used only to compare on ASCII based substring.
284
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285
 *           strings without newlines within the parser.
286
 *
287
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288
 *
289
 *   CURRENT Returns the current char value, with the full decoding of
290
 *           UTF-8 if we are using this mode. It returns an int.
291
 *   NEXT    Skip to the next character, this does the proper decoding
292
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
293
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
294
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295
 */
296
297
21.0k
#define UPPER (toupper(*ctxt->input->cur))
298
299
1.46M
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301
91.7M
#define NXT(val) ctxt->input->cur[(val)]
302
303
607k
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305
289k
#define CUR_PTR ctxt->input->cur
306
291k
#define BASE_PTR ctxt->input->base
307
308
5.41M
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309
5.41M
       (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310
5.41M
  xmlParserInputShrink(ctxt->input)
311
312
152M
#define GROW if ((ctxt->progressive == 0) &&       \
313
152M
     (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK)) \
314
152M
  xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316
#define CURRENT ((int) (*ctxt->input->cur))
317
318
19.2M
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320
/* Imported from XML */
321
322
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323
978M
#define CUR ((int) (*ctxt->input->cur))
324
261M
#define NEXT xmlNextChar(ctxt)
325
326
286k
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329
955M
#define NEXTL(l) do {             \
330
955M
    if (*(ctxt->input->cur) == '\n') {         \
331
14.7M
  ctxt->input->line++; ctxt->input->col = 1;      \
332
940M
    } else ctxt->input->col++;           \
333
955M
    ctxt->token = 0; ctxt->input->cur += l;       \
334
955M
  } while (0)
335
336
/************
337
    \
338
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340
 ************/
341
342
1.13G
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345
#define COPY_BUF(l,b,i,v)           \
346
760M
    if (l == 1) b[i++] = (xmlChar) v;         \
347
760M
    else i += xmlCopyChar(l,&b[i],v)
348
349
/**
350
 * htmlFindEncoding:
351
 * @the HTML parser context
352
 *
353
 * Ty to find and encoding in the current data available in the input
354
 * buffer this is needed to try to switch to the proper encoding when
355
 * one face a character error.
356
 * That's an heuristic, since it's operating outside of parsing it could
357
 * try to use a meta which had been commented out, that's the reason it
358
 * should only be used in case of error, not as a default.
359
 *
360
 * Returns an encoding string or NULL if not found, the string need to
361
 *   be freed
362
 */
363
static xmlChar *
364
7.27k
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365
7.27k
    const xmlChar *start, *cur, *end;
366
367
7.27k
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
368
7.27k
        (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369
7.27k
        (ctxt->input->buf->encoder != NULL))
370
1.93k
        return(NULL);
371
5.34k
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372
0
        return(NULL);
373
374
5.34k
    start = ctxt->input->cur;
375
5.34k
    end = ctxt->input->end;
376
    /* we also expect the input buffer to be zero terminated */
377
5.34k
    if (*end != 0)
378
0
        return(NULL);
379
380
5.34k
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381
5.34k
    if (cur == NULL)
382
4.21k
        return(NULL);
383
1.12k
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
384
1.12k
    if (cur == NULL)
385
26
        return(NULL);
386
1.09k
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
387
1.09k
    if (cur == NULL)
388
47
        return(NULL);
389
1.04k
    cur += 8;
390
1.04k
    start = cur;
391
117k
    while (((*cur >= 'A') && (*cur <= 'Z')) ||
392
117k
           ((*cur >= 'a') && (*cur <= 'z')) ||
393
117k
           ((*cur >= '0') && (*cur <= '9')) ||
394
117k
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395
116k
           cur++;
396
1.04k
    if (cur == start)
397
11
        return(NULL);
398
1.03k
    return(xmlStrndup(start, cur - start));
399
1.04k
}
400
401
/**
402
 * htmlCurrentChar:
403
 * @ctxt:  the HTML parser context
404
 * @len:  pointer to the length of the char read
405
 *
406
 * The current char value, if using UTF-8 this may actually span multiple
407
 * bytes in the input buffer. Implement the end of line normalization:
408
 * 2.11 End-of-Line Handling
409
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410
 * char, then the encoding converter is plugged in automatically.
411
 *
412
 * Returns the current char value and its length
413
 */
414
415
static int
416
1.13G
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417
1.13G
    const unsigned char *cur;
418
1.13G
    unsigned char c;
419
1.13G
    unsigned int val;
420
421
1.13G
    if (ctxt->instate == XML_PARSER_EOF)
422
62
  return(0);
423
424
1.13G
    if (ctxt->token != 0) {
425
0
  *len = 0;
426
0
  return(ctxt->token);
427
0
    }
428
1.13G
    if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
429
777k
        xmlChar * guess;
430
777k
        xmlCharEncodingHandlerPtr handler;
431
432
        /*
433
         * Assume it's a fixed length encoding (1) with
434
         * a compatible encoding for the ASCII set, since
435
         * HTML constructs only use < 128 chars
436
         */
437
777k
        if ((int) *ctxt->input->cur < 0x80) {
438
770k
            *len = 1;
439
770k
            if ((*ctxt->input->cur == 0) &&
440
770k
                (ctxt->input->cur < ctxt->input->end)) {
441
197k
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
442
197k
                                "Char 0x%X out of allowed range\n", 0);
443
197k
                return(' ');
444
197k
            }
445
573k
            return((int) *ctxt->input->cur);
446
770k
        }
447
448
        /*
449
         * Humm this is bad, do an automatic flow conversion
450
         */
451
7.27k
        guess = htmlFindEncoding(ctxt);
452
7.27k
        if (guess == NULL) {
453
6.23k
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
454
6.23k
        } else {
455
1.03k
            if (ctxt->input->encoding != NULL)
456
0
                xmlFree((xmlChar *) ctxt->input->encoding);
457
1.03k
            ctxt->input->encoding = guess;
458
1.03k
            handler = xmlFindCharEncodingHandler((const char *) guess);
459
1.03k
            if (handler != NULL) {
460
                /*
461
                 * Don't use UTF-8 encoder which isn't required and
462
                 * can produce invalid UTF-8.
463
                 */
464
876
                if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
465
871
                    xmlSwitchToEncoding(ctxt, handler);
466
876
            } else {
467
162
                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
468
162
                             "Unsupported encoding %s", guess, NULL);
469
162
            }
470
1.03k
        }
471
7.27k
        ctxt->charset = XML_CHAR_ENCODING_UTF8;
472
7.27k
    }
473
474
    /*
475
     * We are supposed to handle UTF8, check it's valid
476
     * From rfc2044: encoding of the Unicode values on UTF-8:
477
     *
478
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
479
     * 0000 0000-0000 007F   0xxxxxxx
480
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
481
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
482
     *
483
     * Check for the 0x110000 limit too
484
     */
485
1.13G
    cur = ctxt->input->cur;
486
1.13G
    c = *cur;
487
1.13G
    if (c & 0x80) {
488
1.10G
        if ((c & 0x40) == 0)
489
13.3k
            goto encoding_error;
490
1.10G
        if (cur[1] == 0) {
491
226
            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
492
226
            cur = ctxt->input->cur;
493
226
        }
494
1.10G
        if ((cur[1] & 0xc0) != 0x80)
495
1.61k
            goto encoding_error;
496
1.10G
        if ((c & 0xe0) == 0xe0) {
497
498
1.06G
            if (cur[2] == 0) {
499
22
                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
500
22
                cur = ctxt->input->cur;
501
22
            }
502
1.06G
            if ((cur[2] & 0xc0) != 0x80)
503
60
                goto encoding_error;
504
1.06G
            if ((c & 0xf0) == 0xf0) {
505
5.17k
                if (cur[3] == 0) {
506
7
                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
507
7
                    cur = ctxt->input->cur;
508
7
                }
509
5.17k
                if (((c & 0xf8) != 0xf0) ||
510
5.17k
                    ((cur[3] & 0xc0) != 0x80))
511
2.21k
                    goto encoding_error;
512
                /* 4-byte code */
513
2.96k
                *len = 4;
514
2.96k
                val = (cur[0] & 0x7) << 18;
515
2.96k
                val |= (cur[1] & 0x3f) << 12;
516
2.96k
                val |= (cur[2] & 0x3f) << 6;
517
2.96k
                val |= cur[3] & 0x3f;
518
2.96k
                if (val < 0x10000)
519
18
                    goto encoding_error;
520
1.06G
            } else {
521
              /* 3-byte code */
522
1.06G
                *len = 3;
523
1.06G
                val = (cur[0] & 0xf) << 12;
524
1.06G
                val |= (cur[1] & 0x3f) << 6;
525
1.06G
                val |= cur[2] & 0x3f;
526
1.06G
                if (val < 0x800)
527
10
                    goto encoding_error;
528
1.06G
            }
529
1.06G
        } else {
530
          /* 2-byte code */
531
39.7M
            *len = 2;
532
39.7M
            val = (cur[0] & 0x1f) << 6;
533
39.7M
            val |= cur[1] & 0x3f;
534
39.7M
            if (val < 0x80)
535
12
                goto encoding_error;
536
39.7M
        }
537
1.10G
        if (!IS_CHAR(val)) {
538
7.54k
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
539
7.54k
                            "Char 0x%X out of allowed range\n", val);
540
7.54k
        }
541
1.10G
        return(val);
542
1.10G
    } else {
543
31.0M
        if ((*ctxt->input->cur == 0) &&
544
31.0M
            (ctxt->input->cur < ctxt->input->end)) {
545
3.35M
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
546
3.35M
                            "Char 0x%X out of allowed range\n", 0);
547
3.35M
            *len = 1;
548
3.35M
            return(' ');
549
3.35M
        }
550
        /* 1-byte code */
551
27.7M
        *len = 1;
552
27.7M
        return((int) *ctxt->input->cur);
553
31.0M
    }
554
555
17.2k
encoding_error:
556
    /*
557
     * If we detect an UTF8 error that probably mean that the
558
     * input encoding didn't get properly advertised in the
559
     * declaration header. Report the error and switch the encoding
560
     * to ISO-Latin-1 (if you don't like this policy, just declare the
561
     * encoding !)
562
     */
563
17.2k
    {
564
17.2k
        char buffer[150];
565
566
17.2k
  if (ctxt->input->end - ctxt->input->cur >= 4) {
567
15.6k
      snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
568
15.6k
          ctxt->input->cur[0], ctxt->input->cur[1],
569
15.6k
          ctxt->input->cur[2], ctxt->input->cur[3]);
570
15.6k
  } else {
571
1.55k
      snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
572
1.55k
  }
573
17.2k
  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
574
17.2k
         "Input is not proper UTF-8, indicate encoding !\n",
575
17.2k
         BAD_CAST buffer, NULL);
576
17.2k
    }
577
578
    /*
579
     * Don't switch encodings twice. Note that if there's an encoder, we
580
     * shouldn't receive invalid UTF-8 anyway.
581
     *
582
     * Note that if ctxt->input->buf == NULL, switching encodings is
583
     * impossible, see Gitlab issue #34.
584
     */
585
17.2k
    if ((ctxt->input->buf != NULL) &&
586
17.2k
        (ctxt->input->buf->encoder == NULL))
587
2.97k
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
588
17.2k
    *len = 1;
589
17.2k
    return((int) *ctxt->input->cur);
590
1.13G
}
591
592
/**
593
 * htmlSkipBlankChars:
594
 * @ctxt:  the HTML parser context
595
 *
596
 * skip all blanks character found at that point in the input streams.
597
 *
598
 * Returns the number of space chars skipped
599
 */
600
601
static int
602
19.2M
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
603
19.2M
    int res = 0;
604
605
19.2M
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
606
2.63M
  if ((*ctxt->input->cur == 0) &&
607
2.63M
      (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
608
0
    xmlPopInput(ctxt);
609
2.63M
  } else {
610
2.63M
      if (*(ctxt->input->cur) == '\n') {
611
2.39M
    ctxt->input->line++; ctxt->input->col = 1;
612
2.39M
      } else ctxt->input->col++;
613
2.63M
      ctxt->input->cur++;
614
2.63M
      if (*ctxt->input->cur == 0)
615
1.23k
    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
616
2.63M
  }
617
2.63M
  res++;
618
2.63M
    }
619
19.2M
    return(res);
620
19.2M
}
621
622
623
624
/************************************************************************
625
 *                  *
626
 *  The list of HTML elements and their properties    *
627
 *                  *
628
 ************************************************************************/
629
630
/*
631
 *  Start Tag: 1 means the start tag can be omitted
632
 *  End Tag:   1 means the end tag can be omitted
633
 *             2 means it's forbidden (empty elements)
634
 *             3 means the tag is stylistic and should be closed easily
635
 *  Depr:      this element is deprecated
636
 *  DTD:       1 means that this element is valid only in the Loose DTD
637
 *             2 means that this element is valid only in the Frameset DTD
638
 *
639
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
640
  , subElements , impliedsubelt , Attributes, userdata
641
 */
642
643
/* Definitions and a couple of vars for HTML Elements */
644
645
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
646
#define NB_FONTSTYLE 8
647
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
648
#define NB_PHRASE 10
649
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
650
#define NB_SPECIAL 16
651
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
652
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
653
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
654
#define NB_BLOCK NB_HEADING + NB_LIST + 14
655
#define FORMCTRL "input", "select", "textarea", "label", "button"
656
#define NB_FORMCTRL 5
657
#define PCDATA
658
#define NB_PCDATA 0
659
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
660
#define NB_HEADING 6
661
#define LIST "ul", "ol", "dir", "menu"
662
#define NB_LIST 4
663
#define MODIFIER
664
#define NB_MODIFIER 0
665
#define FLOW BLOCK,INLINE
666
#define NB_FLOW NB_BLOCK + NB_INLINE
667
#define EMPTY NULL
668
669
670
static const char* const html_flow[] = { FLOW, NULL } ;
671
static const char* const html_inline[] = { INLINE, NULL } ;
672
673
/* placeholders: elts with content but no subelements */
674
static const char* const html_pcdata[] = { NULL } ;
675
#define html_cdata html_pcdata
676
677
678
/* ... and for HTML Attributes */
679
680
#define COREATTRS "id", "class", "style", "title"
681
#define NB_COREATTRS 4
682
#define I18N "lang", "dir"
683
#define NB_I18N 2
684
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
685
#define NB_EVENTS 9
686
#define ATTRS COREATTRS,I18N,EVENTS
687
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
688
#define CELLHALIGN "align", "char", "charoff"
689
#define NB_CELLHALIGN 3
690
#define CELLVALIGN "valign"
691
#define NB_CELLVALIGN 1
692
693
static const char* const html_attrs[] = { ATTRS, NULL } ;
694
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
695
static const char* const core_attrs[] = { COREATTRS, NULL } ;
696
static const char* const i18n_attrs[] = { I18N, NULL } ;
697
698
699
/* Other declarations that should go inline ... */
700
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
701
  "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
702
  "tabindex", "onfocus", "onblur", NULL } ;
703
static const char* const target_attr[] = { "target", NULL } ;
704
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
705
static const char* const alt_attr[] = { "alt", NULL } ;
706
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
707
static const char* const href_attrs[] = { "href", NULL } ;
708
static const char* const clear_attrs[] = { "clear", NULL } ;
709
static const char* const inline_p[] = { INLINE, "p", NULL } ;
710
711
static const char* const flow_param[] = { FLOW, "param", NULL } ;
712
static const char* const applet_attrs[] = { COREATTRS , "codebase",
713
    "archive", "alt", "name", "height", "width", "align",
714
    "hspace", "vspace", NULL } ;
715
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
716
  "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
717
static const char* const basefont_attrs[] =
718
  { "id", "size", "color", "face", NULL } ;
719
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
720
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
721
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
722
static const char* const body_depr[] = { "background", "bgcolor", "text",
723
  "link", "vlink", "alink", NULL } ;
724
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
725
  "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
726
727
728
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
729
static const char* const col_elt[] = { "col", NULL } ;
730
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
731
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
732
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
733
static const char* const compact_attr[] = { "compact", NULL } ;
734
static const char* const label_attr[] = { "label", NULL } ;
735
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
736
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
737
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
738
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
739
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
740
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
741
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
742
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
743
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
744
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
745
static const char* const version_attr[] = { "version", NULL } ;
746
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
747
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
748
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
749
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
750
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
751
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
752
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
753
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
754
static const char* const align_attr[] = { "align", NULL } ;
755
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
756
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
757
static const char* const name_attr[] = { "name", NULL } ;
758
static const char* const action_attr[] = { "action", NULL } ;
759
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
760
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
761
static const char* const content_attr[] = { "content", NULL } ;
762
static const char* const type_attr[] = { "type", NULL } ;
763
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
764
static const char* const object_contents[] = { FLOW, "param", NULL } ;
765
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
766
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
767
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
768
static const char* const option_elt[] = { "option", NULL } ;
769
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
770
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
771
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
772
static const char* const width_attr[] = { "width", NULL } ;
773
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
774
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
775
static const char* const language_attr[] = { "language", NULL } ;
776
static const char* const select_content[] = { "optgroup", "option", NULL } ;
777
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
778
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
779
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
780
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
781
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
782
static const char* const tr_elt[] = { "tr", NULL } ;
783
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
784
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
785
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
786
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
787
static const char* const tr_contents[] = { "th", "td", NULL } ;
788
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
789
static const char* const li_elt[] = { "li", NULL } ;
790
static const char* const ul_depr[] = { "type", "compact", NULL} ;
791
static const char* const dir_attr[] = { "dir", NULL} ;
792
793
#define DECL (const char**)
794
795
static const htmlElemDesc
796
html40ElementTable[] = {
797
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
798
  DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
799
},
800
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
801
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
802
},
803
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
804
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
805
},
806
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
807
  DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
808
},
809
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
810
  DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
811
},
812
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
813
  EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
814
},
815
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
816
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
817
},
818
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
819
  EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
820
},
821
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
822
  EMPTY , NULL , NULL, DECL basefont_attrs, NULL
823
},
824
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
825
  DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
826
},
827
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
828
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
829
},
830
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
831
  DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
832
},
833
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
834
  DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
835
},
836
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
837
  EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
838
},
839
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
840
  DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
841
},
842
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
843
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
844
},
845
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
846
  DECL html_flow , NULL , NULL, DECL html_attrs, NULL
847
},
848
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
849
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
850
},
851
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
852
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
853
},
854
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
855
  EMPTY , NULL , DECL col_attrs , NULL, NULL
856
},
857
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
858
  DECL col_elt , "col" , DECL col_attrs , NULL, NULL
859
},
860
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
861
  DECL html_flow , NULL , DECL html_attrs, NULL, NULL
862
},
863
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
864
  DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
865
},
866
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
867
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
868
},
869
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
870
  DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
871
},
872
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
873
  DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
874
},
875
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
876
  DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
877
},
878
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
879
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
880
},
881
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
882
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
883
},
884
{ "embed",  0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
885
  EMPTY, NULL, DECL embed_attrs, NULL, NULL
886
},
887
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
888
  DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
889
},
890
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
891
  DECL html_inline, NULL, NULL, DECL font_attrs, NULL
892
},
893
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
894
  DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
895
},
896
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
897
  EMPTY, NULL, NULL, DECL frame_attrs, NULL
898
},
899
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
900
  DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
901
},
902
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
903
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
904
},
905
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
906
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
907
},
908
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
909
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
910
},
911
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
912
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
913
},
914
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
915
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
916
},
917
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
918
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
919
},
920
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
921
  DECL head_contents, NULL, DECL head_attrs, NULL, NULL
922
},
923
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
924
  EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
925
},
926
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
927
  DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
928
},
929
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
930
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
931
},
932
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
933
  DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
934
},
935
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
936
  EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
937
},
938
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
939
  EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
940
},
941
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
942
  DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
943
},
944
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
945
  EMPTY, NULL, NULL, DECL prompt_attrs, NULL
946
},
947
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
948
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
949
},
950
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
951
  DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
952
},
953
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
954
  DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
955
},
956
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
957
  DECL html_flow, NULL, DECL html_attrs, NULL, NULL
958
},
959
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
960
  EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
961
},
962
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
963
  DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
964
},
965
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
966
  DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
967
},
968
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
969
  EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
970
},
971
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
972
  DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
973
},
974
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
975
  DECL html_flow, "div", DECL html_attrs, NULL, NULL
976
},
977
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
978
  DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
979
},
980
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
981
  DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
982
},
983
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
984
  DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
985
},
986
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
987
  DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
988
},
989
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
990
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
991
},
992
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
993
  EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
994
},
995
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
996
  DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
997
},
998
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
999
  DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
1000
},
1001
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
1002
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1003
},
1004
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
1005
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1006
},
1007
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
1008
  DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
1009
},
1010
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
1011
  DECL select_content, NULL, DECL select_attrs, NULL, NULL
1012
},
1013
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
1014
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1015
},
1016
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1017
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1018
},
1019
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1020
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1021
},
1022
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1023
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1024
},
1025
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
1026
  DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1027
},
1028
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
1029
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1030
},
1031
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
1032
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1033
},
1034
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
1035
  DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1036
},
1037
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
1038
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1039
},
1040
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
1041
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1042
},
1043
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1044
  DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1045
},
1046
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
1047
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1048
},
1049
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
1050
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1051
},
1052
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
1053
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1054
},
1055
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
1056
  DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1057
},
1058
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
1059
  DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1060
},
1061
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1062
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1063
},
1064
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
1065
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1066
},
1067
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
1068
  DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1069
},
1070
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1071
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1072
}
1073
};
1074
1075
typedef struct {
1076
    const char *oldTag;
1077
    const char *newTag;
1078
} htmlStartCloseEntry;
1079
1080
/*
1081
 * start tags that imply the end of current element
1082
 */
1083
static const htmlStartCloseEntry htmlStartClose[] = {
1084
    { "a", "a" },
1085
    { "a", "fieldset" },
1086
    { "a", "table" },
1087
    { "a", "td" },
1088
    { "a", "th" },
1089
    { "address", "dd" },
1090
    { "address", "dl" },
1091
    { "address", "dt" },
1092
    { "address", "form" },
1093
    { "address", "li" },
1094
    { "address", "ul" },
1095
    { "b", "center" },
1096
    { "b", "p" },
1097
    { "b", "td" },
1098
    { "b", "th" },
1099
    { "big", "p" },
1100
    { "caption", "col" },
1101
    { "caption", "colgroup" },
1102
    { "caption", "tbody" },
1103
    { "caption", "tfoot" },
1104
    { "caption", "thead" },
1105
    { "caption", "tr" },
1106
    { "col", "col" },
1107
    { "col", "colgroup" },
1108
    { "col", "tbody" },
1109
    { "col", "tfoot" },
1110
    { "col", "thead" },
1111
    { "col", "tr" },
1112
    { "colgroup", "colgroup" },
1113
    { "colgroup", "tbody" },
1114
    { "colgroup", "tfoot" },
1115
    { "colgroup", "thead" },
1116
    { "colgroup", "tr" },
1117
    { "dd", "dt" },
1118
    { "dir", "dd" },
1119
    { "dir", "dl" },
1120
    { "dir", "dt" },
1121
    { "dir", "form" },
1122
    { "dir", "ul" },
1123
    { "dl", "form" },
1124
    { "dl", "li" },
1125
    { "dt", "dd" },
1126
    { "dt", "dl" },
1127
    { "font", "center" },
1128
    { "font", "td" },
1129
    { "font", "th" },
1130
    { "form", "form" },
1131
    { "h1", "fieldset" },
1132
    { "h1", "form" },
1133
    { "h1", "li" },
1134
    { "h1", "p" },
1135
    { "h1", "table" },
1136
    { "h2", "fieldset" },
1137
    { "h2", "form" },
1138
    { "h2", "li" },
1139
    { "h2", "p" },
1140
    { "h2", "table" },
1141
    { "h3", "fieldset" },
1142
    { "h3", "form" },
1143
    { "h3", "li" },
1144
    { "h3", "p" },
1145
    { "h3", "table" },
1146
    { "h4", "fieldset" },
1147
    { "h4", "form" },
1148
    { "h4", "li" },
1149
    { "h4", "p" },
1150
    { "h4", "table" },
1151
    { "h5", "fieldset" },
1152
    { "h5", "form" },
1153
    { "h5", "li" },
1154
    { "h5", "p" },
1155
    { "h5", "table" },
1156
    { "h6", "fieldset" },
1157
    { "h6", "form" },
1158
    { "h6", "li" },
1159
    { "h6", "p" },
1160
    { "h6", "table" },
1161
    { "head", "a" },
1162
    { "head", "abbr" },
1163
    { "head", "acronym" },
1164
    { "head", "address" },
1165
    { "head", "b" },
1166
    { "head", "bdo" },
1167
    { "head", "big" },
1168
    { "head", "blockquote" },
1169
    { "head", "body" },
1170
    { "head", "br" },
1171
    { "head", "center" },
1172
    { "head", "cite" },
1173
    { "head", "code" },
1174
    { "head", "dd" },
1175
    { "head", "dfn" },
1176
    { "head", "dir" },
1177
    { "head", "div" },
1178
    { "head", "dl" },
1179
    { "head", "dt" },
1180
    { "head", "em" },
1181
    { "head", "fieldset" },
1182
    { "head", "font" },
1183
    { "head", "form" },
1184
    { "head", "frameset" },
1185
    { "head", "h1" },
1186
    { "head", "h2" },
1187
    { "head", "h3" },
1188
    { "head", "h4" },
1189
    { "head", "h5" },
1190
    { "head", "h6" },
1191
    { "head", "hr" },
1192
    { "head", "i" },
1193
    { "head", "iframe" },
1194
    { "head", "img" },
1195
    { "head", "kbd" },
1196
    { "head", "li" },
1197
    { "head", "listing" },
1198
    { "head", "map" },
1199
    { "head", "menu" },
1200
    { "head", "ol" },
1201
    { "head", "p" },
1202
    { "head", "pre" },
1203
    { "head", "q" },
1204
    { "head", "s" },
1205
    { "head", "samp" },
1206
    { "head", "small" },
1207
    { "head", "span" },
1208
    { "head", "strike" },
1209
    { "head", "strong" },
1210
    { "head", "sub" },
1211
    { "head", "sup" },
1212
    { "head", "table" },
1213
    { "head", "tt" },
1214
    { "head", "u" },
1215
    { "head", "ul" },
1216
    { "head", "var" },
1217
    { "head", "xmp" },
1218
    { "hr", "form" },
1219
    { "i", "center" },
1220
    { "i", "p" },
1221
    { "i", "td" },
1222
    { "i", "th" },
1223
    { "legend", "fieldset" },
1224
    { "li", "li" },
1225
    { "link", "body" },
1226
    { "link", "frameset" },
1227
    { "listing", "dd" },
1228
    { "listing", "dl" },
1229
    { "listing", "dt" },
1230
    { "listing", "fieldset" },
1231
    { "listing", "form" },
1232
    { "listing", "li" },
1233
    { "listing", "table" },
1234
    { "listing", "ul" },
1235
    { "menu", "dd" },
1236
    { "menu", "dl" },
1237
    { "menu", "dt" },
1238
    { "menu", "form" },
1239
    { "menu", "ul" },
1240
    { "ol", "form" },
1241
    { "ol", "ul" },
1242
    { "option", "optgroup" },
1243
    { "option", "option" },
1244
    { "p", "address" },
1245
    { "p", "blockquote" },
1246
    { "p", "body" },
1247
    { "p", "caption" },
1248
    { "p", "center" },
1249
    { "p", "col" },
1250
    { "p", "colgroup" },
1251
    { "p", "dd" },
1252
    { "p", "dir" },
1253
    { "p", "div" },
1254
    { "p", "dl" },
1255
    { "p", "dt" },
1256
    { "p", "fieldset" },
1257
    { "p", "form" },
1258
    { "p", "frameset" },
1259
    { "p", "h1" },
1260
    { "p", "h2" },
1261
    { "p", "h3" },
1262
    { "p", "h4" },
1263
    { "p", "h5" },
1264
    { "p", "h6" },
1265
    { "p", "head" },
1266
    { "p", "hr" },
1267
    { "p", "li" },
1268
    { "p", "listing" },
1269
    { "p", "menu" },
1270
    { "p", "ol" },
1271
    { "p", "p" },
1272
    { "p", "pre" },
1273
    { "p", "table" },
1274
    { "p", "tbody" },
1275
    { "p", "td" },
1276
    { "p", "tfoot" },
1277
    { "p", "th" },
1278
    { "p", "title" },
1279
    { "p", "tr" },
1280
    { "p", "ul" },
1281
    { "p", "xmp" },
1282
    { "pre", "dd" },
1283
    { "pre", "dl" },
1284
    { "pre", "dt" },
1285
    { "pre", "fieldset" },
1286
    { "pre", "form" },
1287
    { "pre", "li" },
1288
    { "pre", "table" },
1289
    { "pre", "ul" },
1290
    { "s", "p" },
1291
    { "script", "noscript" },
1292
    { "small", "p" },
1293
    { "span", "td" },
1294
    { "span", "th" },
1295
    { "strike", "p" },
1296
    { "style", "body" },
1297
    { "style", "frameset" },
1298
    { "tbody", "tbody" },
1299
    { "tbody", "tfoot" },
1300
    { "td", "tbody" },
1301
    { "td", "td" },
1302
    { "td", "tfoot" },
1303
    { "td", "th" },
1304
    { "td", "tr" },
1305
    { "tfoot", "tbody" },
1306
    { "th", "tbody" },
1307
    { "th", "td" },
1308
    { "th", "tfoot" },
1309
    { "th", "th" },
1310
    { "th", "tr" },
1311
    { "thead", "tbody" },
1312
    { "thead", "tfoot" },
1313
    { "title", "body" },
1314
    { "title", "frameset" },
1315
    { "tr", "tbody" },
1316
    { "tr", "tfoot" },
1317
    { "tr", "tr" },
1318
    { "tt", "p" },
1319
    { "u", "p" },
1320
    { "u", "td" },
1321
    { "u", "th" },
1322
    { "ul", "address" },
1323
    { "ul", "form" },
1324
    { "ul", "menu" },
1325
    { "ul", "ol" },
1326
    { "ul", "pre" },
1327
    { "xmp", "dd" },
1328
    { "xmp", "dl" },
1329
    { "xmp", "dt" },
1330
    { "xmp", "fieldset" },
1331
    { "xmp", "form" },
1332
    { "xmp", "li" },
1333
    { "xmp", "table" },
1334
    { "xmp", "ul" }
1335
};
1336
1337
/*
1338
 * The list of HTML elements which are supposed not to have
1339
 * CDATA content and where a p element will be implied
1340
 *
1341
 * TODO: extend that list by reading the HTML SGML DTD on
1342
 *       implied paragraph
1343
 */
1344
static const char *const htmlNoContentElements[] = {
1345
    "html",
1346
    "head",
1347
    NULL
1348
};
1349
1350
/*
1351
 * The list of HTML attributes which are of content %Script;
1352
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1353
 *       it assumes the name starts with 'on'
1354
 */
1355
static const char *const htmlScriptAttributes[] = {
1356
    "onclick",
1357
    "ondblclick",
1358
    "onmousedown",
1359
    "onmouseup",
1360
    "onmouseover",
1361
    "onmousemove",
1362
    "onmouseout",
1363
    "onkeypress",
1364
    "onkeydown",
1365
    "onkeyup",
1366
    "onload",
1367
    "onunload",
1368
    "onfocus",
1369
    "onblur",
1370
    "onsubmit",
1371
    "onreset",
1372
    "onchange",
1373
    "onselect"
1374
};
1375
1376
/*
1377
 * This table is used by the htmlparser to know what to do with
1378
 * broken html pages. By assigning different priorities to different
1379
 * elements the parser can decide how to handle extra endtags.
1380
 * Endtags are only allowed to close elements with lower or equal
1381
 * priority.
1382
 */
1383
1384
typedef struct {
1385
    const char *name;
1386
    int priority;
1387
} elementPriority;
1388
1389
static const elementPriority htmlEndPriority[] = {
1390
    {"div",   150},
1391
    {"td",    160},
1392
    {"th",    160},
1393
    {"tr",    170},
1394
    {"thead", 180},
1395
    {"tbody", 180},
1396
    {"tfoot", 180},
1397
    {"table", 190},
1398
    {"head",  200},
1399
    {"body",  200},
1400
    {"html",  220},
1401
    {NULL,    100} /* Default priority */
1402
};
1403
1404
/************************************************************************
1405
 *                  *
1406
 *  functions to handle HTML specific data      *
1407
 *                  *
1408
 ************************************************************************/
1409
1410
/**
1411
 * htmlInitAutoClose:
1412
 *
1413
 * This is a no-op now.
1414
 */
1415
void
1416
6
htmlInitAutoClose(void) {
1417
6
}
1418
1419
static int
1420
178M
htmlCompareTags(const void *key, const void *member) {
1421
178M
    const xmlChar *tag = (const xmlChar *) key;
1422
178M
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1423
1424
178M
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1425
178M
}
1426
1427
/**
1428
 * htmlTagLookup:
1429
 * @tag:  The tag name in lowercase
1430
 *
1431
 * Lookup the HTML tag in the ElementTable
1432
 *
1433
 * Returns the related htmlElemDescPtr or NULL if not found.
1434
 */
1435
const htmlElemDesc *
1436
29.6M
htmlTagLookup(const xmlChar *tag) {
1437
29.6M
    if (tag == NULL)
1438
0
        return(NULL);
1439
1440
29.6M
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1441
29.6M
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1442
29.6M
                sizeof(htmlElemDesc), htmlCompareTags));
1443
29.6M
}
1444
1445
/**
1446
 * htmlGetEndPriority:
1447
 * @name: The name of the element to look up the priority for.
1448
 *
1449
 * Return value: The "endtag" priority.
1450
 **/
1451
static int
1452
96.1M
htmlGetEndPriority (const xmlChar *name) {
1453
96.1M
    int i = 0;
1454
1455
1.15G
    while ((htmlEndPriority[i].name != NULL) &&
1456
1.15G
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1457
1.05G
  i++;
1458
1459
96.1M
    return(htmlEndPriority[i].priority);
1460
96.1M
}
1461
1462
1463
static int
1464
224M
htmlCompareStartClose(const void *vkey, const void *member) {
1465
224M
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1466
224M
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1467
224M
    int ret;
1468
1469
224M
    ret = strcmp(key->oldTag, entry->oldTag);
1470
224M
    if (ret == 0)
1471
53.7M
        ret = strcmp(key->newTag, entry->newTag);
1472
1473
224M
    return(ret);
1474
224M
}
1475
1476
/**
1477
 * htmlCheckAutoClose:
1478
 * @newtag:  The new tag name
1479
 * @oldtag:  The old tag name
1480
 *
1481
 * Checks whether the new tag is one of the registered valid tags for
1482
 * closing old.
1483
 *
1484
 * Returns 0 if no, 1 if yes.
1485
 */
1486
static int
1487
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1488
28.0M
{
1489
28.0M
    htmlStartCloseEntry key;
1490
28.0M
    void *res;
1491
1492
28.0M
    key.oldTag = (const char *) oldtag;
1493
28.0M
    key.newTag = (const char *) newtag;
1494
28.0M
    res = bsearch(&key, htmlStartClose,
1495
28.0M
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1496
28.0M
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1497
28.0M
    return(res != NULL);
1498
28.0M
}
1499
1500
/**
1501
 * htmlAutoCloseOnClose:
1502
 * @ctxt:  an HTML parser context
1503
 * @newtag:  The new tag name
1504
 * @force:  force the tag closure
1505
 *
1506
 * The HTML DTD allows an ending tag to implicitly close other tags.
1507
 */
1508
static void
1509
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1510
40.0k
{
1511
40.0k
    const htmlElemDesc *info;
1512
40.0k
    int i, priority;
1513
1514
40.0k
    priority = htmlGetEndPriority(newtag);
1515
1516
96.1M
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1517
1518
96.1M
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1519
37.8k
            break;
1520
        /*
1521
         * A misplaced endtag can only close elements with lower
1522
         * or equal priority, so if we find an element with higher
1523
         * priority before we find an element with
1524
         * matching name, we just ignore this endtag
1525
         */
1526
96.0M
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1527
2.20k
            return;
1528
96.0M
    }
1529
37.8k
    if (i < 0)
1530
0
        return;
1531
1532
258k
    while (!xmlStrEqual(newtag, ctxt->name)) {
1533
221k
        info = htmlTagLookup(ctxt->name);
1534
221k
        if ((info != NULL) && (info->endTag == 3)) {
1535
210k
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1536
210k
                   "Opening and ending tag mismatch: %s and %s\n",
1537
210k
       newtag, ctxt->name);
1538
210k
        }
1539
221k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1540
221k
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1541
221k
  htmlnamePop(ctxt);
1542
221k
    }
1543
37.8k
}
1544
1545
/**
1546
 * htmlAutoCloseOnEnd:
1547
 * @ctxt:  an HTML parser context
1548
 *
1549
 * Close all remaining tags at the end of the stream
1550
 */
1551
static void
1552
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1553
49.8k
{
1554
49.8k
    int i;
1555
1556
49.8k
    if (ctxt->nameNr == 0)
1557
33.8k
        return;
1558
15.3M
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1559
15.3M
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1560
15.3M
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1561
15.3M
  htmlnamePop(ctxt);
1562
15.3M
    }
1563
15.9k
}
1564
1565
/**
1566
 * htmlAutoClose:
1567
 * @ctxt:  an HTML parser context
1568
 * @newtag:  The new tag name or NULL
1569
 *
1570
 * The HTML DTD allows a tag to implicitly close other tags.
1571
 * The list is kept in htmlStartClose array. This function is
1572
 * called when a new tag has been detected and generates the
1573
 * appropriates closes if possible/needed.
1574
 * If newtag is NULL this mean we are at the end of the resource
1575
 * and we should check
1576
 */
1577
static void
1578
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1579
17.9M
{
1580
19.3M
    while ((newtag != NULL) && (ctxt->name != NULL) &&
1581
19.3M
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1582
1.42M
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1583
1.42M
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1584
1.42M
  htmlnamePop(ctxt);
1585
1.42M
    }
1586
17.9M
    if (newtag == NULL) {
1587
0
        htmlAutoCloseOnEnd(ctxt);
1588
0
        return;
1589
0
    }
1590
17.9M
    while ((newtag == NULL) && (ctxt->name != NULL) &&
1591
17.9M
           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1592
0
            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1593
0
            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1594
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1595
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1596
0
  htmlnamePop(ctxt);
1597
0
    }
1598
17.9M
}
1599
1600
/**
1601
 * htmlAutoCloseTag:
1602
 * @doc:  the HTML document
1603
 * @name:  The tag name
1604
 * @elem:  the HTML element
1605
 *
1606
 * The HTML DTD allows a tag to implicitly close other tags.
1607
 * The list is kept in htmlStartClose array. This function checks
1608
 * if the element or one of it's children would autoclose the
1609
 * given tag.
1610
 *
1611
 * Returns 1 if autoclose, 0 otherwise
1612
 */
1613
int
1614
0
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1615
0
    htmlNodePtr child;
1616
1617
0
    if (elem == NULL) return(1);
1618
0
    if (xmlStrEqual(name, elem->name)) return(0);
1619
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1620
0
    child = elem->children;
1621
0
    while (child != NULL) {
1622
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1623
0
  child = child->next;
1624
0
    }
1625
0
    return(0);
1626
0
}
1627
1628
/**
1629
 * htmlIsAutoClosed:
1630
 * @doc:  the HTML document
1631
 * @elem:  the HTML element
1632
 *
1633
 * The HTML DTD allows a tag to implicitly close other tags.
1634
 * The list is kept in htmlStartClose array. This function checks
1635
 * if a tag is autoclosed by one of it's child
1636
 *
1637
 * Returns 1 if autoclosed, 0 otherwise
1638
 */
1639
int
1640
0
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1641
0
    htmlNodePtr child;
1642
1643
0
    if (elem == NULL) return(1);
1644
0
    child = elem->children;
1645
0
    while (child != NULL) {
1646
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1647
0
  child = child->next;
1648
0
    }
1649
0
    return(0);
1650
0
}
1651
1652
/**
1653
 * htmlCheckImplied:
1654
 * @ctxt:  an HTML parser context
1655
 * @newtag:  The new tag name
1656
 *
1657
 * The HTML DTD allows a tag to exists only implicitly
1658
 * called when a new tag has been detected and generates the
1659
 * appropriates implicit tags if missing
1660
 */
1661
static void
1662
17.9M
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1663
17.9M
    int i;
1664
1665
17.9M
    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1666
14.2M
        return;
1667
3.62M
    if (!htmlOmittedDefaultValue)
1668
0
  return;
1669
3.62M
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1670
470
  return;
1671
3.62M
    if (ctxt->nameNr <= 0) {
1672
11.6k
  htmlnamePush(ctxt, BAD_CAST"html");
1673
11.6k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1674
11.6k
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1675
11.6k
    }
1676
3.62M
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1677
187k
        return;
1678
3.43M
    if ((ctxt->nameNr <= 1) &&
1679
3.43M
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1680
17.3k
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1681
17.3k
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1682
17.3k
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1683
17.3k
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1684
17.3k
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1685
5.37k
        if (ctxt->html >= 3) {
1686
            /* we already saw or generated an <head> before */
1687
1.57k
            return;
1688
1.57k
        }
1689
        /*
1690
         * dropped OBJECT ... i you put it first BODY will be
1691
         * assumed !
1692
         */
1693
3.79k
        htmlnamePush(ctxt, BAD_CAST"head");
1694
3.79k
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1695
3.79k
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1696
3.43M
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1697
3.43M
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1698
3.43M
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1699
3.43M
        if (ctxt->html >= 10) {
1700
            /* we already saw or generated a <body> before */
1701
3.34M
            return;
1702
3.34M
        }
1703
186k
  for (i = 0;i < ctxt->nameNr;i++) {
1704
178k
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1705
0
    return;
1706
0
      }
1707
178k
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1708
85.1k
    return;
1709
85.1k
      }
1710
178k
  }
1711
1712
7.86k
  htmlnamePush(ctxt, BAD_CAST"body");
1713
7.86k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1714
7.86k
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1715
7.86k
    }
1716
3.43M
}
1717
1718
/**
1719
 * htmlCheckParagraph
1720
 * @ctxt:  an HTML parser context
1721
 *
1722
 * Check whether a p element need to be implied before inserting
1723
 * characters in the current element.
1724
 *
1725
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1726
 *         in case of error.
1727
 */
1728
1729
static int
1730
2.33M
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1731
2.33M
    const xmlChar *tag;
1732
2.33M
    int i;
1733
1734
2.33M
    if (ctxt == NULL)
1735
0
  return(-1);
1736
2.33M
    tag = ctxt->name;
1737
2.33M
    if (tag == NULL) {
1738
12.1k
  htmlAutoClose(ctxt, BAD_CAST"p");
1739
12.1k
  htmlCheckImplied(ctxt, BAD_CAST"p");
1740
12.1k
  htmlnamePush(ctxt, BAD_CAST"p");
1741
12.1k
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1742
12.1k
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1743
12.1k
  return(1);
1744
12.1k
    }
1745
2.32M
    if (!htmlOmittedDefaultValue)
1746
0
  return(0);
1747
6.95M
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1748
4.64M
  if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1749
4.07k
      htmlAutoClose(ctxt, BAD_CAST"p");
1750
4.07k
      htmlCheckImplied(ctxt, BAD_CAST"p");
1751
4.07k
      htmlnamePush(ctxt, BAD_CAST"p");
1752
4.07k
      if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1753
4.07k
    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1754
4.07k
      return(1);
1755
4.07k
  }
1756
4.64M
    }
1757
2.31M
    return(0);
1758
2.32M
}
1759
1760
/**
1761
 * htmlIsScriptAttribute:
1762
 * @name:  an attribute name
1763
 *
1764
 * Check if an attribute is of content type Script
1765
 *
1766
 * Returns 1 is the attribute is a script 0 otherwise
1767
 */
1768
int
1769
0
htmlIsScriptAttribute(const xmlChar *name) {
1770
0
    unsigned int i;
1771
1772
0
    if (name == NULL)
1773
0
      return(0);
1774
    /*
1775
     * all script attributes start with 'on'
1776
     */
1777
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1778
0
      return(0);
1779
0
    for (i = 0;
1780
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1781
0
   i++) {
1782
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1783
0
      return(1);
1784
0
    }
1785
0
    return(0);
1786
0
}
1787
1788
/************************************************************************
1789
 *                  *
1790
 *  The list of HTML predefined entities      *
1791
 *                  *
1792
 ************************************************************************/
1793
1794
1795
static const htmlEntityDesc  html40EntitiesTable[] = {
1796
/*
1797
 * the 4 absolute ones, plus apostrophe.
1798
 */
1799
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1800
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1801
{ 39, "apos", "single quote" },
1802
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1803
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1804
1805
/*
1806
 * A bunch still in the 128-255 range
1807
 * Replacing them depend really on the charset used.
1808
 */
1809
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1810
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1811
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1812
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1813
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1814
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1815
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1816
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1817
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1818
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1819
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1820
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1821
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1822
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1823
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1824
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1825
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1826
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1827
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1828
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1829
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1830
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1831
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1832
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1833
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1834
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1835
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1836
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1837
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1838
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1839
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1840
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1841
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1842
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1843
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1844
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1845
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1846
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1847
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1848
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1849
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1850
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1851
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1852
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1853
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1854
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1855
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1856
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1857
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1858
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1859
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1860
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1861
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1862
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1863
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1864
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1865
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1866
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1867
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1868
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1869
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1870
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1871
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1872
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1873
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1874
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1875
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1876
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1877
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1878
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1879
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1880
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1881
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1882
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1883
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1884
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1885
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1886
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1887
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1888
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1889
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1890
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1891
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1892
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1893
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1894
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1895
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1896
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1897
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1898
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1899
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1900
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1901
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1902
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1903
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1904
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1905
1906
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1907
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1908
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1909
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1910
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1911
1912
/*
1913
 * Anything below should really be kept as entities references
1914
 */
1915
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1916
1917
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1918
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1919
1920
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1921
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1922
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1923
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1924
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1925
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1926
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1927
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1928
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1929
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1930
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1931
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1932
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1933
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1934
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1935
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1936
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1937
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1938
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1939
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1940
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1941
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1942
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1943
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1944
1945
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1946
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1947
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1948
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1949
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1950
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1951
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1952
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1953
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1954
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1955
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1956
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1957
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1958
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1959
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1960
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1961
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1962
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1963
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1964
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1965
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1966
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1967
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1968
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1969
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1970
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1971
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1972
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1973
1974
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1975
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1976
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1977
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1978
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1979
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1980
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1981
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1982
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1983
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1984
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1985
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1986
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1987
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1988
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1989
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1990
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1991
1992
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1993
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1994
1995
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1996
1997
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1998
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1999
2000
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
2001
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
2002
2003
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
2004
{ 8260, "frasl","fraction slash, U+2044 NEW" },
2005
2006
{ 8364, "euro", "euro sign, U+20AC NEW" },
2007
2008
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
2009
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
2010
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
2011
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
2012
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2013
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2014
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2015
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2016
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2017
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2018
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2019
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2020
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2021
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2022
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2023
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2024
2025
{ 8704, "forall","for all, U+2200 ISOtech" },
2026
{ 8706, "part", "partial differential, U+2202 ISOtech" },
2027
{ 8707, "exist","there exists, U+2203 ISOtech" },
2028
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2029
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2030
{ 8712, "isin", "element of, U+2208 ISOtech" },
2031
{ 8713, "notin","not an element of, U+2209 ISOtech" },
2032
{ 8715, "ni", "contains as member, U+220B ISOtech" },
2033
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2034
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2035
{ 8722, "minus","minus sign, U+2212 ISOtech" },
2036
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2037
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
2038
{ 8733, "prop", "proportional to, U+221D ISOtech" },
2039
{ 8734, "infin","infinity, U+221E ISOtech" },
2040
{ 8736, "ang",  "angle, U+2220 ISOamso" },
2041
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2042
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
2043
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2044
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
2045
{ 8747, "int",  "integral, U+222B ISOtech" },
2046
{ 8756, "there4","therefore, U+2234 ISOtech" },
2047
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2048
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2049
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2050
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
2051
{ 8801, "equiv","identical to, U+2261 ISOtech" },
2052
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2053
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2054
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
2055
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
2056
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2057
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2058
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2059
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2060
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2061
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2062
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2063
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2064
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2065
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2066
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
2067
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2068
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2069
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
2070
2071
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
2072
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2073
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2074
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
2075
2076
};
2077
2078
/************************************************************************
2079
 *                  *
2080
 *    Commodity functions to handle entities      *
2081
 *                  *
2082
 ************************************************************************/
2083
2084
/*
2085
 * Macro used to grow the current buffer.
2086
 */
2087
149k
#define growBuffer(buffer) {           \
2088
149k
    xmlChar *tmp;             \
2089
149k
    buffer##_size *= 2;             \
2090
149k
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
2091
149k
    if (tmp == NULL) {           \
2092
0
  htmlErrMemory(ctxt, "growing buffer\n");      \
2093
0
  xmlFree(buffer);            \
2094
0
  return(NULL);             \
2095
0
    }                  \
2096
149k
    buffer = tmp;             \
2097
149k
}
2098
2099
/**
2100
 * htmlEntityLookup:
2101
 * @name: the entity name
2102
 *
2103
 * Lookup the given entity in EntitiesTable
2104
 *
2105
 * TODO: the linear scan is really ugly, an hash table is really needed.
2106
 *
2107
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2108
 */
2109
const htmlEntityDesc *
2110
11.6k
htmlEntityLookup(const xmlChar *name) {
2111
11.6k
    unsigned int i;
2112
2113
900k
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2114
900k
                    sizeof(html40EntitiesTable[0]));i++) {
2115
897k
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2116
9.29k
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2117
9.29k
  }
2118
897k
    }
2119
2.39k
    return(NULL);
2120
11.6k
}
2121
2122
/**
2123
 * htmlEntityValueLookup:
2124
 * @value: the entity's unicode value
2125
 *
2126
 * Lookup the given entity in EntitiesTable
2127
 *
2128
 * TODO: the linear scan is really ugly, an hash table is really needed.
2129
 *
2130
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2131
 */
2132
const htmlEntityDesc *
2133
0
htmlEntityValueLookup(unsigned int value) {
2134
0
    unsigned int i;
2135
2136
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2137
0
                    sizeof(html40EntitiesTable[0]));i++) {
2138
0
        if (html40EntitiesTable[i].value >= value) {
2139
0
      if (html40EntitiesTable[i].value > value)
2140
0
    break;
2141
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2142
0
  }
2143
0
    }
2144
0
    return(NULL);
2145
0
}
2146
2147
/**
2148
 * UTF8ToHtml:
2149
 * @out:  a pointer to an array of bytes to store the result
2150
 * @outlen:  the length of @out
2151
 * @in:  a pointer to an array of UTF-8 chars
2152
 * @inlen:  the length of @in
2153
 *
2154
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2155
 * plus HTML entities block of chars out.
2156
 *
2157
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2158
 * The value of @inlen after return is the number of octets consumed
2159
 *     as the return value is positive, else unpredictable.
2160
 * The value of @outlen after return is the number of octets consumed.
2161
 */
2162
int
2163
UTF8ToHtml(unsigned char* out, int *outlen,
2164
0
              const unsigned char* in, int *inlen) {
2165
0
    const unsigned char* processed = in;
2166
0
    const unsigned char* outend;
2167
0
    const unsigned char* outstart = out;
2168
0
    const unsigned char* instart = in;
2169
0
    const unsigned char* inend;
2170
0
    unsigned int c, d;
2171
0
    int trailing;
2172
2173
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2174
0
    if (in == NULL) {
2175
        /*
2176
   * initialization nothing to do
2177
   */
2178
0
  *outlen = 0;
2179
0
  *inlen = 0;
2180
0
  return(0);
2181
0
    }
2182
0
    inend = in + (*inlen);
2183
0
    outend = out + (*outlen);
2184
0
    while (in < inend) {
2185
0
  d = *in++;
2186
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2187
0
  else if (d < 0xC0) {
2188
      /* trailing byte in leading position */
2189
0
      *outlen = out - outstart;
2190
0
      *inlen = processed - instart;
2191
0
      return(-2);
2192
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2193
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2194
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2195
0
  else {
2196
      /* no chance for this in Ascii */
2197
0
      *outlen = out - outstart;
2198
0
      *inlen = processed - instart;
2199
0
      return(-2);
2200
0
  }
2201
2202
0
  if (inend - in < trailing) {
2203
0
      break;
2204
0
  }
2205
2206
0
  for ( ; trailing; trailing--) {
2207
0
      if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2208
0
    break;
2209
0
      c <<= 6;
2210
0
      c |= d & 0x3F;
2211
0
  }
2212
2213
  /* assertion: c is a single UTF-4 value */
2214
0
  if (c < 0x80) {
2215
0
      if (out + 1 >= outend)
2216
0
    break;
2217
0
      *out++ = c;
2218
0
  } else {
2219
0
      int len;
2220
0
      const htmlEntityDesc * ent;
2221
0
      const char *cp;
2222
0
      char nbuf[16];
2223
2224
      /*
2225
       * Try to lookup a predefined HTML entity for it
2226
       */
2227
2228
0
      ent = htmlEntityValueLookup(c);
2229
0
      if (ent == NULL) {
2230
0
        snprintf(nbuf, sizeof(nbuf), "#%u", c);
2231
0
        cp = nbuf;
2232
0
      }
2233
0
      else
2234
0
        cp = ent->name;
2235
0
      len = strlen(cp);
2236
0
      if (out + 2 + len >= outend)
2237
0
    break;
2238
0
      *out++ = '&';
2239
0
      memcpy(out, cp, len);
2240
0
      out += len;
2241
0
      *out++ = ';';
2242
0
  }
2243
0
  processed = in;
2244
0
    }
2245
0
    *outlen = out - outstart;
2246
0
    *inlen = processed - instart;
2247
0
    return(0);
2248
0
}
2249
2250
/**
2251
 * htmlEncodeEntities:
2252
 * @out:  a pointer to an array of bytes to store the result
2253
 * @outlen:  the length of @out
2254
 * @in:  a pointer to an array of UTF-8 chars
2255
 * @inlen:  the length of @in
2256
 * @quoteChar: the quote character to escape (' or ") or zero.
2257
 *
2258
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2259
 * plus HTML entities block of chars out.
2260
 *
2261
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2262
 * The value of @inlen after return is the number of octets consumed
2263
 *     as the return value is positive, else unpredictable.
2264
 * The value of @outlen after return is the number of octets consumed.
2265
 */
2266
int
2267
htmlEncodeEntities(unsigned char* out, int *outlen,
2268
0
       const unsigned char* in, int *inlen, int quoteChar) {
2269
0
    const unsigned char* processed = in;
2270
0
    const unsigned char* outend;
2271
0
    const unsigned char* outstart = out;
2272
0
    const unsigned char* instart = in;
2273
0
    const unsigned char* inend;
2274
0
    unsigned int c, d;
2275
0
    int trailing;
2276
2277
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2278
0
        return(-1);
2279
0
    outend = out + (*outlen);
2280
0
    inend = in + (*inlen);
2281
0
    while (in < inend) {
2282
0
  d = *in++;
2283
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2284
0
  else if (d < 0xC0) {
2285
      /* trailing byte in leading position */
2286
0
      *outlen = out - outstart;
2287
0
      *inlen = processed - instart;
2288
0
      return(-2);
2289
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2290
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2291
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2292
0
  else {
2293
      /* no chance for this in Ascii */
2294
0
      *outlen = out - outstart;
2295
0
      *inlen = processed - instart;
2296
0
      return(-2);
2297
0
  }
2298
2299
0
  if (inend - in < trailing)
2300
0
      break;
2301
2302
0
  while (trailing--) {
2303
0
      if (((d= *in++) & 0xC0) != 0x80) {
2304
0
    *outlen = out - outstart;
2305
0
    *inlen = processed - instart;
2306
0
    return(-2);
2307
0
      }
2308
0
      c <<= 6;
2309
0
      c |= d & 0x3F;
2310
0
  }
2311
2312
  /* assertion: c is a single UTF-4 value */
2313
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2314
0
      (c != '&') && (c != '<') && (c != '>')) {
2315
0
      if (out >= outend)
2316
0
    break;
2317
0
      *out++ = c;
2318
0
  } else {
2319
0
      const htmlEntityDesc * ent;
2320
0
      const char *cp;
2321
0
      char nbuf[16];
2322
0
      int len;
2323
2324
      /*
2325
       * Try to lookup a predefined HTML entity for it
2326
       */
2327
0
      ent = htmlEntityValueLookup(c);
2328
0
      if (ent == NULL) {
2329
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2330
0
    cp = nbuf;
2331
0
      }
2332
0
      else
2333
0
    cp = ent->name;
2334
0
      len = strlen(cp);
2335
0
      if (out + 2 + len > outend)
2336
0
    break;
2337
0
      *out++ = '&';
2338
0
      memcpy(out, cp, len);
2339
0
      out += len;
2340
0
      *out++ = ';';
2341
0
  }
2342
0
  processed = in;
2343
0
    }
2344
0
    *outlen = out - outstart;
2345
0
    *inlen = processed - instart;
2346
0
    return(0);
2347
0
}
2348
2349
/************************************************************************
2350
 *                  *
2351
 *    Commodity functions to handle streams     *
2352
 *                  *
2353
 ************************************************************************/
2354
2355
#ifdef LIBXML_PUSH_ENABLED
2356
/**
2357
 * htmlNewInputStream:
2358
 * @ctxt:  an HTML parser context
2359
 *
2360
 * Create a new input stream structure
2361
 * Returns the new input stream or NULL
2362
 */
2363
static htmlParserInputPtr
2364
12.0k
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2365
12.0k
    htmlParserInputPtr input;
2366
2367
12.0k
    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2368
12.0k
    if (input == NULL) {
2369
0
        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2370
0
  return(NULL);
2371
0
    }
2372
12.0k
    memset(input, 0, sizeof(htmlParserInput));
2373
12.0k
    input->filename = NULL;
2374
12.0k
    input->directory = NULL;
2375
12.0k
    input->base = NULL;
2376
12.0k
    input->cur = NULL;
2377
12.0k
    input->buf = NULL;
2378
12.0k
    input->line = 1;
2379
12.0k
    input->col = 1;
2380
12.0k
    input->buf = NULL;
2381
12.0k
    input->free = NULL;
2382
12.0k
    input->version = NULL;
2383
12.0k
    input->consumed = 0;
2384
12.0k
    input->length = 0;
2385
12.0k
    return(input);
2386
12.0k
}
2387
#endif
2388
2389
2390
/************************************************************************
2391
 *                  *
2392
 *    Commodity functions, cleanup needed ?     *
2393
 *                  *
2394
 ************************************************************************/
2395
/*
2396
 * all tags allowing pc data from the html 4.01 loose dtd
2397
 * NOTE: it might be more appropriate to integrate this information
2398
 * into the html40ElementTable array but I don't want to risk any
2399
 * binary incompatibility
2400
 */
2401
static const char *allowPCData[] = {
2402
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2403
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2404
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2405
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2406
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2407
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2408
};
2409
2410
/**
2411
 * areBlanks:
2412
 * @ctxt:  an HTML parser context
2413
 * @str:  a xmlChar *
2414
 * @len:  the size of @str
2415
 *
2416
 * Is this a sequence of blank chars that one can ignore ?
2417
 *
2418
 * Returns 1 if ignorable 0 otherwise.
2419
 */
2420
2421
1.82M
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2422
1.82M
    unsigned int i;
2423
1.82M
    int j;
2424
1.82M
    xmlNodePtr lastChild;
2425
1.82M
    xmlDtdPtr dtd;
2426
2427
11.4M
    for (j = 0;j < len;j++)
2428
11.4M
        if (!(IS_BLANK_CH(str[j]))) return(0);
2429
2430
44.5k
    if (CUR == 0) return(1);
2431
40.6k
    if (CUR != '<') return(0);
2432
32.8k
    if (ctxt->name == NULL)
2433
2.92k
  return(1);
2434
29.9k
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2435
206
  return(1);
2436
29.7k
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2437
1.08k
  return(1);
2438
2439
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2440
28.6k
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2441
2.17k
        dtd = xmlGetIntSubset(ctxt->myDoc);
2442
2.17k
        if (dtd != NULL && dtd->ExternalID != NULL) {
2443
606
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2444
606
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2445
392
                return(1);
2446
606
        }
2447
2.17k
    }
2448
2449
28.2k
    if (ctxt->node == NULL) return(0);
2450
28.2k
    lastChild = xmlGetLastChild(ctxt->node);
2451
45.4k
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2452
17.2k
  lastChild = lastChild->prev;
2453
28.2k
    if (lastChild == NULL) {
2454
4.67k
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2455
4.67k
            (ctxt->node->content != NULL)) return(0);
2456
  /* keep ws in constructs like ...<b> </b>...
2457
     for all tags "b" allowing PCDATA */
2458
240k
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2459
236k
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2460
549
    return(0);
2461
549
      }
2462
236k
  }
2463
23.5k
    } else if (xmlNodeIsText(lastChild)) {
2464
20.6k
        return(0);
2465
20.6k
    } else {
2466
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2467
     for all tags "p" allowing PCDATA */
2468
138k
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2469
136k
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2470
1.23k
    return(0);
2471
1.23k
      }
2472
136k
  }
2473
2.95k
    }
2474
5.84k
    return(1);
2475
28.2k
}
2476
2477
/**
2478
 * htmlNewDocNoDtD:
2479
 * @URI:  URI for the dtd, or NULL
2480
 * @ExternalID:  the external ID of the DTD, or NULL
2481
 *
2482
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2483
 * are NULL
2484
 *
2485
 * Returns a new document, do not initialize the DTD if not provided
2486
 */
2487
htmlDocPtr
2488
24.0k
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2489
24.0k
    xmlDocPtr cur;
2490
2491
    /*
2492
     * Allocate a new document and fill the fields.
2493
     */
2494
24.0k
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2495
24.0k
    if (cur == NULL) {
2496
0
  htmlErrMemory(NULL, "HTML document creation failed\n");
2497
0
  return(NULL);
2498
0
    }
2499
24.0k
    memset(cur, 0, sizeof(xmlDoc));
2500
2501
24.0k
    cur->type = XML_HTML_DOCUMENT_NODE;
2502
24.0k
    cur->version = NULL;
2503
24.0k
    cur->intSubset = NULL;
2504
24.0k
    cur->doc = cur;
2505
24.0k
    cur->name = NULL;
2506
24.0k
    cur->children = NULL;
2507
24.0k
    cur->extSubset = NULL;
2508
24.0k
    cur->oldNs = NULL;
2509
24.0k
    cur->encoding = NULL;
2510
24.0k
    cur->standalone = 1;
2511
24.0k
    cur->compression = 0;
2512
24.0k
    cur->ids = NULL;
2513
24.0k
    cur->refs = NULL;
2514
24.0k
    cur->_private = NULL;
2515
24.0k
    cur->charset = XML_CHAR_ENCODING_UTF8;
2516
24.0k
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2517
24.0k
    if ((ExternalID != NULL) ||
2518
24.0k
  (URI != NULL))
2519
0
  xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2520
24.0k
    if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2521
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2522
24.0k
    return(cur);
2523
24.0k
}
2524
2525
/**
2526
 * htmlNewDoc:
2527
 * @URI:  URI for the dtd, or NULL
2528
 * @ExternalID:  the external ID of the DTD, or NULL
2529
 *
2530
 * Creates a new HTML document
2531
 *
2532
 * Returns a new document
2533
 */
2534
htmlDocPtr
2535
0
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2536
0
    if ((URI == NULL) && (ExternalID == NULL))
2537
0
  return(htmlNewDocNoDtD(
2538
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2539
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2540
2541
0
    return(htmlNewDocNoDtD(URI, ExternalID));
2542
0
}
2543
2544
2545
/************************************************************************
2546
 *                  *
2547
 *      The parser itself       *
2548
 *  Relates to http://www.w3.org/TR/html40        *
2549
 *                  *
2550
 ************************************************************************/
2551
2552
/************************************************************************
2553
 *                  *
2554
 *      The parser itself       *
2555
 *                  *
2556
 ************************************************************************/
2557
2558
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2559
2560
/**
2561
 * htmlParseHTMLName:
2562
 * @ctxt:  an HTML parser context
2563
 *
2564
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2565
 * since HTML names are not case-sensitive.
2566
 *
2567
 * Returns the Tag Name parsed or NULL
2568
 */
2569
2570
static const xmlChar *
2571
18.5M
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2572
18.5M
    int i = 0;
2573
18.5M
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574
2575
18.5M
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2576
18.5M
        (CUR != ':') && (CUR != '.')) return(NULL);
2577
2578
44.3M
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579
44.3M
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2580
44.3M
     (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2581
44.3M
           (CUR == '.'))) {
2582
26.1M
  if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2583
20.5M
        else loc[i] = CUR;
2584
26.1M
  i++;
2585
2586
26.1M
  NEXT;
2587
26.1M
    }
2588
2589
18.2M
    return(xmlDictLookup(ctxt->dict, loc, i));
2590
18.5M
}
2591
2592
2593
/**
2594
 * htmlParseHTMLName_nonInvasive:
2595
 * @ctxt:  an HTML parser context
2596
 *
2597
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2598
 * since HTML names are not case-sensitive, this doesn't consume the data
2599
 * from the stream, it's a look-ahead
2600
 *
2601
 * Returns the Tag Name parsed or NULL
2602
 */
2603
2604
static const xmlChar *
2605
8.70M
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2606
8.70M
    int i = 0;
2607
8.70M
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2608
2609
8.70M
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2610
8.70M
        (NXT(1) != ':')) return(NULL);
2611
2612
18.0M
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2613
18.0M
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2614
18.0M
     (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2615
9.36M
  if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2616
9.24M
        else loc[i] = NXT(1+i);
2617
9.36M
  i++;
2618
9.36M
    }
2619
2620
8.70M
    return(xmlDictLookup(ctxt->dict, loc, i));
2621
8.70M
}
2622
2623
2624
/**
2625
 * htmlParseName:
2626
 * @ctxt:  an HTML parser context
2627
 *
2628
 * parse an HTML name, this routine is case sensitive.
2629
 *
2630
 * Returns the Name parsed or NULL
2631
 */
2632
2633
static const xmlChar *
2634
596k
htmlParseName(htmlParserCtxtPtr ctxt) {
2635
596k
    const xmlChar *in;
2636
596k
    const xmlChar *ret;
2637
596k
    int count = 0;
2638
2639
596k
    GROW;
2640
2641
    /*
2642
     * Accelerator for simple ASCII names
2643
     */
2644
596k
    in = ctxt->input->cur;
2645
596k
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2646
596k
  ((*in >= 0x41) && (*in <= 0x5A)) ||
2647
596k
  (*in == '_') || (*in == ':')) {
2648
456k
  in++;
2649
566k
  while (((*in >= 0x61) && (*in <= 0x7A)) ||
2650
566k
         ((*in >= 0x41) && (*in <= 0x5A)) ||
2651
566k
         ((*in >= 0x30) && (*in <= 0x39)) ||
2652
566k
         (*in == '_') || (*in == '-') ||
2653
566k
         (*in == ':') || (*in == '.'))
2654
109k
      in++;
2655
2656
456k
  if (in == ctxt->input->end)
2657
423
      return(NULL);
2658
2659
456k
  if ((*in > 0) && (*in < 0x80)) {
2660
449k
      count = in - ctxt->input->cur;
2661
449k
      ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2662
449k
      ctxt->input->cur = in;
2663
449k
      ctxt->input->col += count;
2664
449k
      return(ret);
2665
449k
  }
2666
456k
    }
2667
146k
    return(htmlParseNameComplex(ctxt));
2668
596k
}
2669
2670
static const xmlChar *
2671
148k
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2672
148k
    int len = 0, l;
2673
148k
    int c;
2674
148k
    int count = 0;
2675
148k
    const xmlChar *base = ctxt->input->base;
2676
2677
    /*
2678
     * Handler for more complex cases
2679
     */
2680
148k
    GROW;
2681
148k
    c = CUR_CHAR(l);
2682
148k
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2683
148k
  (!IS_LETTER(c) && (c != '_') &&
2684
141k
         (c != ':'))) {
2685
86.4k
  return(NULL);
2686
86.4k
    }
2687
2688
193M
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2689
193M
     ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2690
193M
            (c == '.') || (c == '-') ||
2691
193M
      (c == '_') || (c == ':') ||
2692
193M
      (IS_COMBINING(c)) ||
2693
193M
      (IS_EXTENDER(c)))) {
2694
193M
  if (count++ > 100) {
2695
1.88M
      count = 0;
2696
1.88M
      GROW;
2697
1.88M
  }
2698
193M
  len += l;
2699
193M
  NEXTL(l);
2700
193M
  c = CUR_CHAR(l);
2701
193M
  if (ctxt->input->base != base) {
2702
      /*
2703
       * We changed encoding from an unknown encoding
2704
       * Input buffer changed location, so we better start again
2705
       */
2706
1.27k
      return(htmlParseNameComplex(ctxt));
2707
1.27k
  }
2708
193M
    }
2709
2710
60.3k
    if (ctxt->input->cur - ctxt->input->base < len) {
2711
        /* Sanity check */
2712
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2713
0
                     "unexpected change of input buffer", NULL, NULL);
2714
0
        return (NULL);
2715
0
    }
2716
2717
60.3k
    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2718
60.3k
}
2719
2720
2721
/**
2722
 * htmlParseHTMLAttribute:
2723
 * @ctxt:  an HTML parser context
2724
 * @stop:  a char stop value
2725
 *
2726
 * parse an HTML attribute value till the stop (quote), if
2727
 * stop is 0 then it stops at the first space
2728
 *
2729
 * Returns the attribute parsed or NULL
2730
 */
2731
2732
static xmlChar *
2733
160k
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2734
160k
    xmlChar *buffer = NULL;
2735
160k
    int buffer_size = 0;
2736
160k
    xmlChar *out = NULL;
2737
160k
    const xmlChar *name = NULL;
2738
160k
    const xmlChar *cur = NULL;
2739
160k
    const htmlEntityDesc * ent;
2740
2741
    /*
2742
     * allocate a translation buffer.
2743
     */
2744
160k
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2745
160k
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2746
160k
    if (buffer == NULL) {
2747
0
  htmlErrMemory(ctxt, "buffer allocation failed\n");
2748
0
  return(NULL);
2749
0
    }
2750
160k
    out = buffer;
2751
2752
    /*
2753
     * Ok loop until we reach one of the ending chars
2754
     */
2755
177M
    while ((CUR != 0) && (CUR != stop)) {
2756
177M
  if ((stop == 0) && (CUR == '>')) break;
2757
177M
  if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2758
177M
        if (CUR == '&') {
2759
41.6k
      if (NXT(1) == '#') {
2760
7.66k
    unsigned int c;
2761
7.66k
    int bits;
2762
2763
7.66k
    c = htmlParseCharRef(ctxt);
2764
7.66k
    if      (c <    0x80)
2765
3.86k
            { *out++  = c;                bits= -6; }
2766
3.80k
    else if (c <   0x800)
2767
465
            { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2768
3.34k
    else if (c < 0x10000)
2769
1.92k
            { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2770
1.41k
    else
2771
1.41k
            { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2772
2773
16.2k
    for ( ; bits >= 0; bits-= 6) {
2774
8.56k
        *out++  = ((c >> bits) & 0x3F) | 0x80;
2775
8.56k
    }
2776
2777
7.66k
    if (out - buffer > buffer_size - 100) {
2778
1.90k
      int indx = out - buffer;
2779
2780
1.90k
      growBuffer(buffer);
2781
1.90k
      out = &buffer[indx];
2782
1.90k
    }
2783
34.0k
      } else {
2784
34.0k
    ent = htmlParseEntityRef(ctxt, &name);
2785
34.0k
    if (name == NULL) {
2786
18.0k
        *out++ = '&';
2787
18.0k
        if (out - buffer > buffer_size - 100) {
2788
1.34k
      int indx = out - buffer;
2789
2790
1.34k
      growBuffer(buffer);
2791
1.34k
      out = &buffer[indx];
2792
1.34k
        }
2793
18.0k
    } else if (ent == NULL) {
2794
11.1k
        *out++ = '&';
2795
11.1k
        cur = name;
2796
60.7M
        while (*cur != 0) {
2797
60.7M
      if (out - buffer > buffer_size - 100) {
2798
1.80k
          int indx = out - buffer;
2799
2800
1.80k
          growBuffer(buffer);
2801
1.80k
          out = &buffer[indx];
2802
1.80k
      }
2803
60.7M
      *out++ = *cur++;
2804
60.7M
        }
2805
11.1k
    } else {
2806
4.75k
        unsigned int c;
2807
4.75k
        int bits;
2808
2809
4.75k
        if (out - buffer > buffer_size - 100) {
2810
259
      int indx = out - buffer;
2811
2812
259
      growBuffer(buffer);
2813
259
      out = &buffer[indx];
2814
259
        }
2815
4.75k
        c = ent->value;
2816
4.75k
        if      (c <    0x80)
2817
3.78k
      { *out++  = c;                bits= -6; }
2818
970
        else if (c <   0x800)
2819
520
      { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2820
450
        else if (c < 0x10000)
2821
450
      { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2822
0
        else
2823
0
      { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2824
2825
6.17k
        for ( ; bits >= 0; bits-= 6) {
2826
1.42k
      *out++  = ((c >> bits) & 0x3F) | 0x80;
2827
1.42k
        }
2828
4.75k
    }
2829
34.0k
      }
2830
177M
  } else {
2831
177M
      unsigned int c;
2832
177M
      int bits, l;
2833
2834
177M
      if (out - buffer > buffer_size - 100) {
2835
144k
    int indx = out - buffer;
2836
2837
144k
    growBuffer(buffer);
2838
144k
    out = &buffer[indx];
2839
144k
      }
2840
177M
      c = CUR_CHAR(l);
2841
177M
      if      (c <    0x80)
2842
3.22M
        { *out++  = c;                bits= -6; }
2843
174M
      else if (c <   0x800)
2844
8.78M
        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2845
165M
      else if (c < 0x10000)
2846
165M
        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2847
366
      else
2848
366
        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2849
2850
517M
      for ( ; bits >= 0; bits-= 6) {
2851
340M
    *out++  = ((c >> bits) & 0x3F) | 0x80;
2852
340M
      }
2853
177M
      NEXT;
2854
177M
  }
2855
177M
    }
2856
160k
    *out = 0;
2857
160k
    return(buffer);
2858
160k
}
2859
2860
/**
2861
 * htmlParseEntityRef:
2862
 * @ctxt:  an HTML parser context
2863
 * @str:  location to store the entity name
2864
 *
2865
 * parse an HTML ENTITY references
2866
 *
2867
 * [68] EntityRef ::= '&' Name ';'
2868
 *
2869
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2870
 *         if non-NULL *str will have to be freed by the caller.
2871
 */
2872
const htmlEntityDesc *
2873
549k
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2874
549k
    const xmlChar *name;
2875
549k
    const htmlEntityDesc * ent = NULL;
2876
2877
549k
    if (str != NULL) *str = NULL;
2878
549k
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2879
2880
549k
    if (CUR == '&') {
2881
549k
        NEXT;
2882
549k
        name = htmlParseName(ctxt);
2883
549k
  if (name == NULL) {
2884
66.7k
      htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2885
66.7k
                   "htmlParseEntityRef: no name\n", NULL, NULL);
2886
482k
  } else {
2887
482k
      GROW;
2888
482k
      if (CUR == ';') {
2889
11.6k
          if (str != NULL)
2890
11.6k
        *str = name;
2891
2892
    /*
2893
     * Lookup the entity in the table.
2894
     */
2895
11.6k
    ent = htmlEntityLookup(name);
2896
11.6k
    if (ent != NULL) /* OK that's ugly !!! */
2897
9.29k
        NEXT;
2898
471k
      } else {
2899
471k
    htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2900
471k
                 "htmlParseEntityRef: expecting ';'\n",
2901
471k
           NULL, NULL);
2902
471k
          if (str != NULL)
2903
471k
        *str = name;
2904
471k
      }
2905
482k
  }
2906
549k
    }
2907
549k
    return(ent);
2908
549k
}
2909
2910
/**
2911
 * htmlParseAttValue:
2912
 * @ctxt:  an HTML parser context
2913
 *
2914
 * parse a value for an attribute
2915
 * Note: the parser won't do substitution of entities here, this
2916
 * will be handled later in xmlStringGetNodeList, unless it was
2917
 * asked for ctxt->replaceEntities != 0
2918
 *
2919
 * Returns the AttValue parsed or NULL.
2920
 */
2921
2922
static xmlChar *
2923
160k
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2924
160k
    xmlChar *ret = NULL;
2925
2926
160k
    if (CUR == '"') {
2927
39.4k
        NEXT;
2928
39.4k
  ret = htmlParseHTMLAttribute(ctxt, '"');
2929
39.4k
        if (CUR != '"') {
2930
936
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2931
936
                   "AttValue: \" expected\n", NULL, NULL);
2932
936
  } else
2933
38.4k
      NEXT;
2934
120k
    } else if (CUR == '\'') {
2935
1.36k
        NEXT;
2936
1.36k
  ret = htmlParseHTMLAttribute(ctxt, '\'');
2937
1.36k
        if (CUR != '\'') {
2938
780
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2939
780
                   "AttValue: ' expected\n", NULL, NULL);
2940
780
  } else
2941
582
      NEXT;
2942
119k
    } else {
2943
        /*
2944
   * That's an HTMLism, the attribute value may not be quoted
2945
   */
2946
119k
  ret = htmlParseHTMLAttribute(ctxt, 0);
2947
119k
  if (ret == NULL) {
2948
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2949
0
                   "AttValue: no value found\n", NULL, NULL);
2950
0
  }
2951
119k
    }
2952
160k
    return(ret);
2953
160k
}
2954
2955
/**
2956
 * htmlParseSystemLiteral:
2957
 * @ctxt:  an HTML parser context
2958
 *
2959
 * parse an HTML Literal
2960
 *
2961
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2962
 *
2963
 * Returns the SystemLiteral parsed or NULL
2964
 */
2965
2966
static xmlChar *
2967
3.13k
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2968
3.13k
    size_t len = 0, startPosition = 0;
2969
3.13k
    int err = 0;
2970
3.13k
    int quote;
2971
3.13k
    xmlChar *ret = NULL;
2972
2973
3.13k
    if ((CUR != '"') && (CUR != '\'')) {
2974
1.00k
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2975
1.00k
               "SystemLiteral \" or ' expected\n", NULL, NULL);
2976
1.00k
        return(NULL);
2977
1.00k
    }
2978
2.12k
    quote = CUR;
2979
2.12k
    NEXT;
2980
2981
2.12k
    if (CUR_PTR < BASE_PTR)
2982
0
        return(ret);
2983
2.12k
    startPosition = CUR_PTR - BASE_PTR;
2984
2985
370k
    while ((CUR != 0) && (CUR != quote)) {
2986
        /* TODO: Handle UTF-8 */
2987
368k
        if (!IS_CHAR_CH(CUR)) {
2988
1.01k
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2989
1.01k
                            "Invalid char in SystemLiteral 0x%X\n", CUR);
2990
1.01k
            err = 1;
2991
1.01k
        }
2992
368k
        NEXT;
2993
368k
        len++;
2994
368k
    }
2995
2.12k
    if (CUR != quote) {
2996
526
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2997
526
                     "Unfinished SystemLiteral\n", NULL, NULL);
2998
1.60k
    } else {
2999
1.60k
        NEXT;
3000
1.60k
        if (err == 0)
3001
1.33k
            ret = xmlStrndup((BASE_PTR+startPosition), len);
3002
1.60k
    }
3003
3004
2.12k
    return(ret);
3005
2.12k
}
3006
3007
/**
3008
 * htmlParsePubidLiteral:
3009
 * @ctxt:  an HTML parser context
3010
 *
3011
 * parse an HTML public literal
3012
 *
3013
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3014
 *
3015
 * Returns the PubidLiteral parsed or NULL.
3016
 */
3017
3018
static xmlChar *
3019
3.39k
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3020
3.39k
    size_t len = 0, startPosition = 0;
3021
3.39k
    int err = 0;
3022
3.39k
    int quote;
3023
3.39k
    xmlChar *ret = NULL;
3024
3025
3.39k
    if ((CUR != '"') && (CUR != '\'')) {
3026
1.20k
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3027
1.20k
               "PubidLiteral \" or ' expected\n", NULL, NULL);
3028
1.20k
        return(NULL);
3029
1.20k
    }
3030
2.19k
    quote = CUR;
3031
2.19k
    NEXT;
3032
3033
    /*
3034
     * Name ::= (Letter | '_') (NameChar)*
3035
     */
3036
2.19k
    if (CUR_PTR < BASE_PTR)
3037
0
        return(ret);
3038
2.19k
    startPosition = CUR_PTR - BASE_PTR;
3039
3040
16.6k
    while ((CUR != 0) && (CUR != quote)) {
3041
14.4k
        if (!IS_PUBIDCHAR_CH(CUR)) {
3042
1.77k
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3043
1.77k
                            "Invalid char in PubidLiteral 0x%X\n", CUR);
3044
1.77k
            err = 1;
3045
1.77k
        }
3046
14.4k
        len++;
3047
14.4k
        NEXT;
3048
14.4k
    }
3049
3050
2.19k
    if (CUR != quote) {
3051
496
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3052
496
                     "Unfinished PubidLiteral\n", NULL, NULL);
3053
1.69k
    } else {
3054
1.69k
        NEXT;
3055
1.69k
        if (err == 0)
3056
917
            ret = xmlStrndup((BASE_PTR + startPosition), len);
3057
1.69k
    }
3058
3059
2.19k
    return(ret);
3060
2.19k
}
3061
3062
/**
3063
 * htmlParseScript:
3064
 * @ctxt:  an HTML parser context
3065
 *
3066
 * parse the content of an HTML SCRIPT or STYLE element
3067
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3068
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3069
 * http://www.w3.org/TR/html4/types.html#type-script
3070
 * http://www.w3.org/TR/html4/types.html#h-6.15
3071
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3072
 *
3073
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3074
 * element and the value of intrinsic event attributes. User agents must
3075
 * not evaluate script data as HTML markup but instead must pass it on as
3076
 * data to a script engine.
3077
 * NOTES:
3078
 * - The content is passed like CDATA
3079
 * - the attributes for style and scripting "onXXX" are also described
3080
 *   as CDATA but SGML allows entities references in attributes so their
3081
 *   processing is identical as other attributes
3082
 */
3083
static void
3084
28.6k
htmlParseScript(htmlParserCtxtPtr ctxt) {
3085
28.6k
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3086
28.6k
    int nbchar = 0;
3087
28.6k
    int cur,l;
3088
3089
28.6k
    SHRINK;
3090
28.6k
    cur = CUR_CHAR(l);
3091
106M
    while (cur != 0) {
3092
106M
  if ((cur == '<') && (NXT(1) == '/')) {
3093
            /*
3094
             * One should break here, the specification is clear:
3095
             * Authors should therefore escape "</" within the content.
3096
             * Escape mechanisms are specific to each scripting or
3097
             * style sheet language.
3098
             *
3099
             * In recovery mode, only break if end tag match the
3100
             * current tag, effectively ignoring all tags inside the
3101
             * script/style block and treating the entire block as
3102
             * CDATA.
3103
             */
3104
32.9k
            if (ctxt->recovery) {
3105
5.07k
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3106
5.07k
           xmlStrlen(ctxt->name)) == 0)
3107
2.41k
                {
3108
2.41k
                    break; /* while */
3109
2.66k
                } else {
3110
2.66k
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3111
2.66k
         "Element %s embeds close tag\n",
3112
2.66k
                     ctxt->name, NULL);
3113
2.66k
    }
3114
27.8k
            } else {
3115
27.8k
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3116
27.8k
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3117
22.7k
                {
3118
22.7k
                    break; /* while */
3119
22.7k
                }
3120
27.8k
            }
3121
32.9k
  }
3122
106M
        if (IS_CHAR(cur)) {
3123
106M
      COPY_BUF(l,buf,nbchar,cur);
3124
106M
        } else {
3125
29.9k
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3126
29.9k
                            "Invalid char in CDATA 0x%X\n", cur);
3127
29.9k
        }
3128
106M
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3129
308k
            buf[nbchar] = 0;
3130
308k
      if (ctxt->sax->cdataBlock!= NULL) {
3131
    /*
3132
     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3133
     */
3134
308k
    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3135
308k
      } else if (ctxt->sax->characters != NULL) {
3136
0
    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3137
0
      }
3138
308k
      nbchar = 0;
3139
308k
  }
3140
106M
  GROW;
3141
106M
  NEXTL(l);
3142
106M
  cur = CUR_CHAR(l);
3143
106M
    }
3144
3145
28.6k
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3146
10.5k
        buf[nbchar] = 0;
3147
10.5k
  if (ctxt->sax->cdataBlock!= NULL) {
3148
      /*
3149
       * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3150
       */
3151
10.5k
      ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3152
10.5k
  } else if (ctxt->sax->characters != NULL) {
3153
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3154
0
  }
3155
10.5k
    }
3156
28.6k
}
3157
3158
3159
/**
3160
 * htmlParseCharDataInternal:
3161
 * @ctxt:  an HTML parser context
3162
 * @readahead: optional read ahead character in ascii range
3163
 *
3164
 * parse a CharData section.
3165
 * if we are within a CDATA section ']]>' marks an end of section.
3166
 *
3167
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3168
 */
3169
3170
static void
3171
516k
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3172
516k
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3173
516k
    int nbchar = 0;
3174
516k
    int cur, l;
3175
516k
    int chunk = 0;
3176
3177
516k
    if (readahead)
3178
0
        buf[nbchar++] = readahead;
3179
3180
516k
    SHRINK;
3181
516k
    cur = CUR_CHAR(l);
3182
473M
    while (((cur != '<') || (ctxt->token == '<')) &&
3183
473M
           ((cur != '&') || (ctxt->token == '&')) &&
3184
473M
     (cur != 0)) {
3185
473M
  if (!(IS_CHAR(cur))) {
3186
315k
      htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3187
315k
                  "Invalid char in CDATA 0x%X\n", cur);
3188
472M
  } else {
3189
472M
      COPY_BUF(l,buf,nbchar,cur);
3190
472M
  }
3191
473M
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3192
1.35M
            buf[nbchar] = 0;
3193
3194
      /*
3195
       * Ok the segment is to be consumed as chars.
3196
       */
3197
1.35M
      if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3198
1.33M
    if (areBlanks(ctxt, buf, nbchar)) {
3199
681
        if (ctxt->keepBlanks) {
3200
479
      if (ctxt->sax->characters != NULL)
3201
479
          ctxt->sax->characters(ctxt->userData, buf, nbchar);
3202
479
        } else {
3203
202
      if (ctxt->sax->ignorableWhitespace != NULL)
3204
202
          ctxt->sax->ignorableWhitespace(ctxt->userData,
3205
202
                                         buf, nbchar);
3206
202
        }
3207
1.33M
    } else {
3208
1.33M
        htmlCheckParagraph(ctxt);
3209
1.33M
        if (ctxt->sax->characters != NULL)
3210
1.33M
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3211
1.33M
    }
3212
1.33M
      }
3213
1.35M
      nbchar = 0;
3214
1.35M
  }
3215
473M
  NEXTL(l);
3216
473M
        chunk++;
3217
473M
        if (chunk > HTML_PARSER_BUFFER_SIZE) {
3218
4.65M
            chunk = 0;
3219
4.65M
            SHRINK;
3220
4.65M
            GROW;
3221
4.65M
        }
3222
473M
  cur = CUR_CHAR(l);
3223
473M
  if (cur == 0) {
3224
8.50k
      SHRINK;
3225
8.50k
      GROW;
3226
8.50k
      cur = CUR_CHAR(l);
3227
8.50k
  }
3228
473M
    }
3229
516k
    if (nbchar != 0) {
3230
512k
        buf[nbchar] = 0;
3231
3232
  /*
3233
   * Ok the segment is to be consumed as chars.
3234
   */
3235
512k
  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3236
490k
      if (areBlanks(ctxt, buf, nbchar)) {
3237
13.6k
    if (ctxt->keepBlanks) {
3238
10.0k
        if (ctxt->sax->characters != NULL)
3239
10.0k
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3240
10.0k
    } else {
3241
3.58k
        if (ctxt->sax->ignorableWhitespace != NULL)
3242
3.58k
      ctxt->sax->ignorableWhitespace(ctxt->userData,
3243
3.58k
                                     buf, nbchar);
3244
3.58k
    }
3245
476k
      } else {
3246
476k
    htmlCheckParagraph(ctxt);
3247
476k
    if (ctxt->sax->characters != NULL)
3248
476k
        ctxt->sax->characters(ctxt->userData, buf, nbchar);
3249
476k
      }
3250
490k
  }
3251
512k
    } else {
3252
  /*
3253
   * Loop detection
3254
   */
3255
4.25k
  if (cur == 0)
3256
345
      ctxt->instate = XML_PARSER_EOF;
3257
4.25k
    }
3258
516k
}
3259
3260
/**
3261
 * htmlParseCharData:
3262
 * @ctxt:  an HTML parser context
3263
 *
3264
 * parse a CharData section.
3265
 * if we are within a CDATA section ']]>' marks an end of section.
3266
 *
3267
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3268
 */
3269
3270
static void
3271
516k
htmlParseCharData(htmlParserCtxtPtr ctxt) {
3272
516k
    htmlParseCharDataInternal(ctxt, 0);
3273
516k
}
3274
3275
/**
3276
 * htmlParseExternalID:
3277
 * @ctxt:  an HTML parser context
3278
 * @publicID:  a xmlChar** receiving PubidLiteral
3279
 *
3280
 * Parse an External ID or a Public ID
3281
 *
3282
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3283
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3284
 *
3285
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3286
 *
3287
 * Returns the function returns SystemLiteral and in the second
3288
 *                case publicID receives PubidLiteral, is strict is off
3289
 *                it is possible to return NULL and have publicID set.
3290
 */
3291
3292
static xmlChar *
3293
11.7k
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3294
11.7k
    xmlChar *URI = NULL;
3295
3296
11.7k
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3297
11.7k
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3298
11.7k
   (UPP(4) == 'E') && (UPP(5) == 'M')) {
3299
2.35k
        SKIP(6);
3300
2.35k
  if (!IS_BLANK_CH(CUR)) {
3301
1.40k
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3302
1.40k
                   "Space required after 'SYSTEM'\n", NULL, NULL);
3303
1.40k
  }
3304
2.35k
        SKIP_BLANKS;
3305
2.35k
  URI = htmlParseSystemLiteral(ctxt);
3306
2.35k
  if (URI == NULL) {
3307
1.75k
      htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3308
1.75k
                   "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3309
1.75k
        }
3310
9.35k
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3311
9.35k
         (UPP(2) == 'B') && (UPP(3) == 'L') &&
3312
9.35k
         (UPP(4) == 'I') && (UPP(5) == 'C')) {
3313
3.39k
        SKIP(6);
3314
3.39k
  if (!IS_BLANK_CH(CUR)) {
3315
1.71k
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3316
1.71k
                   "Space required after 'PUBLIC'\n", NULL, NULL);
3317
1.71k
  }
3318
3.39k
        SKIP_BLANKS;
3319
3.39k
  *publicID = htmlParsePubidLiteral(ctxt);
3320
3.39k
  if (*publicID == NULL) {
3321
2.48k
      htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3322
2.48k
                   "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3323
2.48k
       NULL, NULL);
3324
2.48k
  }
3325
3.39k
        SKIP_BLANKS;
3326
3.39k
        if ((CUR == '"') || (CUR == '\'')) {
3327
778
      URI = htmlParseSystemLiteral(ctxt);
3328
778
  }
3329
3.39k
    }
3330
11.7k
    return(URI);
3331
11.7k
}
3332
3333
/**
3334
 * xmlParsePI:
3335
 * @ctxt:  an XML parser context
3336
 *
3337
 * parse an XML Processing Instruction.
3338
 *
3339
 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3340
 */
3341
static void
3342
90.3k
htmlParsePI(htmlParserCtxtPtr ctxt) {
3343
90.3k
    xmlChar *buf = NULL;
3344
90.3k
    int len = 0;
3345
90.3k
    int size = HTML_PARSER_BUFFER_SIZE;
3346
90.3k
    int cur, l;
3347
90.3k
    const xmlChar *target;
3348
90.3k
    xmlParserInputState state;
3349
90.3k
    int count = 0;
3350
3351
90.3k
    if ((RAW == '<') && (NXT(1) == '?')) {
3352
35.2k
  state = ctxt->instate;
3353
35.2k
        ctxt->instate = XML_PARSER_PI;
3354
  /*
3355
   * this is a Processing Instruction.
3356
   */
3357
35.2k
  SKIP(2);
3358
35.2k
  SHRINK;
3359
3360
  /*
3361
   * Parse the target name and check for special support like
3362
   * namespace.
3363
   */
3364
35.2k
        target = htmlParseName(ctxt);
3365
35.2k
  if (target != NULL) {
3366
17.6k
      if (RAW == '>') {
3367
4.50k
    SKIP(1);
3368
3369
    /*
3370
     * SAX: PI detected.
3371
     */
3372
4.50k
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3373
4.50k
        (ctxt->sax->processingInstruction != NULL))
3374
4.15k
        ctxt->sax->processingInstruction(ctxt->userData,
3375
4.15k
                                         target, NULL);
3376
4.50k
    ctxt->instate = state;
3377
4.50k
    return;
3378
4.50k
      }
3379
13.1k
      buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3380
13.1k
      if (buf == NULL) {
3381
0
    htmlErrMemory(ctxt, NULL);
3382
0
    ctxt->instate = state;
3383
0
    return;
3384
0
      }
3385
13.1k
      cur = CUR;
3386
13.1k
      if (!IS_BLANK(cur)) {
3387
9.19k
    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3388
9.19k
        "ParsePI: PI %s space expected\n", target, NULL);
3389
9.19k
      }
3390
13.1k
            SKIP_BLANKS;
3391
13.1k
      cur = CUR_CHAR(l);
3392
37.6M
      while ((cur != 0) && (cur != '>')) {
3393
37.6M
    if (len + 5 >= size) {
3394
6.00k
        xmlChar *tmp;
3395
3396
6.00k
        size *= 2;
3397
6.00k
        tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3398
6.00k
        if (tmp == NULL) {
3399
0
      htmlErrMemory(ctxt, NULL);
3400
0
      xmlFree(buf);
3401
0
      ctxt->instate = state;
3402
0
      return;
3403
0
        }
3404
6.00k
        buf = tmp;
3405
6.00k
    }
3406
37.6M
    count++;
3407
37.6M
    if (count > 50) {
3408
735k
        GROW;
3409
735k
        count = 0;
3410
735k
    }
3411
37.6M
                if (IS_CHAR(cur)) {
3412
37.6M
        COPY_BUF(l,buf,len,cur);
3413
37.6M
                } else {
3414
7.48k
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3415
7.48k
                                    "Invalid char in processing instruction "
3416
7.48k
                                    "0x%X\n", cur);
3417
7.48k
                }
3418
37.6M
    NEXTL(l);
3419
37.6M
    cur = CUR_CHAR(l);
3420
37.6M
    if (cur == 0) {
3421
694
        SHRINK;
3422
694
        GROW;
3423
694
        cur = CUR_CHAR(l);
3424
694
    }
3425
37.6M
      }
3426
13.1k
      buf[len] = 0;
3427
13.1k
      if (cur != '>') {
3428
947
    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3429
947
          "ParsePI: PI %s never end ...\n", target, NULL);
3430
12.2k
      } else {
3431
12.2k
    SKIP(1);
3432
3433
    /*
3434
     * SAX: PI detected.
3435
     */
3436
12.2k
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3437
12.2k
        (ctxt->sax->processingInstruction != NULL))
3438
11.6k
        ctxt->sax->processingInstruction(ctxt->userData,
3439
11.6k
                                         target, buf);
3440
12.2k
      }
3441
13.1k
      xmlFree(buf);
3442
17.5k
  } else {
3443
17.5k
      htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3444
17.5k
                         "PI is not started correctly", NULL, NULL);
3445
17.5k
  }
3446
30.7k
  ctxt->instate = state;
3447
30.7k
    }
3448
90.3k
}
3449
3450
/**
3451
 * htmlParseComment:
3452
 * @ctxt:  an HTML parser context
3453
 *
3454
 * Parse an XML (SGML) comment <!-- .... -->
3455
 *
3456
 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3457
 */
3458
static void
3459
166k
htmlParseComment(htmlParserCtxtPtr ctxt) {
3460
166k
    xmlChar *buf = NULL;
3461
166k
    int len;
3462
166k
    int size = HTML_PARSER_BUFFER_SIZE;
3463
166k
    int q, ql;
3464
166k
    int r, rl;
3465
166k
    int cur, l;
3466
166k
    int next, nl;
3467
166k
    xmlParserInputState state;
3468
3469
    /*
3470
     * Check that there is a comment right here.
3471
     */
3472
166k
    if ((RAW != '<') || (NXT(1) != '!') ||
3473
166k
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3474
3475
165k
    state = ctxt->instate;
3476
165k
    ctxt->instate = XML_PARSER_COMMENT;
3477
165k
    SHRINK;
3478
165k
    SKIP(4);
3479
165k
    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3480
165k
    if (buf == NULL) {
3481
0
        htmlErrMemory(ctxt, "buffer allocation failed\n");
3482
0
  ctxt->instate = state;
3483
0
  return;
3484
0
    }
3485
165k
    len = 0;
3486
165k
    buf[len] = 0;
3487
165k
    q = CUR_CHAR(ql);
3488
165k
    if (q == 0)
3489
90
        goto unfinished;
3490
165k
    NEXTL(ql);
3491
165k
    r = CUR_CHAR(rl);
3492
165k
    if (r == 0)
3493
333
        goto unfinished;
3494
164k
    NEXTL(rl);
3495
164k
    cur = CUR_CHAR(l);
3496
144M
    while ((cur != 0) &&
3497
144M
           ((cur != '>') ||
3498
144M
      (r != '-') || (q != '-'))) {
3499
143M
  NEXTL(l);
3500
143M
  next = CUR_CHAR(nl);
3501
143M
  if (next == 0) {
3502
3.46k
      SHRINK;
3503
3.46k
      GROW;
3504
3.46k
      next = CUR_CHAR(nl);
3505
3.46k
  }
3506
3507
143M
  if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3508
853
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3509
853
           "Comment incorrectly closed by '--!>'", NULL, NULL);
3510
853
    cur = '>';
3511
853
    break;
3512
853
  }
3513
3514
143M
  if (len + 5 >= size) {
3515
14.8k
      xmlChar *tmp;
3516
3517
14.8k
      size *= 2;
3518
14.8k
      tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3519
14.8k
      if (tmp == NULL) {
3520
0
          xmlFree(buf);
3521
0
          htmlErrMemory(ctxt, "growing buffer failed\n");
3522
0
    ctxt->instate = state;
3523
0
    return;
3524
0
      }
3525
14.8k
      buf = tmp;
3526
14.8k
  }
3527
143M
        if (IS_CHAR(q)) {
3528
143M
      COPY_BUF(ql,buf,len,q);
3529
143M
        } else {
3530
205k
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3531
205k
                            "Invalid char in comment 0x%X\n", q);
3532
205k
        }
3533
3534
143M
  q = r;
3535
143M
  ql = rl;
3536
143M
  r = cur;
3537
143M
  rl = l;
3538
143M
  cur = next;
3539
143M
  l = nl;
3540
143M
    }
3541
164k
    buf[len] = 0;
3542
164k
    if (cur == '>') {
3543
161k
        NEXT;
3544
161k
  if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3545
161k
      (!ctxt->disableSAX))
3546
159k
      ctxt->sax->comment(ctxt->userData, buf);
3547
161k
  xmlFree(buf);
3548
161k
  ctxt->instate = state;
3549
161k
  return;
3550
161k
    }
3551
3552
3.90k
unfinished:
3553
3.90k
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3554
3.90k
     "Comment not terminated \n<!--%.50s\n", buf, NULL);
3555
3.90k
    xmlFree(buf);
3556
3.90k
}
3557
3558
/**
3559
 * htmlParseCharRef:
3560
 * @ctxt:  an HTML parser context
3561
 *
3562
 * parse Reference declarations
3563
 *
3564
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3565
 *                  '&#x' [0-9a-fA-F]+ ';'
3566
 *
3567
 * Returns the value parsed (as an int)
3568
 */
3569
int
3570
16.2k
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3571
16.2k
    int val = 0;
3572
3573
16.2k
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3574
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3575
0
         "htmlParseCharRef: context error\n",
3576
0
         NULL, NULL);
3577
0
        return(0);
3578
0
    }
3579
16.2k
    if ((CUR == '&') && (NXT(1) == '#') &&
3580
16.2k
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3581
7.19k
  SKIP(3);
3582
22.1k
  while (CUR != ';') {
3583
18.5k
      if ((CUR >= '0') && (CUR <= '9')) {
3584
2.63k
                if (val < 0x110000)
3585
2.36k
              val = val * 16 + (CUR - '0');
3586
15.9k
            } else if ((CUR >= 'a') && (CUR <= 'f')) {
3587
7.66k
                if (val < 0x110000)
3588
7.38k
              val = val * 16 + (CUR - 'a') + 10;
3589
8.28k
            } else if ((CUR >= 'A') && (CUR <= 'F')) {
3590
4.67k
                if (val < 0x110000)
3591
4.21k
              val = val * 16 + (CUR - 'A') + 10;
3592
4.67k
            } else {
3593
3.60k
          htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3594
3.60k
                 "htmlParseCharRef: missing semicolon\n",
3595
3.60k
           NULL, NULL);
3596
3.60k
    break;
3597
3.60k
      }
3598
14.9k
      NEXT;
3599
14.9k
  }
3600
7.19k
  if (CUR == ';')
3601
3.58k
      NEXT;
3602
9.01k
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3603
9.01k
  SKIP(2);
3604
34.7k
  while (CUR != ';') {
3605
34.1k
      if ((CUR >= '0') && (CUR <= '9')) {
3606
25.7k
                if (val < 0x110000)
3607
25.5k
              val = val * 10 + (CUR - '0');
3608
25.7k
            } else {
3609
8.38k
          htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3610
8.38k
                 "htmlParseCharRef: missing semicolon\n",
3611
8.38k
           NULL, NULL);
3612
8.38k
    break;
3613
8.38k
      }
3614
25.7k
      NEXT;
3615
25.7k
  }
3616
9.01k
  if (CUR == ';')
3617
627
      NEXT;
3618
9.01k
    } else {
3619
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3620
0
               "htmlParseCharRef: invalid value\n", NULL, NULL);
3621
0
    }
3622
    /*
3623
     * Check the value IS_CHAR ...
3624
     */
3625
16.2k
    if (IS_CHAR(val)) {
3626
11.2k
        return(val);
3627
11.2k
    } else if (val >= 0x110000) {
3628
783
  htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3629
783
         "htmlParseCharRef: value too large\n", NULL, NULL);
3630
4.18k
    } else {
3631
4.18k
  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3632
4.18k
      "htmlParseCharRef: invalid xmlChar value %d\n",
3633
4.18k
      val);
3634
4.18k
    }
3635
4.97k
    return(0);
3636
16.2k
}
3637
3638
3639
/**
3640
 * htmlParseDocTypeDecl:
3641
 * @ctxt:  an HTML parser context
3642
 *
3643
 * parse a DOCTYPE declaration
3644
 *
3645
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3646
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3647
 */
3648
3649
static void
3650
11.7k
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3651
11.7k
    const xmlChar *name;
3652
11.7k
    xmlChar *ExternalID = NULL;
3653
11.7k
    xmlChar *URI = NULL;
3654
3655
    /*
3656
     * We know that '<!DOCTYPE' has been detected.
3657
     */
3658
11.7k
    SKIP(9);
3659
3660
11.7k
    SKIP_BLANKS;
3661
3662
    /*
3663
     * Parse the DOCTYPE name.
3664
     */
3665
11.7k
    name = htmlParseName(ctxt);
3666
11.7k
    if (name == NULL) {
3667
2.61k
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3668
2.61k
               "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3669
2.61k
         NULL, NULL);
3670
2.61k
    }
3671
    /*
3672
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3673
     */
3674
3675
11.7k
    SKIP_BLANKS;
3676
3677
    /*
3678
     * Check for SystemID and ExternalID
3679
     */
3680
11.7k
    URI = htmlParseExternalID(ctxt, &ExternalID);
3681
11.7k
    SKIP_BLANKS;
3682
3683
    /*
3684
     * We should be at the end of the DOCTYPE declaration.
3685
     */
3686
11.7k
    if (CUR != '>') {
3687
8.69k
  htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3688
8.69k
               "DOCTYPE improperly terminated\n", NULL, NULL);
3689
        /* Ignore bogus content */
3690
256k
        while ((CUR != 0) && (CUR != '>'))
3691
247k
            NEXT;
3692
8.69k
    }
3693
11.7k
    if (CUR == '>')
3694
8.67k
        NEXT;
3695
3696
    /*
3697
     * Create or update the document accordingly to the DOCTYPE
3698
     */
3699
11.7k
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3700
11.7k
  (!ctxt->disableSAX))
3701
8.82k
  ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3702
3703
    /*
3704
     * Cleanup, since we don't use all those identifiers
3705
     */
3706
11.7k
    if (URI != NULL) xmlFree(URI);
3707
11.7k
    if (ExternalID != NULL) xmlFree(ExternalID);
3708
11.7k
}
3709
3710
/**
3711
 * htmlParseAttribute:
3712
 * @ctxt:  an HTML parser context
3713
 * @value:  a xmlChar ** used to store the value of the attribute
3714
 *
3715
 * parse an attribute
3716
 *
3717
 * [41] Attribute ::= Name Eq AttValue
3718
 *
3719
 * [25] Eq ::= S? '=' S?
3720
 *
3721
 * With namespace:
3722
 *
3723
 * [NS 11] Attribute ::= QName Eq AttValue
3724
 *
3725
 * Also the case QName == xmlns:??? is handled independently as a namespace
3726
 * definition.
3727
 *
3728
 * Returns the attribute name, and the value in *value.
3729
 */
3730
3731
static const xmlChar *
3732
494k
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3733
494k
    const xmlChar *name;
3734
494k
    xmlChar *val = NULL;
3735
3736
494k
    *value = NULL;
3737
494k
    name = htmlParseHTMLName(ctxt);
3738
494k
    if (name == NULL) {
3739
259k
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3740
259k
               "error parsing attribute name\n", NULL, NULL);
3741
259k
        return(NULL);
3742
259k
    }
3743
3744
    /*
3745
     * read the value
3746
     */
3747
235k
    SKIP_BLANKS;
3748
235k
    if (CUR == '=') {
3749
160k
        NEXT;
3750
160k
  SKIP_BLANKS;
3751
160k
  val = htmlParseAttValue(ctxt);
3752
160k
    }
3753
3754
235k
    *value = val;
3755
235k
    return(name);
3756
494k
}
3757
3758
/**
3759
 * htmlCheckEncodingDirect:
3760
 * @ctxt:  an HTML parser context
3761
 * @attvalue: the attribute value
3762
 *
3763
 * Checks an attribute value to detect
3764
 * the encoding
3765
 * If a new encoding is detected the parser is switched to decode
3766
 * it and pass UTF8
3767
 */
3768
static void
3769
6.44k
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3770
3771
6.44k
    if ((ctxt == NULL) || (encoding == NULL) ||
3772
6.44k
        (ctxt->options & HTML_PARSE_IGNORE_ENC))
3773
676
  return;
3774
3775
    /* do not change encoding */
3776
5.76k
    if (ctxt->input->encoding != NULL)
3777
1.22k
        return;
3778
3779
4.54k
    if (encoding != NULL) {
3780
4.54k
  xmlCharEncoding enc;
3781
4.54k
  xmlCharEncodingHandlerPtr handler;
3782
3783
4.93k
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3784
3785
4.54k
  if (ctxt->input->encoding != NULL)
3786
0
      xmlFree((xmlChar *) ctxt->input->encoding);
3787
4.54k
  ctxt->input->encoding = xmlStrdup(encoding);
3788
3789
4.54k
  enc = xmlParseCharEncoding((const char *) encoding);
3790
  /*
3791
   * registered set of known encodings
3792
   */
3793
4.54k
  if (enc != XML_CHAR_ENCODING_ERROR) {
3794
287
      if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3795
287
           (enc == XML_CHAR_ENCODING_UTF16BE) ||
3796
287
     (enc == XML_CHAR_ENCODING_UCS4LE) ||
3797
287
     (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3798
287
    (ctxt->input->buf != NULL) &&
3799
287
    (ctxt->input->buf->encoder == NULL)) {
3800
18
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3801
18
                 "htmlCheckEncoding: wrong encoding meta\n",
3802
18
           NULL, NULL);
3803
269
      } else {
3804
269
    xmlSwitchEncoding(ctxt, enc);
3805
269
      }
3806
287
      ctxt->charset = XML_CHAR_ENCODING_UTF8;
3807
4.26k
  } else {
3808
      /*
3809
       * fallback for unknown encodings
3810
       */
3811
4.26k
      handler = xmlFindCharEncodingHandler((const char *) encoding);
3812
4.26k
      if (handler != NULL) {
3813
3.33k
    xmlSwitchToEncoding(ctxt, handler);
3814
3.33k
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3815
3.33k
      } else {
3816
923
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3817
923
                 "htmlCheckEncoding: unknown encoding %s\n",
3818
923
           encoding, NULL);
3819
923
      }
3820
4.26k
  }
3821
3822
4.54k
  if ((ctxt->input->buf != NULL) &&
3823
4.54k
      (ctxt->input->buf->encoder != NULL) &&
3824
4.54k
      (ctxt->input->buf->raw != NULL) &&
3825
4.54k
      (ctxt->input->buf->buffer != NULL)) {
3826
3.74k
      int nbchars;
3827
3.74k
      int processed;
3828
3829
      /*
3830
       * convert as much as possible to the parser reading buffer.
3831
       */
3832
3.74k
      processed = ctxt->input->cur - ctxt->input->base;
3833
3.74k
      xmlBufShrink(ctxt->input->buf->buffer, processed);
3834
3.74k
      nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3835
3.74k
            xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3836
3.74k
      if (nbchars < 0) {
3837
94
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3838
94
                 "htmlCheckEncoding: encoder error\n",
3839
94
           NULL, NULL);
3840
94
      }
3841
3.74k
  }
3842
4.54k
    }
3843
4.54k
}
3844
3845
/**
3846
 * htmlCheckEncoding:
3847
 * @ctxt:  an HTML parser context
3848
 * @attvalue: the attribute value
3849
 *
3850
 * Checks an http-equiv attribute from a Meta tag to detect
3851
 * the encoding
3852
 * If a new encoding is detected the parser is switched to decode
3853
 * it and pass UTF8
3854
 */
3855
static void
3856
1.77k
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3857
1.77k
    const xmlChar *encoding;
3858
3859
1.77k
    if (!attvalue)
3860
0
  return;
3861
3862
1.77k
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3863
1.77k
    if (encoding != NULL) {
3864
1.43k
  encoding += 7;
3865
1.43k
    }
3866
    /*
3867
     * skip blank
3868
     */
3869
1.77k
    if (encoding && IS_BLANK_CH(*encoding))
3870
938
  encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3871
1.77k
    if (encoding && *encoding == '=') {
3872
323
  encoding ++;
3873
323
  htmlCheckEncodingDirect(ctxt, encoding);
3874
323
    }
3875
1.77k
}
3876
3877
/**
3878
 * htmlCheckMeta:
3879
 * @ctxt:  an HTML parser context
3880
 * @atts:  the attributes values
3881
 *
3882
 * Checks an attributes from a Meta tag
3883
 */
3884
static void
3885
9.85k
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3886
9.85k
    int i;
3887
9.85k
    const xmlChar *att, *value;
3888
9.85k
    int http = 0;
3889
9.85k
    const xmlChar *content = NULL;
3890
3891
9.85k
    if ((ctxt == NULL) || (atts == NULL))
3892
0
  return;
3893
3894
9.85k
    i = 0;
3895
9.85k
    att = atts[i++];
3896
24.2k
    while (att != NULL) {
3897
14.4k
  value = atts[i++];
3898
14.4k
  if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3899
14.4k
   && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3900
2.05k
      http = 1;
3901
12.3k
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3902
6.12k
      htmlCheckEncodingDirect(ctxt, value);
3903
6.22k
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3904
2.25k
      content = value;
3905
14.4k
  att = atts[i++];
3906
14.4k
    }
3907
9.85k
    if ((http) && (content != NULL))
3908
1.77k
  htmlCheckEncoding(ctxt, content);
3909
3910
9.85k
}
3911
3912
/**
3913
 * htmlParseStartTag:
3914
 * @ctxt:  an HTML parser context
3915
 *
3916
 * parse a start of tag either for rule element or
3917
 * EmptyElement. In both case we don't parse the tag closing chars.
3918
 *
3919
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3920
 *
3921
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3922
 *
3923
 * With namespace:
3924
 *
3925
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3926
 *
3927
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3928
 *
3929
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3930
 */
3931
3932
static int
3933
17.9M
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3934
17.9M
    const xmlChar *name;
3935
17.9M
    const xmlChar *attname;
3936
17.9M
    xmlChar *attvalue;
3937
17.9M
    const xmlChar **atts;
3938
17.9M
    int nbatts = 0;
3939
17.9M
    int maxatts;
3940
17.9M
    int meta = 0;
3941
17.9M
    int i;
3942
17.9M
    int discardtag = 0;
3943
3944
17.9M
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3945
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3946
0
         "htmlParseStartTag: context error\n", NULL, NULL);
3947
0
  return -1;
3948
0
    }
3949
17.9M
    if (ctxt->instate == XML_PARSER_EOF)
3950
0
        return(-1);
3951
17.9M
    if (CUR != '<') return -1;
3952
17.9M
    NEXT;
3953
3954
17.9M
    atts = ctxt->atts;
3955
17.9M
    maxatts = ctxt->maxatts;
3956
3957
17.9M
    GROW;
3958
17.9M
    name = htmlParseHTMLName(ctxt);
3959
17.9M
    if (name == NULL) {
3960
67.8k
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3961
67.8k
               "htmlParseStartTag: invalid element name\n",
3962
67.8k
         NULL, NULL);
3963
        /*
3964
         * The recovery code is disabled for now as it can result in
3965
         * quadratic behavior with the push parser. htmlParseStartTag
3966
         * must consume all content up to the final '>' in order to avoid
3967
         * rescanning for this terminator.
3968
         *
3969
         * For a proper fix in line with HTML5, htmlParseStartTag and
3970
         * htmlParseElement should only be called when there's an ASCII
3971
         * alpha character following the initial '<'. Otherwise, the '<'
3972
         * should be emitted as text (unless followed by '!', '/' or '?').
3973
         */
3974
#if 0
3975
  /* if recover preserve text on classic misconstructs */
3976
  if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3977
      (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3978
      htmlParseCharDataInternal(ctxt, '<');
3979
      return(-1);
3980
  }
3981
#endif
3982
3983
  /* Dump the bogus tag like browsers do */
3984
8.11M
  while ((CUR != 0) && (CUR != '>') &&
3985
8.11M
               (ctxt->instate != XML_PARSER_EOF))
3986
8.05M
      NEXT;
3987
67.8k
        return -1;
3988
67.8k
    }
3989
17.8M
    if (xmlStrEqual(name, BAD_CAST"meta"))
3990
10.6k
  meta = 1;
3991
3992
    /*
3993
     * Check for auto-closure of HTML elements.
3994
     */
3995
17.8M
    htmlAutoClose(ctxt, name);
3996
3997
    /*
3998
     * Check for implied HTML elements.
3999
     */
4000
17.8M
    htmlCheckImplied(ctxt, name);
4001
4002
    /*
4003
     * Avoid html at any level > 0, head at any level != 1
4004
     * or any attempt to recurse body
4005
     */
4006
17.8M
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4007
4.15k
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4008
4.15k
               "htmlParseStartTag: misplaced <html> tag\n",
4009
4.15k
         name, NULL);
4010
4.15k
  discardtag = 1;
4011
4.15k
  ctxt->depth++;
4012
4.15k
    }
4013
17.8M
    if ((ctxt->nameNr != 1) &&
4014
17.8M
  (xmlStrEqual(name, BAD_CAST"head"))) {
4015
192k
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4016
192k
               "htmlParseStartTag: misplaced <head> tag\n",
4017
192k
         name, NULL);
4018
192k
  discardtag = 1;
4019
192k
  ctxt->depth++;
4020
192k
    }
4021
17.8M
    if (xmlStrEqual(name, BAD_CAST"body")) {
4022
1.65k
  int indx;
4023
7.33M
  for (indx = 0;indx < ctxt->nameNr;indx++) {
4024
7.33M
      if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4025
1.21k
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4026
1.21k
                 "htmlParseStartTag: misplaced <body> tag\n",
4027
1.21k
           name, NULL);
4028
1.21k
    discardtag = 1;
4029
1.21k
    ctxt->depth++;
4030
1.21k
      }
4031
7.33M
  }
4032
1.65k
    }
4033
4034
    /*
4035
     * Now parse the attributes, it ends up with the ending
4036
     *
4037
     * (S Attribute)* S?
4038
     */
4039
17.8M
    SKIP_BLANKS;
4040
18.3M
    while ((CUR != 0) &&
4041
18.3M
           (CUR != '>') &&
4042
18.3M
     ((CUR != '/') || (NXT(1) != '>'))) {
4043
494k
  GROW;
4044
494k
  attname = htmlParseAttribute(ctxt, &attvalue);
4045
494k
        if (attname != NULL) {
4046
4047
      /*
4048
       * Well formedness requires at most one declaration of an attribute
4049
       */
4050
28.6M
      for (i = 0; i < nbatts;i += 2) {
4051
28.4M
          if (xmlStrEqual(atts[i], attname)) {
4052
24.8k
        htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4053
24.8k
                     "Attribute %s redefined\n", attname, NULL);
4054
24.8k
        if (attvalue != NULL)
4055
2.24k
      xmlFree(attvalue);
4056
24.8k
        goto failed;
4057
24.8k
    }
4058
28.4M
      }
4059
4060
      /*
4061
       * Add the pair to atts
4062
       */
4063
210k
      if (atts == NULL) {
4064
9.40k
          maxatts = 22; /* allow for 10 attrs by default */
4065
9.40k
          atts = (const xmlChar **)
4066
9.40k
           xmlMalloc(maxatts * sizeof(xmlChar *));
4067
9.40k
    if (atts == NULL) {
4068
0
        htmlErrMemory(ctxt, NULL);
4069
0
        if (attvalue != NULL)
4070
0
      xmlFree(attvalue);
4071
0
        goto failed;
4072
0
    }
4073
9.40k
    ctxt->atts = atts;
4074
9.40k
    ctxt->maxatts = maxatts;
4075
201k
      } else if (nbatts + 4 > maxatts) {
4076
438
          const xmlChar **n;
4077
4078
438
          maxatts *= 2;
4079
438
          n = (const xmlChar **) xmlRealloc((void *) atts,
4080
438
               maxatts * sizeof(const xmlChar *));
4081
438
    if (n == NULL) {
4082
0
        htmlErrMemory(ctxt, NULL);
4083
0
        if (attvalue != NULL)
4084
0
      xmlFree(attvalue);
4085
0
        goto failed;
4086
0
    }
4087
438
    atts = n;
4088
438
    ctxt->atts = atts;
4089
438
    ctxt->maxatts = maxatts;
4090
438
      }
4091
210k
      atts[nbatts++] = attname;
4092
210k
      atts[nbatts++] = attvalue;
4093
210k
      atts[nbatts] = NULL;
4094
210k
      atts[nbatts + 1] = NULL;
4095
210k
  }
4096
259k
  else {
4097
259k
      if (attvalue != NULL)
4098
0
          xmlFree(attvalue);
4099
      /* Dump the bogus attribute string up to the next blank or
4100
       * the end of the tag. */
4101
6.15M
      while ((CUR != 0) &&
4102
6.15M
             !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4103
6.15M
       ((CUR != '/') || (NXT(1) != '>')))
4104
5.89M
    NEXT;
4105
259k
  }
4106
4107
494k
failed:
4108
494k
  SKIP_BLANKS;
4109
494k
    }
4110
4111
    /*
4112
     * Handle specific association to the META tag
4113
     */
4114
17.8M
    if (meta && (nbatts != 0))
4115
9.85k
  htmlCheckMeta(ctxt, atts);
4116
4117
    /*
4118
     * SAX: Start of Element !
4119
     */
4120
17.8M
    if (!discardtag) {
4121
17.7M
  htmlnamePush(ctxt, name);
4122
17.7M
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4123
17.7M
      if (nbatts != 0)
4124
99.4k
    ctxt->sax->startElement(ctxt->userData, name, atts);
4125
17.6M
      else
4126
17.6M
    ctxt->sax->startElement(ctxt->userData, name, NULL);
4127
17.7M
  }
4128
17.7M
    }
4129
4130
17.8M
    if (atts != NULL) {
4131
10.9M
        for (i = 1;i < nbatts;i += 2) {
4132
210k
      if (atts[i] != NULL)
4133
157k
    xmlFree((xmlChar *) atts[i]);
4134
210k
  }
4135
10.7M
    }
4136
4137
17.8M
    return(discardtag);
4138
17.8M
}
4139
4140
/**
4141
 * htmlParseEndTag:
4142
 * @ctxt:  an HTML parser context
4143
 *
4144
 * parse an end of tag
4145
 *
4146
 * [42] ETag ::= '</' Name S? '>'
4147
 *
4148
 * With namespace
4149
 *
4150
 * [NS 9] ETag ::= '</' QName S? '>'
4151
 *
4152
 * Returns 1 if the current level should be closed.
4153
 */
4154
4155
static int
4156
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4157
108k
{
4158
108k
    const xmlChar *name;
4159
108k
    const xmlChar *oldname;
4160
108k
    int i, ret;
4161
4162
108k
    if ((CUR != '<') || (NXT(1) != '/')) {
4163
1.93k
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4164
1.93k
               "htmlParseEndTag: '</' not found\n", NULL, NULL);
4165
1.93k
        return (0);
4166
1.93k
    }
4167
106k
    SKIP(2);
4168
4169
106k
    name = htmlParseHTMLName(ctxt);
4170
106k
    if (name == NULL)
4171
23.3k
        return (0);
4172
    /*
4173
     * We should definitely be at the ending "S? '>'" part
4174
     */
4175
83.1k
    SKIP_BLANKS;
4176
83.1k
    if (CUR != '>') {
4177
9.44k
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4178
9.44k
               "End tag : expected '>'\n", NULL, NULL);
4179
        /* Skip to next '>' */
4180
5.61M
        while ((CUR != 0) && (CUR != '>'))
4181
5.60M
            NEXT;
4182
9.44k
    }
4183
83.1k
    if (CUR == '>')
4184
80.1k
        NEXT;
4185
4186
    /*
4187
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4188
     * out now.
4189
     */
4190
83.1k
    if ((ctxt->depth > 0) &&
4191
83.1k
        (xmlStrEqual(name, BAD_CAST "html") ||
4192
15.6k
         xmlStrEqual(name, BAD_CAST "body") ||
4193
15.6k
   xmlStrEqual(name, BAD_CAST "head"))) {
4194
1.54k
  ctxt->depth--;
4195
1.54k
  return (0);
4196
1.54k
    }
4197
4198
    /*
4199
     * If the name read is not one of the element in the parsing stack
4200
     * then return, it's just an error.
4201
     */
4202
799M
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4203
799M
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4204
40.0k
            break;
4205
799M
    }
4206
81.5k
    if (i < 0) {
4207
41.5k
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4208
41.5k
               "Unexpected end tag : %s\n", name, NULL);
4209
41.5k
        return (0);
4210
41.5k
    }
4211
4212
4213
    /*
4214
     * Check for auto-closure of HTML elements.
4215
     */
4216
4217
40.0k
    htmlAutoCloseOnClose(ctxt, name);
4218
4219
    /*
4220
     * Well formedness constraints, opening and closing must match.
4221
     * With the exception that the autoclose may have popped stuff out
4222
     * of the stack.
4223
     */
4224
40.0k
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4225
2.20k
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4226
2.20k
                     "Opening and ending tag mismatch: %s and %s\n",
4227
2.20k
                     name, ctxt->name);
4228
2.20k
    }
4229
4230
    /*
4231
     * SAX: End of Tag
4232
     */
4233
40.0k
    oldname = ctxt->name;
4234
40.0k
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4235
37.8k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4236
37.8k
            ctxt->sax->endElement(ctxt->userData, name);
4237
37.8k
  htmlNodeInfoPop(ctxt);
4238
37.8k
        htmlnamePop(ctxt);
4239
37.8k
        ret = 1;
4240
37.8k
    } else {
4241
2.20k
        ret = 0;
4242
2.20k
    }
4243
4244
40.0k
    return (ret);
4245
81.5k
}
4246
4247
4248
/**
4249
 * htmlParseReference:
4250
 * @ctxt:  an HTML parser context
4251
 *
4252
 * parse and handle entity references in content,
4253
 * this will end-up in a call to character() since this is either a
4254
 * CharRef, or a predefined entity.
4255
 */
4256
static void
4257
524k
htmlParseReference(htmlParserCtxtPtr ctxt) {
4258
524k
    const htmlEntityDesc * ent;
4259
524k
    xmlChar out[6];
4260
524k
    const xmlChar *name;
4261
524k
    if (CUR != '&') return;
4262
4263
524k
    if (NXT(1) == '#') {
4264
8.53k
  unsigned int c;
4265
8.53k
  int bits, i = 0;
4266
4267
8.53k
  c = htmlParseCharRef(ctxt);
4268
8.53k
  if (c == 0)
4269
4.17k
      return;
4270
4271
4.36k
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
4272
1.98k
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4273
1.28k
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4274
663
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4275
4276
8.29k
        for ( ; bits >= 0; bits-= 6) {
4277
3.93k
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
4278
3.93k
        }
4279
4.36k
  out[i] = 0;
4280
4281
4.36k
  htmlCheckParagraph(ctxt);
4282
4.36k
  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4283
4.36k
      ctxt->sax->characters(ctxt->userData, out, i);
4284
515k
    } else {
4285
515k
  ent = htmlParseEntityRef(ctxt, &name);
4286
515k
  if (name == NULL) {
4287
48.6k
      htmlCheckParagraph(ctxt);
4288
48.6k
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4289
48.6k
          ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4290
48.6k
      return;
4291
48.6k
  }
4292
466k
  if ((ent == NULL) || !(ent->value > 0)) {
4293
462k
      htmlCheckParagraph(ctxt);
4294
462k
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4295
462k
    ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4296
462k
    ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4297
    /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4298
462k
      }
4299
462k
  } else {
4300
4.54k
      unsigned int c;
4301
4.54k
      int bits, i = 0;
4302
4303
4.54k
      c = ent->value;
4304
4.54k
      if      (c <    0x80)
4305
4.05k
              { out[i++]= c;                bits= -6; }
4306
487
      else if (c <   0x800)
4307
279
              { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4308
208
      else if (c < 0x10000)
4309
208
              { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4310
0
      else
4311
0
              { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4312
4313
5.23k
      for ( ; bits >= 0; bits-= 6) {
4314
695
    out[i++]= ((c >> bits) & 0x3F) | 0x80;
4315
695
      }
4316
4.54k
      out[i] = 0;
4317
4318
4.54k
      htmlCheckParagraph(ctxt);
4319
4.54k
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4320
4.54k
    ctxt->sax->characters(ctxt->userData, out, i);
4321
4.54k
  }
4322
466k
    }
4323
524k
}
4324
4325
/**
4326
 * htmlParseContent:
4327
 * @ctxt:  an HTML parser context
4328
 *
4329
 * Parse a content: comment, sub-element, reference or text.
4330
 * Kept for compatibility with old code
4331
 */
4332
4333
static void
4334
0
htmlParseContent(htmlParserCtxtPtr ctxt) {
4335
0
    xmlChar *currentNode;
4336
0
    int depth;
4337
0
    const xmlChar *name;
4338
4339
0
    currentNode = xmlStrdup(ctxt->name);
4340
0
    depth = ctxt->nameNr;
4341
0
    while (1) {
4342
0
        GROW;
4343
4344
0
        if (ctxt->instate == XML_PARSER_EOF)
4345
0
            break;
4346
4347
  /*
4348
   * Our tag or one of it's parent or children is ending.
4349
   */
4350
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4351
0
      if (htmlParseEndTag(ctxt) &&
4352
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4353
0
    if (currentNode != NULL)
4354
0
        xmlFree(currentNode);
4355
0
    return;
4356
0
      }
4357
0
      continue; /* while */
4358
0
        }
4359
4360
0
  else if ((CUR == '<') &&
4361
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4362
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4363
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4364
0
      if (name == NULL) {
4365
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4366
0
       "htmlParseStartTag: invalid element name\n",
4367
0
       NULL, NULL);
4368
          /* Dump the bogus tag like browsers do */
4369
0
                while ((CUR != 0) && (CUR != '>'))
4370
0
              NEXT;
4371
4372
0
          if (currentNode != NULL)
4373
0
              xmlFree(currentNode);
4374
0
          return;
4375
0
      }
4376
4377
0
      if (ctxt->name != NULL) {
4378
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4379
0
              htmlAutoClose(ctxt, name);
4380
0
              continue;
4381
0
          }
4382
0
      }
4383
0
  }
4384
4385
  /*
4386
   * Has this node been popped out during parsing of
4387
   * the next element
4388
   */
4389
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4390
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4391
0
       {
4392
0
      if (currentNode != NULL) xmlFree(currentNode);
4393
0
      return;
4394
0
  }
4395
4396
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4397
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4398
      /*
4399
       * Handle SCRIPT/STYLE separately
4400
       */
4401
0
      htmlParseScript(ctxt);
4402
0
  } else {
4403
      /*
4404
       * Sometimes DOCTYPE arrives in the middle of the document
4405
       */
4406
0
      if ((CUR == '<') && (NXT(1) == '!') &&
4407
0
    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4408
0
    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4409
0
    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4410
0
    (UPP(8) == 'E')) {
4411
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4412
0
                 "Misplaced DOCTYPE declaration\n",
4413
0
           BAD_CAST "DOCTYPE" , NULL);
4414
0
    htmlParseDocTypeDecl(ctxt);
4415
0
      }
4416
4417
      /*
4418
       * First case :  a comment
4419
       */
4420
0
      if ((CUR == '<') && (NXT(1) == '!') &&
4421
0
    (NXT(2) == '-') && (NXT(3) == '-')) {
4422
0
    htmlParseComment(ctxt);
4423
0
      }
4424
4425
      /*
4426
       * Second case : a Processing Instruction.
4427
       */
4428
0
      else if ((CUR == '<') && (NXT(1) == '?')) {
4429
0
    htmlParsePI(ctxt);
4430
0
      }
4431
4432
      /*
4433
       * Third case :  a sub-element.
4434
       */
4435
0
      else if (CUR == '<') {
4436
0
    htmlParseElement(ctxt);
4437
0
      }
4438
4439
      /*
4440
       * Fourth case : a reference. If if has not been resolved,
4441
       *    parsing returns it's Name, create the node
4442
       */
4443
0
      else if (CUR == '&') {
4444
0
    htmlParseReference(ctxt);
4445
0
      }
4446
4447
      /*
4448
       * Fifth case : end of the resource
4449
       */
4450
0
      else if (CUR == 0) {
4451
0
    htmlAutoCloseOnEnd(ctxt);
4452
0
    break;
4453
0
      }
4454
4455
      /*
4456
       * Last case, text. Note that References are handled directly.
4457
       */
4458
0
      else {
4459
0
    htmlParseCharData(ctxt);
4460
0
      }
4461
0
  }
4462
0
        GROW;
4463
0
    }
4464
0
    if (currentNode != NULL) xmlFree(currentNode);
4465
0
}
4466
4467
/**
4468
 * htmlParseElement:
4469
 * @ctxt:  an HTML parser context
4470
 *
4471
 * parse an HTML element, this is highly recursive
4472
 * this is kept for compatibility with previous code versions
4473
 *
4474
 * [39] element ::= EmptyElemTag | STag content ETag
4475
 *
4476
 * [41] Attribute ::= Name Eq AttValue
4477
 */
4478
4479
void
4480
0
htmlParseElement(htmlParserCtxtPtr ctxt) {
4481
0
    const xmlChar *name;
4482
0
    xmlChar *currentNode = NULL;
4483
0
    const htmlElemDesc * info;
4484
0
    htmlParserNodeInfo node_info;
4485
0
    int failed;
4486
0
    int depth;
4487
0
    const xmlChar *oldptr;
4488
4489
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4490
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4491
0
         "htmlParseElement: context error\n", NULL, NULL);
4492
0
  return;
4493
0
    }
4494
4495
0
    if (ctxt->instate == XML_PARSER_EOF)
4496
0
        return;
4497
4498
    /* Capture start position */
4499
0
    if (ctxt->record_info) {
4500
0
        node_info.begin_pos = ctxt->input->consumed +
4501
0
                          (CUR_PTR - ctxt->input->base);
4502
0
  node_info.begin_line = ctxt->input->line;
4503
0
    }
4504
4505
0
    failed = htmlParseStartTag(ctxt);
4506
0
    name = ctxt->name;
4507
0
    if ((failed == -1) || (name == NULL)) {
4508
0
  if (CUR == '>')
4509
0
      NEXT;
4510
0
        return;
4511
0
    }
4512
4513
    /*
4514
     * Lookup the info for that element.
4515
     */
4516
0
    info = htmlTagLookup(name);
4517
0
    if (info == NULL) {
4518
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4519
0
               "Tag %s invalid\n", name, NULL);
4520
0
    }
4521
4522
    /*
4523
     * Check for an Empty Element labeled the XML/SGML way
4524
     */
4525
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4526
0
        SKIP(2);
4527
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4528
0
      ctxt->sax->endElement(ctxt->userData, name);
4529
0
  htmlnamePop(ctxt);
4530
0
  return;
4531
0
    }
4532
4533
0
    if (CUR == '>') {
4534
0
        NEXT;
4535
0
    } else {
4536
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4537
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4538
4539
  /*
4540
   * end of parsing of this node.
4541
   */
4542
0
  if (xmlStrEqual(name, ctxt->name)) {
4543
0
      nodePop(ctxt);
4544
0
      htmlnamePop(ctxt);
4545
0
  }
4546
4547
  /*
4548
   * Capture end position and add node
4549
   */
4550
0
  if (ctxt->record_info) {
4551
0
     node_info.end_pos = ctxt->input->consumed +
4552
0
            (CUR_PTR - ctxt->input->base);
4553
0
     node_info.end_line = ctxt->input->line;
4554
0
     node_info.node = ctxt->node;
4555
0
     xmlParserAddNodeInfo(ctxt, &node_info);
4556
0
  }
4557
0
  return;
4558
0
    }
4559
4560
    /*
4561
     * Check for an Empty Element from DTD definition
4562
     */
4563
0
    if ((info != NULL) && (info->empty)) {
4564
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4565
0
      ctxt->sax->endElement(ctxt->userData, name);
4566
0
  htmlnamePop(ctxt);
4567
0
  return;
4568
0
    }
4569
4570
    /*
4571
     * Parse the content of the element:
4572
     */
4573
0
    currentNode = xmlStrdup(ctxt->name);
4574
0
    depth = ctxt->nameNr;
4575
0
    while (CUR != 0) {
4576
0
  oldptr = ctxt->input->cur;
4577
0
  htmlParseContent(ctxt);
4578
0
  if (oldptr==ctxt->input->cur) break;
4579
0
  if (ctxt->nameNr < depth) break;
4580
0
    }
4581
4582
    /*
4583
     * Capture end position and add node
4584
     */
4585
0
    if ( currentNode != NULL && ctxt->record_info ) {
4586
0
       node_info.end_pos = ctxt->input->consumed +
4587
0
                          (CUR_PTR - ctxt->input->base);
4588
0
       node_info.end_line = ctxt->input->line;
4589
0
       node_info.node = ctxt->node;
4590
0
       xmlParserAddNodeInfo(ctxt, &node_info);
4591
0
    }
4592
0
    if (CUR == 0) {
4593
0
  htmlAutoCloseOnEnd(ctxt);
4594
0
    }
4595
4596
0
    if (currentNode != NULL)
4597
0
  xmlFree(currentNode);
4598
0
}
4599
4600
static void
4601
16.9k
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4602
    /*
4603
     * Capture end position and add node
4604
     */
4605
16.9k
    if ( ctxt->node != NULL && ctxt->record_info ) {
4606
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4607
0
                                (CUR_PTR - ctxt->input->base);
4608
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
4609
0
       ctxt->nodeInfo->node = ctxt->node;
4610
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4611
0
       htmlNodeInfoPop(ctxt);
4612
0
    }
4613
16.9k
    if (CUR == 0) {
4614
4.17k
       htmlAutoCloseOnEnd(ctxt);
4615
4.17k
    }
4616
16.9k
}
4617
4618
/**
4619
 * htmlParseElementInternal:
4620
 * @ctxt:  an HTML parser context
4621
 *
4622
 * parse an HTML element, new version, non recursive
4623
 *
4624
 * [39] element ::= EmptyElemTag | STag content ETag
4625
 *
4626
 * [41] Attribute ::= Name Eq AttValue
4627
 */
4628
4629
static void
4630
8.69M
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4631
8.69M
    const xmlChar *name;
4632
8.69M
    const htmlElemDesc * info;
4633
8.69M
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4634
8.69M
    int failed;
4635
4636
8.69M
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4637
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4638
0
         "htmlParseElementInternal: context error\n", NULL, NULL);
4639
0
  return;
4640
0
    }
4641
4642
8.69M
    if (ctxt->instate == XML_PARSER_EOF)
4643
0
        return;
4644
4645
    /* Capture start position */
4646
8.69M
    if (ctxt->record_info) {
4647
0
        node_info.begin_pos = ctxt->input->consumed +
4648
0
                          (CUR_PTR - ctxt->input->base);
4649
0
  node_info.begin_line = ctxt->input->line;
4650
0
    }
4651
4652
8.69M
    failed = htmlParseStartTag(ctxt);
4653
8.69M
    name = ctxt->name;
4654
8.69M
    if ((failed == -1) || (name == NULL)) {
4655
24.4k
  if (CUR == '>')
4656
22.9k
      NEXT;
4657
24.4k
        return;
4658
24.4k
    }
4659
4660
    /*
4661
     * Lookup the info for that element.
4662
     */
4663
8.67M
    info = htmlTagLookup(name);
4664
8.67M
    if (info == NULL) {
4665
321k
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4666
321k
               "Tag %s invalid\n", name, NULL);
4667
321k
    }
4668
4669
    /*
4670
     * Check for an Empty Element labeled the XML/SGML way
4671
     */
4672
8.67M
    if ((CUR == '/') && (NXT(1) == '>')) {
4673
6.90k
        SKIP(2);
4674
6.90k
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4675
6.90k
      ctxt->sax->endElement(ctxt->userData, name);
4676
6.90k
  htmlnamePop(ctxt);
4677
6.90k
  return;
4678
6.90k
    }
4679
4680
8.66M
    if (CUR == '>') {
4681
8.65M
        NEXT;
4682
8.65M
    } else {
4683
4.20k
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4684
4.20k
               "Couldn't find end of Start Tag %s\n", name, NULL);
4685
4686
  /*
4687
   * end of parsing of this node.
4688
   */
4689
4.20k
  if (xmlStrEqual(name, ctxt->name)) {
4690
4.20k
      nodePop(ctxt);
4691
4.20k
      htmlnamePop(ctxt);
4692
4.20k
  }
4693
4694
4.20k
        if (ctxt->record_info)
4695
0
            htmlNodeInfoPush(ctxt, &node_info);
4696
4.20k
        htmlParserFinishElementParsing(ctxt);
4697
4.20k
  return;
4698
4.20k
    }
4699
4700
    /*
4701
     * Check for an Empty Element from DTD definition
4702
     */
4703
8.65M
    if ((info != NULL) && (info->empty)) {
4704
6.87k
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4705
6.87k
      ctxt->sax->endElement(ctxt->userData, name);
4706
6.87k
  htmlnamePop(ctxt);
4707
6.87k
  return;
4708
6.87k
    }
4709
4710
8.65M
    if (ctxt->record_info)
4711
0
        htmlNodeInfoPush(ctxt, &node_info);
4712
8.65M
}
4713
4714
/**
4715
 * htmlParseContentInternal:
4716
 * @ctxt:  an HTML parser context
4717
 *
4718
 * Parse a content: comment, sub-element, reference or text.
4719
 * New version for non recursive htmlParseElementInternal
4720
 */
4721
4722
static void
4723
12.0k
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4724
12.0k
    xmlChar *currentNode;
4725
12.0k
    int depth;
4726
12.0k
    const xmlChar *name;
4727
4728
12.0k
    currentNode = xmlStrdup(ctxt->name);
4729
12.0k
    depth = ctxt->nameNr;
4730
9.26M
    while (1) {
4731
9.26M
        GROW;
4732
4733
9.26M
        if (ctxt->instate == XML_PARSER_EOF)
4734
301
            break;
4735
4736
  /*
4737
   * Our tag or one of it's parent or children is ending.
4738
   */
4739
9.26M
        if ((CUR == '<') && (NXT(1) == '/')) {
4740
44.0k
      if (htmlParseEndTag(ctxt) &&
4741
44.0k
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4742
15.9k
    if (currentNode != NULL)
4743
14.2k
        xmlFree(currentNode);
4744
4745
15.9k
          currentNode = xmlStrdup(ctxt->name);
4746
15.9k
          depth = ctxt->nameNr;
4747
15.9k
      }
4748
44.0k
      continue; /* while */
4749
44.0k
        }
4750
4751
9.22M
  else if ((CUR == '<') &&
4752
9.22M
           ((IS_ASCII_LETTER(NXT(1))) ||
4753
8.75M
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4754
8.70M
      name = htmlParseHTMLName_nonInvasive(ctxt);
4755
8.70M
      if (name == NULL) {
4756
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4757
0
       "htmlParseStartTag: invalid element name\n",
4758
0
       NULL, NULL);
4759
          /* Dump the bogus tag like browsers do */
4760
0
          while ((CUR == 0) && (CUR != '>'))
4761
0
              NEXT;
4762
4763
0
          htmlParserFinishElementParsing(ctxt);
4764
0
          if (currentNode != NULL)
4765
0
              xmlFree(currentNode);
4766
4767
0
          currentNode = xmlStrdup(ctxt->name);
4768
0
          depth = ctxt->nameNr;
4769
0
          continue;
4770
0
      }
4771
4772
8.70M
      if (ctxt->name != NULL) {
4773
8.69M
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4774
15.8k
              htmlAutoClose(ctxt, name);
4775
15.8k
              continue;
4776
15.8k
          }
4777
8.69M
      }
4778
8.70M
  }
4779
4780
  /*
4781
   * Has this node been popped out during parsing of
4782
   * the next element
4783
   */
4784
9.20M
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4785
9.20M
      (!xmlStrEqual(currentNode, ctxt->name)))
4786
12.7k
       {
4787
12.7k
      htmlParserFinishElementParsing(ctxt);
4788
12.7k
      if (currentNode != NULL) xmlFree(currentNode);
4789
4790
12.7k
      currentNode = xmlStrdup(ctxt->name);
4791
12.7k
      depth = ctxt->nameNr;
4792
12.7k
      continue;
4793
12.7k
  }
4794
4795
9.19M
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4796
9.18M
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4797
      /*
4798
       * Handle SCRIPT/STYLE separately
4799
       */
4800
4.68k
      htmlParseScript(ctxt);
4801
9.18M
  } else {
4802
      /*
4803
       * Sometimes DOCTYPE arrives in the middle of the document
4804
       */
4805
9.18M
      if ((CUR == '<') && (NXT(1) == '!') &&
4806
9.18M
    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4807
9.18M
    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4808
9.18M
    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4809
9.18M
    (UPP(8) == 'E')) {
4810
3.99k
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4811
3.99k
                 "Misplaced DOCTYPE declaration\n",
4812
3.99k
           BAD_CAST "DOCTYPE" , NULL);
4813
3.99k
    htmlParseDocTypeDecl(ctxt);
4814
3.99k
      }
4815
4816
      /*
4817
       * First case :  a comment
4818
       */
4819
9.18M
      if ((CUR == '<') && (NXT(1) == '!') &&
4820
9.18M
    (NXT(2) == '-') && (NXT(3) == '-')) {
4821
16.9k
    htmlParseComment(ctxt);
4822
16.9k
      }
4823
4824
      /*
4825
       * Second case : a Processing Instruction.
4826
       */
4827
9.17M
      else if ((CUR == '<') && (NXT(1) == '?')) {
4828
12.7k
    htmlParsePI(ctxt);
4829
12.7k
      }
4830
4831
      /*
4832
       * Third case :  a sub-element.
4833
       */
4834
9.15M
      else if (CUR == '<') {
4835
8.69M
    htmlParseElementInternal(ctxt);
4836
8.69M
    if (currentNode != NULL) xmlFree(currentNode);
4837
4838
8.69M
    currentNode = xmlStrdup(ctxt->name);
4839
8.69M
    depth = ctxt->nameNr;
4840
8.69M
      }
4841
4842
      /*
4843
       * Fourth case : a reference. If if has not been resolved,
4844
       *    parsing returns it's Name, create the node
4845
       */
4846
464k
      else if (CUR == '&') {
4847
241k
    htmlParseReference(ctxt);
4848
241k
      }
4849
4850
      /*
4851
       * Fifth case : end of the resource
4852
       */
4853
223k
      else if (CUR == 0) {
4854
11.7k
    htmlAutoCloseOnEnd(ctxt);
4855
11.7k
    break;
4856
11.7k
      }
4857
4858
      /*
4859
       * Last case, text. Note that References are handled directly.
4860
       */
4861
211k
      else {
4862
211k
    htmlParseCharData(ctxt);
4863
211k
      }
4864
9.18M
  }
4865
9.18M
        GROW;
4866
9.18M
    }
4867
12.0k
    if (currentNode != NULL) xmlFree(currentNode);
4868
12.0k
}
4869
4870
/**
4871
 * htmlParseContent:
4872
 * @ctxt:  an HTML parser context
4873
 *
4874
 * Parse a content: comment, sub-element, reference or text.
4875
 * This is the entry point when called from parser.c
4876
 */
4877
4878
void
4879
0
__htmlParseContent(void *ctxt) {
4880
0
    if (ctxt != NULL)
4881
0
  htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4882
0
}
4883
4884
/**
4885
 * htmlParseDocument:
4886
 * @ctxt:  an HTML parser context
4887
 *
4888
 * parse an HTML document (and build a tree if using the standard SAX
4889
 * interface).
4890
 *
4891
 * Returns 0, -1 in case of error. the parser context is augmented
4892
 *                as a result of the parsing.
4893
 */
4894
4895
int
4896
12.0k
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4897
12.0k
    xmlChar start[4];
4898
12.0k
    xmlCharEncoding enc;
4899
12.0k
    xmlDtdPtr dtd;
4900
4901
12.0k
    xmlInitParser();
4902
4903
12.0k
    htmlDefaultSAXHandlerInit();
4904
4905
12.0k
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4906
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4907
0
         "htmlParseDocument: context error\n", NULL, NULL);
4908
0
  return(XML_ERR_INTERNAL_ERROR);
4909
0
    }
4910
12.0k
    ctxt->html = 1;
4911
12.0k
    ctxt->linenumbers = 1;
4912
12.0k
    GROW;
4913
    /*
4914
     * SAX: beginning of the document processing.
4915
     */
4916
12.0k
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4917
12.0k
        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4918
4919
12.0k
    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4920
12.0k
        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4921
  /*
4922
   * Get the 4 first bytes and decode the charset
4923
   * if enc != XML_CHAR_ENCODING_NONE
4924
   * plug some encoding conversion routines.
4925
   */
4926
11.4k
  start[0] = RAW;
4927
11.4k
  start[1] = NXT(1);
4928
11.4k
  start[2] = NXT(2);
4929
11.4k
  start[3] = NXT(3);
4930
11.4k
  enc = xmlDetectCharEncoding(&start[0], 4);
4931
11.4k
  if (enc != XML_CHAR_ENCODING_NONE) {
4932
391
      xmlSwitchEncoding(ctxt, enc);
4933
391
  }
4934
11.4k
    }
4935
4936
    /*
4937
     * Wipe out everything which is before the first '<'
4938
     */
4939
12.0k
    SKIP_BLANKS;
4940
12.0k
    if (CUR == 0) {
4941
643
  htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4942
643
               "Document is empty\n", NULL, NULL);
4943
643
    }
4944
4945
12.0k
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4946
12.0k
  ctxt->sax->startDocument(ctxt->userData);
4947
4948
4949
    /*
4950
     * Parse possible comments and PIs before any content
4951
     */
4952
55.6k
    while (((CUR == '<') && (NXT(1) == '!') &&
4953
55.6k
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4954
55.6k
     ((CUR == '<') && (NXT(1) == '?'))) {
4955
43.6k
        htmlParseComment(ctxt);
4956
43.6k
        htmlParsePI(ctxt);
4957
43.6k
  SKIP_BLANKS;
4958
43.6k
    }
4959
4960
4961
    /*
4962
     * Then possibly doc type declaration(s) and more Misc
4963
     * (doctypedecl Misc*)?
4964
     */
4965
12.0k
    if ((CUR == '<') && (NXT(1) == '!') &&
4966
12.0k
  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4967
12.0k
  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4968
12.0k
  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4969
12.0k
  (UPP(8) == 'E')) {
4970
663
  htmlParseDocTypeDecl(ctxt);
4971
663
    }
4972
12.0k
    SKIP_BLANKS;
4973
4974
    /*
4975
     * Parse possible comments and PIs before any content
4976
     */
4977
24.9k
    while (((CUR == '<') && (NXT(1) == '!') &&
4978
24.9k
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4979
24.9k
     ((CUR == '<') && (NXT(1) == '?'))) {
4980
12.9k
        htmlParseComment(ctxt);
4981
12.9k
        htmlParsePI(ctxt);
4982
12.9k
  SKIP_BLANKS;
4983
12.9k
    }
4984
4985
    /*
4986
     * Time to start parsing the tree itself
4987
     */
4988
12.0k
    htmlParseContentInternal(ctxt);
4989
4990
    /*
4991
     * autoclose
4992
     */
4993
12.0k
    if (CUR == 0)
4994
12.0k
  htmlAutoCloseOnEnd(ctxt);
4995
4996
4997
    /*
4998
     * SAX: end of the document processing.
4999
     */
5000
12.0k
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5001
12.0k
        ctxt->sax->endDocument(ctxt->userData);
5002
5003
12.0k
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5004
7.07k
  dtd = xmlGetIntSubset(ctxt->myDoc);
5005
7.07k
  if (dtd == NULL)
5006
6.54k
      ctxt->myDoc->intSubset =
5007
6.54k
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5008
6.54k
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5009
6.54k
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5010
7.07k
    }
5011
12.0k
    if (! ctxt->wellFormed) return(-1);
5012
615
    return(0);
5013
12.0k
}
5014
5015
5016
/************************************************************************
5017
 *                  *
5018
 *      Parser contexts handling      *
5019
 *                  *
5020
 ************************************************************************/
5021
5022
/**
5023
 * htmlInitParserCtxt:
5024
 * @ctxt:  an HTML parser context
5025
 *
5026
 * Initialize a parser context
5027
 *
5028
 * Returns 0 in case of success and -1 in case of error
5029
 */
5030
5031
static int
5032
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
5033
12.0k
{
5034
12.0k
    htmlSAXHandler *sax;
5035
5036
12.0k
    if (ctxt == NULL) return(-1);
5037
12.0k
    memset(ctxt, 0, sizeof(htmlParserCtxt));
5038
5039
12.0k
    ctxt->dict = xmlDictCreate();
5040
12.0k
    if (ctxt->dict == NULL) {
5041
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5042
0
  return(-1);
5043
0
    }
5044
12.0k
    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5045
12.0k
    if (sax == NULL) {
5046
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5047
0
  return(-1);
5048
0
    }
5049
12.0k
    else
5050
12.0k
        memset(sax, 0, sizeof(htmlSAXHandler));
5051
5052
    /* Allocate the Input stack */
5053
12.0k
    ctxt->inputTab = (htmlParserInputPtr *)
5054
12.0k
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
5055
12.0k
    if (ctxt->inputTab == NULL) {
5056
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5057
0
  ctxt->inputNr = 0;
5058
0
  ctxt->inputMax = 0;
5059
0
  ctxt->input = NULL;
5060
0
  return(-1);
5061
0
    }
5062
12.0k
    ctxt->inputNr = 0;
5063
12.0k
    ctxt->inputMax = 5;
5064
12.0k
    ctxt->input = NULL;
5065
12.0k
    ctxt->version = NULL;
5066
12.0k
    ctxt->encoding = NULL;
5067
12.0k
    ctxt->standalone = -1;
5068
12.0k
    ctxt->instate = XML_PARSER_START;
5069
5070
    /* Allocate the Node stack */
5071
12.0k
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5072
12.0k
    if (ctxt->nodeTab == NULL) {
5073
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5074
0
  ctxt->nodeNr = 0;
5075
0
  ctxt->nodeMax = 0;
5076
0
  ctxt->node = NULL;
5077
0
  ctxt->inputNr = 0;
5078
0
  ctxt->inputMax = 0;
5079
0
  ctxt->input = NULL;
5080
0
  return(-1);
5081
0
    }
5082
12.0k
    ctxt->nodeNr = 0;
5083
12.0k
    ctxt->nodeMax = 10;
5084
12.0k
    ctxt->node = NULL;
5085
5086
    /* Allocate the Name stack */
5087
12.0k
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5088
12.0k
    if (ctxt->nameTab == NULL) {
5089
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5090
0
  ctxt->nameNr = 0;
5091
0
  ctxt->nameMax = 0;
5092
0
  ctxt->name = NULL;
5093
0
  ctxt->nodeNr = 0;
5094
0
  ctxt->nodeMax = 0;
5095
0
  ctxt->node = NULL;
5096
0
  ctxt->inputNr = 0;
5097
0
  ctxt->inputMax = 0;
5098
0
  ctxt->input = NULL;
5099
0
  return(-1);
5100
0
    }
5101
12.0k
    ctxt->nameNr = 0;
5102
12.0k
    ctxt->nameMax = 10;
5103
12.0k
    ctxt->name = NULL;
5104
5105
12.0k
    ctxt->nodeInfoTab = NULL;
5106
12.0k
    ctxt->nodeInfoNr  = 0;
5107
12.0k
    ctxt->nodeInfoMax = 0;
5108
5109
12.0k
    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
5110
12.0k
    else {
5111
12.0k
        ctxt->sax = sax;
5112
12.0k
  memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
5113
12.0k
    }
5114
12.0k
    ctxt->userData = ctxt;
5115
12.0k
    ctxt->myDoc = NULL;
5116
12.0k
    ctxt->wellFormed = 1;
5117
12.0k
    ctxt->replaceEntities = 0;
5118
12.0k
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
5119
12.0k
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5120
12.0k
    ctxt->html = 1;
5121
12.0k
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5122
12.0k
    ctxt->vctxt.userData = ctxt;
5123
12.0k
    ctxt->vctxt.error = xmlParserValidityError;
5124
12.0k
    ctxt->vctxt.warning = xmlParserValidityWarning;
5125
12.0k
    ctxt->record_info = 0;
5126
12.0k
    ctxt->validate = 0;
5127
12.0k
    ctxt->checkIndex = 0;
5128
12.0k
    ctxt->catalogs = NULL;
5129
12.0k
    xmlInitNodeInfoSeq(&ctxt->node_seq);
5130
12.0k
    return(0);
5131
12.0k
}
5132
5133
/**
5134
 * htmlFreeParserCtxt:
5135
 * @ctxt:  an HTML parser context
5136
 *
5137
 * Free all the memory used by a parser context. However the parsed
5138
 * document in ctxt->myDoc is not freed.
5139
 */
5140
5141
void
5142
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5143
12.0k
{
5144
12.0k
    xmlFreeParserCtxt(ctxt);
5145
12.0k
}
5146
5147
/**
5148
 * htmlNewParserCtxt:
5149
 *
5150
 * Allocate and initialize a new parser context.
5151
 *
5152
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5153
 */
5154
5155
htmlParserCtxtPtr
5156
htmlNewParserCtxt(void)
5157
12.0k
{
5158
12.0k
    xmlParserCtxtPtr ctxt;
5159
5160
12.0k
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5161
12.0k
    if (ctxt == NULL) {
5162
0
        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5163
0
  return(NULL);
5164
0
    }
5165
12.0k
    memset(ctxt, 0, sizeof(xmlParserCtxt));
5166
12.0k
    if (htmlInitParserCtxt(ctxt) < 0) {
5167
0
        htmlFreeParserCtxt(ctxt);
5168
0
  return(NULL);
5169
0
    }
5170
12.0k
    return(ctxt);
5171
12.0k
}
5172
5173
/**
5174
 * htmlCreateMemoryParserCtxt:
5175
 * @buffer:  a pointer to a char array
5176
 * @size:  the size of the array
5177
 *
5178
 * Create a parser context for an HTML in-memory document.
5179
 *
5180
 * Returns the new parser context or NULL
5181
 */
5182
htmlParserCtxtPtr
5183
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5184
0
    xmlParserCtxtPtr ctxt;
5185
0
    xmlParserInputPtr input;
5186
0
    xmlParserInputBufferPtr buf;
5187
5188
0
    if (buffer == NULL)
5189
0
  return(NULL);
5190
0
    if (size <= 0)
5191
0
  return(NULL);
5192
5193
0
    ctxt = htmlNewParserCtxt();
5194
0
    if (ctxt == NULL)
5195
0
  return(NULL);
5196
5197
0
    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5198
0
    if (buf == NULL) return(NULL);
5199
5200
0
    input = xmlNewInputStream(ctxt);
5201
0
    if (input == NULL) {
5202
0
  xmlFreeParserInputBuffer(buf);
5203
0
  xmlFreeParserCtxt(ctxt);
5204
0
  return(NULL);
5205
0
    }
5206
5207
0
    input->filename = NULL;
5208
0
    input->buf = buf;
5209
0
    xmlBufResetInput(buf->buffer, input);
5210
5211
0
    inputPush(ctxt, input);
5212
0
    return(ctxt);
5213
0
}
5214
5215
/**
5216
 * htmlCreateDocParserCtxt:
5217
 * @cur:  a pointer to an array of xmlChar
5218
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5219
 *
5220
 * Create a parser context for an HTML document.
5221
 *
5222
 * TODO: check the need to add encoding handling there
5223
 *
5224
 * Returns the new parser context or NULL
5225
 */
5226
static htmlParserCtxtPtr
5227
0
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5228
0
    int len;
5229
0
    htmlParserCtxtPtr ctxt;
5230
5231
0
    if (cur == NULL)
5232
0
  return(NULL);
5233
0
    len = xmlStrlen(cur);
5234
0
    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5235
0
    if (ctxt == NULL)
5236
0
  return(NULL);
5237
5238
0
    if (encoding != NULL) {
5239
0
  xmlCharEncoding enc;
5240
0
  xmlCharEncodingHandlerPtr handler;
5241
5242
0
  if (ctxt->input->encoding != NULL)
5243
0
      xmlFree((xmlChar *) ctxt->input->encoding);
5244
0
  ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5245
5246
0
  enc = xmlParseCharEncoding(encoding);
5247
  /*
5248
   * registered set of known encodings
5249
   */
5250
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
5251
0
      xmlSwitchEncoding(ctxt, enc);
5252
0
      if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5253
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5254
0
                 "Unsupported encoding %s\n",
5255
0
           (const xmlChar *) encoding, NULL);
5256
0
      }
5257
0
  } else {
5258
      /*
5259
       * fallback for unknown encodings
5260
       */
5261
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
5262
0
      if (handler != NULL) {
5263
0
    xmlSwitchToEncoding(ctxt, handler);
5264
0
      } else {
5265
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5266
0
                 "Unsupported encoding %s\n",
5267
0
           (const xmlChar *) encoding, NULL);
5268
0
      }
5269
0
  }
5270
0
    }
5271
0
    return(ctxt);
5272
0
}
5273
5274
#ifdef LIBXML_PUSH_ENABLED
5275
/************************************************************************
5276
 *                  *
5277
 *  Progressive parsing interfaces        *
5278
 *                  *
5279
 ************************************************************************/
5280
5281
/**
5282
 * htmlParseLookupSequence:
5283
 * @ctxt:  an HTML parser context
5284
 * @first:  the first char to lookup
5285
 * @next:  the next char to lookup or zero
5286
 * @third:  the next char to lookup or zero
5287
 * @ignoreattrval: skip over attribute values
5288
 *
5289
 * Try to find if a sequence (first, next, third) or  just (first next) or
5290
 * (first) is available in the input stream.
5291
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5292
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5293
 * parser, do not use liberally.
5294
 * This is basically similar to xmlParseLookupSequence()
5295
 *
5296
 * Returns the index to the current parsing point if the full sequence
5297
 *      is available, -1 otherwise.
5298
 */
5299
static int
5300
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5301
                        xmlChar next, xmlChar third, int ignoreattrval)
5302
10.7M
{
5303
10.7M
    int base, len;
5304
10.7M
    htmlParserInputPtr in;
5305
10.7M
    const xmlChar *buf;
5306
10.7M
    int invalue = 0;
5307
10.7M
    char valdellim = 0x0;
5308
5309
10.7M
    in = ctxt->input;
5310
10.7M
    if (in == NULL)
5311
0
        return (-1);
5312
5313
10.7M
    base = in->cur - in->base;
5314
10.7M
    if (base < 0)
5315
0
        return (-1);
5316
5317
10.7M
    if (ctxt->checkIndex > base) {
5318
1.95M
        base = ctxt->checkIndex;
5319
        /* Abuse hasPErefs member to restore current state. */
5320
1.95M
        invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5321
1.95M
    }
5322
5323
10.7M
    if (in->buf == NULL) {
5324
0
        buf = in->base;
5325
0
        len = in->length;
5326
10.7M
    } else {
5327
10.7M
        buf = xmlBufContent(in->buf->buffer);
5328
10.7M
        len = xmlBufUse(in->buf->buffer);
5329
10.7M
    }
5330
5331
    /* take into account the sequence length */
5332
10.7M
    if (third)
5333
0
        len -= 2;
5334
10.7M
    else if (next)
5335
600k
        len--;
5336
2.38G
    for (; base < len; base++) {
5337
2.38G
        if (ignoreattrval) {
5338
860M
            if (buf[base] == '"' || buf[base] == '\'') {
5339
191k
                if (invalue) {
5340
186k
                    if (buf[base] == valdellim) {
5341
3.24k
                        invalue = 0;
5342
3.24k
                        continue;
5343
3.24k
                    }
5344
186k
                } else {
5345
4.31k
                    valdellim = buf[base];
5346
4.31k
                    invalue = 1;
5347
4.31k
                    continue;
5348
4.31k
                }
5349
860M
            } else if (invalue) {
5350
532M
                continue;
5351
532M
            }
5352
860M
        }
5353
1.85G
        if (buf[base] == first) {
5354
9.04M
            if (third != 0) {
5355
0
                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5356
0
                    continue;
5357
9.04M
            } else if (next != 0) {
5358
348k
                if (buf[base + 1] != next)
5359
74.8k
                    continue;
5360
348k
            }
5361
8.96M
            ctxt->checkIndex = 0;
5362
#ifdef DEBUG_PUSH
5363
            if (next == 0)
5364
                xmlGenericError(xmlGenericErrorContext,
5365
                                "HPP: lookup '%c' found at %d\n",
5366
                                first, base);
5367
            else if (third == 0)
5368
                xmlGenericError(xmlGenericErrorContext,
5369
                                "HPP: lookup '%c%c' found at %d\n",
5370
                                first, next, base);
5371
            else
5372
                xmlGenericError(xmlGenericErrorContext,
5373
                                "HPP: lookup '%c%c%c' found at %d\n",
5374
                                first, next, third, base);
5375
#endif
5376
8.96M
            return (base - (in->cur - in->base));
5377
9.04M
        }
5378
1.85G
    }
5379
1.80M
    ctxt->checkIndex = base;
5380
    /* Abuse hasPErefs member to track current state. */
5381
1.80M
    if (invalue)
5382
508k
        ctxt->hasPErefs |= 1;
5383
1.29M
    else
5384
1.29M
        ctxt->hasPErefs &= ~1;
5385
#ifdef DEBUG_PUSH
5386
    if (next == 0)
5387
        xmlGenericError(xmlGenericErrorContext,
5388
                        "HPP: lookup '%c' failed\n", first);
5389
    else if (third == 0)
5390
        xmlGenericError(xmlGenericErrorContext,
5391
                        "HPP: lookup '%c%c' failed\n", first, next);
5392
    else
5393
        xmlGenericError(xmlGenericErrorContext,
5394
                        "HPP: lookup '%c%c%c' failed\n", first, next,
5395
                        third);
5396
#endif
5397
1.80M
    return (-1);
5398
10.7M
}
5399
5400
/**
5401
 * htmlParseLookupCommentEnd:
5402
 * @ctxt: an HTML parser context
5403
 *
5404
 * Try to find a comment end tag in the input stream
5405
 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5406
 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5407
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5408
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5409
 * parser, do not use liberally.
5410
 * This wraps to htmlParseLookupSequence()
5411
 *
5412
 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5413
 */
5414
static int
5415
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5416
281k
{
5417
281k
    int mark = 0;
5418
281k
    int cur = CUR_PTR - BASE_PTR;
5419
5420
437k
    while (mark >= 0) {
5421
437k
  mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5422
437k
  if ((mark < 0) ||
5423
437k
      (NXT(mark+2) == '>') ||
5424
437k
      ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5425
281k
      return mark;
5426
281k
  }
5427
156k
  ctxt->checkIndex = cur + mark + 1;
5428
156k
    }
5429
0
    return mark;
5430
281k
}
5431
5432
5433
/**
5434
 * htmlParseTryOrFinish:
5435
 * @ctxt:  an HTML parser context
5436
 * @terminate:  last chunk indicator
5437
 *
5438
 * Try to progress on parsing
5439
 *
5440
 * Returns zero if no parsing was possible
5441
 */
5442
static int
5443
2.06M
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5444
2.06M
    int ret = 0;
5445
2.06M
    htmlParserInputPtr in;
5446
2.06M
    ptrdiff_t avail = 0;
5447
2.06M
    xmlChar cur, next;
5448
5449
2.06M
    htmlParserNodeInfo node_info;
5450
5451
#ifdef DEBUG_PUSH
5452
    switch (ctxt->instate) {
5453
  case XML_PARSER_EOF:
5454
      xmlGenericError(xmlGenericErrorContext,
5455
        "HPP: try EOF\n"); break;
5456
  case XML_PARSER_START:
5457
      xmlGenericError(xmlGenericErrorContext,
5458
        "HPP: try START\n"); break;
5459
  case XML_PARSER_MISC:
5460
      xmlGenericError(xmlGenericErrorContext,
5461
        "HPP: try MISC\n");break;
5462
  case XML_PARSER_COMMENT:
5463
      xmlGenericError(xmlGenericErrorContext,
5464
        "HPP: try COMMENT\n");break;
5465
  case XML_PARSER_PROLOG:
5466
      xmlGenericError(xmlGenericErrorContext,
5467
        "HPP: try PROLOG\n");break;
5468
  case XML_PARSER_START_TAG:
5469
      xmlGenericError(xmlGenericErrorContext,
5470
        "HPP: try START_TAG\n");break;
5471
  case XML_PARSER_CONTENT:
5472
      xmlGenericError(xmlGenericErrorContext,
5473
        "HPP: try CONTENT\n");break;
5474
  case XML_PARSER_CDATA_SECTION:
5475
      xmlGenericError(xmlGenericErrorContext,
5476
        "HPP: try CDATA_SECTION\n");break;
5477
  case XML_PARSER_END_TAG:
5478
      xmlGenericError(xmlGenericErrorContext,
5479
        "HPP: try END_TAG\n");break;
5480
  case XML_PARSER_ENTITY_DECL:
5481
      xmlGenericError(xmlGenericErrorContext,
5482
        "HPP: try ENTITY_DECL\n");break;
5483
  case XML_PARSER_ENTITY_VALUE:
5484
      xmlGenericError(xmlGenericErrorContext,
5485
        "HPP: try ENTITY_VALUE\n");break;
5486
  case XML_PARSER_ATTRIBUTE_VALUE:
5487
      xmlGenericError(xmlGenericErrorContext,
5488
        "HPP: try ATTRIBUTE_VALUE\n");break;
5489
  case XML_PARSER_DTD:
5490
      xmlGenericError(xmlGenericErrorContext,
5491
        "HPP: try DTD\n");break;
5492
  case XML_PARSER_EPILOG:
5493
      xmlGenericError(xmlGenericErrorContext,
5494
        "HPP: try EPILOG\n");break;
5495
  case XML_PARSER_PI:
5496
      xmlGenericError(xmlGenericErrorContext,
5497
        "HPP: try PI\n");break;
5498
  case XML_PARSER_SYSTEM_LITERAL:
5499
      xmlGenericError(xmlGenericErrorContext,
5500
        "HPP: try SYSTEM_LITERAL\n");break;
5501
    }
5502
#endif
5503
5504
22.2M
    while (1) {
5505
5506
22.2M
  in = ctxt->input;
5507
22.2M
  if (in == NULL) break;
5508
22.2M
  if (in->buf == NULL)
5509
887
      avail = in->length - (in->cur - in->base);
5510
22.2M
  else
5511
22.2M
      avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5512
22.2M
                    (in->cur - in->base);
5513
22.2M
  if ((avail == 0) && (terminate)) {
5514
10.9k
      htmlAutoCloseOnEnd(ctxt);
5515
10.9k
      if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5516
    /*
5517
     * SAX: end of the document processing.
5518
     */
5519
10.8k
    ctxt->instate = XML_PARSER_EOF;
5520
10.8k
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5521
10.8k
        ctxt->sax->endDocument(ctxt->userData);
5522
10.8k
      }
5523
10.9k
  }
5524
22.2M
        if (avail < 1)
5525
85.6k
      goto done;
5526
        /*
5527
         * This is done to make progress and avoid an infinite loop
5528
         * if a parsing attempt was aborted by hitting a NUL byte. After
5529
         * changing htmlCurrentChar, this probably isn't necessary anymore.
5530
         * We should consider removing this check.
5531
         */
5532
22.1M
  cur = in->cur[0];
5533
22.1M
  if (cur == 0) {
5534
1.09M
      SKIP(1);
5535
1.09M
      continue;
5536
1.09M
  }
5537
5538
21.0M
        switch (ctxt->instate) {
5539
59.6k
            case XML_PARSER_EOF:
5540
          /*
5541
     * Document parsing is done !
5542
     */
5543
59.6k
          goto done;
5544
27.5k
            case XML_PARSER_START:
5545
          /*
5546
     * Very first chars read from the document flow.
5547
     */
5548
27.5k
    cur = in->cur[0];
5549
27.5k
    if (IS_BLANK_CH(cur)) {
5550
198
        SKIP_BLANKS;
5551
198
        if (in->buf == NULL)
5552
0
      avail = in->length - (in->cur - in->base);
5553
198
        else
5554
198
      avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5555
198
                                (in->cur - in->base);
5556
198
    }
5557
27.5k
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5558
27.5k
        ctxt->sax->setDocumentLocator(ctxt->userData,
5559
27.5k
              &xmlDefaultSAXLocator);
5560
27.5k
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5561
27.5k
              (!ctxt->disableSAX))
5562
27.5k
        ctxt->sax->startDocument(ctxt->userData);
5563
5564
27.5k
    cur = in->cur[0];
5565
27.5k
    next = in->cur[1];
5566
27.5k
    if ((cur == '<') && (next == '!') &&
5567
27.5k
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5568
27.5k
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5569
27.5k
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5570
27.5k
        (UPP(8) == 'E')) {
5571
16.2k
        if ((!terminate) &&
5572
16.2k
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5573
15.5k
      goto done;
5574
#ifdef DEBUG_PUSH
5575
        xmlGenericError(xmlGenericErrorContext,
5576
          "HPP: Parsing internal subset\n");
5577
#endif
5578
664
        htmlParseDocTypeDecl(ctxt);
5579
664
        ctxt->instate = XML_PARSER_PROLOG;
5580
#ifdef DEBUG_PUSH
5581
        xmlGenericError(xmlGenericErrorContext,
5582
          "HPP: entering PROLOG\n");
5583
#endif
5584
11.3k
                } else {
5585
11.3k
        ctxt->instate = XML_PARSER_MISC;
5586
#ifdef DEBUG_PUSH
5587
        xmlGenericError(xmlGenericErrorContext,
5588
          "HPP: entering MISC\n");
5589
#endif
5590
11.3k
    }
5591
12.0k
    break;
5592
114k
            case XML_PARSER_MISC:
5593
114k
    SKIP_BLANKS;
5594
114k
    if (in->buf == NULL)
5595
0
        avail = in->length - (in->cur - in->base);
5596
114k
    else
5597
114k
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5598
114k
                            (in->cur - in->base);
5599
    /*
5600
     * no chars in buffer
5601
     */
5602
114k
    if (avail < 1)
5603
502
        goto done;
5604
    /*
5605
     * not enough chars in buffer
5606
     */
5607
113k
    if (avail < 2) {
5608
586
        if (!terminate)
5609
478
      goto done;
5610
108
        else
5611
108
      next = ' ';
5612
113k
    } else {
5613
113k
        next = in->cur[1];
5614
113k
    }
5615
113k
    cur = in->cur[0];
5616
113k
          if ((cur == '<') && (next == '!') &&
5617
113k
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5618
77.2k
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5619
34.6k
      goto done;
5620
#ifdef DEBUG_PUSH
5621
        xmlGenericError(xmlGenericErrorContext,
5622
          "HPP: Parsing Comment\n");
5623
#endif
5624
42.5k
        htmlParseComment(ctxt);
5625
42.5k
        ctxt->instate = XML_PARSER_MISC;
5626
42.5k
          } else if ((cur == '<') && (next == '?')) {
5627
8.21k
        if ((!terminate) &&
5628
8.21k
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5629
6.97k
      goto done;
5630
#ifdef DEBUG_PUSH
5631
        xmlGenericError(xmlGenericErrorContext,
5632
          "HPP: Parsing PI\n");
5633
#endif
5634
1.24k
        htmlParsePI(ctxt);
5635
1.24k
        ctxt->instate = XML_PARSER_MISC;
5636
28.0k
    } else if ((cur == '<') && (next == '!') &&
5637
28.0k
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5638
28.0k
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5639
28.0k
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5640
28.0k
        (UPP(8) == 'E')) {
5641
16.4k
        if ((!terminate) &&
5642
16.4k
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5643
16.3k
      goto done;
5644
#ifdef DEBUG_PUSH
5645
        xmlGenericError(xmlGenericErrorContext,
5646
          "HPP: Parsing internal subset\n");
5647
#endif
5648
50
        htmlParseDocTypeDecl(ctxt);
5649
50
        ctxt->instate = XML_PARSER_PROLOG;
5650
#ifdef DEBUG_PUSH
5651
        xmlGenericError(xmlGenericErrorContext,
5652
          "HPP: entering PROLOG\n");
5653
#endif
5654
11.5k
    } else if ((cur == '<') && (next == '!') &&
5655
11.5k
               (avail < 9)) {
5656
903
        goto done;
5657
10.6k
    } else {
5658
10.6k
        ctxt->instate = XML_PARSER_CONTENT;
5659
#ifdef DEBUG_PUSH
5660
        xmlGenericError(xmlGenericErrorContext,
5661
          "HPP: entering START_TAG\n");
5662
#endif
5663
10.6k
    }
5664
54.5k
    break;
5665
99.4k
            case XML_PARSER_PROLOG:
5666
99.4k
    SKIP_BLANKS;
5667
99.4k
    if (in->buf == NULL)
5668
0
        avail = in->length - (in->cur - in->base);
5669
99.4k
    else
5670
99.4k
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5671
99.4k
                            (in->cur - in->base);
5672
99.4k
    if (avail < 2)
5673
271
        goto done;
5674
99.2k
    cur = in->cur[0];
5675
99.2k
    next = in->cur[1];
5676
99.2k
    if ((cur == '<') && (next == '!') &&
5677
99.2k
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5678
85.7k
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5679
73.6k
      goto done;
5680
#ifdef DEBUG_PUSH
5681
        xmlGenericError(xmlGenericErrorContext,
5682
          "HPP: Parsing Comment\n");
5683
#endif
5684
12.0k
        htmlParseComment(ctxt);
5685
12.0k
        ctxt->instate = XML_PARSER_PROLOG;
5686
13.4k
          } else if ((cur == '<') && (next == '?')) {
5687
12.6k
        if ((!terminate) &&
5688
12.6k
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5689
12.0k
      goto done;
5690
#ifdef DEBUG_PUSH
5691
        xmlGenericError(xmlGenericErrorContext,
5692
          "HPP: Parsing PI\n");
5693
#endif
5694
630
        htmlParsePI(ctxt);
5695
630
        ctxt->instate = XML_PARSER_PROLOG;
5696
784
    } else if ((cur == '<') && (next == '!') &&
5697
784
               (avail < 4)) {
5698
452
        goto done;
5699
452
    } else {
5700
332
        ctxt->instate = XML_PARSER_CONTENT;
5701
#ifdef DEBUG_PUSH
5702
        xmlGenericError(xmlGenericErrorContext,
5703
          "HPP: entering START_TAG\n");
5704
#endif
5705
332
    }
5706
13.0k
    break;
5707
44.9k
            case XML_PARSER_EPILOG:
5708
44.9k
    if (in->buf == NULL)
5709
0
        avail = in->length - (in->cur - in->base);
5710
44.9k
    else
5711
44.9k
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5712
44.9k
                            (in->cur - in->base);
5713
44.9k
    if (avail < 1)
5714
0
        goto done;
5715
44.9k
    cur = in->cur[0];
5716
44.9k
    if (IS_BLANK_CH(cur)) {
5717
4.22k
        htmlParseCharData(ctxt);
5718
4.22k
        goto done;
5719
4.22k
    }
5720
40.7k
    if (avail < 2)
5721
1.33k
        goto done;
5722
39.4k
    next = in->cur[1];
5723
39.4k
          if ((cur == '<') && (next == '!') &&
5724
39.4k
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5725
38.0k
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5726
37.0k
      goto done;
5727
#ifdef DEBUG_PUSH
5728
        xmlGenericError(xmlGenericErrorContext,
5729
          "HPP: Parsing Comment\n");
5730
#endif
5731
961
        htmlParseComment(ctxt);
5732
961
        ctxt->instate = XML_PARSER_EPILOG;
5733
1.42k
          } else if ((cur == '<') && (next == '?')) {
5734
764
        if ((!terminate) &&
5735
764
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5736
211
      goto done;
5737
#ifdef DEBUG_PUSH
5738
        xmlGenericError(xmlGenericErrorContext,
5739
          "HPP: Parsing PI\n");
5740
#endif
5741
553
        htmlParsePI(ctxt);
5742
553
        ctxt->instate = XML_PARSER_EPILOG;
5743
664
    } else if ((cur == '<') && (next == '!') &&
5744
664
               (avail < 4)) {
5745
396
        goto done;
5746
396
    } else {
5747
268
        ctxt->errNo = XML_ERR_DOCUMENT_END;
5748
268
        ctxt->wellFormed = 0;
5749
268
        ctxt->instate = XML_PARSER_EOF;
5750
#ifdef DEBUG_PUSH
5751
        xmlGenericError(xmlGenericErrorContext,
5752
          "HPP: entering EOF\n");
5753
#endif
5754
268
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5755
268
      ctxt->sax->endDocument(ctxt->userData);
5756
268
        goto done;
5757
268
    }
5758
1.51k
    break;
5759
10.0M
            case XML_PARSER_START_TAG: {
5760
10.0M
          const xmlChar *name;
5761
10.0M
    int failed;
5762
10.0M
    const htmlElemDesc * info;
5763
5764
    /*
5765
     * no chars in buffer
5766
     */
5767
10.0M
    if (avail < 1)
5768
0
        goto done;
5769
    /*
5770
     * not enough chars in buffer
5771
     */
5772
10.0M
    if (avail < 2) {
5773
512
        if (!terminate)
5774
421
      goto done;
5775
91
        else
5776
91
      next = ' ';
5777
10.0M
    } else {
5778
10.0M
        next = in->cur[1];
5779
10.0M
    }
5780
10.0M
    cur = in->cur[0];
5781
10.0M
          if (cur != '<') {
5782
16.4k
        ctxt->instate = XML_PARSER_CONTENT;
5783
#ifdef DEBUG_PUSH
5784
        xmlGenericError(xmlGenericErrorContext,
5785
          "HPP: entering CONTENT\n");
5786
#endif
5787
16.4k
        break;
5788
16.4k
    }
5789
10.0M
    if (next == '/') {
5790
4.35k
        ctxt->instate = XML_PARSER_END_TAG;
5791
4.35k
        ctxt->checkIndex = 0;
5792
#ifdef DEBUG_PUSH
5793
        xmlGenericError(xmlGenericErrorContext,
5794
          "HPP: entering END_TAG\n");
5795
#endif
5796
4.35k
        break;
5797
4.35k
    }
5798
10.0M
    if ((!terminate) &&
5799
10.0M
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5800
727k
        goto done;
5801
5802
                /* Capture start position */
5803
9.27M
          if (ctxt->record_info) {
5804
0
               node_info.begin_pos = ctxt->input->consumed +
5805
0
                                  (CUR_PTR - ctxt->input->base);
5806
0
               node_info.begin_line = ctxt->input->line;
5807
0
          }
5808
5809
5810
9.27M
    failed = htmlParseStartTag(ctxt);
5811
9.27M
    name = ctxt->name;
5812
9.27M
    if ((failed == -1) ||
5813
9.27M
        (name == NULL)) {
5814
44.6k
        if (CUR == '>')
5815
36.5k
      NEXT;
5816
44.6k
        break;
5817
44.6k
    }
5818
5819
    /*
5820
     * Lookup the info for that element.
5821
     */
5822
9.22M
    info = htmlTagLookup(name);
5823
9.22M
    if (info == NULL) {
5824
544k
        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5825
544k
                     "Tag %s invalid\n", name, NULL);
5826
544k
    }
5827
5828
    /*
5829
     * Check for an Empty Element labeled the XML/SGML way
5830
     */
5831
9.22M
    if ((CUR == '/') && (NXT(1) == '>')) {
5832
6.18k
        SKIP(2);
5833
6.18k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5834
6.18k
      ctxt->sax->endElement(ctxt->userData, name);
5835
6.18k
        htmlnamePop(ctxt);
5836
6.18k
        ctxt->instate = XML_PARSER_CONTENT;
5837
#ifdef DEBUG_PUSH
5838
        xmlGenericError(xmlGenericErrorContext,
5839
          "HPP: entering CONTENT\n");
5840
#endif
5841
6.18k
        break;
5842
6.18k
    }
5843
5844
9.22M
    if (CUR == '>') {
5845
9.21M
        NEXT;
5846
9.21M
    } else {
5847
8.99k
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5848
8.99k
                     "Couldn't find end of Start Tag %s\n",
5849
8.99k
         name, NULL);
5850
5851
        /*
5852
         * end of parsing of this node.
5853
         */
5854
8.99k
        if (xmlStrEqual(name, ctxt->name)) {
5855
8.99k
      nodePop(ctxt);
5856
8.99k
      htmlnamePop(ctxt);
5857
8.99k
        }
5858
5859
8.99k
        if (ctxt->record_info)
5860
0
            htmlNodeInfoPush(ctxt, &node_info);
5861
5862
8.99k
        ctxt->instate = XML_PARSER_CONTENT;
5863
#ifdef DEBUG_PUSH
5864
        xmlGenericError(xmlGenericErrorContext,
5865
          "HPP: entering CONTENT\n");
5866
#endif
5867
8.99k
        break;
5868
8.99k
    }
5869
5870
    /*
5871
     * Check for an Empty Element from DTD definition
5872
     */
5873
9.21M
    if ((info != NULL) && (info->empty)) {
5874
7.75k
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5875
7.75k
      ctxt->sax->endElement(ctxt->userData, name);
5876
7.75k
        htmlnamePop(ctxt);
5877
7.75k
    }
5878
5879
9.21M
                if (ctxt->record_info)
5880
0
              htmlNodeInfoPush(ctxt, &node_info);
5881
5882
9.21M
    ctxt->instate = XML_PARSER_CONTENT;
5883
#ifdef DEBUG_PUSH
5884
    xmlGenericError(xmlGenericErrorContext,
5885
      "HPP: entering CONTENT\n");
5886
#endif
5887
9.21M
                break;
5888
9.22M
      }
5889
10.6M
            case XML_PARSER_CONTENT: {
5890
10.6M
    xmlChar chr[2] = { 0, 0 };
5891
5892
                /*
5893
     * Handle preparsed entities and charRef
5894
     */
5895
10.6M
    if (ctxt->token != 0) {
5896
0
        chr[0] = (xmlChar) ctxt->token;
5897
0
        htmlCheckParagraph(ctxt);
5898
0
        if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5899
0
      ctxt->sax->characters(ctxt->userData, chr, 1);
5900
0
        ctxt->token = 0;
5901
0
        ctxt->checkIndex = 0;
5902
0
    }
5903
10.6M
    if ((avail == 1) && (terminate)) {
5904
725
        cur = in->cur[0];
5905
725
        if ((cur != '<') && (cur != '&')) {
5906
418
      if (ctxt->sax != NULL) {
5907
418
                            chr[0] = cur;
5908
418
          if (IS_BLANK_CH(cur)) {
5909
33
        if (ctxt->keepBlanks) {
5910
21
            if (ctxt->sax->characters != NULL)
5911
21
          ctxt->sax->characters(
5912
21
            ctxt->userData, chr, 1);
5913
21
        } else {
5914
12
            if (ctxt->sax->ignorableWhitespace != NULL)
5915
12
          ctxt->sax->ignorableWhitespace(
5916
12
            ctxt->userData, chr, 1);
5917
12
        }
5918
385
          } else {
5919
385
        htmlCheckParagraph(ctxt);
5920
385
        if (ctxt->sax->characters != NULL)
5921
385
            ctxt->sax->characters(
5922
385
              ctxt->userData, chr, 1);
5923
385
          }
5924
418
      }
5925
418
      ctxt->token = 0;
5926
418
      ctxt->checkIndex = 0;
5927
418
      in->cur++;
5928
418
      break;
5929
418
        }
5930
725
    }
5931
10.6M
    if (avail < 2)
5932
68.4k
        goto done;
5933
10.5M
    cur = in->cur[0];
5934
10.5M
    next = in->cur[1];
5935
10.5M
    if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5936
10.5M
        (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5937
        /*
5938
         * Handle SCRIPT/STYLE separately
5939
         */
5940
166k
        if (!terminate) {
5941
162k
            int idx;
5942
162k
      xmlChar val;
5943
5944
162k
      idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5945
162k
      if (idx < 0)
5946
132k
          goto done;
5947
29.7k
            val = in->cur[idx + 2];
5948
29.7k
      if (val == 0) /* bad cut of input */
5949
10.0k
          goto done;
5950
29.7k
        }
5951
23.9k
        htmlParseScript(ctxt);
5952
23.9k
        if ((cur == '<') && (next == '/')) {
5953
15.4k
      ctxt->instate = XML_PARSER_END_TAG;
5954
15.4k
      ctxt->checkIndex = 0;
5955
#ifdef DEBUG_PUSH
5956
      xmlGenericError(xmlGenericErrorContext,
5957
        "HPP: entering END_TAG\n");
5958
#endif
5959
15.4k
      break;
5960
15.4k
        }
5961
10.3M
    } else {
5962
        /*
5963
         * Sometimes DOCTYPE arrives in the middle of the document
5964
         */
5965
10.3M
        if ((cur == '<') && (next == '!') &&
5966
10.3M
      (UPP(2) == 'D') && (UPP(3) == 'O') &&
5967
10.3M
      (UPP(4) == 'C') && (UPP(5) == 'T') &&
5968
10.3M
      (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5969
10.3M
      (UPP(8) == 'E')) {
5970
25.5k
      if ((!terminate) &&
5971
25.5k
          (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5972
19.1k
          goto done;
5973
6.34k
      htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5974
6.34k
                   "Misplaced DOCTYPE declaration\n",
5975
6.34k
             BAD_CAST "DOCTYPE" , NULL);
5976
6.34k
      htmlParseDocTypeDecl(ctxt);
5977
10.3M
        } else if ((cur == '<') && (next == '!') &&
5978
10.3M
      (in->cur[2] == '-') && (in->cur[3] == '-')) {
5979
85.9k
      if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5980
48.4k
          goto done;
5981
#ifdef DEBUG_PUSH
5982
      xmlGenericError(xmlGenericErrorContext,
5983
        "HPP: Parsing Comment\n");
5984
#endif
5985
37.4k
      htmlParseComment(ctxt);
5986
37.4k
      ctxt->instate = XML_PARSER_CONTENT;
5987
10.2M
        } else if ((cur == '<') && (next == '?')) {
5988
97.6k
      if ((!terminate) &&
5989
97.6k
          (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5990
78.9k
          goto done;
5991
#ifdef DEBUG_PUSH
5992
      xmlGenericError(xmlGenericErrorContext,
5993
        "HPP: Parsing PI\n");
5994
#endif
5995
18.6k
      htmlParsePI(ctxt);
5996
18.6k
      ctxt->instate = XML_PARSER_CONTENT;
5997
10.1M
        } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5998
892
      goto done;
5999
10.1M
        } else if ((cur == '<') && (next == '/')) {
6000
44.5k
      ctxt->instate = XML_PARSER_END_TAG;
6001
44.5k
      ctxt->checkIndex = 0;
6002
#ifdef DEBUG_PUSH
6003
      xmlGenericError(xmlGenericErrorContext,
6004
        "HPP: entering END_TAG\n");
6005
#endif
6006
44.5k
      break;
6007
10.1M
        } else if (cur == '<') {
6008
9.26M
                        if ((!terminate) && (next == 0))
6009
13.5k
                            goto done;
6010
9.24M
                        ctxt->instate = XML_PARSER_START_TAG;
6011
9.24M
                        ctxt->checkIndex = 0;
6012
#ifdef DEBUG_PUSH
6013
                        xmlGenericError(xmlGenericErrorContext,
6014
                                "HPP: entering START_TAG\n");
6015
#endif
6016
9.24M
      break;
6017
9.26M
        } else {
6018
            /*
6019
       * check that the text sequence is complete
6020
       * before handing out the data to the parser
6021
       * to avoid problems with erroneous end of
6022
       * data detection.
6023
       */
6024
850k
      if ((!terminate) &&
6025
850k
                            (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6026
596k
          goto done;
6027
253k
      ctxt->checkIndex = 0;
6028
#ifdef DEBUG_PUSH
6029
      xmlGenericError(xmlGenericErrorContext,
6030
        "HPP: Parsing char data\n");
6031
#endif
6032
837k
                        while ((ctxt->instate != XML_PARSER_EOF) &&
6033
837k
                               (cur != '<') && (in->cur < in->end)) {
6034
583k
                            if (cur == '&') {
6035
282k
              htmlParseReference(ctxt);
6036
301k
                            } else {
6037
301k
              htmlParseCharData(ctxt);
6038
301k
                            }
6039
583k
                            cur = in->cur[0];
6040
583k
                        }
6041
253k
        }
6042
10.3M
    }
6043
6044
324k
    break;
6045
10.5M
      }
6046
324k
            case XML_PARSER_END_TAG:
6047
81.0k
    if (avail < 2)
6048
10.2k
        goto done;
6049
70.8k
    if ((!terminate) &&
6050
70.8k
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6051
6.45k
        goto done;
6052
64.3k
    htmlParseEndTag(ctxt);
6053
64.3k
    if (ctxt->nameNr == 0) {
6054
519
        ctxt->instate = XML_PARSER_EPILOG;
6055
63.8k
    } else {
6056
63.8k
        ctxt->instate = XML_PARSER_CONTENT;
6057
63.8k
    }
6058
64.3k
    ctxt->checkIndex = 0;
6059
#ifdef DEBUG_PUSH
6060
    xmlGenericError(xmlGenericErrorContext,
6061
      "HPP: entering CONTENT\n");
6062
#endif
6063
64.3k
          break;
6064
0
            case XML_PARSER_CDATA_SECTION:
6065
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6066
0
      "HPP: internal error, state == CDATA\n",
6067
0
           NULL, NULL);
6068
0
    ctxt->instate = XML_PARSER_CONTENT;
6069
0
    ctxt->checkIndex = 0;
6070
#ifdef DEBUG_PUSH
6071
    xmlGenericError(xmlGenericErrorContext,
6072
      "HPP: entering CONTENT\n");
6073
#endif
6074
0
    break;
6075
0
            case XML_PARSER_DTD:
6076
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6077
0
      "HPP: internal error, state == DTD\n",
6078
0
           NULL, NULL);
6079
0
    ctxt->instate = XML_PARSER_CONTENT;
6080
0
    ctxt->checkIndex = 0;
6081
#ifdef DEBUG_PUSH
6082
    xmlGenericError(xmlGenericErrorContext,
6083
      "HPP: entering CONTENT\n");
6084
#endif
6085
0
    break;
6086
0
            case XML_PARSER_COMMENT:
6087
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6088
0
      "HPP: internal error, state == COMMENT\n",
6089
0
           NULL, NULL);
6090
0
    ctxt->instate = XML_PARSER_CONTENT;
6091
0
    ctxt->checkIndex = 0;
6092
#ifdef DEBUG_PUSH
6093
    xmlGenericError(xmlGenericErrorContext,
6094
      "HPP: entering CONTENT\n");
6095
#endif
6096
0
    break;
6097
0
            case XML_PARSER_PI:
6098
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6099
0
      "HPP: internal error, state == PI\n",
6100
0
           NULL, NULL);
6101
0
    ctxt->instate = XML_PARSER_CONTENT;
6102
0
    ctxt->checkIndex = 0;
6103
#ifdef DEBUG_PUSH
6104
    xmlGenericError(xmlGenericErrorContext,
6105
      "HPP: entering CONTENT\n");
6106
#endif
6107
0
    break;
6108
0
            case XML_PARSER_ENTITY_DECL:
6109
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6110
0
      "HPP: internal error, state == ENTITY_DECL\n",
6111
0
           NULL, NULL);
6112
0
    ctxt->instate = XML_PARSER_CONTENT;
6113
0
    ctxt->checkIndex = 0;
6114
#ifdef DEBUG_PUSH
6115
    xmlGenericError(xmlGenericErrorContext,
6116
      "HPP: entering CONTENT\n");
6117
#endif
6118
0
    break;
6119
0
            case XML_PARSER_ENTITY_VALUE:
6120
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6121
0
      "HPP: internal error, state == ENTITY_VALUE\n",
6122
0
           NULL, NULL);
6123
0
    ctxt->instate = XML_PARSER_CONTENT;
6124
0
    ctxt->checkIndex = 0;
6125
#ifdef DEBUG_PUSH
6126
    xmlGenericError(xmlGenericErrorContext,
6127
      "HPP: entering DTD\n");
6128
#endif
6129
0
    break;
6130
0
            case XML_PARSER_ATTRIBUTE_VALUE:
6131
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6132
0
      "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6133
0
           NULL, NULL);
6134
0
    ctxt->instate = XML_PARSER_START_TAG;
6135
0
    ctxt->checkIndex = 0;
6136
#ifdef DEBUG_PUSH
6137
    xmlGenericError(xmlGenericErrorContext,
6138
      "HPP: entering START_TAG\n");
6139
#endif
6140
0
    break;
6141
0
      case XML_PARSER_SYSTEM_LITERAL:
6142
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6143
0
        "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6144
0
           NULL, NULL);
6145
0
    ctxt->instate = XML_PARSER_CONTENT;
6146
0
    ctxt->checkIndex = 0;
6147
#ifdef DEBUG_PUSH
6148
    xmlGenericError(xmlGenericErrorContext,
6149
      "HPP: entering CONTENT\n");
6150
#endif
6151
0
    break;
6152
0
      case XML_PARSER_IGNORE:
6153
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6154
0
      "HPP: internal error, state == XML_PARSER_IGNORE\n",
6155
0
           NULL, NULL);
6156
0
    ctxt->instate = XML_PARSER_CONTENT;
6157
0
    ctxt->checkIndex = 0;
6158
#ifdef DEBUG_PUSH
6159
    xmlGenericError(xmlGenericErrorContext,
6160
      "HPP: entering CONTENT\n");
6161
#endif
6162
0
    break;
6163
0
      case XML_PARSER_PUBLIC_LITERAL:
6164
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6165
0
      "HPP: internal error, state == XML_PARSER_LITERAL\n",
6166
0
           NULL, NULL);
6167
0
    ctxt->instate = XML_PARSER_CONTENT;
6168
0
    ctxt->checkIndex = 0;
6169
#ifdef DEBUG_PUSH
6170
    xmlGenericError(xmlGenericErrorContext,
6171
      "HPP: entering CONTENT\n");
6172
#endif
6173
0
    break;
6174
6175
21.0M
  }
6176
21.0M
    }
6177
2.06M
done:
6178
2.06M
    if ((avail == 0) && (terminate)) {
6179
10.9k
  htmlAutoCloseOnEnd(ctxt);
6180
10.9k
  if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6181
      /*
6182
       * SAX: end of the document processing.
6183
       */
6184
3
      ctxt->instate = XML_PARSER_EOF;
6185
3
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6186
3
    ctxt->sax->endDocument(ctxt->userData);
6187
3
  }
6188
10.9k
    }
6189
2.06M
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6190
2.06M
  ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6191
1.18M
   (ctxt->instate == XML_PARSER_EPILOG))) {
6192
60.1k
  xmlDtdPtr dtd;
6193
60.1k
  dtd = xmlGetIntSubset(ctxt->myDoc);
6194
60.1k
  if (dtd == NULL)
6195
6.24k
      ctxt->myDoc->intSubset =
6196
6.24k
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6197
6.24k
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6198
6.24k
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6199
60.1k
    }
6200
#ifdef DEBUG_PUSH
6201
    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6202
#endif
6203
2.06M
    return(ret);
6204
2.06M
}
6205
6206
/**
6207
 * htmlParseChunk:
6208
 * @ctxt:  an HTML parser context
6209
 * @chunk:  an char array
6210
 * @size:  the size in byte of the chunk
6211
 * @terminate:  last chunk indicator
6212
 *
6213
 * Parse a Chunk of memory
6214
 *
6215
 * Returns zero if no error, the xmlParserErrors otherwise.
6216
 */
6217
int
6218
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6219
2.14M
              int terminate) {
6220
2.14M
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
6221
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6222
0
         "htmlParseChunk: context error\n", NULL, NULL);
6223
0
  return(XML_ERR_INTERNAL_ERROR);
6224
0
    }
6225
2.14M
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6226
2.14M
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6227
2.07M
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6228
2.07M
  size_t cur = ctxt->input->cur - ctxt->input->base;
6229
2.07M
  int res;
6230
6231
2.07M
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6232
2.07M
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6233
2.07M
  if (res < 0) {
6234
78.5k
      ctxt->errNo = XML_PARSER_EOF;
6235
78.5k
      ctxt->disableSAX = 1;
6236
78.5k
      return (XML_PARSER_EOF);
6237
78.5k
  }
6238
#ifdef DEBUG_PUSH
6239
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6240
#endif
6241
6242
#if 0
6243
  if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6244
      htmlParseTryOrFinish(ctxt, terminate);
6245
#endif
6246
2.07M
    } else if (ctxt->instate != XML_PARSER_EOF) {
6247
12.6k
  if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6248
11.8k
      xmlParserInputBufferPtr in = ctxt->input->buf;
6249
11.8k
      if ((in->encoder != NULL) && (in->buffer != NULL) &&
6250
11.8k
        (in->raw != NULL)) {
6251
3.40k
    int nbchars;
6252
3.40k
    size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6253
3.40k
    size_t current = ctxt->input->cur - ctxt->input->base;
6254
6255
3.40k
    nbchars = xmlCharEncInput(in, terminate);
6256
3.40k
    xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6257
3.40k
    if (nbchars < 0) {
6258
240
        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6259
240
               "encoder error\n", NULL, NULL);
6260
240
        return(XML_ERR_INVALID_ENCODING);
6261
240
    }
6262
3.40k
      }
6263
11.8k
  }
6264
12.6k
    }
6265
2.06M
    htmlParseTryOrFinish(ctxt, terminate);
6266
2.06M
    if (terminate) {
6267
11.8k
  if ((ctxt->instate != XML_PARSER_EOF) &&
6268
11.8k
      (ctxt->instate != XML_PARSER_EPILOG) &&
6269
11.8k
      (ctxt->instate != XML_PARSER_MISC)) {
6270
385
      ctxt->errNo = XML_ERR_DOCUMENT_END;
6271
385
      ctxt->wellFormed = 0;
6272
385
  }
6273
11.8k
  if (ctxt->instate != XML_PARSER_EOF) {
6274
570
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6275
570
    ctxt->sax->endDocument(ctxt->userData);
6276
570
  }
6277
11.8k
  ctxt->instate = XML_PARSER_EOF;
6278
11.8k
    }
6279
2.06M
    return((xmlParserErrors) ctxt->errNo);
6280
2.14M
}
6281
6282
/************************************************************************
6283
 *                  *
6284
 *      User entry points       *
6285
 *                  *
6286
 ************************************************************************/
6287
6288
/**
6289
 * htmlCreatePushParserCtxt:
6290
 * @sax:  a SAX handler
6291
 * @user_data:  The user data returned on SAX callbacks
6292
 * @chunk:  a pointer to an array of chars
6293
 * @size:  number of chars in the array
6294
 * @filename:  an optional file name or URI
6295
 * @enc:  an optional encoding
6296
 *
6297
 * Create a parser context for using the HTML parser in push mode
6298
 * The value of @filename is used for fetching external entities
6299
 * and error/warning reports.
6300
 *
6301
 * Returns the new parser context or NULL
6302
 */
6303
htmlParserCtxtPtr
6304
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6305
                         const char *chunk, int size, const char *filename,
6306
12.0k
       xmlCharEncoding enc) {
6307
12.0k
    htmlParserCtxtPtr ctxt;
6308
12.0k
    htmlParserInputPtr inputStream;
6309
12.0k
    xmlParserInputBufferPtr buf;
6310
6311
12.0k
    xmlInitParser();
6312
6313
12.0k
    buf = xmlAllocParserInputBuffer(enc);
6314
12.0k
    if (buf == NULL) return(NULL);
6315
6316
12.0k
    ctxt = htmlNewParserCtxt();
6317
12.0k
    if (ctxt == NULL) {
6318
0
  xmlFreeParserInputBuffer(buf);
6319
0
  return(NULL);
6320
0
    }
6321
12.0k
    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6322
0
  ctxt->charset=XML_CHAR_ENCODING_UTF8;
6323
12.0k
    if (sax != NULL) {
6324
0
  if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6325
0
      xmlFree(ctxt->sax);
6326
0
  ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6327
0
  if (ctxt->sax == NULL) {
6328
0
      xmlFree(buf);
6329
0
      xmlFree(ctxt);
6330
0
      return(NULL);
6331
0
  }
6332
0
  memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6333
0
  if (user_data != NULL)
6334
0
      ctxt->userData = user_data;
6335
0
    }
6336
12.0k
    if (filename == NULL) {
6337
12.0k
  ctxt->directory = NULL;
6338
12.0k
    } else {
6339
0
        ctxt->directory = xmlParserGetDirectory(filename);
6340
0
    }
6341
6342
12.0k
    inputStream = htmlNewInputStream(ctxt);
6343
12.0k
    if (inputStream == NULL) {
6344
0
  xmlFreeParserCtxt(ctxt);
6345
0
  xmlFree(buf);
6346
0
  return(NULL);
6347
0
    }
6348
6349
12.0k
    if (filename == NULL)
6350
12.0k
  inputStream->filename = NULL;
6351
0
    else
6352
0
  inputStream->filename = (char *)
6353
0
      xmlCanonicPath((const xmlChar *) filename);
6354
12.0k
    inputStream->buf = buf;
6355
12.0k
    xmlBufResetInput(buf->buffer, inputStream);
6356
6357
12.0k
    inputPush(ctxt, inputStream);
6358
6359
12.0k
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6360
12.0k
        (ctxt->input->buf != NULL))  {
6361
0
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6362
0
  size_t cur = ctxt->input->cur - ctxt->input->base;
6363
6364
0
  xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6365
6366
0
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6367
#ifdef DEBUG_PUSH
6368
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6369
#endif
6370
0
    }
6371
12.0k
    ctxt->progressive = 1;
6372
6373
12.0k
    return(ctxt);
6374
12.0k
}
6375
#endif /* LIBXML_PUSH_ENABLED */
6376
6377
/**
6378
 * htmlSAXParseDoc:
6379
 * @cur:  a pointer to an array of xmlChar
6380
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6381
 * @sax:  the SAX handler block
6382
 * @userData: if using SAX, this pointer will be provided on callbacks.
6383
 *
6384
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6385
 * to handle parse events. If sax is NULL, fallback to the default DOM
6386
 * behavior and return a tree.
6387
 *
6388
 * Returns the resulting document tree unless SAX is NULL or the document is
6389
 *     not well formed.
6390
 */
6391
6392
htmlDocPtr
6393
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6394
0
                htmlSAXHandlerPtr sax, void *userData) {
6395
0
    htmlDocPtr ret;
6396
0
    htmlParserCtxtPtr ctxt;
6397
6398
0
    xmlInitParser();
6399
6400
0
    if (cur == NULL) return(NULL);
6401
6402
6403
0
    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6404
0
    if (ctxt == NULL) return(NULL);
6405
0
    if (sax != NULL) {
6406
0
        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6407
0
        ctxt->sax = sax;
6408
0
        ctxt->userData = userData;
6409
0
    }
6410
6411
0
    htmlParseDocument(ctxt);
6412
0
    ret = ctxt->myDoc;
6413
0
    if (sax != NULL) {
6414
0
  ctxt->sax = NULL;
6415
0
  ctxt->userData = NULL;
6416
0
    }
6417
0
    htmlFreeParserCtxt(ctxt);
6418
6419
0
    return(ret);
6420
0
}
6421
6422
/**
6423
 * htmlParseDoc:
6424
 * @cur:  a pointer to an array of xmlChar
6425
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6426
 *
6427
 * parse an HTML in-memory document and build a tree.
6428
 *
6429
 * Returns the resulting document tree
6430
 */
6431
6432
htmlDocPtr
6433
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
6434
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6435
0
}
6436
6437
6438
/**
6439
 * htmlCreateFileParserCtxt:
6440
 * @filename:  the filename
6441
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6442
 *
6443
 * Create a parser context for a file content.
6444
 * Automatic support for ZLIB/Compress compressed document is provided
6445
 * by default if found at compile-time.
6446
 *
6447
 * Returns the new parser context or NULL
6448
 */
6449
htmlParserCtxtPtr
6450
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6451
0
{
6452
0
    htmlParserCtxtPtr ctxt;
6453
0
    htmlParserInputPtr inputStream;
6454
0
    char *canonicFilename;
6455
    /* htmlCharEncoding enc; */
6456
0
    xmlChar *content, *content_line = (xmlChar *) "charset=";
6457
6458
0
    if (filename == NULL)
6459
0
        return(NULL);
6460
6461
0
    ctxt = htmlNewParserCtxt();
6462
0
    if (ctxt == NULL) {
6463
0
  return(NULL);
6464
0
    }
6465
0
    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6466
0
    if (canonicFilename == NULL) {
6467
0
#ifdef LIBXML_SAX1_ENABLED
6468
0
  if (xmlDefaultSAXHandler.error != NULL) {
6469
0
      xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6470
0
  }
6471
0
#endif
6472
0
  xmlFreeParserCtxt(ctxt);
6473
0
  return(NULL);
6474
0
    }
6475
6476
0
    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6477
0
    xmlFree(canonicFilename);
6478
0
    if (inputStream == NULL) {
6479
0
  xmlFreeParserCtxt(ctxt);
6480
0
  return(NULL);
6481
0
    }
6482
6483
0
    inputPush(ctxt, inputStream);
6484
6485
    /* set encoding */
6486
0
    if (encoding) {
6487
0
        size_t l = strlen(encoding);
6488
6489
0
  if (l < 1000) {
6490
0
      content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6491
0
      if (content) {
6492
0
    strcpy ((char *)content, (char *)content_line);
6493
0
    strcat ((char *)content, (char *)encoding);
6494
0
    htmlCheckEncoding (ctxt, content);
6495
0
    xmlFree (content);
6496
0
      }
6497
0
  }
6498
0
    }
6499
6500
0
    return(ctxt);
6501
0
}
6502
6503
/**
6504
 * htmlSAXParseFile:
6505
 * @filename:  the filename
6506
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6507
 * @sax:  the SAX handler block
6508
 * @userData: if using SAX, this pointer will be provided on callbacks.
6509
 *
6510
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6511
 * compressed document is provided by default if found at compile-time.
6512
 * It use the given SAX function block to handle the parsing callback.
6513
 * If sax is NULL, fallback to the default DOM tree building routines.
6514
 *
6515
 * Returns the resulting document tree unless SAX is NULL or the document is
6516
 *     not well formed.
6517
 */
6518
6519
htmlDocPtr
6520
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6521
0
                 void *userData) {
6522
0
    htmlDocPtr ret;
6523
0
    htmlParserCtxtPtr ctxt;
6524
0
    htmlSAXHandlerPtr oldsax = NULL;
6525
6526
0
    xmlInitParser();
6527
6528
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6529
0
    if (ctxt == NULL) return(NULL);
6530
0
    if (sax != NULL) {
6531
0
  oldsax = ctxt->sax;
6532
0
        ctxt->sax = sax;
6533
0
        ctxt->userData = userData;
6534
0
    }
6535
6536
0
    htmlParseDocument(ctxt);
6537
6538
0
    ret = ctxt->myDoc;
6539
0
    if (sax != NULL) {
6540
0
        ctxt->sax = oldsax;
6541
0
        ctxt->userData = NULL;
6542
0
    }
6543
0
    htmlFreeParserCtxt(ctxt);
6544
6545
0
    return(ret);
6546
0
}
6547
6548
/**
6549
 * htmlParseFile:
6550
 * @filename:  the filename
6551
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6552
 *
6553
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6554
 * compressed document is provided by default if found at compile-time.
6555
 *
6556
 * Returns the resulting document tree
6557
 */
6558
6559
htmlDocPtr
6560
0
htmlParseFile(const char *filename, const char *encoding) {
6561
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6562
0
}
6563
6564
/**
6565
 * htmlHandleOmittedElem:
6566
 * @val:  int 0 or 1
6567
 *
6568
 * Set and return the previous value for handling HTML omitted tags.
6569
 *
6570
 * Returns the last value for 0 for no handling, 1 for auto insertion.
6571
 */
6572
6573
int
6574
0
htmlHandleOmittedElem(int val) {
6575
0
    int old = htmlOmittedDefaultValue;
6576
6577
0
    htmlOmittedDefaultValue = val;
6578
0
    return(old);
6579
0
}
6580
6581
/**
6582
 * htmlElementAllowedHere:
6583
 * @parent: HTML parent element
6584
 * @elt: HTML element
6585
 *
6586
 * Checks whether an HTML element may be a direct child of a parent element.
6587
 * Note - doesn't check for deprecated elements
6588
 *
6589
 * Returns 1 if allowed; 0 otherwise.
6590
 */
6591
int
6592
0
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6593
0
  const char** p ;
6594
6595
0
  if ( ! elt || ! parent || ! parent->subelts )
6596
0
  return 0 ;
6597
6598
0
  for ( p = parent->subelts; *p; ++p )
6599
0
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6600
0
      return 1 ;
6601
6602
0
  return 0 ;
6603
0
}
6604
/**
6605
 * htmlElementStatusHere:
6606
 * @parent: HTML parent element
6607
 * @elt: HTML element
6608
 *
6609
 * Checks whether an HTML element may be a direct child of a parent element.
6610
 * and if so whether it is valid or deprecated.
6611
 *
6612
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6613
 */
6614
htmlStatus
6615
0
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6616
0
  if ( ! parent || ! elt )
6617
0
    return HTML_INVALID ;
6618
0
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6619
0
    return HTML_INVALID ;
6620
6621
0
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6622
0
}
6623
/**
6624
 * htmlAttrAllowed:
6625
 * @elt: HTML element
6626
 * @attr: HTML attribute
6627
 * @legacy: whether to allow deprecated attributes
6628
 *
6629
 * Checks whether an attribute is valid for an element
6630
 * Has full knowledge of Required and Deprecated attributes
6631
 *
6632
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6633
 */
6634
htmlStatus
6635
0
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6636
0
  const char** p ;
6637
6638
0
  if ( !elt || ! attr )
6639
0
  return HTML_INVALID ;
6640
6641
0
  if ( elt->attrs_req )
6642
0
    for ( p = elt->attrs_req; *p; ++p)
6643
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6644
0
        return HTML_REQUIRED ;
6645
6646
0
  if ( elt->attrs_opt )
6647
0
    for ( p = elt->attrs_opt; *p; ++p)
6648
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6649
0
        return HTML_VALID ;
6650
6651
0
  if ( legacy && elt->attrs_depr )
6652
0
    for ( p = elt->attrs_depr; *p; ++p)
6653
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6654
0
        return HTML_DEPRECATED ;
6655
6656
0
  return HTML_INVALID ;
6657
0
}
6658
/**
6659
 * htmlNodeStatus:
6660
 * @node: an htmlNodePtr in a tree
6661
 * @legacy: whether to allow deprecated elements (YES is faster here
6662
 *  for Element nodes)
6663
 *
6664
 * Checks whether the tree node is valid.  Experimental (the author
6665
 *     only uses the HTML enhancements in a SAX parser)
6666
 *
6667
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6668
 *  legacy allowed) or htmlElementStatusHere (otherwise).
6669
 *  for Attribute nodes, a return from htmlAttrAllowed
6670
 *  for other nodes, HTML_NA (no checks performed)
6671
 */
6672
htmlStatus
6673
0
htmlNodeStatus(const htmlNodePtr node, int legacy) {
6674
0
  if ( ! node )
6675
0
    return HTML_INVALID ;
6676
6677
0
  switch ( node->type ) {
6678
0
    case XML_ELEMENT_NODE:
6679
0
      return legacy
6680
0
  ? ( htmlElementAllowedHere (
6681
0
    htmlTagLookup(node->parent->name) , node->name
6682
0
    ) ? HTML_VALID : HTML_INVALID )
6683
0
  : htmlElementStatusHere(
6684
0
    htmlTagLookup(node->parent->name) ,
6685
0
    htmlTagLookup(node->name) )
6686
0
  ;
6687
0
    case XML_ATTRIBUTE_NODE:
6688
0
      return htmlAttrAllowed(
6689
0
  htmlTagLookup(node->parent->name) , node->name, legacy) ;
6690
0
    default: return HTML_NA ;
6691
0
  }
6692
0
}
6693
/************************************************************************
6694
 *                  *
6695
 *  New set (2.6.0) of simpler and more flexible APIs   *
6696
 *                  *
6697
 ************************************************************************/
6698
/**
6699
 * DICT_FREE:
6700
 * @str:  a string
6701
 *
6702
 * Free a string if it is not owned by the "dict" dictionary in the
6703
 * current scope
6704
 */
6705
#define DICT_FREE(str)            \
6706
0
  if ((str) && ((!dict) ||       \
6707
0
      (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6708
0
      xmlFree((char *)(str));
6709
6710
/**
6711
 * htmlCtxtReset:
6712
 * @ctxt: an HTML parser context
6713
 *
6714
 * Reset a parser context
6715
 */
6716
void
6717
htmlCtxtReset(htmlParserCtxtPtr ctxt)
6718
0
{
6719
0
    xmlParserInputPtr input;
6720
0
    xmlDictPtr dict;
6721
6722
0
    if (ctxt == NULL)
6723
0
        return;
6724
6725
0
    xmlInitParser();
6726
0
    dict = ctxt->dict;
6727
6728
0
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6729
0
        xmlFreeInputStream(input);
6730
0
    }
6731
0
    ctxt->inputNr = 0;
6732
0
    ctxt->input = NULL;
6733
6734
0
    ctxt->spaceNr = 0;
6735
0
    if (ctxt->spaceTab != NULL) {
6736
0
  ctxt->spaceTab[0] = -1;
6737
0
  ctxt->space = &ctxt->spaceTab[0];
6738
0
    } else {
6739
0
  ctxt->space = NULL;
6740
0
    }
6741
6742
6743
0
    ctxt->nodeNr = 0;
6744
0
    ctxt->node = NULL;
6745
6746
0
    ctxt->nameNr = 0;
6747
0
    ctxt->name = NULL;
6748
6749
0
    DICT_FREE(ctxt->version);
6750
0
    ctxt->version = NULL;
6751
0
    DICT_FREE(ctxt->encoding);
6752
0
    ctxt->encoding = NULL;
6753
0
    DICT_FREE(ctxt->directory);
6754
0
    ctxt->directory = NULL;
6755
0
    DICT_FREE(ctxt->extSubURI);
6756
0
    ctxt->extSubURI = NULL;
6757
0
    DICT_FREE(ctxt->extSubSystem);
6758
0
    ctxt->extSubSystem = NULL;
6759
0
    if (ctxt->myDoc != NULL)
6760
0
        xmlFreeDoc(ctxt->myDoc);
6761
0
    ctxt->myDoc = NULL;
6762
6763
0
    ctxt->standalone = -1;
6764
0
    ctxt->hasExternalSubset = 0;
6765
0
    ctxt->hasPErefs = 0;
6766
0
    ctxt->html = 1;
6767
0
    ctxt->external = 0;
6768
0
    ctxt->instate = XML_PARSER_START;
6769
0
    ctxt->token = 0;
6770
6771
0
    ctxt->wellFormed = 1;
6772
0
    ctxt->nsWellFormed = 1;
6773
0
    ctxt->disableSAX = 0;
6774
0
    ctxt->valid = 1;
6775
0
    ctxt->vctxt.userData = ctxt;
6776
0
    ctxt->vctxt.error = xmlParserValidityError;
6777
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
6778
0
    ctxt->record_info = 0;
6779
0
    ctxt->checkIndex = 0;
6780
0
    ctxt->inSubset = 0;
6781
0
    ctxt->errNo = XML_ERR_OK;
6782
0
    ctxt->depth = 0;
6783
0
    ctxt->charset = XML_CHAR_ENCODING_NONE;
6784
0
    ctxt->catalogs = NULL;
6785
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
6786
6787
0
    if (ctxt->attsDefault != NULL) {
6788
0
        xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6789
0
        ctxt->attsDefault = NULL;
6790
0
    }
6791
0
    if (ctxt->attsSpecial != NULL) {
6792
0
        xmlHashFree(ctxt->attsSpecial, NULL);
6793
0
        ctxt->attsSpecial = NULL;
6794
0
    }
6795
0
}
6796
6797
/**
6798
 * htmlCtxtUseOptions:
6799
 * @ctxt: an HTML parser context
6800
 * @options:  a combination of htmlParserOption(s)
6801
 *
6802
 * Applies the options to the parser context
6803
 *
6804
 * Returns 0 in case of success, the set of unknown or unimplemented options
6805
 *         in case of error.
6806
 */
6807
int
6808
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6809
24.0k
{
6810
24.0k
    if (ctxt == NULL)
6811
0
        return(-1);
6812
6813
24.0k
    if (options & HTML_PARSE_NOWARNING) {
6814
11.0k
        ctxt->sax->warning = NULL;
6815
11.0k
        ctxt->vctxt.warning = NULL;
6816
11.0k
        options -= XML_PARSE_NOWARNING;
6817
11.0k
  ctxt->options |= XML_PARSE_NOWARNING;
6818
11.0k
    }
6819
24.0k
    if (options & HTML_PARSE_NOERROR) {
6820
14.6k
        ctxt->sax->error = NULL;
6821
14.6k
        ctxt->vctxt.error = NULL;
6822
14.6k
        ctxt->sax->fatalError = NULL;
6823
14.6k
        options -= XML_PARSE_NOERROR;
6824
14.6k
  ctxt->options |= XML_PARSE_NOERROR;
6825
14.6k
    }
6826
24.0k
    if (options & HTML_PARSE_PEDANTIC) {
6827
4.45k
        ctxt->pedantic = 1;
6828
4.45k
        options -= XML_PARSE_PEDANTIC;
6829
4.45k
  ctxt->options |= XML_PARSE_PEDANTIC;
6830
4.45k
    } else
6831
19.6k
        ctxt->pedantic = 0;
6832
24.0k
    if (options & XML_PARSE_NOBLANKS) {
6833
8.98k
        ctxt->keepBlanks = 0;
6834
8.98k
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6835
8.98k
        options -= XML_PARSE_NOBLANKS;
6836
8.98k
  ctxt->options |= XML_PARSE_NOBLANKS;
6837
8.98k
    } else
6838
15.0k
        ctxt->keepBlanks = 1;
6839
24.0k
    if (options & HTML_PARSE_RECOVER) {
6840
8.19k
        ctxt->recovery = 1;
6841
8.19k
  options -= HTML_PARSE_RECOVER;
6842
8.19k
    } else
6843
15.8k
        ctxt->recovery = 0;
6844
24.0k
    if (options & HTML_PARSE_COMPACT) {
6845
7.70k
  ctxt->options |= HTML_PARSE_COMPACT;
6846
7.70k
        options -= HTML_PARSE_COMPACT;
6847
7.70k
    }
6848
24.0k
    if (options & XML_PARSE_HUGE) {
6849
9.28k
  ctxt->options |= XML_PARSE_HUGE;
6850
9.28k
        options -= XML_PARSE_HUGE;
6851
9.28k
    }
6852
24.0k
    if (options & HTML_PARSE_NODEFDTD) {
6853
9.89k
  ctxt->options |= HTML_PARSE_NODEFDTD;
6854
9.89k
        options -= HTML_PARSE_NODEFDTD;
6855
9.89k
    }
6856
24.0k
    if (options & HTML_PARSE_IGNORE_ENC) {
6857
12.1k
  ctxt->options |= HTML_PARSE_IGNORE_ENC;
6858
12.1k
        options -= HTML_PARSE_IGNORE_ENC;
6859
12.1k
    }
6860
24.0k
    if (options & HTML_PARSE_NOIMPLIED) {
6861
12.1k
        ctxt->options |= HTML_PARSE_NOIMPLIED;
6862
12.1k
        options -= HTML_PARSE_NOIMPLIED;
6863
12.1k
    }
6864
24.0k
    ctxt->dictNames = 0;
6865
24.0k
    return (options);
6866
24.0k
}
6867
6868
/**
6869
 * htmlDoRead:
6870
 * @ctxt:  an HTML parser context
6871
 * @URL:  the base URL to use for the document
6872
 * @encoding:  the document encoding, or NULL
6873
 * @options:  a combination of htmlParserOption(s)
6874
 * @reuse:  keep the context for reuse
6875
 *
6876
 * Common front-end for the htmlRead functions
6877
 *
6878
 * Returns the resulting document tree or NULL
6879
 */
6880
static htmlDocPtr
6881
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6882
          int options, int reuse)
6883
12.0k
{
6884
12.0k
    htmlDocPtr ret;
6885
6886
12.0k
    htmlCtxtUseOptions(ctxt, options);
6887
12.0k
    ctxt->html = 1;
6888
12.0k
    if (encoding != NULL) {
6889
0
        xmlCharEncodingHandlerPtr hdlr;
6890
6891
0
  hdlr = xmlFindCharEncodingHandler(encoding);
6892
0
  if (hdlr != NULL) {
6893
0
      xmlSwitchToEncoding(ctxt, hdlr);
6894
0
      if (ctxt->input->encoding != NULL)
6895
0
        xmlFree((xmlChar *) ctxt->input->encoding);
6896
0
            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6897
0
        }
6898
0
    }
6899
12.0k
    if ((URL != NULL) && (ctxt->input != NULL) &&
6900
12.0k
        (ctxt->input->filename == NULL))
6901
0
        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6902
12.0k
    htmlParseDocument(ctxt);
6903
12.0k
    ret = ctxt->myDoc;
6904
12.0k
    ctxt->myDoc = NULL;
6905
12.0k
    if (!reuse) {
6906
12.0k
        if ((ctxt->dictNames) &&
6907
12.0k
      (ret != NULL) &&
6908
12.0k
      (ret->dict == ctxt->dict))
6909
0
      ctxt->dict = NULL;
6910
12.0k
  xmlFreeParserCtxt(ctxt);
6911
12.0k
    }
6912
12.0k
    return (ret);
6913
12.0k
}
6914
6915
/**
6916
 * htmlReadDoc:
6917
 * @cur:  a pointer to a zero terminated string
6918
 * @URL:  the base URL to use for the document
6919
 * @encoding:  the document encoding, or NULL
6920
 * @options:  a combination of htmlParserOption(s)
6921
 *
6922
 * parse an XML in-memory document and build a tree.
6923
 *
6924
 * Returns the resulting document tree
6925
 */
6926
htmlDocPtr
6927
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6928
0
{
6929
0
    htmlParserCtxtPtr ctxt;
6930
6931
0
    if (cur == NULL)
6932
0
        return (NULL);
6933
6934
0
    xmlInitParser();
6935
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6936
0
    if (ctxt == NULL)
6937
0
        return (NULL);
6938
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6939
0
}
6940
6941
/**
6942
 * htmlReadFile:
6943
 * @filename:  a file or URL
6944
 * @encoding:  the document encoding, or NULL
6945
 * @options:  a combination of htmlParserOption(s)
6946
 *
6947
 * parse an XML file from the filesystem or the network.
6948
 *
6949
 * Returns the resulting document tree
6950
 */
6951
htmlDocPtr
6952
htmlReadFile(const char *filename, const char *encoding, int options)
6953
0
{
6954
0
    htmlParserCtxtPtr ctxt;
6955
6956
0
    xmlInitParser();
6957
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6958
0
    if (ctxt == NULL)
6959
0
        return (NULL);
6960
0
    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6961
0
}
6962
6963
/**
6964
 * htmlReadMemory:
6965
 * @buffer:  a pointer to a char array
6966
 * @size:  the size of the array
6967
 * @URL:  the base URL to use for the document
6968
 * @encoding:  the document encoding, or NULL
6969
 * @options:  a combination of htmlParserOption(s)
6970
 *
6971
 * parse an XML in-memory document and build a tree.
6972
 *
6973
 * Returns the resulting document tree
6974
 */
6975
htmlDocPtr
6976
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6977
12.0k
{
6978
12.0k
    htmlParserCtxtPtr ctxt;
6979
6980
12.0k
    xmlInitParser();
6981
12.0k
    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6982
12.0k
    if (ctxt == NULL)
6983
7
        return (NULL);
6984
12.0k
    htmlDefaultSAXHandlerInit();
6985
12.0k
    if (ctxt->sax != NULL)
6986
12.0k
        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6987
12.0k
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6988
12.0k
}
6989
6990
/**
6991
 * htmlReadFd:
6992
 * @fd:  an open file descriptor
6993
 * @URL:  the base URL to use for the document
6994
 * @encoding:  the document encoding, or NULL
6995
 * @options:  a combination of htmlParserOption(s)
6996
 *
6997
 * parse an HTML from a file descriptor and build a tree.
6998
 * NOTE that the file descriptor will not be closed when the
6999
 *      reader is closed or reset.
7000
 *
7001
 * Returns the resulting document tree
7002
 */
7003
htmlDocPtr
7004
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7005
0
{
7006
0
    htmlParserCtxtPtr ctxt;
7007
0
    xmlParserInputBufferPtr input;
7008
0
    htmlParserInputPtr stream;
7009
7010
0
    if (fd < 0)
7011
0
        return (NULL);
7012
7013
0
    xmlInitParser();
7014
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7015
0
    if (input == NULL)
7016
0
        return (NULL);
7017
0
    input->closecallback = NULL;
7018
0
    ctxt = htmlNewParserCtxt();
7019
0
    if (ctxt == NULL) {
7020
0
        xmlFreeParserInputBuffer(input);
7021
0
        return (NULL);
7022
0
    }
7023
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7024
0
    if (stream == NULL) {
7025
0
        xmlFreeParserInputBuffer(input);
7026
0
  htmlFreeParserCtxt(ctxt);
7027
0
        return (NULL);
7028
0
    }
7029
0
    inputPush(ctxt, stream);
7030
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
7031
0
}
7032
7033
/**
7034
 * htmlReadIO:
7035
 * @ioread:  an I/O read function
7036
 * @ioclose:  an I/O close function
7037
 * @ioctx:  an I/O handler
7038
 * @URL:  the base URL to use for the document
7039
 * @encoding:  the document encoding, or NULL
7040
 * @options:  a combination of htmlParserOption(s)
7041
 *
7042
 * parse an HTML document from I/O functions and source and build a tree.
7043
 *
7044
 * Returns the resulting document tree
7045
 */
7046
htmlDocPtr
7047
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7048
          void *ioctx, const char *URL, const char *encoding, int options)
7049
0
{
7050
0
    htmlParserCtxtPtr ctxt;
7051
0
    xmlParserInputBufferPtr input;
7052
0
    xmlParserInputPtr stream;
7053
7054
0
    if (ioread == NULL)
7055
0
        return (NULL);
7056
0
    xmlInitParser();
7057
7058
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7059
0
                                         XML_CHAR_ENCODING_NONE);
7060
0
    if (input == NULL) {
7061
0
        if (ioclose != NULL)
7062
0
            ioclose(ioctx);
7063
0
        return (NULL);
7064
0
    }
7065
0
    ctxt = htmlNewParserCtxt();
7066
0
    if (ctxt == NULL) {
7067
0
        xmlFreeParserInputBuffer(input);
7068
0
        return (NULL);
7069
0
    }
7070
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7071
0
    if (stream == NULL) {
7072
0
        xmlFreeParserInputBuffer(input);
7073
0
  xmlFreeParserCtxt(ctxt);
7074
0
        return (NULL);
7075
0
    }
7076
0
    inputPush(ctxt, stream);
7077
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
7078
0
}
7079
7080
/**
7081
 * htmlCtxtReadDoc:
7082
 * @ctxt:  an HTML parser context
7083
 * @cur:  a pointer to a zero terminated string
7084
 * @URL:  the base URL to use for the document
7085
 * @encoding:  the document encoding, or NULL
7086
 * @options:  a combination of htmlParserOption(s)
7087
 *
7088
 * parse an XML in-memory document and build a tree.
7089
 * This reuses the existing @ctxt parser context
7090
 *
7091
 * Returns the resulting document tree
7092
 */
7093
htmlDocPtr
7094
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7095
               const char *URL, const char *encoding, int options)
7096
0
{
7097
0
    xmlParserInputPtr stream;
7098
7099
0
    if (cur == NULL)
7100
0
        return (NULL);
7101
0
    if (ctxt == NULL)
7102
0
        return (NULL);
7103
0
    xmlInitParser();
7104
7105
0
    htmlCtxtReset(ctxt);
7106
7107
0
    stream = xmlNewStringInputStream(ctxt, cur);
7108
0
    if (stream == NULL) {
7109
0
        return (NULL);
7110
0
    }
7111
0
    inputPush(ctxt, stream);
7112
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7113
0
}
7114
7115
/**
7116
 * htmlCtxtReadFile:
7117
 * @ctxt:  an HTML parser context
7118
 * @filename:  a file or URL
7119
 * @encoding:  the document encoding, or NULL
7120
 * @options:  a combination of htmlParserOption(s)
7121
 *
7122
 * parse an XML file from the filesystem or the network.
7123
 * This reuses the existing @ctxt parser context
7124
 *
7125
 * Returns the resulting document tree
7126
 */
7127
htmlDocPtr
7128
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7129
                const char *encoding, int options)
7130
0
{
7131
0
    xmlParserInputPtr stream;
7132
7133
0
    if (filename == NULL)
7134
0
        return (NULL);
7135
0
    if (ctxt == NULL)
7136
0
        return (NULL);
7137
0
    xmlInitParser();
7138
7139
0
    htmlCtxtReset(ctxt);
7140
7141
0
    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7142
0
    if (stream == NULL) {
7143
0
        return (NULL);
7144
0
    }
7145
0
    inputPush(ctxt, stream);
7146
0
    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7147
0
}
7148
7149
/**
7150
 * htmlCtxtReadMemory:
7151
 * @ctxt:  an HTML parser context
7152
 * @buffer:  a pointer to a char array
7153
 * @size:  the size of the array
7154
 * @URL:  the base URL to use for the document
7155
 * @encoding:  the document encoding, or NULL
7156
 * @options:  a combination of htmlParserOption(s)
7157
 *
7158
 * parse an XML in-memory document and build a tree.
7159
 * This reuses the existing @ctxt parser context
7160
 *
7161
 * Returns the resulting document tree
7162
 */
7163
htmlDocPtr
7164
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7165
                  const char *URL, const char *encoding, int options)
7166
0
{
7167
0
    xmlParserInputBufferPtr input;
7168
0
    xmlParserInputPtr stream;
7169
7170
0
    if (ctxt == NULL)
7171
0
        return (NULL);
7172
0
    if (buffer == NULL)
7173
0
        return (NULL);
7174
0
    xmlInitParser();
7175
7176
0
    htmlCtxtReset(ctxt);
7177
7178
0
    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7179
0
    if (input == NULL) {
7180
0
  return(NULL);
7181
0
    }
7182
7183
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7184
0
    if (stream == NULL) {
7185
0
  xmlFreeParserInputBuffer(input);
7186
0
  return(NULL);
7187
0
    }
7188
7189
0
    inputPush(ctxt, stream);
7190
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7191
0
}
7192
7193
/**
7194
 * htmlCtxtReadFd:
7195
 * @ctxt:  an HTML parser context
7196
 * @fd:  an open file descriptor
7197
 * @URL:  the base URL to use for the document
7198
 * @encoding:  the document encoding, or NULL
7199
 * @options:  a combination of htmlParserOption(s)
7200
 *
7201
 * parse an XML from a file descriptor and build a tree.
7202
 * This reuses the existing @ctxt parser context
7203
 *
7204
 * Returns the resulting document tree
7205
 */
7206
htmlDocPtr
7207
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7208
              const char *URL, const char *encoding, int options)
7209
0
{
7210
0
    xmlParserInputBufferPtr input;
7211
0
    xmlParserInputPtr stream;
7212
7213
0
    if (fd < 0)
7214
0
        return (NULL);
7215
0
    if (ctxt == NULL)
7216
0
        return (NULL);
7217
0
    xmlInitParser();
7218
7219
0
    htmlCtxtReset(ctxt);
7220
7221
7222
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7223
0
    if (input == NULL)
7224
0
        return (NULL);
7225
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7226
0
    if (stream == NULL) {
7227
0
        xmlFreeParserInputBuffer(input);
7228
0
        return (NULL);
7229
0
    }
7230
0
    inputPush(ctxt, stream);
7231
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7232
0
}
7233
7234
/**
7235
 * htmlCtxtReadIO:
7236
 * @ctxt:  an HTML parser context
7237
 * @ioread:  an I/O read function
7238
 * @ioclose:  an I/O close function
7239
 * @ioctx:  an I/O handler
7240
 * @URL:  the base URL to use for the document
7241
 * @encoding:  the document encoding, or NULL
7242
 * @options:  a combination of htmlParserOption(s)
7243
 *
7244
 * parse an HTML document from I/O functions and source and build a tree.
7245
 * This reuses the existing @ctxt parser context
7246
 *
7247
 * Returns the resulting document tree
7248
 */
7249
htmlDocPtr
7250
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7251
              xmlInputCloseCallback ioclose, void *ioctx,
7252
        const char *URL,
7253
              const char *encoding, int options)
7254
0
{
7255
0
    xmlParserInputBufferPtr input;
7256
0
    xmlParserInputPtr stream;
7257
7258
0
    if (ioread == NULL)
7259
0
        return (NULL);
7260
0
    if (ctxt == NULL)
7261
0
        return (NULL);
7262
0
    xmlInitParser();
7263
7264
0
    htmlCtxtReset(ctxt);
7265
7266
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7267
0
                                         XML_CHAR_ENCODING_NONE);
7268
0
    if (input == NULL) {
7269
0
        if (ioclose != NULL)
7270
0
            ioclose(ioctx);
7271
0
        return (NULL);
7272
0
    }
7273
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7274
0
    if (stream == NULL) {
7275
0
        xmlFreeParserInputBuffer(input);
7276
0
        return (NULL);
7277
0
    }
7278
0
    inputPush(ctxt, stream);
7279
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7280
0
}
7281
7282
#endif /* LIBXML_HTML_ENABLED */