Coverage Report

Created: 2023-09-29 17:40

/src/libxml2/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12
13
#include <string.h>
14
#include <ctype.h>
15
#include <stdlib.h>
16
17
#include <libxml/xmlmemory.h>
18
#include <libxml/tree.h>
19
#include <libxml/parser.h>
20
#include <libxml/parserInternals.h>
21
#include <libxml/xmlerror.h>
22
#include <libxml/HTMLparser.h>
23
#include <libxml/HTMLtree.h>
24
#include <libxml/entities.h>
25
#include <libxml/encoding.h>
26
#include <libxml/valid.h>
27
#include <libxml/xmlIO.h>
28
#include <libxml/globals.h>
29
#include <libxml/uri.h>
30
31
#include "private/buf.h"
32
#include "private/enc.h"
33
#include "private/error.h"
34
#include "private/html.h"
35
#include "private/parser.h"
36
#include "private/tree.h"
37
38
#define HTML_MAX_NAMELEN 1000
39
0
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
40
0
#define HTML_PARSER_BUFFER_SIZE 100
41
42
/* #define DEBUG */
43
/* #define DEBUG_PUSH */
44
45
static int htmlOmittedDefaultValue = 1;
46
47
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
48
           xmlChar end, xmlChar  end2, xmlChar end3);
49
static void htmlParseComment(htmlParserCtxtPtr ctxt);
50
51
/************************************************************************
52
 *                  *
53
 *    Some factorized error routines        *
54
 *                  *
55
 ************************************************************************/
56
57
/**
58
 * htmlErrMemory:
59
 * @ctxt:  an HTML parser context
60
 * @extra:  extra information
61
 *
62
 * Handle a redefinition of attribute error
63
 */
64
static void
65
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
66
0
{
67
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
68
0
        (ctxt->instate == XML_PARSER_EOF))
69
0
  return;
70
0
    if (ctxt != NULL) {
71
0
        ctxt->errNo = XML_ERR_NO_MEMORY;
72
0
        ctxt->instate = XML_PARSER_EOF;
73
0
        ctxt->disableSAX = 1;
74
0
    }
75
0
    if (extra)
76
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
77
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
78
0
                        NULL, NULL, 0, 0,
79
0
                        "Memory allocation failed : %s\n", extra);
80
0
    else
81
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
82
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
83
0
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
84
0
}
85
86
/**
87
 * htmlParseErr:
88
 * @ctxt:  an HTML parser context
89
 * @error:  the error number
90
 * @msg:  the error message
91
 * @str1:  string infor
92
 * @str2:  string infor
93
 *
94
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
95
 */
96
static void LIBXML_ATTR_FORMAT(3,0)
97
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
98
             const char *msg, const xmlChar *str1, const xmlChar *str2)
99
0
{
100
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
101
0
        (ctxt->instate == XML_PARSER_EOF))
102
0
  return;
103
0
    if (ctxt != NULL)
104
0
  ctxt->errNo = error;
105
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
106
0
                    XML_ERR_ERROR, NULL, 0,
107
0
        (const char *) str1, (const char *) str2,
108
0
        NULL, 0, 0,
109
0
        msg, str1, str2);
110
0
    if (ctxt != NULL)
111
0
  ctxt->wellFormed = 0;
112
0
}
113
114
/**
115
 * htmlParseErrInt:
116
 * @ctxt:  an HTML parser context
117
 * @error:  the error number
118
 * @msg:  the error message
119
 * @val:  integer info
120
 *
121
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
122
 */
123
static void LIBXML_ATTR_FORMAT(3,0)
124
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
125
             const char *msg, int val)
126
0
{
127
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
128
0
        (ctxt->instate == XML_PARSER_EOF))
129
0
  return;
130
0
    if (ctxt != NULL)
131
0
  ctxt->errNo = error;
132
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
133
0
                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
134
0
        NULL, val, 0, msg, val);
135
0
    if (ctxt != NULL)
136
0
  ctxt->wellFormed = 0;
137
0
}
138
139
/************************************************************************
140
 *                  *
141
 *  Parser stacks related functions and macros    *
142
 *                  *
143
 ************************************************************************/
144
145
/**
146
 * htmlnamePush:
147
 * @ctxt:  an HTML parser context
148
 * @value:  the element name
149
 *
150
 * Pushes a new element name on top of the name stack
151
 *
152
 * Returns 0 in case of error, the index in the stack otherwise
153
 */
154
static int
155
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
156
0
{
157
0
    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
158
0
        ctxt->html = 3;
159
0
    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
160
0
        ctxt->html = 10;
161
0
    if (ctxt->nameNr >= ctxt->nameMax) {
162
0
        ctxt->nameMax *= 2;
163
0
        ctxt->nameTab = (const xmlChar * *)
164
0
                         xmlRealloc((xmlChar * *)ctxt->nameTab,
165
0
                                    ctxt->nameMax *
166
0
                                    sizeof(ctxt->nameTab[0]));
167
0
        if (ctxt->nameTab == NULL) {
168
0
            htmlErrMemory(ctxt, NULL);
169
0
            return (0);
170
0
        }
171
0
    }
172
0
    ctxt->nameTab[ctxt->nameNr] = value;
173
0
    ctxt->name = value;
174
0
    return (ctxt->nameNr++);
175
0
}
176
/**
177
 * htmlnamePop:
178
 * @ctxt: an HTML parser context
179
 *
180
 * Pops the top element name from the name stack
181
 *
182
 * Returns the name just removed
183
 */
184
static const xmlChar *
185
htmlnamePop(htmlParserCtxtPtr ctxt)
186
0
{
187
0
    const xmlChar *ret;
188
189
0
    if (ctxt->nameNr <= 0)
190
0
        return (NULL);
191
0
    ctxt->nameNr--;
192
0
    if (ctxt->nameNr < 0)
193
0
        return (NULL);
194
0
    if (ctxt->nameNr > 0)
195
0
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
196
0
    else
197
0
        ctxt->name = NULL;
198
0
    ret = ctxt->nameTab[ctxt->nameNr];
199
0
    ctxt->nameTab[ctxt->nameNr] = NULL;
200
0
    return (ret);
201
0
}
202
203
/**
204
 * htmlNodeInfoPush:
205
 * @ctxt:  an HTML parser context
206
 * @value:  the node info
207
 *
208
 * Pushes a new element name on top of the node info stack
209
 *
210
 * Returns 0 in case of error, the index in the stack otherwise
211
 */
212
static int
213
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
214
0
{
215
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
216
0
        if (ctxt->nodeInfoMax == 0)
217
0
                ctxt->nodeInfoMax = 5;
218
0
        ctxt->nodeInfoMax *= 2;
219
0
        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
220
0
                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
221
0
                                    ctxt->nodeInfoMax *
222
0
                                    sizeof(ctxt->nodeInfoTab[0]));
223
0
        if (ctxt->nodeInfoTab == NULL) {
224
0
            htmlErrMemory(ctxt, NULL);
225
0
            return (0);
226
0
        }
227
0
    }
228
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
229
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
230
0
    return (ctxt->nodeInfoNr++);
231
0
}
232
233
/**
234
 * htmlNodeInfoPop:
235
 * @ctxt:  an HTML parser context
236
 *
237
 * Pops the top element name from the node info stack
238
 *
239
 * Returns 0 in case of error, the pointer to NodeInfo otherwise
240
 */
241
static htmlParserNodeInfo *
242
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
243
0
{
244
0
    if (ctxt->nodeInfoNr <= 0)
245
0
        return (NULL);
246
0
    ctxt->nodeInfoNr--;
247
0
    if (ctxt->nodeInfoNr < 0)
248
0
        return (NULL);
249
0
    if (ctxt->nodeInfoNr > 0)
250
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
251
0
    else
252
0
        ctxt->nodeInfo = NULL;
253
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
254
0
}
255
256
/*
257
 * Macros for accessing the content. Those should be used only by the parser,
258
 * and not exported.
259
 *
260
 * Dirty macros, i.e. one need to make assumption on the context to use them
261
 *
262
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
263
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
264
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
265
 *           in UNICODE mode. This should be used internally by the parser
266
 *           only to compare to ASCII values otherwise it would break when
267
 *           running with UTF-8 encoding.
268
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
269
 *           to compare on ASCII based substring.
270
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
271
 *           it should be used only to compare on ASCII based substring.
272
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
273
 *           strings without newlines within the parser.
274
 *
275
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
276
 *
277
 *   NEXT    Skip to the next character, this does the proper decoding
278
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
279
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
280
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
281
 */
282
283
0
#define UPPER (toupper(*ctxt->input->cur))
284
285
0
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
286
287
0
#define NXT(val) ctxt->input->cur[(val)]
288
289
0
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
290
291
0
#define CUR_PTR ctxt->input->cur
292
0
#define BASE_PTR ctxt->input->base
293
294
0
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
295
0
       (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
296
0
  xmlParserInputShrink(ctxt->input)
297
298
0
#define GROW if ((ctxt->progressive == 0) &&       \
299
0
     (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))  \
300
0
  xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
301
302
0
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
303
304
/* Imported from XML */
305
306
0
#define CUR (*ctxt->input->cur)
307
0
#define NEXT xmlNextChar(ctxt)
308
309
0
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
310
311
312
0
#define NEXTL(l) do {             \
313
0
    if (*(ctxt->input->cur) == '\n') {         \
314
0
  ctxt->input->line++; ctxt->input->col = 1;      \
315
0
    } else ctxt->input->col++;           \
316
0
    ctxt->token = 0; ctxt->input->cur += l;       \
317
0
  } while (0)
318
319
/************
320
    \
321
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
322
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
323
 ************/
324
325
0
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
326
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
327
328
#define COPY_BUF(l,b,i,v)           \
329
0
    if (l == 1) b[i++] = v;           \
330
0
    else i += xmlCopyChar(l,&b[i],v)
331
332
/**
333
 * htmlFindEncoding:
334
 * @the HTML parser context
335
 *
336
 * Ty to find and encoding in the current data available in the input
337
 * buffer this is needed to try to switch to the proper encoding when
338
 * one face a character error.
339
 * That's an heuristic, since it's operating outside of parsing it could
340
 * try to use a meta which had been commented out, that's the reason it
341
 * should only be used in case of error, not as a default.
342
 *
343
 * Returns an encoding string or NULL if not found, the string need to
344
 *   be freed
345
 */
346
static xmlChar *
347
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
348
0
    const xmlChar *start, *cur, *end;
349
350
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
351
0
        (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
352
0
        (ctxt->input->buf->encoder != NULL))
353
0
        return(NULL);
354
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
355
0
        return(NULL);
356
357
0
    start = ctxt->input->cur;
358
0
    end = ctxt->input->end;
359
    /* we also expect the input buffer to be zero terminated */
360
0
    if (*end != 0)
361
0
        return(NULL);
362
363
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
364
0
    if (cur == NULL)
365
0
        return(NULL);
366
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
367
0
    if (cur == NULL)
368
0
        return(NULL);
369
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
370
0
    if (cur == NULL)
371
0
        return(NULL);
372
0
    cur += 8;
373
0
    start = cur;
374
0
    while (((*cur >= 'A') && (*cur <= 'Z')) ||
375
0
           ((*cur >= 'a') && (*cur <= 'z')) ||
376
0
           ((*cur >= '0') && (*cur <= '9')) ||
377
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
378
0
           cur++;
379
0
    if (cur == start)
380
0
        return(NULL);
381
0
    return(xmlStrndup(start, cur - start));
382
0
}
383
384
/**
385
 * htmlCurrentChar:
386
 * @ctxt:  the HTML parser context
387
 * @len:  pointer to the length of the char read
388
 *
389
 * The current char value, if using UTF-8 this may actually span multiple
390
 * bytes in the input buffer. Implement the end of line normalization:
391
 * 2.11 End-of-Line Handling
392
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
393
 * char, then the encoding converter is plugged in automatically.
394
 *
395
 * Returns the current char value and its length
396
 */
397
398
static int
399
0
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
400
0
    const unsigned char *cur;
401
0
    unsigned char c;
402
0
    unsigned int val;
403
404
0
    if (ctxt->instate == XML_PARSER_EOF)
405
0
  return(0);
406
407
0
    if (ctxt->token != 0) {
408
0
  *len = 0;
409
0
  return(ctxt->token);
410
0
    }
411
0
    if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
412
0
        xmlChar * guess;
413
0
        xmlCharEncodingHandlerPtr handler;
414
415
        /*
416
         * Assume it's a fixed length encoding (1) with
417
         * a compatible encoding for the ASCII set, since
418
         * HTML constructs only use < 128 chars
419
         */
420
0
        if (*ctxt->input->cur < 0x80) {
421
0
            *len = 1;
422
0
            if ((*ctxt->input->cur == 0) &&
423
0
                (ctxt->input->cur < ctxt->input->end)) {
424
0
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
425
0
                                "Char 0x%X out of allowed range\n", 0);
426
0
                return(' ');
427
0
            }
428
0
            return(*ctxt->input->cur);
429
0
        }
430
431
        /*
432
         * Humm this is bad, do an automatic flow conversion
433
         */
434
0
        guess = htmlFindEncoding(ctxt);
435
0
        if (guess == NULL) {
436
0
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
437
0
        } else {
438
0
            if (ctxt->input->encoding != NULL)
439
0
                xmlFree((xmlChar *) ctxt->input->encoding);
440
0
            ctxt->input->encoding = guess;
441
0
            handler = xmlFindCharEncodingHandler((const char *) guess);
442
0
            if (handler != NULL) {
443
                /*
444
                 * Don't use UTF-8 encoder which isn't required and
445
                 * can produce invalid UTF-8.
446
                 */
447
0
                if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
448
0
                    xmlSwitchToEncoding(ctxt, handler);
449
0
            } else {
450
0
                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
451
0
                             "Unsupported encoding %s", guess, NULL);
452
0
            }
453
0
        }
454
0
        ctxt->charset = XML_CHAR_ENCODING_UTF8;
455
0
    }
456
457
    /*
458
     * We are supposed to handle UTF8, check it's valid
459
     * From rfc2044: encoding of the Unicode values on UTF-8:
460
     *
461
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
462
     * 0000 0000-0000 007F   0xxxxxxx
463
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
464
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
465
     *
466
     * Check for the 0x110000 limit too
467
     */
468
0
    cur = ctxt->input->cur;
469
0
    c = *cur;
470
0
    if (c & 0x80) {
471
0
        if ((c & 0x40) == 0)
472
0
            goto encoding_error;
473
0
        if (cur[1] == 0) {
474
0
            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
475
0
            cur = ctxt->input->cur;
476
0
        }
477
0
        if ((cur[1] & 0xc0) != 0x80)
478
0
            goto encoding_error;
479
0
        if ((c & 0xe0) == 0xe0) {
480
481
0
            if (cur[2] == 0) {
482
0
                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
483
0
                cur = ctxt->input->cur;
484
0
            }
485
0
            if ((cur[2] & 0xc0) != 0x80)
486
0
                goto encoding_error;
487
0
            if ((c & 0xf0) == 0xf0) {
488
0
                if (cur[3] == 0) {
489
0
                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
490
0
                    cur = ctxt->input->cur;
491
0
                }
492
0
                if (((c & 0xf8) != 0xf0) ||
493
0
                    ((cur[3] & 0xc0) != 0x80))
494
0
                    goto encoding_error;
495
                /* 4-byte code */
496
0
                *len = 4;
497
0
                val = (cur[0] & 0x7) << 18;
498
0
                val |= (cur[1] & 0x3f) << 12;
499
0
                val |= (cur[2] & 0x3f) << 6;
500
0
                val |= cur[3] & 0x3f;
501
0
                if (val < 0x10000)
502
0
                    goto encoding_error;
503
0
            } else {
504
              /* 3-byte code */
505
0
                *len = 3;
506
0
                val = (cur[0] & 0xf) << 12;
507
0
                val |= (cur[1] & 0x3f) << 6;
508
0
                val |= cur[2] & 0x3f;
509
0
                if (val < 0x800)
510
0
                    goto encoding_error;
511
0
            }
512
0
        } else {
513
          /* 2-byte code */
514
0
            *len = 2;
515
0
            val = (cur[0] & 0x1f) << 6;
516
0
            val |= cur[1] & 0x3f;
517
0
            if (val < 0x80)
518
0
                goto encoding_error;
519
0
        }
520
0
        if (!IS_CHAR(val)) {
521
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
522
0
                            "Char 0x%X out of allowed range\n", val);
523
0
        }
524
0
        return(val);
525
0
    } else {
526
0
        if ((*ctxt->input->cur == 0) &&
527
0
            (ctxt->input->cur < ctxt->input->end)) {
528
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
529
0
                            "Char 0x%X out of allowed range\n", 0);
530
0
            *len = 1;
531
0
            return(' ');
532
0
        }
533
        /* 1-byte code */
534
0
        *len = 1;
535
0
        return(*ctxt->input->cur);
536
0
    }
537
538
0
encoding_error:
539
    /*
540
     * If we detect an UTF8 error that probably mean that the
541
     * input encoding didn't get properly advertised in the
542
     * declaration header. Report the error and switch the encoding
543
     * to ISO-Latin-1 (if you don't like this policy, just declare the
544
     * encoding !)
545
     */
546
0
    {
547
0
        char buffer[150];
548
549
0
  if (ctxt->input->end - ctxt->input->cur >= 4) {
550
0
      snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
551
0
          ctxt->input->cur[0], ctxt->input->cur[1],
552
0
          ctxt->input->cur[2], ctxt->input->cur[3]);
553
0
  } else {
554
0
      snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
555
0
  }
556
0
  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
557
0
         "Input is not proper UTF-8, indicate encoding !\n",
558
0
         BAD_CAST buffer, NULL);
559
0
    }
560
561
    /*
562
     * Don't switch encodings twice. Note that if there's an encoder, we
563
     * shouldn't receive invalid UTF-8 anyway.
564
     *
565
     * Note that if ctxt->input->buf == NULL, switching encodings is
566
     * impossible, see Gitlab issue #34.
567
     */
568
0
    if ((ctxt->input->buf != NULL) &&
569
0
        (ctxt->input->buf->encoder == NULL))
570
0
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
571
0
    *len = 1;
572
0
    return(*ctxt->input->cur);
573
0
}
574
575
/**
576
 * htmlSkipBlankChars:
577
 * @ctxt:  the HTML parser context
578
 *
579
 * skip all blanks character found at that point in the input streams.
580
 *
581
 * Returns the number of space chars skipped
582
 */
583
584
static int
585
0
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
586
0
    int res = 0;
587
588
0
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
589
0
  if ((*ctxt->input->cur == 0) &&
590
0
      (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
591
0
    xmlPopInput(ctxt);
592
0
  } else {
593
0
      if (*(ctxt->input->cur) == '\n') {
594
0
    ctxt->input->line++; ctxt->input->col = 1;
595
0
      } else ctxt->input->col++;
596
0
      ctxt->input->cur++;
597
0
      if (*ctxt->input->cur == 0)
598
0
    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
599
0
  }
600
0
  if (res < INT_MAX)
601
0
      res++;
602
0
    }
603
0
    return(res);
604
0
}
605
606
607
608
/************************************************************************
609
 *                  *
610
 *  The list of HTML elements and their properties    *
611
 *                  *
612
 ************************************************************************/
613
614
/*
615
 *  Start Tag: 1 means the start tag can be omitted
616
 *  End Tag:   1 means the end tag can be omitted
617
 *             2 means it's forbidden (empty elements)
618
 *             3 means the tag is stylistic and should be closed easily
619
 *  Depr:      this element is deprecated
620
 *  DTD:       1 means that this element is valid only in the Loose DTD
621
 *             2 means that this element is valid only in the Frameset DTD
622
 *
623
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
624
  , subElements , impliedsubelt , Attributes, userdata
625
 */
626
627
/* Definitions and a couple of vars for HTML Elements */
628
629
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
630
#define NB_FONTSTYLE 8
631
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
632
#define NB_PHRASE 10
633
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
634
#define NB_SPECIAL 16
635
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
636
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
637
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
638
#define NB_BLOCK NB_HEADING + NB_LIST + 14
639
#define FORMCTRL "input", "select", "textarea", "label", "button"
640
#define NB_FORMCTRL 5
641
#define PCDATA
642
#define NB_PCDATA 0
643
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
644
#define NB_HEADING 6
645
#define LIST "ul", "ol", "dir", "menu"
646
#define NB_LIST 4
647
#define MODIFIER
648
#define NB_MODIFIER 0
649
#define FLOW BLOCK,INLINE
650
#define NB_FLOW NB_BLOCK + NB_INLINE
651
#define EMPTY NULL
652
653
654
static const char* const html_flow[] = { FLOW, NULL } ;
655
static const char* const html_inline[] = { INLINE, NULL } ;
656
657
/* placeholders: elts with content but no subelements */
658
static const char* const html_pcdata[] = { NULL } ;
659
#define html_cdata html_pcdata
660
661
662
/* ... and for HTML Attributes */
663
664
#define COREATTRS "id", "class", "style", "title"
665
#define NB_COREATTRS 4
666
#define I18N "lang", "dir"
667
#define NB_I18N 2
668
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
669
#define NB_EVENTS 9
670
#define ATTRS COREATTRS,I18N,EVENTS
671
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
672
#define CELLHALIGN "align", "char", "charoff"
673
#define NB_CELLHALIGN 3
674
#define CELLVALIGN "valign"
675
#define NB_CELLVALIGN 1
676
677
static const char* const html_attrs[] = { ATTRS, NULL } ;
678
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
679
static const char* const core_attrs[] = { COREATTRS, NULL } ;
680
static const char* const i18n_attrs[] = { I18N, NULL } ;
681
682
683
/* Other declarations that should go inline ... */
684
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
685
  "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
686
  "tabindex", "onfocus", "onblur", NULL } ;
687
static const char* const target_attr[] = { "target", NULL } ;
688
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
689
static const char* const alt_attr[] = { "alt", NULL } ;
690
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
691
static const char* const href_attrs[] = { "href", NULL } ;
692
static const char* const clear_attrs[] = { "clear", NULL } ;
693
static const char* const inline_p[] = { INLINE, "p", NULL } ;
694
695
static const char* const flow_param[] = { FLOW, "param", NULL } ;
696
static const char* const applet_attrs[] = { COREATTRS , "codebase",
697
    "archive", "alt", "name", "height", "width", "align",
698
    "hspace", "vspace", NULL } ;
699
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
700
  "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
701
static const char* const basefont_attrs[] =
702
  { "id", "size", "color", "face", NULL } ;
703
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
704
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
705
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
706
static const char* const body_depr[] = { "background", "bgcolor", "text",
707
  "link", "vlink", "alink", NULL } ;
708
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
709
  "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
710
711
712
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
713
static const char* const col_elt[] = { "col", NULL } ;
714
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
715
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
716
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
717
static const char* const compact_attr[] = { "compact", NULL } ;
718
static const char* const label_attr[] = { "label", NULL } ;
719
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
720
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
721
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
722
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
723
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
724
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
725
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
726
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
727
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
728
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
729
static const char* const version_attr[] = { "version", NULL } ;
730
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
731
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
732
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
733
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
734
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
735
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
736
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
737
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
738
static const char* const align_attr[] = { "align", NULL } ;
739
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
740
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
741
static const char* const name_attr[] = { "name", NULL } ;
742
static const char* const action_attr[] = { "action", NULL } ;
743
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
744
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
745
static const char* const content_attr[] = { "content", NULL } ;
746
static const char* const type_attr[] = { "type", NULL } ;
747
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
748
static const char* const object_contents[] = { FLOW, "param", NULL } ;
749
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
750
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
751
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
752
static const char* const option_elt[] = { "option", NULL } ;
753
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
754
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
755
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
756
static const char* const width_attr[] = { "width", NULL } ;
757
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
758
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
759
static const char* const language_attr[] = { "language", NULL } ;
760
static const char* const select_content[] = { "optgroup", "option", NULL } ;
761
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
762
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
763
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
764
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
765
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
766
static const char* const tr_elt[] = { "tr", NULL } ;
767
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
768
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
769
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
770
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
771
static const char* const tr_contents[] = { "th", "td", NULL } ;
772
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
773
static const char* const li_elt[] = { "li", NULL } ;
774
static const char* const ul_depr[] = { "type", "compact", NULL} ;
775
static const char* const dir_attr[] = { "dir", NULL} ;
776
777
#define DECL (const char**)
778
779
static const htmlElemDesc
780
html40ElementTable[] = {
781
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
782
  DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
783
},
784
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
785
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
786
},
787
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
788
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
789
},
790
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
791
  DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
792
},
793
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
794
  DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
795
},
796
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
797
  EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
798
},
799
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
800
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
801
},
802
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
803
  EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
804
},
805
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
806
  EMPTY , NULL , NULL, DECL basefont_attrs, NULL
807
},
808
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
809
  DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
810
},
811
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
812
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
813
},
814
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
815
  DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
816
},
817
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
818
  DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
819
},
820
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
821
  EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
822
},
823
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
824
  DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
825
},
826
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
827
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
828
},
829
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
830
  DECL html_flow , NULL , NULL, DECL html_attrs, NULL
831
},
832
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
833
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
834
},
835
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
836
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
837
},
838
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
839
  EMPTY , NULL , DECL col_attrs , NULL, NULL
840
},
841
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
842
  DECL col_elt , "col" , DECL col_attrs , NULL, NULL
843
},
844
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
845
  DECL html_flow , NULL , DECL html_attrs, NULL, NULL
846
},
847
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
848
  DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
849
},
850
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
851
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
852
},
853
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
854
  DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
855
},
856
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
857
  DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
858
},
859
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
860
  DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
861
},
862
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
863
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
864
},
865
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
866
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
867
},
868
{ "embed",  0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
869
  EMPTY, NULL, DECL embed_attrs, NULL, NULL
870
},
871
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
872
  DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
873
},
874
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
875
  DECL html_inline, NULL, NULL, DECL font_attrs, NULL
876
},
877
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
878
  DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
879
},
880
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
881
  EMPTY, NULL, NULL, DECL frame_attrs, NULL
882
},
883
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
884
  DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
885
},
886
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
887
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
888
},
889
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
890
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
891
},
892
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
893
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894
},
895
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
896
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
897
},
898
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
899
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
900
},
901
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
902
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
903
},
904
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
905
  DECL head_contents, NULL, DECL head_attrs, NULL, NULL
906
},
907
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
908
  EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
909
},
910
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
911
  DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
912
},
913
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
914
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
915
},
916
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
917
  DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
918
},
919
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
920
  EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
921
},
922
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
923
  EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
924
},
925
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
926
  DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
927
},
928
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
929
  EMPTY, NULL, NULL, DECL prompt_attrs, NULL
930
},
931
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
932
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933
},
934
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
935
  DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
936
},
937
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
938
  DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
939
},
940
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
941
  DECL html_flow, NULL, DECL html_attrs, NULL, NULL
942
},
943
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
944
  EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
945
},
946
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
947
  DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
948
},
949
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
950
  DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
951
},
952
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
953
  EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
954
},
955
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
956
  DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
957
},
958
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
959
  DECL html_flow, "div", DECL html_attrs, NULL, NULL
960
},
961
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
962
  DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
963
},
964
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
965
  DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
966
},
967
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
968
  DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
969
},
970
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
971
  DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
972
},
973
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
974
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
975
},
976
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
977
  EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
978
},
979
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
980
  DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
981
},
982
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
983
  DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
984
},
985
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
986
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
987
},
988
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
989
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990
},
991
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
992
  DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
993
},
994
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
995
  DECL select_content, NULL, DECL select_attrs, NULL, NULL
996
},
997
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
998
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
999
},
1000
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1001
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1002
},
1003
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1004
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1005
},
1006
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1007
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1008
},
1009
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
1010
  DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1011
},
1012
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
1013
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1014
},
1015
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
1016
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1017
},
1018
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
1019
  DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1020
},
1021
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
1022
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1023
},
1024
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
1025
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1026
},
1027
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1028
  DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1029
},
1030
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
1031
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1032
},
1033
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
1034
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1035
},
1036
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
1037
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1038
},
1039
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
1040
  DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1041
},
1042
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
1043
  DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1044
},
1045
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1046
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1047
},
1048
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
1049
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1050
},
1051
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
1052
  DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1053
},
1054
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1055
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1056
}
1057
};
1058
1059
typedef struct {
1060
    const char *oldTag;
1061
    const char *newTag;
1062
} htmlStartCloseEntry;
1063
1064
/*
1065
 * start tags that imply the end of current element
1066
 */
1067
static const htmlStartCloseEntry htmlStartClose[] = {
1068
    { "a", "a" },
1069
    { "a", "fieldset" },
1070
    { "a", "table" },
1071
    { "a", "td" },
1072
    { "a", "th" },
1073
    { "address", "dd" },
1074
    { "address", "dl" },
1075
    { "address", "dt" },
1076
    { "address", "form" },
1077
    { "address", "li" },
1078
    { "address", "ul" },
1079
    { "b", "center" },
1080
    { "b", "p" },
1081
    { "b", "td" },
1082
    { "b", "th" },
1083
    { "big", "p" },
1084
    { "caption", "col" },
1085
    { "caption", "colgroup" },
1086
    { "caption", "tbody" },
1087
    { "caption", "tfoot" },
1088
    { "caption", "thead" },
1089
    { "caption", "tr" },
1090
    { "col", "col" },
1091
    { "col", "colgroup" },
1092
    { "col", "tbody" },
1093
    { "col", "tfoot" },
1094
    { "col", "thead" },
1095
    { "col", "tr" },
1096
    { "colgroup", "colgroup" },
1097
    { "colgroup", "tbody" },
1098
    { "colgroup", "tfoot" },
1099
    { "colgroup", "thead" },
1100
    { "colgroup", "tr" },
1101
    { "dd", "dt" },
1102
    { "dir", "dd" },
1103
    { "dir", "dl" },
1104
    { "dir", "dt" },
1105
    { "dir", "form" },
1106
    { "dir", "ul" },
1107
    { "dl", "form" },
1108
    { "dl", "li" },
1109
    { "dt", "dd" },
1110
    { "dt", "dl" },
1111
    { "font", "center" },
1112
    { "font", "td" },
1113
    { "font", "th" },
1114
    { "form", "form" },
1115
    { "h1", "fieldset" },
1116
    { "h1", "form" },
1117
    { "h1", "li" },
1118
    { "h1", "p" },
1119
    { "h1", "table" },
1120
    { "h2", "fieldset" },
1121
    { "h2", "form" },
1122
    { "h2", "li" },
1123
    { "h2", "p" },
1124
    { "h2", "table" },
1125
    { "h3", "fieldset" },
1126
    { "h3", "form" },
1127
    { "h3", "li" },
1128
    { "h3", "p" },
1129
    { "h3", "table" },
1130
    { "h4", "fieldset" },
1131
    { "h4", "form" },
1132
    { "h4", "li" },
1133
    { "h4", "p" },
1134
    { "h4", "table" },
1135
    { "h5", "fieldset" },
1136
    { "h5", "form" },
1137
    { "h5", "li" },
1138
    { "h5", "p" },
1139
    { "h5", "table" },
1140
    { "h6", "fieldset" },
1141
    { "h6", "form" },
1142
    { "h6", "li" },
1143
    { "h6", "p" },
1144
    { "h6", "table" },
1145
    { "head", "a" },
1146
    { "head", "abbr" },
1147
    { "head", "acronym" },
1148
    { "head", "address" },
1149
    { "head", "b" },
1150
    { "head", "bdo" },
1151
    { "head", "big" },
1152
    { "head", "blockquote" },
1153
    { "head", "body" },
1154
    { "head", "br" },
1155
    { "head", "center" },
1156
    { "head", "cite" },
1157
    { "head", "code" },
1158
    { "head", "dd" },
1159
    { "head", "dfn" },
1160
    { "head", "dir" },
1161
    { "head", "div" },
1162
    { "head", "dl" },
1163
    { "head", "dt" },
1164
    { "head", "em" },
1165
    { "head", "fieldset" },
1166
    { "head", "font" },
1167
    { "head", "form" },
1168
    { "head", "frameset" },
1169
    { "head", "h1" },
1170
    { "head", "h2" },
1171
    { "head", "h3" },
1172
    { "head", "h4" },
1173
    { "head", "h5" },
1174
    { "head", "h6" },
1175
    { "head", "hr" },
1176
    { "head", "i" },
1177
    { "head", "iframe" },
1178
    { "head", "img" },
1179
    { "head", "kbd" },
1180
    { "head", "li" },
1181
    { "head", "listing" },
1182
    { "head", "map" },
1183
    { "head", "menu" },
1184
    { "head", "ol" },
1185
    { "head", "p" },
1186
    { "head", "pre" },
1187
    { "head", "q" },
1188
    { "head", "s" },
1189
    { "head", "samp" },
1190
    { "head", "small" },
1191
    { "head", "span" },
1192
    { "head", "strike" },
1193
    { "head", "strong" },
1194
    { "head", "sub" },
1195
    { "head", "sup" },
1196
    { "head", "table" },
1197
    { "head", "tt" },
1198
    { "head", "u" },
1199
    { "head", "ul" },
1200
    { "head", "var" },
1201
    { "head", "xmp" },
1202
    { "hr", "form" },
1203
    { "i", "center" },
1204
    { "i", "p" },
1205
    { "i", "td" },
1206
    { "i", "th" },
1207
    { "legend", "fieldset" },
1208
    { "li", "li" },
1209
    { "link", "body" },
1210
    { "link", "frameset" },
1211
    { "listing", "dd" },
1212
    { "listing", "dl" },
1213
    { "listing", "dt" },
1214
    { "listing", "fieldset" },
1215
    { "listing", "form" },
1216
    { "listing", "li" },
1217
    { "listing", "table" },
1218
    { "listing", "ul" },
1219
    { "menu", "dd" },
1220
    { "menu", "dl" },
1221
    { "menu", "dt" },
1222
    { "menu", "form" },
1223
    { "menu", "ul" },
1224
    { "ol", "form" },
1225
    { "option", "optgroup" },
1226
    { "option", "option" },
1227
    { "p", "address" },
1228
    { "p", "blockquote" },
1229
    { "p", "body" },
1230
    { "p", "caption" },
1231
    { "p", "center" },
1232
    { "p", "col" },
1233
    { "p", "colgroup" },
1234
    { "p", "dd" },
1235
    { "p", "dir" },
1236
    { "p", "div" },
1237
    { "p", "dl" },
1238
    { "p", "dt" },
1239
    { "p", "fieldset" },
1240
    { "p", "form" },
1241
    { "p", "frameset" },
1242
    { "p", "h1" },
1243
    { "p", "h2" },
1244
    { "p", "h3" },
1245
    { "p", "h4" },
1246
    { "p", "h5" },
1247
    { "p", "h6" },
1248
    { "p", "head" },
1249
    { "p", "hr" },
1250
    { "p", "li" },
1251
    { "p", "listing" },
1252
    { "p", "menu" },
1253
    { "p", "ol" },
1254
    { "p", "p" },
1255
    { "p", "pre" },
1256
    { "p", "table" },
1257
    { "p", "tbody" },
1258
    { "p", "td" },
1259
    { "p", "tfoot" },
1260
    { "p", "th" },
1261
    { "p", "title" },
1262
    { "p", "tr" },
1263
    { "p", "ul" },
1264
    { "p", "xmp" },
1265
    { "pre", "dd" },
1266
    { "pre", "dl" },
1267
    { "pre", "dt" },
1268
    { "pre", "fieldset" },
1269
    { "pre", "form" },
1270
    { "pre", "li" },
1271
    { "pre", "table" },
1272
    { "pre", "ul" },
1273
    { "s", "p" },
1274
    { "script", "noscript" },
1275
    { "small", "p" },
1276
    { "span", "td" },
1277
    { "span", "th" },
1278
    { "strike", "p" },
1279
    { "style", "body" },
1280
    { "style", "frameset" },
1281
    { "tbody", "tbody" },
1282
    { "tbody", "tfoot" },
1283
    { "td", "tbody" },
1284
    { "td", "td" },
1285
    { "td", "tfoot" },
1286
    { "td", "th" },
1287
    { "td", "tr" },
1288
    { "tfoot", "tbody" },
1289
    { "th", "tbody" },
1290
    { "th", "td" },
1291
    { "th", "tfoot" },
1292
    { "th", "th" },
1293
    { "th", "tr" },
1294
    { "thead", "tbody" },
1295
    { "thead", "tfoot" },
1296
    { "title", "body" },
1297
    { "title", "frameset" },
1298
    { "tr", "tbody" },
1299
    { "tr", "tfoot" },
1300
    { "tr", "tr" },
1301
    { "tt", "p" },
1302
    { "u", "p" },
1303
    { "u", "td" },
1304
    { "u", "th" },
1305
    { "ul", "address" },
1306
    { "ul", "form" },
1307
    { "ul", "menu" },
1308
    { "ul", "pre" },
1309
    { "xmp", "dd" },
1310
    { "xmp", "dl" },
1311
    { "xmp", "dt" },
1312
    { "xmp", "fieldset" },
1313
    { "xmp", "form" },
1314
    { "xmp", "li" },
1315
    { "xmp", "table" },
1316
    { "xmp", "ul" }
1317
};
1318
1319
/*
1320
 * The list of HTML elements which are supposed not to have
1321
 * CDATA content and where a p element will be implied
1322
 *
1323
 * TODO: extend that list by reading the HTML SGML DTD on
1324
 *       implied paragraph
1325
 */
1326
static const char *const htmlNoContentElements[] = {
1327
    "html",
1328
    "head",
1329
    NULL
1330
};
1331
1332
/*
1333
 * The list of HTML attributes which are of content %Script;
1334
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1335
 *       it assumes the name starts with 'on'
1336
 */
1337
static const char *const htmlScriptAttributes[] = {
1338
    "onclick",
1339
    "ondblclick",
1340
    "onmousedown",
1341
    "onmouseup",
1342
    "onmouseover",
1343
    "onmousemove",
1344
    "onmouseout",
1345
    "onkeypress",
1346
    "onkeydown",
1347
    "onkeyup",
1348
    "onload",
1349
    "onunload",
1350
    "onfocus",
1351
    "onblur",
1352
    "onsubmit",
1353
    "onreset",
1354
    "onchange",
1355
    "onselect"
1356
};
1357
1358
/*
1359
 * This table is used by the htmlparser to know what to do with
1360
 * broken html pages. By assigning different priorities to different
1361
 * elements the parser can decide how to handle extra endtags.
1362
 * Endtags are only allowed to close elements with lower or equal
1363
 * priority.
1364
 */
1365
1366
typedef struct {
1367
    const char *name;
1368
    int priority;
1369
} elementPriority;
1370
1371
static const elementPriority htmlEndPriority[] = {
1372
    {"div",   150},
1373
    {"td",    160},
1374
    {"th",    160},
1375
    {"tr",    170},
1376
    {"thead", 180},
1377
    {"tbody", 180},
1378
    {"tfoot", 180},
1379
    {"table", 190},
1380
    {"head",  200},
1381
    {"body",  200},
1382
    {"html",  220},
1383
    {NULL,    100} /* Default priority */
1384
};
1385
1386
/************************************************************************
1387
 *                  *
1388
 *  functions to handle HTML specific data      *
1389
 *                  *
1390
 ************************************************************************/
1391
1392
/**
1393
 * htmlInitAutoClose:
1394
 *
1395
 * DEPRECATED: This is a no-op.
1396
 */
1397
void
1398
0
htmlInitAutoClose(void) {
1399
0
}
1400
1401
static int
1402
0
htmlCompareTags(const void *key, const void *member) {
1403
0
    const xmlChar *tag = (const xmlChar *) key;
1404
0
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1405
1406
0
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1407
0
}
1408
1409
/**
1410
 * htmlTagLookup:
1411
 * @tag:  The tag name in lowercase
1412
 *
1413
 * Lookup the HTML tag in the ElementTable
1414
 *
1415
 * Returns the related htmlElemDescPtr or NULL if not found.
1416
 */
1417
const htmlElemDesc *
1418
0
htmlTagLookup(const xmlChar *tag) {
1419
0
    if (tag == NULL)
1420
0
        return(NULL);
1421
1422
0
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1423
0
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1424
0
                sizeof(htmlElemDesc), htmlCompareTags));
1425
0
}
1426
1427
/**
1428
 * htmlGetEndPriority:
1429
 * @name: The name of the element to look up the priority for.
1430
 *
1431
 * Return value: The "endtag" priority.
1432
 **/
1433
static int
1434
0
htmlGetEndPriority (const xmlChar *name) {
1435
0
    int i = 0;
1436
1437
0
    while ((htmlEndPriority[i].name != NULL) &&
1438
0
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1439
0
  i++;
1440
1441
0
    return(htmlEndPriority[i].priority);
1442
0
}
1443
1444
1445
static int
1446
0
htmlCompareStartClose(const void *vkey, const void *member) {
1447
0
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1448
0
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1449
0
    int ret;
1450
1451
0
    ret = strcmp(key->oldTag, entry->oldTag);
1452
0
    if (ret == 0)
1453
0
        ret = strcmp(key->newTag, entry->newTag);
1454
1455
0
    return(ret);
1456
0
}
1457
1458
/**
1459
 * htmlCheckAutoClose:
1460
 * @newtag:  The new tag name
1461
 * @oldtag:  The old tag name
1462
 *
1463
 * Checks whether the new tag is one of the registered valid tags for
1464
 * closing old.
1465
 *
1466
 * Returns 0 if no, 1 if yes.
1467
 */
1468
static int
1469
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1470
0
{
1471
0
    htmlStartCloseEntry key;
1472
0
    void *res;
1473
1474
0
    key.oldTag = (const char *) oldtag;
1475
0
    key.newTag = (const char *) newtag;
1476
0
    res = bsearch(&key, htmlStartClose,
1477
0
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1478
0
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1479
0
    return(res != NULL);
1480
0
}
1481
1482
/**
1483
 * htmlAutoCloseOnClose:
1484
 * @ctxt:  an HTML parser context
1485
 * @newtag:  The new tag name
1486
 * @force:  force the tag closure
1487
 *
1488
 * The HTML DTD allows an ending tag to implicitly close other tags.
1489
 */
1490
static void
1491
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1492
0
{
1493
0
    const htmlElemDesc *info;
1494
0
    int i, priority;
1495
1496
0
    priority = htmlGetEndPriority(newtag);
1497
1498
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1499
1500
0
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1501
0
            break;
1502
        /*
1503
         * A misplaced endtag can only close elements with lower
1504
         * or equal priority, so if we find an element with higher
1505
         * priority before we find an element with
1506
         * matching name, we just ignore this endtag
1507
         */
1508
0
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1509
0
            return;
1510
0
    }
1511
0
    if (i < 0)
1512
0
        return;
1513
1514
0
    while (!xmlStrEqual(newtag, ctxt->name)) {
1515
0
        info = htmlTagLookup(ctxt->name);
1516
0
        if ((info != NULL) && (info->endTag == 3)) {
1517
0
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1518
0
                   "Opening and ending tag mismatch: %s and %s\n",
1519
0
       newtag, ctxt->name);
1520
0
        }
1521
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1522
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1523
0
  htmlnamePop(ctxt);
1524
0
    }
1525
0
}
1526
1527
/**
1528
 * htmlAutoCloseOnEnd:
1529
 * @ctxt:  an HTML parser context
1530
 *
1531
 * Close all remaining tags at the end of the stream
1532
 */
1533
static void
1534
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1535
0
{
1536
0
    int i;
1537
1538
0
    if (ctxt->nameNr == 0)
1539
0
        return;
1540
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1541
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1542
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1543
0
  htmlnamePop(ctxt);
1544
0
    }
1545
0
}
1546
1547
/**
1548
 * htmlAutoClose:
1549
 * @ctxt:  an HTML parser context
1550
 * @newtag:  The new tag name or NULL
1551
 *
1552
 * The HTML DTD allows a tag to implicitly close other tags.
1553
 * The list is kept in htmlStartClose array. This function is
1554
 * called when a new tag has been detected and generates the
1555
 * appropriates closes if possible/needed.
1556
 * If newtag is NULL this mean we are at the end of the resource
1557
 * and we should check
1558
 */
1559
static void
1560
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1561
0
{
1562
0
    while ((newtag != NULL) && (ctxt->name != NULL) &&
1563
0
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1564
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1565
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1566
0
  htmlnamePop(ctxt);
1567
0
    }
1568
0
    if (newtag == NULL) {
1569
0
        htmlAutoCloseOnEnd(ctxt);
1570
0
        return;
1571
0
    }
1572
0
    while ((newtag == NULL) && (ctxt->name != NULL) &&
1573
0
           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1574
0
            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1575
0
            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1576
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1577
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1578
0
  htmlnamePop(ctxt);
1579
0
    }
1580
0
}
1581
1582
/**
1583
 * htmlAutoCloseTag:
1584
 * @doc:  the HTML document
1585
 * @name:  The tag name
1586
 * @elem:  the HTML element
1587
 *
1588
 * The HTML DTD allows a tag to implicitly close other tags.
1589
 * The list is kept in htmlStartClose array. This function checks
1590
 * if the element or one of it's children would autoclose the
1591
 * given tag.
1592
 *
1593
 * Returns 1 if autoclose, 0 otherwise
1594
 */
1595
int
1596
0
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1597
0
    htmlNodePtr child;
1598
1599
0
    if (elem == NULL) return(1);
1600
0
    if (xmlStrEqual(name, elem->name)) return(0);
1601
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1602
0
    child = elem->children;
1603
0
    while (child != NULL) {
1604
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1605
0
  child = child->next;
1606
0
    }
1607
0
    return(0);
1608
0
}
1609
1610
/**
1611
 * htmlIsAutoClosed:
1612
 * @doc:  the HTML document
1613
 * @elem:  the HTML element
1614
 *
1615
 * The HTML DTD allows a tag to implicitly close other tags.
1616
 * The list is kept in htmlStartClose array. This function checks
1617
 * if a tag is autoclosed by one of it's child
1618
 *
1619
 * Returns 1 if autoclosed, 0 otherwise
1620
 */
1621
int
1622
0
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1623
0
    htmlNodePtr child;
1624
1625
0
    if (elem == NULL) return(1);
1626
0
    child = elem->children;
1627
0
    while (child != NULL) {
1628
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1629
0
  child = child->next;
1630
0
    }
1631
0
    return(0);
1632
0
}
1633
1634
/**
1635
 * htmlCheckImplied:
1636
 * @ctxt:  an HTML parser context
1637
 * @newtag:  The new tag name
1638
 *
1639
 * The HTML DTD allows a tag to exists only implicitly
1640
 * called when a new tag has been detected and generates the
1641
 * appropriates implicit tags if missing
1642
 */
1643
static void
1644
0
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1645
0
    int i;
1646
1647
0
    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1648
0
        return;
1649
0
    if (!htmlOmittedDefaultValue)
1650
0
  return;
1651
0
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1652
0
  return;
1653
0
    if (ctxt->nameNr <= 0) {
1654
0
  htmlnamePush(ctxt, BAD_CAST"html");
1655
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1656
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1657
0
    }
1658
0
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1659
0
        return;
1660
0
    if ((ctxt->nameNr <= 1) &&
1661
0
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1662
0
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1663
0
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1664
0
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1665
0
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1666
0
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1667
0
        if (ctxt->html >= 3) {
1668
            /* we already saw or generated an <head> before */
1669
0
            return;
1670
0
        }
1671
        /*
1672
         * dropped OBJECT ... i you put it first BODY will be
1673
         * assumed !
1674
         */
1675
0
        htmlnamePush(ctxt, BAD_CAST"head");
1676
0
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1677
0
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1678
0
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1679
0
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1680
0
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1681
0
        if (ctxt->html >= 10) {
1682
            /* we already saw or generated a <body> before */
1683
0
            return;
1684
0
        }
1685
0
  for (i = 0;i < ctxt->nameNr;i++) {
1686
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1687
0
    return;
1688
0
      }
1689
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1690
0
    return;
1691
0
      }
1692
0
  }
1693
1694
0
  htmlnamePush(ctxt, BAD_CAST"body");
1695
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1696
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1697
0
    }
1698
0
}
1699
1700
/**
1701
 * htmlCheckParagraph
1702
 * @ctxt:  an HTML parser context
1703
 *
1704
 * Check whether a p element need to be implied before inserting
1705
 * characters in the current element.
1706
 *
1707
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1708
 *         in case of error.
1709
 */
1710
1711
static int
1712
0
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1713
0
    const xmlChar *tag;
1714
0
    int i;
1715
1716
0
    if (ctxt == NULL)
1717
0
  return(-1);
1718
0
    tag = ctxt->name;
1719
0
    if (tag == NULL) {
1720
0
  htmlAutoClose(ctxt, BAD_CAST"p");
1721
0
  htmlCheckImplied(ctxt, BAD_CAST"p");
1722
0
  htmlnamePush(ctxt, BAD_CAST"p");
1723
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1724
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1725
0
  return(1);
1726
0
    }
1727
0
    if (!htmlOmittedDefaultValue)
1728
0
  return(0);
1729
0
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1730
0
  if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1731
0
      htmlAutoClose(ctxt, BAD_CAST"p");
1732
0
      htmlCheckImplied(ctxt, BAD_CAST"p");
1733
0
      htmlnamePush(ctxt, BAD_CAST"p");
1734
0
      if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1735
0
    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1736
0
      return(1);
1737
0
  }
1738
0
    }
1739
0
    return(0);
1740
0
}
1741
1742
/**
1743
 * htmlIsScriptAttribute:
1744
 * @name:  an attribute name
1745
 *
1746
 * Check if an attribute is of content type Script
1747
 *
1748
 * Returns 1 is the attribute is a script 0 otherwise
1749
 */
1750
int
1751
0
htmlIsScriptAttribute(const xmlChar *name) {
1752
0
    unsigned int i;
1753
1754
0
    if (name == NULL)
1755
0
      return(0);
1756
    /*
1757
     * all script attributes start with 'on'
1758
     */
1759
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1760
0
      return(0);
1761
0
    for (i = 0;
1762
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1763
0
   i++) {
1764
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1765
0
      return(1);
1766
0
    }
1767
0
    return(0);
1768
0
}
1769
1770
/************************************************************************
1771
 *                  *
1772
 *  The list of HTML predefined entities      *
1773
 *                  *
1774
 ************************************************************************/
1775
1776
1777
static const htmlEntityDesc  html40EntitiesTable[] = {
1778
/*
1779
 * the 4 absolute ones, plus apostrophe.
1780
 */
1781
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1782
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1783
{ 39, "apos", "single quote" },
1784
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1785
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1786
1787
/*
1788
 * A bunch still in the 128-255 range
1789
 * Replacing them depend really on the charset used.
1790
 */
1791
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1792
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1793
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1794
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1795
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1796
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1797
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1798
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1799
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1800
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1801
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1802
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1803
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1804
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1805
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1806
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1807
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1808
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1809
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1810
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1811
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1812
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1813
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1814
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1815
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1816
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1817
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1818
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1819
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1820
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1821
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1822
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1823
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1824
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1825
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1826
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1827
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1828
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1829
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1830
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1831
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1832
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1833
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1834
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1835
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1836
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1837
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1838
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1839
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1840
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1841
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1842
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1843
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1844
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1845
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1846
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1847
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1848
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1849
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1850
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1851
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1852
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1853
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1854
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1855
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1856
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1857
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1858
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1859
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1860
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1861
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1862
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1863
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1864
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1865
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1866
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1867
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1868
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1869
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1870
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1871
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1872
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1873
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1874
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1875
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1876
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1877
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1878
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1879
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1880
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1881
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1882
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1883
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1884
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1885
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1886
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1887
1888
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1889
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1890
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1891
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1892
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1893
1894
/*
1895
 * Anything below should really be kept as entities references
1896
 */
1897
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1898
1899
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1900
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1901
1902
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1903
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1904
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1905
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1906
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1907
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1908
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1909
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1910
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1911
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1912
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1913
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1914
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1915
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1916
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1917
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1918
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1919
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1920
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1921
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1922
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1923
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1924
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1925
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1926
1927
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1928
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1929
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1930
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1931
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1932
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1933
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1934
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1935
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1936
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1937
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1938
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1939
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1940
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1941
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1942
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1943
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1944
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1945
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1946
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1947
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1948
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1949
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1950
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1951
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1952
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1953
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1954
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1955
1956
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1957
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1958
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1959
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1960
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1961
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1962
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1963
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1964
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1965
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1966
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1967
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1968
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1969
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1970
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1971
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1972
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1973
1974
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1975
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1976
1977
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1978
1979
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1980
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1981
1982
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1983
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1984
1985
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1986
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1987
1988
{ 8364, "euro", "euro sign, U+20AC NEW" },
1989
1990
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1991
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1992
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1993
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1994
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1995
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1996
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1997
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1998
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1999
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2000
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2001
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2002
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2003
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2004
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2005
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2006
2007
{ 8704, "forall","for all, U+2200 ISOtech" },
2008
{ 8706, "part", "partial differential, U+2202 ISOtech" },
2009
{ 8707, "exist","there exists, U+2203 ISOtech" },
2010
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2011
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2012
{ 8712, "isin", "element of, U+2208 ISOtech" },
2013
{ 8713, "notin","not an element of, U+2209 ISOtech" },
2014
{ 8715, "ni", "contains as member, U+220B ISOtech" },
2015
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2016
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2017
{ 8722, "minus","minus sign, U+2212 ISOtech" },
2018
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2019
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
2020
{ 8733, "prop", "proportional to, U+221D ISOtech" },
2021
{ 8734, "infin","infinity, U+221E ISOtech" },
2022
{ 8736, "ang",  "angle, U+2220 ISOamso" },
2023
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2024
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
2025
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2026
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
2027
{ 8747, "int",  "integral, U+222B ISOtech" },
2028
{ 8756, "there4","therefore, U+2234 ISOtech" },
2029
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2030
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2031
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2032
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
2033
{ 8801, "equiv","identical to, U+2261 ISOtech" },
2034
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2035
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2036
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
2037
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
2038
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2039
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2040
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2041
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2042
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2043
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2044
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2045
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2046
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2047
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2048
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
2049
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2050
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2051
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
2052
2053
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
2054
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2055
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2056
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
2057
2058
};
2059
2060
/************************************************************************
2061
 *                  *
2062
 *    Commodity functions to handle entities      *
2063
 *                  *
2064
 ************************************************************************/
2065
2066
/*
2067
 * Macro used to grow the current buffer.
2068
 */
2069
0
#define growBuffer(buffer) {           \
2070
0
    xmlChar *tmp;             \
2071
0
    buffer##_size *= 2;             \
2072
0
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size);    \
2073
0
    if (tmp == NULL) {             \
2074
0
  htmlErrMemory(ctxt, "growing buffer\n");      \
2075
0
  xmlFree(buffer);            \
2076
0
  return(NULL);             \
2077
0
    }                  \
2078
0
    buffer = tmp;             \
2079
0
}
2080
2081
/**
2082
 * htmlEntityLookup:
2083
 * @name: the entity name
2084
 *
2085
 * Lookup the given entity in EntitiesTable
2086
 *
2087
 * TODO: the linear scan is really ugly, an hash table is really needed.
2088
 *
2089
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2090
 */
2091
const htmlEntityDesc *
2092
0
htmlEntityLookup(const xmlChar *name) {
2093
0
    unsigned int i;
2094
2095
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2096
0
                    sizeof(html40EntitiesTable[0]));i++) {
2097
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2098
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2099
0
  }
2100
0
    }
2101
0
    return(NULL);
2102
0
}
2103
2104
/**
2105
 * htmlEntityValueLookup:
2106
 * @value: the entity's unicode value
2107
 *
2108
 * Lookup the given entity in EntitiesTable
2109
 *
2110
 * TODO: the linear scan is really ugly, an hash table is really needed.
2111
 *
2112
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2113
 */
2114
const htmlEntityDesc *
2115
0
htmlEntityValueLookup(unsigned int value) {
2116
0
    unsigned int i;
2117
2118
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2119
0
                    sizeof(html40EntitiesTable[0]));i++) {
2120
0
        if (html40EntitiesTable[i].value >= value) {
2121
0
      if (html40EntitiesTable[i].value > value)
2122
0
    break;
2123
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2124
0
  }
2125
0
    }
2126
0
    return(NULL);
2127
0
}
2128
2129
/**
2130
 * UTF8ToHtml:
2131
 * @out:  a pointer to an array of bytes to store the result
2132
 * @outlen:  the length of @out
2133
 * @in:  a pointer to an array of UTF-8 chars
2134
 * @inlen:  the length of @in
2135
 *
2136
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2137
 * plus HTML entities block of chars out.
2138
 *
2139
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2140
 * The value of @inlen after return is the number of octets consumed
2141
 *     as the return value is positive, else unpredictable.
2142
 * The value of @outlen after return is the number of octets consumed.
2143
 */
2144
int
2145
UTF8ToHtml(unsigned char* out, int *outlen,
2146
0
              const unsigned char* in, int *inlen) {
2147
0
    const unsigned char* processed = in;
2148
0
    const unsigned char* outend;
2149
0
    const unsigned char* outstart = out;
2150
0
    const unsigned char* instart = in;
2151
0
    const unsigned char* inend;
2152
0
    unsigned int c, d;
2153
0
    int trailing;
2154
2155
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2156
0
    if (in == NULL) {
2157
        /*
2158
   * initialization nothing to do
2159
   */
2160
0
  *outlen = 0;
2161
0
  *inlen = 0;
2162
0
  return(0);
2163
0
    }
2164
0
    inend = in + (*inlen);
2165
0
    outend = out + (*outlen);
2166
0
    while (in < inend) {
2167
0
  d = *in++;
2168
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2169
0
  else if (d < 0xC0) {
2170
      /* trailing byte in leading position */
2171
0
      *outlen = out - outstart;
2172
0
      *inlen = processed - instart;
2173
0
      return(-2);
2174
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2175
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2176
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2177
0
  else {
2178
      /* no chance for this in Ascii */
2179
0
      *outlen = out - outstart;
2180
0
      *inlen = processed - instart;
2181
0
      return(-2);
2182
0
  }
2183
2184
0
  if (inend - in < trailing) {
2185
0
      break;
2186
0
  }
2187
2188
0
  for ( ; trailing; trailing--) {
2189
0
      if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2190
0
    break;
2191
0
      c <<= 6;
2192
0
      c |= d & 0x3F;
2193
0
  }
2194
2195
  /* assertion: c is a single UTF-4 value */
2196
0
  if (c < 0x80) {
2197
0
      if (out + 1 >= outend)
2198
0
    break;
2199
0
      *out++ = c;
2200
0
  } else {
2201
0
      int len;
2202
0
      const htmlEntityDesc * ent;
2203
0
      const char *cp;
2204
0
      char nbuf[16];
2205
2206
      /*
2207
       * Try to lookup a predefined HTML entity for it
2208
       */
2209
2210
0
      ent = htmlEntityValueLookup(c);
2211
0
      if (ent == NULL) {
2212
0
        snprintf(nbuf, sizeof(nbuf), "#%u", c);
2213
0
        cp = nbuf;
2214
0
      }
2215
0
      else
2216
0
        cp = ent->name;
2217
0
      len = strlen(cp);
2218
0
      if (out + 2 + len >= outend)
2219
0
    break;
2220
0
      *out++ = '&';
2221
0
      memcpy(out, cp, len);
2222
0
      out += len;
2223
0
      *out++ = ';';
2224
0
  }
2225
0
  processed = in;
2226
0
    }
2227
0
    *outlen = out - outstart;
2228
0
    *inlen = processed - instart;
2229
0
    return(0);
2230
0
}
2231
2232
/**
2233
 * htmlEncodeEntities:
2234
 * @out:  a pointer to an array of bytes to store the result
2235
 * @outlen:  the length of @out
2236
 * @in:  a pointer to an array of UTF-8 chars
2237
 * @inlen:  the length of @in
2238
 * @quoteChar: the quote character to escape (' or ") or zero.
2239
 *
2240
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2241
 * plus HTML entities block of chars out.
2242
 *
2243
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2244
 * The value of @inlen after return is the number of octets consumed
2245
 *     as the return value is positive, else unpredictable.
2246
 * The value of @outlen after return is the number of octets consumed.
2247
 */
2248
int
2249
htmlEncodeEntities(unsigned char* out, int *outlen,
2250
0
       const unsigned char* in, int *inlen, int quoteChar) {
2251
0
    const unsigned char* processed = in;
2252
0
    const unsigned char* outend;
2253
0
    const unsigned char* outstart = out;
2254
0
    const unsigned char* instart = in;
2255
0
    const unsigned char* inend;
2256
0
    unsigned int c, d;
2257
0
    int trailing;
2258
2259
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2260
0
        return(-1);
2261
0
    outend = out + (*outlen);
2262
0
    inend = in + (*inlen);
2263
0
    while (in < inend) {
2264
0
  d = *in++;
2265
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2266
0
  else if (d < 0xC0) {
2267
      /* trailing byte in leading position */
2268
0
      *outlen = out - outstart;
2269
0
      *inlen = processed - instart;
2270
0
      return(-2);
2271
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2272
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2273
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2274
0
  else {
2275
      /* no chance for this in Ascii */
2276
0
      *outlen = out - outstart;
2277
0
      *inlen = processed - instart;
2278
0
      return(-2);
2279
0
  }
2280
2281
0
  if (inend - in < trailing)
2282
0
      break;
2283
2284
0
  while (trailing--) {
2285
0
      if (((d= *in++) & 0xC0) != 0x80) {
2286
0
    *outlen = out - outstart;
2287
0
    *inlen = processed - instart;
2288
0
    return(-2);
2289
0
      }
2290
0
      c <<= 6;
2291
0
      c |= d & 0x3F;
2292
0
  }
2293
2294
  /* assertion: c is a single UTF-4 value */
2295
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2296
0
      (c != '&') && (c != '<') && (c != '>')) {
2297
0
      if (out >= outend)
2298
0
    break;
2299
0
      *out++ = c;
2300
0
  } else {
2301
0
      const htmlEntityDesc * ent;
2302
0
      const char *cp;
2303
0
      char nbuf[16];
2304
0
      int len;
2305
2306
      /*
2307
       * Try to lookup a predefined HTML entity for it
2308
       */
2309
0
      ent = htmlEntityValueLookup(c);
2310
0
      if (ent == NULL) {
2311
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2312
0
    cp = nbuf;
2313
0
      }
2314
0
      else
2315
0
    cp = ent->name;
2316
0
      len = strlen(cp);
2317
0
      if (outend - out < len + 2)
2318
0
    break;
2319
0
      *out++ = '&';
2320
0
      memcpy(out, cp, len);
2321
0
      out += len;
2322
0
      *out++ = ';';
2323
0
  }
2324
0
  processed = in;
2325
0
    }
2326
0
    *outlen = out - outstart;
2327
0
    *inlen = processed - instart;
2328
0
    return(0);
2329
0
}
2330
2331
/************************************************************************
2332
 *                  *
2333
 *    Commodity functions to handle streams     *
2334
 *                  *
2335
 ************************************************************************/
2336
2337
#ifdef LIBXML_PUSH_ENABLED
2338
/**
2339
 * htmlNewInputStream:
2340
 * @ctxt:  an HTML parser context
2341
 *
2342
 * Create a new input stream structure
2343
 * Returns the new input stream or NULL
2344
 */
2345
static htmlParserInputPtr
2346
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2347
    htmlParserInputPtr input;
2348
2349
    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2350
    if (input == NULL) {
2351
        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2352
  return(NULL);
2353
    }
2354
    memset(input, 0, sizeof(htmlParserInput));
2355
    input->filename = NULL;
2356
    input->directory = NULL;
2357
    input->base = NULL;
2358
    input->cur = NULL;
2359
    input->buf = NULL;
2360
    input->line = 1;
2361
    input->col = 1;
2362
    input->buf = NULL;
2363
    input->free = NULL;
2364
    input->version = NULL;
2365
    input->consumed = 0;
2366
    input->length = 0;
2367
    return(input);
2368
}
2369
#endif
2370
2371
2372
/************************************************************************
2373
 *                  *
2374
 *    Commodity functions, cleanup needed ?     *
2375
 *                  *
2376
 ************************************************************************/
2377
/*
2378
 * all tags allowing pc data from the html 4.01 loose dtd
2379
 * NOTE: it might be more appropriate to integrate this information
2380
 * into the html40ElementTable array but I don't want to risk any
2381
 * binary incompatibility
2382
 */
2383
static const char *allowPCData[] = {
2384
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2385
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2386
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2387
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2388
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2389
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2390
};
2391
2392
/**
2393
 * areBlanks:
2394
 * @ctxt:  an HTML parser context
2395
 * @str:  a xmlChar *
2396
 * @len:  the size of @str
2397
 *
2398
 * Is this a sequence of blank chars that one can ignore ?
2399
 *
2400
 * Returns 1 if ignorable 0 otherwise.
2401
 */
2402
2403
0
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2404
0
    unsigned int i;
2405
0
    int j;
2406
0
    xmlNodePtr lastChild;
2407
0
    xmlDtdPtr dtd;
2408
2409
0
    for (j = 0;j < len;j++)
2410
0
        if (!(IS_BLANK_CH(str[j]))) return(0);
2411
2412
0
    if (CUR == 0) return(1);
2413
0
    if (CUR != '<') return(0);
2414
0
    if (ctxt->name == NULL)
2415
0
  return(1);
2416
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2417
0
  return(1);
2418
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2419
0
  return(1);
2420
2421
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2422
0
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2423
0
        dtd = xmlGetIntSubset(ctxt->myDoc);
2424
0
        if (dtd != NULL && dtd->ExternalID != NULL) {
2425
0
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2426
0
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2427
0
                return(1);
2428
0
        }
2429
0
    }
2430
2431
0
    if (ctxt->node == NULL) return(0);
2432
0
    lastChild = xmlGetLastChild(ctxt->node);
2433
0
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2434
0
  lastChild = lastChild->prev;
2435
0
    if (lastChild == NULL) {
2436
0
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2437
0
            (ctxt->node->content != NULL)) return(0);
2438
  /* keep ws in constructs like ...<b> </b>...
2439
     for all tags "b" allowing PCDATA */
2440
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2441
0
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2442
0
    return(0);
2443
0
      }
2444
0
  }
2445
0
    } else if (xmlNodeIsText(lastChild)) {
2446
0
        return(0);
2447
0
    } else {
2448
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2449
     for all tags "p" allowing PCDATA */
2450
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2451
0
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2452
0
    return(0);
2453
0
      }
2454
0
  }
2455
0
    }
2456
0
    return(1);
2457
0
}
2458
2459
/**
2460
 * htmlNewDocNoDtD:
2461
 * @URI:  URI for the dtd, or NULL
2462
 * @ExternalID:  the external ID of the DTD, or NULL
2463
 *
2464
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2465
 * are NULL
2466
 *
2467
 * Returns a new document, do not initialize the DTD if not provided
2468
 */
2469
htmlDocPtr
2470
0
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2471
0
    xmlDocPtr cur;
2472
2473
    /*
2474
     * Allocate a new document and fill the fields.
2475
     */
2476
0
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2477
0
    if (cur == NULL) {
2478
0
  htmlErrMemory(NULL, "HTML document creation failed\n");
2479
0
  return(NULL);
2480
0
    }
2481
0
    memset(cur, 0, sizeof(xmlDoc));
2482
2483
0
    cur->type = XML_HTML_DOCUMENT_NODE;
2484
0
    cur->version = NULL;
2485
0
    cur->intSubset = NULL;
2486
0
    cur->doc = cur;
2487
0
    cur->name = NULL;
2488
0
    cur->children = NULL;
2489
0
    cur->extSubset = NULL;
2490
0
    cur->oldNs = NULL;
2491
0
    cur->encoding = NULL;
2492
0
    cur->standalone = 1;
2493
0
    cur->compression = 0;
2494
0
    cur->ids = NULL;
2495
0
    cur->refs = NULL;
2496
0
    cur->_private = NULL;
2497
0
    cur->charset = XML_CHAR_ENCODING_UTF8;
2498
0
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2499
0
    if ((ExternalID != NULL) ||
2500
0
  (URI != NULL))
2501
0
  xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2502
0
    if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2503
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2504
0
    return(cur);
2505
0
}
2506
2507
/**
2508
 * htmlNewDoc:
2509
 * @URI:  URI for the dtd, or NULL
2510
 * @ExternalID:  the external ID of the DTD, or NULL
2511
 *
2512
 * Creates a new HTML document
2513
 *
2514
 * Returns a new document
2515
 */
2516
htmlDocPtr
2517
0
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2518
0
    if ((URI == NULL) && (ExternalID == NULL))
2519
0
  return(htmlNewDocNoDtD(
2520
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2521
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2522
2523
0
    return(htmlNewDocNoDtD(URI, ExternalID));
2524
0
}
2525
2526
2527
/************************************************************************
2528
 *                  *
2529
 *      The parser itself       *
2530
 *  Relates to http://www.w3.org/TR/html40        *
2531
 *                  *
2532
 ************************************************************************/
2533
2534
/************************************************************************
2535
 *                  *
2536
 *      The parser itself       *
2537
 *                  *
2538
 ************************************************************************/
2539
2540
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2541
2542
static void
2543
0
htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2544
0
    int c;
2545
2546
0
    htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2547
0
                 "Incorrectly opened comment\n", NULL, NULL);
2548
2549
0
    do {
2550
0
        c = CUR;
2551
0
        if (c == 0)
2552
0
            break;
2553
0
        NEXT;
2554
0
    } while (c != '>');
2555
0
}
2556
2557
/**
2558
 * htmlParseHTMLName:
2559
 * @ctxt:  an HTML parser context
2560
 *
2561
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2562
 * since HTML names are not case-sensitive.
2563
 *
2564
 * Returns the Tag Name parsed or NULL
2565
 */
2566
2567
static const xmlChar *
2568
0
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2569
0
    int i = 0;
2570
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2571
2572
0
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2573
0
        (CUR != ':') && (CUR != '.')) return(NULL);
2574
2575
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2576
0
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2577
0
     (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2578
0
           (CUR == '.'))) {
2579
0
  if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2580
0
        else loc[i] = CUR;
2581
0
  i++;
2582
2583
0
  NEXT;
2584
0
    }
2585
2586
0
    return(xmlDictLookup(ctxt->dict, loc, i));
2587
0
}
2588
2589
2590
/**
2591
 * htmlParseHTMLName_nonInvasive:
2592
 * @ctxt:  an HTML parser context
2593
 *
2594
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2595
 * since HTML names are not case-sensitive, this doesn't consume the data
2596
 * from the stream, it's a look-ahead
2597
 *
2598
 * Returns the Tag Name parsed or NULL
2599
 */
2600
2601
static const xmlChar *
2602
0
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2603
0
    int i = 0;
2604
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2605
2606
0
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2607
0
        (NXT(1) != ':')) return(NULL);
2608
2609
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2610
0
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2611
0
     (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2612
0
  if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2613
0
        else loc[i] = NXT(1+i);
2614
0
  i++;
2615
0
    }
2616
2617
0
    return(xmlDictLookup(ctxt->dict, loc, i));
2618
0
}
2619
2620
2621
/**
2622
 * htmlParseName:
2623
 * @ctxt:  an HTML parser context
2624
 *
2625
 * parse an HTML name, this routine is case sensitive.
2626
 *
2627
 * Returns the Name parsed or NULL
2628
 */
2629
2630
static const xmlChar *
2631
0
htmlParseName(htmlParserCtxtPtr ctxt) {
2632
0
    const xmlChar *in;
2633
0
    const xmlChar *ret;
2634
0
    int count = 0;
2635
2636
0
    GROW;
2637
2638
    /*
2639
     * Accelerator for simple ASCII names
2640
     */
2641
0
    in = ctxt->input->cur;
2642
0
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2643
0
  ((*in >= 0x41) && (*in <= 0x5A)) ||
2644
0
  (*in == '_') || (*in == ':')) {
2645
0
  in++;
2646
0
  while (((*in >= 0x61) && (*in <= 0x7A)) ||
2647
0
         ((*in >= 0x41) && (*in <= 0x5A)) ||
2648
0
         ((*in >= 0x30) && (*in <= 0x39)) ||
2649
0
         (*in == '_') || (*in == '-') ||
2650
0
         (*in == ':') || (*in == '.'))
2651
0
      in++;
2652
2653
0
  if (in == ctxt->input->end)
2654
0
      return(NULL);
2655
2656
0
  if ((*in > 0) && (*in < 0x80)) {
2657
0
      count = in - ctxt->input->cur;
2658
0
      ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2659
0
      ctxt->input->cur = in;
2660
0
      ctxt->input->col += count;
2661
0
      return(ret);
2662
0
  }
2663
0
    }
2664
0
    return(htmlParseNameComplex(ctxt));
2665
0
}
2666
2667
static const xmlChar *
2668
0
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2669
0
    int len = 0, l;
2670
0
    int c;
2671
0
    int count = 0;
2672
0
    const xmlChar *base = ctxt->input->base;
2673
2674
    /*
2675
     * Handler for more complex cases
2676
     */
2677
0
    GROW;
2678
0
    c = CUR_CHAR(l);
2679
0
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2680
0
  (!IS_LETTER(c) && (c != '_') &&
2681
0
         (c != ':'))) {
2682
0
  return(NULL);
2683
0
    }
2684
2685
0
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2686
0
     ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2687
0
            (c == '.') || (c == '-') ||
2688
0
      (c == '_') || (c == ':') ||
2689
0
      (IS_COMBINING(c)) ||
2690
0
      (IS_EXTENDER(c)))) {
2691
0
  if (count++ > 100) {
2692
0
      count = 0;
2693
0
      GROW;
2694
0
  }
2695
0
  len += l;
2696
0
  NEXTL(l);
2697
0
  c = CUR_CHAR(l);
2698
0
  if (ctxt->input->base != base) {
2699
      /*
2700
       * We changed encoding from an unknown encoding
2701
       * Input buffer changed location, so we better start again
2702
       */
2703
0
      return(htmlParseNameComplex(ctxt));
2704
0
  }
2705
0
    }
2706
2707
0
    if (ctxt->input->cur - ctxt->input->base < len) {
2708
        /* Sanity check */
2709
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2710
0
                     "unexpected change of input buffer", NULL, NULL);
2711
0
        return (NULL);
2712
0
    }
2713
2714
0
    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2715
0
}
2716
2717
2718
/**
2719
 * htmlParseHTMLAttribute:
2720
 * @ctxt:  an HTML parser context
2721
 * @stop:  a char stop value
2722
 *
2723
 * parse an HTML attribute value till the stop (quote), if
2724
 * stop is 0 then it stops at the first space
2725
 *
2726
 * Returns the attribute parsed or NULL
2727
 */
2728
2729
static xmlChar *
2730
0
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2731
0
    xmlChar *buffer = NULL;
2732
0
    int buffer_size = 0;
2733
0
    xmlChar *out = NULL;
2734
0
    const xmlChar *name = NULL;
2735
0
    const xmlChar *cur = NULL;
2736
0
    const htmlEntityDesc * ent;
2737
2738
    /*
2739
     * allocate a translation buffer.
2740
     */
2741
0
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2742
0
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2743
0
    if (buffer == NULL) {
2744
0
  htmlErrMemory(ctxt, "buffer allocation failed\n");
2745
0
  return(NULL);
2746
0
    }
2747
0
    out = buffer;
2748
2749
    /*
2750
     * Ok loop until we reach one of the ending chars
2751
     */
2752
0
    while ((CUR != 0) && (CUR != stop)) {
2753
0
  if ((stop == 0) && (CUR == '>')) break;
2754
0
  if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2755
0
        if (CUR == '&') {
2756
0
      if (NXT(1) == '#') {
2757
0
    unsigned int c;
2758
0
    int bits;
2759
2760
0
    c = htmlParseCharRef(ctxt);
2761
0
    if      (c <    0x80)
2762
0
            { *out++  = c;                bits= -6; }
2763
0
    else if (c <   0x800)
2764
0
            { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2765
0
    else if (c < 0x10000)
2766
0
            { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2767
0
    else
2768
0
            { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2769
2770
0
    for ( ; bits >= 0; bits-= 6) {
2771
0
        *out++  = ((c >> bits) & 0x3F) | 0x80;
2772
0
    }
2773
2774
0
    if (out - buffer > buffer_size - 100) {
2775
0
      int indx = out - buffer;
2776
2777
0
      growBuffer(buffer);
2778
0
      out = &buffer[indx];
2779
0
    }
2780
0
      } else {
2781
0
    ent = htmlParseEntityRef(ctxt, &name);
2782
0
    if (name == NULL) {
2783
0
        *out++ = '&';
2784
0
        if (out - buffer > buffer_size - 100) {
2785
0
      int indx = out - buffer;
2786
2787
0
      growBuffer(buffer);
2788
0
      out = &buffer[indx];
2789
0
        }
2790
0
    } else if (ent == NULL) {
2791
0
        *out++ = '&';
2792
0
        cur = name;
2793
0
        while (*cur != 0) {
2794
0
      if (out - buffer > buffer_size - 100) {
2795
0
          int indx = out - buffer;
2796
2797
0
          growBuffer(buffer);
2798
0
          out = &buffer[indx];
2799
0
      }
2800
0
      *out++ = *cur++;
2801
0
        }
2802
0
    } else {
2803
0
        unsigned int c;
2804
0
        int bits;
2805
2806
0
        if (out - buffer > buffer_size - 100) {
2807
0
      int indx = out - buffer;
2808
2809
0
      growBuffer(buffer);
2810
0
      out = &buffer[indx];
2811
0
        }
2812
0
        c = ent->value;
2813
0
        if      (c <    0x80)
2814
0
      { *out++  = c;                bits= -6; }
2815
0
        else if (c <   0x800)
2816
0
      { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2817
0
        else if (c < 0x10000)
2818
0
      { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2819
0
        else
2820
0
      { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2821
2822
0
        for ( ; bits >= 0; bits-= 6) {
2823
0
      *out++  = ((c >> bits) & 0x3F) | 0x80;
2824
0
        }
2825
0
    }
2826
0
      }
2827
0
  } else {
2828
0
      unsigned int c;
2829
0
      int bits, l;
2830
2831
0
      if (out - buffer > buffer_size - 100) {
2832
0
    int indx = out - buffer;
2833
2834
0
    growBuffer(buffer);
2835
0
    out = &buffer[indx];
2836
0
      }
2837
0
      c = CUR_CHAR(l);
2838
0
      if      (c <    0x80)
2839
0
        { *out++  = c;                bits= -6; }
2840
0
      else if (c <   0x800)
2841
0
        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2842
0
      else if (c < 0x10000)
2843
0
        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2844
0
      else
2845
0
        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2846
2847
0
      for ( ; bits >= 0; bits-= 6) {
2848
0
    *out++  = ((c >> bits) & 0x3F) | 0x80;
2849
0
      }
2850
0
      NEXT;
2851
0
  }
2852
0
    }
2853
0
    *out = 0;
2854
0
    return(buffer);
2855
0
}
2856
2857
/**
2858
 * htmlParseEntityRef:
2859
 * @ctxt:  an HTML parser context
2860
 * @str:  location to store the entity name
2861
 *
2862
 * DEPRECATED: Internal function, don't use.
2863
 *
2864
 * parse an HTML ENTITY references
2865
 *
2866
 * [68] EntityRef ::= '&' Name ';'
2867
 *
2868
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2869
 *         if non-NULL *str will have to be freed by the caller.
2870
 */
2871
const htmlEntityDesc *
2872
0
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2873
0
    const xmlChar *name;
2874
0
    const htmlEntityDesc * ent = NULL;
2875
2876
0
    if (str != NULL) *str = NULL;
2877
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2878
2879
0
    if (CUR == '&') {
2880
0
        NEXT;
2881
0
        name = htmlParseName(ctxt);
2882
0
  if (name == NULL) {
2883
0
      htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2884
0
                   "htmlParseEntityRef: no name\n", NULL, NULL);
2885
0
  } else {
2886
0
      GROW;
2887
0
      if (CUR == ';') {
2888
0
          if (str != NULL)
2889
0
        *str = name;
2890
2891
    /*
2892
     * Lookup the entity in the table.
2893
     */
2894
0
    ent = htmlEntityLookup(name);
2895
0
    if (ent != NULL) /* OK that's ugly !!! */
2896
0
        NEXT;
2897
0
      } else {
2898
0
    htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2899
0
                 "htmlParseEntityRef: expecting ';'\n",
2900
0
           NULL, NULL);
2901
0
          if (str != NULL)
2902
0
        *str = name;
2903
0
      }
2904
0
  }
2905
0
    }
2906
0
    return(ent);
2907
0
}
2908
2909
/**
2910
 * htmlParseAttValue:
2911
 * @ctxt:  an HTML parser context
2912
 *
2913
 * parse a value for an attribute
2914
 * Note: the parser won't do substitution of entities here, this
2915
 * will be handled later in xmlStringGetNodeList, unless it was
2916
 * asked for ctxt->replaceEntities != 0
2917
 *
2918
 * Returns the AttValue parsed or NULL.
2919
 */
2920
2921
static xmlChar *
2922
0
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2923
0
    xmlChar *ret = NULL;
2924
2925
0
    if (CUR == '"') {
2926
0
        NEXT;
2927
0
  ret = htmlParseHTMLAttribute(ctxt, '"');
2928
0
        if (CUR != '"') {
2929
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2930
0
                   "AttValue: \" expected\n", NULL, NULL);
2931
0
  } else
2932
0
      NEXT;
2933
0
    } else if (CUR == '\'') {
2934
0
        NEXT;
2935
0
  ret = htmlParseHTMLAttribute(ctxt, '\'');
2936
0
        if (CUR != '\'') {
2937
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2938
0
                   "AttValue: ' expected\n", NULL, NULL);
2939
0
  } else
2940
0
      NEXT;
2941
0
    } else {
2942
        /*
2943
   * That's an HTMLism, the attribute value may not be quoted
2944
   */
2945
0
  ret = htmlParseHTMLAttribute(ctxt, 0);
2946
0
  if (ret == NULL) {
2947
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2948
0
                   "AttValue: no value found\n", NULL, NULL);
2949
0
  }
2950
0
    }
2951
0
    return(ret);
2952
0
}
2953
2954
/**
2955
 * htmlParseSystemLiteral:
2956
 * @ctxt:  an HTML parser context
2957
 *
2958
 * parse an HTML Literal
2959
 *
2960
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2961
 *
2962
 * Returns the SystemLiteral parsed or NULL
2963
 */
2964
2965
static xmlChar *
2966
0
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2967
0
    size_t len = 0, startPosition = 0;
2968
0
    int err = 0;
2969
0
    int quote;
2970
0
    xmlChar *ret = NULL;
2971
2972
0
    if ((CUR != '"') && (CUR != '\'')) {
2973
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2974
0
               "SystemLiteral \" or ' expected\n", NULL, NULL);
2975
0
        return(NULL);
2976
0
    }
2977
0
    quote = CUR;
2978
0
    NEXT;
2979
2980
0
    if (CUR_PTR < BASE_PTR)
2981
0
        return(ret);
2982
0
    startPosition = CUR_PTR - BASE_PTR;
2983
2984
0
    while ((CUR != 0) && (CUR != quote)) {
2985
        /* TODO: Handle UTF-8 */
2986
0
        if (!IS_CHAR_CH(CUR)) {
2987
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2988
0
                            "Invalid char in SystemLiteral 0x%X\n", CUR);
2989
0
            err = 1;
2990
0
        }
2991
0
        NEXT;
2992
0
        len++;
2993
0
    }
2994
0
    if (CUR != quote) {
2995
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2996
0
                     "Unfinished SystemLiteral\n", NULL, NULL);
2997
0
    } else {
2998
0
        NEXT;
2999
0
        if (err == 0)
3000
0
            ret = xmlStrndup((BASE_PTR+startPosition), len);
3001
0
    }
3002
3003
0
    return(ret);
3004
0
}
3005
3006
/**
3007
 * htmlParsePubidLiteral:
3008
 * @ctxt:  an HTML parser context
3009
 *
3010
 * parse an HTML public literal
3011
 *
3012
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3013
 *
3014
 * Returns the PubidLiteral parsed or NULL.
3015
 */
3016
3017
static xmlChar *
3018
0
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3019
0
    size_t len = 0, startPosition = 0;
3020
0
    int err = 0;
3021
0
    int quote;
3022
0
    xmlChar *ret = NULL;
3023
3024
0
    if ((CUR != '"') && (CUR != '\'')) {
3025
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3026
0
               "PubidLiteral \" or ' expected\n", NULL, NULL);
3027
0
        return(NULL);
3028
0
    }
3029
0
    quote = CUR;
3030
0
    NEXT;
3031
3032
    /*
3033
     * Name ::= (Letter | '_') (NameChar)*
3034
     */
3035
0
    if (CUR_PTR < BASE_PTR)
3036
0
        return(ret);
3037
0
    startPosition = CUR_PTR - BASE_PTR;
3038
3039
0
    while ((CUR != 0) && (CUR != quote)) {
3040
0
        if (!IS_PUBIDCHAR_CH(CUR)) {
3041
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3042
0
                            "Invalid char in PubidLiteral 0x%X\n", CUR);
3043
0
            err = 1;
3044
0
        }
3045
0
        len++;
3046
0
        NEXT;
3047
0
    }
3048
3049
0
    if (CUR != quote) {
3050
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3051
0
                     "Unfinished PubidLiteral\n", NULL, NULL);
3052
0
    } else {
3053
0
        NEXT;
3054
0
        if (err == 0)
3055
0
            ret = xmlStrndup((BASE_PTR + startPosition), len);
3056
0
    }
3057
3058
0
    return(ret);
3059
0
}
3060
3061
/**
3062
 * htmlParseScript:
3063
 * @ctxt:  an HTML parser context
3064
 *
3065
 * parse the content of an HTML SCRIPT or STYLE element
3066
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3067
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3068
 * http://www.w3.org/TR/html4/types.html#type-script
3069
 * http://www.w3.org/TR/html4/types.html#h-6.15
3070
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3071
 *
3072
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3073
 * element and the value of intrinsic event attributes. User agents must
3074
 * not evaluate script data as HTML markup but instead must pass it on as
3075
 * data to a script engine.
3076
 * NOTES:
3077
 * - The content is passed like CDATA
3078
 * - the attributes for style and scripting "onXXX" are also described
3079
 *   as CDATA but SGML allows entities references in attributes so their
3080
 *   processing is identical as other attributes
3081
 */
3082
static void
3083
0
htmlParseScript(htmlParserCtxtPtr ctxt) {
3084
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3085
0
    int nbchar = 0;
3086
0
    int cur,l;
3087
3088
0
    SHRINK;
3089
0
    cur = CUR_CHAR(l);
3090
0
    while (cur != 0) {
3091
0
  if ((cur == '<') && (NXT(1) == '/')) {
3092
            /*
3093
             * One should break here, the specification is clear:
3094
             * Authors should therefore escape "</" within the content.
3095
             * Escape mechanisms are specific to each scripting or
3096
             * style sheet language.
3097
             *
3098
             * In recovery mode, only break if end tag match the
3099
             * current tag, effectively ignoring all tags inside the
3100
             * script/style block and treating the entire block as
3101
             * CDATA.
3102
             */
3103
0
            if (ctxt->recovery) {
3104
0
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3105
0
           xmlStrlen(ctxt->name)) == 0)
3106
0
                {
3107
0
                    break; /* while */
3108
0
                } else {
3109
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3110
0
         "Element %s embeds close tag\n",
3111
0
                     ctxt->name, NULL);
3112
0
    }
3113
0
            } else {
3114
0
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3115
0
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3116
0
                {
3117
0
                    break; /* while */
3118
0
                }
3119
0
            }
3120
0
  }
3121
0
        if (IS_CHAR(cur)) {
3122
0
      COPY_BUF(l,buf,nbchar,cur);
3123
0
        } else {
3124
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3125
0
                            "Invalid char in CDATA 0x%X\n", cur);
3126
0
        }
3127
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3128
0
            buf[nbchar] = 0;
3129
0
      if (ctxt->sax->cdataBlock!= NULL) {
3130
    /*
3131
     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3132
     */
3133
0
    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3134
0
      } else if (ctxt->sax->characters != NULL) {
3135
0
    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3136
0
      }
3137
0
      nbchar = 0;
3138
0
  }
3139
0
  GROW;
3140
0
  NEXTL(l);
3141
0
  cur = CUR_CHAR(l);
3142
0
    }
3143
3144
0
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3145
0
        buf[nbchar] = 0;
3146
0
  if (ctxt->sax->cdataBlock!= NULL) {
3147
      /*
3148
       * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3149
       */
3150
0
      ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3151
0
  } else if (ctxt->sax->characters != NULL) {
3152
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3153
0
  }
3154
0
    }
3155
0
}
3156
3157
3158
/**
3159
 * htmlParseCharDataInternal:
3160
 * @ctxt:  an HTML parser context
3161
 * @readahead: optional read ahead character in ascii range
3162
 *
3163
 * parse a CharData section.
3164
 * if we are within a CDATA section ']]>' marks an end of section.
3165
 *
3166
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3167
 */
3168
3169
static void
3170
0
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3171
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3172
0
    int nbchar = 0;
3173
0
    int cur, l;
3174
0
    int chunk = 0;
3175
3176
0
    if (readahead)
3177
0
        buf[nbchar++] = readahead;
3178
3179
0
    SHRINK;
3180
0
    cur = CUR_CHAR(l);
3181
0
    while (((cur != '<') || (ctxt->token == '<')) &&
3182
0
           ((cur != '&') || (ctxt->token == '&')) &&
3183
0
     (cur != 0)) {
3184
0
  if (!(IS_CHAR(cur))) {
3185
0
      htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3186
0
                  "Invalid char in CDATA 0x%X\n", cur);
3187
0
  } else {
3188
0
      COPY_BUF(l,buf,nbchar,cur);
3189
0
  }
3190
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3191
0
            buf[nbchar] = 0;
3192
3193
      /*
3194
       * Ok the segment is to be consumed as chars.
3195
       */
3196
0
      if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3197
0
    if (areBlanks(ctxt, buf, nbchar)) {
3198
0
        if (ctxt->keepBlanks) {
3199
0
      if (ctxt->sax->characters != NULL)
3200
0
          ctxt->sax->characters(ctxt->userData, buf, nbchar);
3201
0
        } else {
3202
0
      if (ctxt->sax->ignorableWhitespace != NULL)
3203
0
          ctxt->sax->ignorableWhitespace(ctxt->userData,
3204
0
                                         buf, nbchar);
3205
0
        }
3206
0
    } else {
3207
0
        htmlCheckParagraph(ctxt);
3208
0
        if (ctxt->sax->characters != NULL)
3209
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3210
0
    }
3211
0
      }
3212
0
      nbchar = 0;
3213
0
  }
3214
0
  NEXTL(l);
3215
0
        chunk++;
3216
0
        if (chunk > HTML_PARSER_BUFFER_SIZE) {
3217
0
            chunk = 0;
3218
0
            SHRINK;
3219
0
            GROW;
3220
0
        }
3221
0
  cur = CUR_CHAR(l);
3222
0
  if (cur == 0) {
3223
0
      SHRINK;
3224
0
      GROW;
3225
0
      cur = CUR_CHAR(l);
3226
0
  }
3227
0
    }
3228
0
    if (nbchar != 0) {
3229
0
        buf[nbchar] = 0;
3230
3231
  /*
3232
   * Ok the segment is to be consumed as chars.
3233
   */
3234
0
  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3235
0
      if (areBlanks(ctxt, buf, nbchar)) {
3236
0
    if (ctxt->keepBlanks) {
3237
0
        if (ctxt->sax->characters != NULL)
3238
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3239
0
    } else {
3240
0
        if (ctxt->sax->ignorableWhitespace != NULL)
3241
0
      ctxt->sax->ignorableWhitespace(ctxt->userData,
3242
0
                                     buf, nbchar);
3243
0
    }
3244
0
      } else {
3245
0
    htmlCheckParagraph(ctxt);
3246
0
    if (ctxt->sax->characters != NULL)
3247
0
        ctxt->sax->characters(ctxt->userData, buf, nbchar);
3248
0
      }
3249
0
  }
3250
0
    } else {
3251
  /*
3252
   * Loop detection
3253
   */
3254
0
  if (cur == 0)
3255
0
      ctxt->instate = XML_PARSER_EOF;
3256
0
    }
3257
0
}
3258
3259
/**
3260
 * htmlParseCharData:
3261
 * @ctxt:  an HTML parser context
3262
 *
3263
 * parse a CharData section.
3264
 * if we are within a CDATA section ']]>' marks an end of section.
3265
 *
3266
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3267
 */
3268
3269
static void
3270
0
htmlParseCharData(htmlParserCtxtPtr ctxt) {
3271
0
    htmlParseCharDataInternal(ctxt, 0);
3272
0
}
3273
3274
/**
3275
 * htmlParseExternalID:
3276
 * @ctxt:  an HTML parser context
3277
 * @publicID:  a xmlChar** receiving PubidLiteral
3278
 *
3279
 * Parse an External ID or a Public ID
3280
 *
3281
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3282
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3283
 *
3284
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3285
 *
3286
 * Returns the function returns SystemLiteral and in the second
3287
 *                case publicID receives PubidLiteral, is strict is off
3288
 *                it is possible to return NULL and have publicID set.
3289
 */
3290
3291
static xmlChar *
3292
0
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3293
0
    xmlChar *URI = NULL;
3294
3295
0
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3296
0
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3297
0
   (UPP(4) == 'E') && (UPP(5) == 'M')) {
3298
0
        SKIP(6);
3299
0
  if (!IS_BLANK_CH(CUR)) {
3300
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3301
0
                   "Space required after 'SYSTEM'\n", NULL, NULL);
3302
0
  }
3303
0
        SKIP_BLANKS;
3304
0
  URI = htmlParseSystemLiteral(ctxt);
3305
0
  if (URI == NULL) {
3306
0
      htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3307
0
                   "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3308
0
        }
3309
0
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3310
0
         (UPP(2) == 'B') && (UPP(3) == 'L') &&
3311
0
         (UPP(4) == 'I') && (UPP(5) == 'C')) {
3312
0
        SKIP(6);
3313
0
  if (!IS_BLANK_CH(CUR)) {
3314
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3315
0
                   "Space required after 'PUBLIC'\n", NULL, NULL);
3316
0
  }
3317
0
        SKIP_BLANKS;
3318
0
  *publicID = htmlParsePubidLiteral(ctxt);
3319
0
  if (*publicID == NULL) {
3320
0
      htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3321
0
                   "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3322
0
       NULL, NULL);
3323
0
  }
3324
0
        SKIP_BLANKS;
3325
0
        if ((CUR == '"') || (CUR == '\'')) {
3326
0
      URI = htmlParseSystemLiteral(ctxt);
3327
0
  }
3328
0
    }
3329
0
    return(URI);
3330
0
}
3331
3332
/**
3333
 * xmlParsePI:
3334
 * @ctxt:  an XML parser context
3335
 *
3336
 * parse an XML Processing Instruction.
3337
 *
3338
 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3339
 */
3340
static void
3341
0
htmlParsePI(htmlParserCtxtPtr ctxt) {
3342
0
    xmlChar *buf = NULL;
3343
0
    int len = 0;
3344
0
    int size = HTML_PARSER_BUFFER_SIZE;
3345
0
    int cur, l;
3346
0
    const xmlChar *target;
3347
0
    xmlParserInputState state;
3348
0
    int count = 0;
3349
3350
0
    if ((RAW == '<') && (NXT(1) == '?')) {
3351
0
  state = ctxt->instate;
3352
0
        ctxt->instate = XML_PARSER_PI;
3353
  /*
3354
   * this is a Processing Instruction.
3355
   */
3356
0
  SKIP(2);
3357
0
  SHRINK;
3358
3359
  /*
3360
   * Parse the target name and check for special support like
3361
   * namespace.
3362
   */
3363
0
        target = htmlParseName(ctxt);
3364
0
  if (target != NULL) {
3365
0
      if (RAW == '>') {
3366
0
    SKIP(1);
3367
3368
    /*
3369
     * SAX: PI detected.
3370
     */
3371
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3372
0
        (ctxt->sax->processingInstruction != NULL))
3373
0
        ctxt->sax->processingInstruction(ctxt->userData,
3374
0
                                         target, NULL);
3375
0
    ctxt->instate = state;
3376
0
    return;
3377
0
      }
3378
0
      buf = (xmlChar *) xmlMallocAtomic(size);
3379
0
      if (buf == NULL) {
3380
0
    htmlErrMemory(ctxt, NULL);
3381
0
    ctxt->instate = state;
3382
0
    return;
3383
0
      }
3384
0
      cur = CUR;
3385
0
      if (!IS_BLANK(cur)) {
3386
0
    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3387
0
        "ParsePI: PI %s space expected\n", target, NULL);
3388
0
      }
3389
0
            SKIP_BLANKS;
3390
0
      cur = CUR_CHAR(l);
3391
0
      while ((cur != 0) && (cur != '>')) {
3392
0
    if (len + 5 >= size) {
3393
0
        xmlChar *tmp;
3394
3395
0
        size *= 2;
3396
0
        tmp = (xmlChar *) xmlRealloc(buf, size);
3397
0
        if (tmp == NULL) {
3398
0
      htmlErrMemory(ctxt, NULL);
3399
0
      xmlFree(buf);
3400
0
      ctxt->instate = state;
3401
0
      return;
3402
0
        }
3403
0
        buf = tmp;
3404
0
    }
3405
0
    count++;
3406
0
    if (count > 50) {
3407
0
        GROW;
3408
0
        count = 0;
3409
0
    }
3410
0
                if (IS_CHAR(cur)) {
3411
0
        COPY_BUF(l,buf,len,cur);
3412
0
                } else {
3413
0
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3414
0
                                    "Invalid char in processing instruction "
3415
0
                                    "0x%X\n", cur);
3416
0
                }
3417
0
    NEXTL(l);
3418
0
    cur = CUR_CHAR(l);
3419
0
    if (cur == 0) {
3420
0
        SHRINK;
3421
0
        GROW;
3422
0
        cur = CUR_CHAR(l);
3423
0
    }
3424
0
      }
3425
0
      buf[len] = 0;
3426
0
      if (cur != '>') {
3427
0
    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3428
0
          "ParsePI: PI %s never end ...\n", target, NULL);
3429
0
      } else {
3430
0
    SKIP(1);
3431
3432
    /*
3433
     * SAX: PI detected.
3434
     */
3435
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3436
0
        (ctxt->sax->processingInstruction != NULL))
3437
0
        ctxt->sax->processingInstruction(ctxt->userData,
3438
0
                                         target, buf);
3439
0
      }
3440
0
      xmlFree(buf);
3441
0
  } else {
3442
0
      htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3443
0
                         "PI is not started correctly", NULL, NULL);
3444
0
  }
3445
0
  ctxt->instate = state;
3446
0
    }
3447
0
}
3448
3449
/**
3450
 * htmlParseComment:
3451
 * @ctxt:  an HTML parser context
3452
 *
3453
 * Parse an XML (SGML) comment <!-- .... -->
3454
 *
3455
 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3456
 */
3457
static void
3458
0
htmlParseComment(htmlParserCtxtPtr ctxt) {
3459
0
    xmlChar *buf = NULL;
3460
0
    int len;
3461
0
    int size = HTML_PARSER_BUFFER_SIZE;
3462
0
    int q, ql;
3463
0
    int r, rl;
3464
0
    int cur, l;
3465
0
    int next, nl;
3466
0
    xmlParserInputState state;
3467
3468
    /*
3469
     * Check that there is a comment right here.
3470
     */
3471
0
    if ((RAW != '<') || (NXT(1) != '!') ||
3472
0
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3473
3474
0
    state = ctxt->instate;
3475
0
    ctxt->instate = XML_PARSER_COMMENT;
3476
0
    SHRINK;
3477
0
    SKIP(4);
3478
0
    buf = (xmlChar *) xmlMallocAtomic(size);
3479
0
    if (buf == NULL) {
3480
0
        htmlErrMemory(ctxt, "buffer allocation failed\n");
3481
0
  ctxt->instate = state;
3482
0
  return;
3483
0
    }
3484
0
    len = 0;
3485
0
    buf[len] = 0;
3486
0
    q = CUR_CHAR(ql);
3487
0
    if (q == 0)
3488
0
        goto unfinished;
3489
0
    if (q == '>') {
3490
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3491
0
        cur = '>';
3492
0
        goto finished;
3493
0
    }
3494
0
    NEXTL(ql);
3495
0
    r = CUR_CHAR(rl);
3496
0
    if (r == 0)
3497
0
        goto unfinished;
3498
0
    if (q == '-' && r == '>') {
3499
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3500
0
        cur = '>';
3501
0
        goto finished;
3502
0
    }
3503
0
    NEXTL(rl);
3504
0
    cur = CUR_CHAR(l);
3505
0
    while ((cur != 0) &&
3506
0
           ((cur != '>') ||
3507
0
      (r != '-') || (q != '-'))) {
3508
0
  NEXTL(l);
3509
0
  next = CUR_CHAR(nl);
3510
0
  if (next == 0) {
3511
0
      SHRINK;
3512
0
      GROW;
3513
0
      next = CUR_CHAR(nl);
3514
0
  }
3515
3516
0
  if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3517
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3518
0
           "Comment incorrectly closed by '--!>'", NULL, NULL);
3519
0
    cur = '>';
3520
0
    break;
3521
0
  }
3522
3523
0
  if (len + 5 >= size) {
3524
0
      xmlChar *tmp;
3525
3526
0
      size *= 2;
3527
0
      tmp = (xmlChar *) xmlRealloc(buf, size);
3528
0
      if (tmp == NULL) {
3529
0
          xmlFree(buf);
3530
0
          htmlErrMemory(ctxt, "growing buffer failed\n");
3531
0
    ctxt->instate = state;
3532
0
    return;
3533
0
      }
3534
0
      buf = tmp;
3535
0
  }
3536
0
        if (IS_CHAR(q)) {
3537
0
      COPY_BUF(ql,buf,len,q);
3538
0
        } else {
3539
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3540
0
                            "Invalid char in comment 0x%X\n", q);
3541
0
        }
3542
3543
0
  q = r;
3544
0
  ql = rl;
3545
0
  r = cur;
3546
0
  rl = l;
3547
0
  cur = next;
3548
0
  l = nl;
3549
0
    }
3550
0
finished:
3551
0
    buf[len] = 0;
3552
0
    if (cur == '>') {
3553
0
        NEXT;
3554
0
  if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3555
0
      (!ctxt->disableSAX))
3556
0
      ctxt->sax->comment(ctxt->userData, buf);
3557
0
  xmlFree(buf);
3558
0
  ctxt->instate = state;
3559
0
  return;
3560
0
    }
3561
3562
0
unfinished:
3563
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3564
0
     "Comment not terminated \n<!--%.50s\n", buf, NULL);
3565
0
    xmlFree(buf);
3566
0
}
3567
3568
/**
3569
 * htmlParseCharRef:
3570
 * @ctxt:  an HTML parser context
3571
 *
3572
 * DEPRECATED: Internal function, don't use.
3573
 *
3574
 * parse Reference declarations
3575
 *
3576
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3577
 *                  '&#x' [0-9a-fA-F]+ ';'
3578
 *
3579
 * Returns the value parsed (as an int)
3580
 */
3581
int
3582
0
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3583
0
    int val = 0;
3584
3585
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3586
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3587
0
         "htmlParseCharRef: context error\n",
3588
0
         NULL, NULL);
3589
0
        return(0);
3590
0
    }
3591
0
    if ((CUR == '&') && (NXT(1) == '#') &&
3592
0
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3593
0
  SKIP(3);
3594
0
  while (CUR != ';') {
3595
0
      if ((CUR >= '0') && (CUR <= '9')) {
3596
0
                if (val < 0x110000)
3597
0
              val = val * 16 + (CUR - '0');
3598
0
            } else if ((CUR >= 'a') && (CUR <= 'f')) {
3599
0
                if (val < 0x110000)
3600
0
              val = val * 16 + (CUR - 'a') + 10;
3601
0
            } else if ((CUR >= 'A') && (CUR <= 'F')) {
3602
0
                if (val < 0x110000)
3603
0
              val = val * 16 + (CUR - 'A') + 10;
3604
0
            } else {
3605
0
          htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3606
0
                 "htmlParseCharRef: missing semicolon\n",
3607
0
           NULL, NULL);
3608
0
    break;
3609
0
      }
3610
0
      NEXT;
3611
0
  }
3612
0
  if (CUR == ';')
3613
0
      NEXT;
3614
0
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3615
0
  SKIP(2);
3616
0
  while (CUR != ';') {
3617
0
      if ((CUR >= '0') && (CUR <= '9')) {
3618
0
                if (val < 0x110000)
3619
0
              val = val * 10 + (CUR - '0');
3620
0
            } else {
3621
0
          htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3622
0
                 "htmlParseCharRef: missing semicolon\n",
3623
0
           NULL, NULL);
3624
0
    break;
3625
0
      }
3626
0
      NEXT;
3627
0
  }
3628
0
  if (CUR == ';')
3629
0
      NEXT;
3630
0
    } else {
3631
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3632
0
               "htmlParseCharRef: invalid value\n", NULL, NULL);
3633
0
    }
3634
    /*
3635
     * Check the value IS_CHAR ...
3636
     */
3637
0
    if (IS_CHAR(val)) {
3638
0
        return(val);
3639
0
    } else if (val >= 0x110000) {
3640
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3641
0
         "htmlParseCharRef: value too large\n", NULL, NULL);
3642
0
    } else {
3643
0
  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3644
0
      "htmlParseCharRef: invalid xmlChar value %d\n",
3645
0
      val);
3646
0
    }
3647
0
    return(0);
3648
0
}
3649
3650
3651
/**
3652
 * htmlParseDocTypeDecl:
3653
 * @ctxt:  an HTML parser context
3654
 *
3655
 * parse a DOCTYPE declaration
3656
 *
3657
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3658
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3659
 */
3660
3661
static void
3662
0
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3663
0
    const xmlChar *name;
3664
0
    xmlChar *ExternalID = NULL;
3665
0
    xmlChar *URI = NULL;
3666
3667
    /*
3668
     * We know that '<!DOCTYPE' has been detected.
3669
     */
3670
0
    SKIP(9);
3671
3672
0
    SKIP_BLANKS;
3673
3674
    /*
3675
     * Parse the DOCTYPE name.
3676
     */
3677
0
    name = htmlParseName(ctxt);
3678
0
    if (name == NULL) {
3679
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3680
0
               "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3681
0
         NULL, NULL);
3682
0
    }
3683
    /*
3684
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3685
     */
3686
3687
0
    SKIP_BLANKS;
3688
3689
    /*
3690
     * Check for SystemID and ExternalID
3691
     */
3692
0
    URI = htmlParseExternalID(ctxt, &ExternalID);
3693
0
    SKIP_BLANKS;
3694
3695
    /*
3696
     * We should be at the end of the DOCTYPE declaration.
3697
     */
3698
0
    if (CUR != '>') {
3699
0
  htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3700
0
               "DOCTYPE improperly terminated\n", NULL, NULL);
3701
        /* Ignore bogus content */
3702
0
        while ((CUR != 0) && (CUR != '>'))
3703
0
            NEXT;
3704
0
    }
3705
0
    if (CUR == '>')
3706
0
        NEXT;
3707
3708
    /*
3709
     * Create or update the document accordingly to the DOCTYPE
3710
     */
3711
0
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3712
0
  (!ctxt->disableSAX))
3713
0
  ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3714
3715
    /*
3716
     * Cleanup, since we don't use all those identifiers
3717
     */
3718
0
    if (URI != NULL) xmlFree(URI);
3719
0
    if (ExternalID != NULL) xmlFree(ExternalID);
3720
0
}
3721
3722
/**
3723
 * htmlParseAttribute:
3724
 * @ctxt:  an HTML parser context
3725
 * @value:  a xmlChar ** used to store the value of the attribute
3726
 *
3727
 * parse an attribute
3728
 *
3729
 * [41] Attribute ::= Name Eq AttValue
3730
 *
3731
 * [25] Eq ::= S? '=' S?
3732
 *
3733
 * With namespace:
3734
 *
3735
 * [NS 11] Attribute ::= QName Eq AttValue
3736
 *
3737
 * Also the case QName == xmlns:??? is handled independently as a namespace
3738
 * definition.
3739
 *
3740
 * Returns the attribute name, and the value in *value.
3741
 */
3742
3743
static const xmlChar *
3744
0
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3745
0
    const xmlChar *name;
3746
0
    xmlChar *val = NULL;
3747
3748
0
    *value = NULL;
3749
0
    name = htmlParseHTMLName(ctxt);
3750
0
    if (name == NULL) {
3751
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3752
0
               "error parsing attribute name\n", NULL, NULL);
3753
0
        return(NULL);
3754
0
    }
3755
3756
    /*
3757
     * read the value
3758
     */
3759
0
    SKIP_BLANKS;
3760
0
    if (CUR == '=') {
3761
0
        NEXT;
3762
0
  SKIP_BLANKS;
3763
0
  val = htmlParseAttValue(ctxt);
3764
0
    }
3765
3766
0
    *value = val;
3767
0
    return(name);
3768
0
}
3769
3770
/**
3771
 * htmlCheckEncodingDirect:
3772
 * @ctxt:  an HTML parser context
3773
 * @attvalue: the attribute value
3774
 *
3775
 * Checks an attribute value to detect
3776
 * the encoding
3777
 * If a new encoding is detected the parser is switched to decode
3778
 * it and pass UTF8
3779
 */
3780
static void
3781
0
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3782
3783
0
    if ((ctxt == NULL) || (encoding == NULL) ||
3784
0
        (ctxt->options & HTML_PARSE_IGNORE_ENC))
3785
0
  return;
3786
3787
    /* do not change encoding */
3788
0
    if (ctxt->input->encoding != NULL)
3789
0
        return;
3790
3791
0
    if (encoding != NULL) {
3792
0
  xmlCharEncoding enc;
3793
0
  xmlCharEncodingHandlerPtr handler;
3794
3795
0
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3796
3797
0
  if (ctxt->input->encoding != NULL)
3798
0
      xmlFree((xmlChar *) ctxt->input->encoding);
3799
0
  ctxt->input->encoding = xmlStrdup(encoding);
3800
3801
0
  enc = xmlParseCharEncoding((const char *) encoding);
3802
  /*
3803
   * registered set of known encodings
3804
   */
3805
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
3806
0
      if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3807
0
           (enc == XML_CHAR_ENCODING_UTF16BE) ||
3808
0
     (enc == XML_CHAR_ENCODING_UCS4LE) ||
3809
0
     (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3810
0
    (ctxt->input->buf != NULL) &&
3811
0
    (ctxt->input->buf->encoder == NULL)) {
3812
0
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3813
0
                 "htmlCheckEncoding: wrong encoding meta\n",
3814
0
           NULL, NULL);
3815
0
      } else {
3816
0
    xmlSwitchEncoding(ctxt, enc);
3817
0
      }
3818
0
      ctxt->charset = XML_CHAR_ENCODING_UTF8;
3819
0
  } else {
3820
      /*
3821
       * fallback for unknown encodings
3822
       */
3823
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
3824
0
      if (handler != NULL) {
3825
0
    xmlSwitchToEncoding(ctxt, handler);
3826
0
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3827
0
      } else {
3828
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3829
0
                 "htmlCheckEncoding: unknown encoding %s\n",
3830
0
           encoding, NULL);
3831
0
      }
3832
0
  }
3833
3834
0
  if ((ctxt->input->buf != NULL) &&
3835
0
      (ctxt->input->buf->encoder != NULL) &&
3836
0
      (ctxt->input->buf->raw != NULL) &&
3837
0
      (ctxt->input->buf->buffer != NULL)) {
3838
0
      int nbchars;
3839
0
      int processed;
3840
3841
      /*
3842
       * convert as much as possible to the parser reading buffer.
3843
       */
3844
0
      processed = ctxt->input->cur - ctxt->input->base;
3845
0
      xmlBufShrink(ctxt->input->buf->buffer, processed);
3846
0
      nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3847
0
            xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3848
0
      if (nbchars < 0) {
3849
0
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3850
0
                 "htmlCheckEncoding: encoder error\n",
3851
0
           NULL, NULL);
3852
0
      }
3853
0
  }
3854
0
    }
3855
0
}
3856
3857
/**
3858
 * htmlCheckEncoding:
3859
 * @ctxt:  an HTML parser context
3860
 * @attvalue: the attribute value
3861
 *
3862
 * Checks an http-equiv attribute from a Meta tag to detect
3863
 * the encoding
3864
 * If a new encoding is detected the parser is switched to decode
3865
 * it and pass UTF8
3866
 */
3867
static void
3868
0
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3869
0
    const xmlChar *encoding;
3870
3871
0
    if (!attvalue)
3872
0
  return;
3873
3874
0
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3875
0
    if (encoding != NULL) {
3876
0
  encoding += 7;
3877
0
    }
3878
    /*
3879
     * skip blank
3880
     */
3881
0
    if (encoding && IS_BLANK_CH(*encoding))
3882
0
  encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3883
0
    if (encoding && *encoding == '=') {
3884
0
  encoding ++;
3885
0
  htmlCheckEncodingDirect(ctxt, encoding);
3886
0
    }
3887
0
}
3888
3889
/**
3890
 * htmlCheckMeta:
3891
 * @ctxt:  an HTML parser context
3892
 * @atts:  the attributes values
3893
 *
3894
 * Checks an attributes from a Meta tag
3895
 */
3896
static void
3897
0
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3898
0
    int i;
3899
0
    const xmlChar *att, *value;
3900
0
    int http = 0;
3901
0
    const xmlChar *content = NULL;
3902
3903
0
    if ((ctxt == NULL) || (atts == NULL))
3904
0
  return;
3905
3906
0
    i = 0;
3907
0
    att = atts[i++];
3908
0
    while (att != NULL) {
3909
0
  value = atts[i++];
3910
0
  if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3911
0
   && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3912
0
      http = 1;
3913
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3914
0
      htmlCheckEncodingDirect(ctxt, value);
3915
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3916
0
      content = value;
3917
0
  att = atts[i++];
3918
0
    }
3919
0
    if ((http) && (content != NULL))
3920
0
  htmlCheckEncoding(ctxt, content);
3921
3922
0
}
3923
3924
/**
3925
 * htmlParseStartTag:
3926
 * @ctxt:  an HTML parser context
3927
 *
3928
 * parse a start of tag either for rule element or
3929
 * EmptyElement. In both case we don't parse the tag closing chars.
3930
 *
3931
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3932
 *
3933
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3934
 *
3935
 * With namespace:
3936
 *
3937
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3938
 *
3939
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3940
 *
3941
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3942
 */
3943
3944
static int
3945
0
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3946
0
    const xmlChar *name;
3947
0
    const xmlChar *attname;
3948
0
    xmlChar *attvalue;
3949
0
    const xmlChar **atts;
3950
0
    int nbatts = 0;
3951
0
    int maxatts;
3952
0
    int meta = 0;
3953
0
    int i;
3954
0
    int discardtag = 0;
3955
3956
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3957
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3958
0
         "htmlParseStartTag: context error\n", NULL, NULL);
3959
0
  return -1;
3960
0
    }
3961
0
    if (ctxt->instate == XML_PARSER_EOF)
3962
0
        return(-1);
3963
0
    if (CUR != '<') return -1;
3964
0
    NEXT;
3965
3966
0
    atts = ctxt->atts;
3967
0
    maxatts = ctxt->maxatts;
3968
3969
0
    GROW;
3970
0
    name = htmlParseHTMLName(ctxt);
3971
0
    if (name == NULL) {
3972
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3973
0
               "htmlParseStartTag: invalid element name\n",
3974
0
         NULL, NULL);
3975
  /* Dump the bogus tag like browsers do */
3976
0
  while ((CUR != 0) && (CUR != '>') &&
3977
0
               (ctxt->instate != XML_PARSER_EOF))
3978
0
      NEXT;
3979
0
        return -1;
3980
0
    }
3981
0
    if (xmlStrEqual(name, BAD_CAST"meta"))
3982
0
  meta = 1;
3983
3984
    /*
3985
     * Check for auto-closure of HTML elements.
3986
     */
3987
0
    htmlAutoClose(ctxt, name);
3988
3989
    /*
3990
     * Check for implied HTML elements.
3991
     */
3992
0
    htmlCheckImplied(ctxt, name);
3993
3994
    /*
3995
     * Avoid html at any level > 0, head at any level != 1
3996
     * or any attempt to recurse body
3997
     */
3998
0
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3999
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4000
0
               "htmlParseStartTag: misplaced <html> tag\n",
4001
0
         name, NULL);
4002
0
  discardtag = 1;
4003
0
  ctxt->depth++;
4004
0
    }
4005
0
    if ((ctxt->nameNr != 1) &&
4006
0
  (xmlStrEqual(name, BAD_CAST"head"))) {
4007
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4008
0
               "htmlParseStartTag: misplaced <head> tag\n",
4009
0
         name, NULL);
4010
0
  discardtag = 1;
4011
0
  ctxt->depth++;
4012
0
    }
4013
0
    if (xmlStrEqual(name, BAD_CAST"body")) {
4014
0
  int indx;
4015
0
  for (indx = 0;indx < ctxt->nameNr;indx++) {
4016
0
      if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4017
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4018
0
                 "htmlParseStartTag: misplaced <body> tag\n",
4019
0
           name, NULL);
4020
0
    discardtag = 1;
4021
0
    ctxt->depth++;
4022
0
      }
4023
0
  }
4024
0
    }
4025
4026
    /*
4027
     * Now parse the attributes, it ends up with the ending
4028
     *
4029
     * (S Attribute)* S?
4030
     */
4031
0
    SKIP_BLANKS;
4032
0
    while ((CUR != 0) &&
4033
0
           (CUR != '>') &&
4034
0
     ((CUR != '/') || (NXT(1) != '>'))) {
4035
0
  GROW;
4036
0
  attname = htmlParseAttribute(ctxt, &attvalue);
4037
0
        if (attname != NULL) {
4038
4039
      /*
4040
       * Well formedness requires at most one declaration of an attribute
4041
       */
4042
0
      for (i = 0; i < nbatts;i += 2) {
4043
0
          if (xmlStrEqual(atts[i], attname)) {
4044
0
        htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4045
0
                     "Attribute %s redefined\n", attname, NULL);
4046
0
        if (attvalue != NULL)
4047
0
      xmlFree(attvalue);
4048
0
        goto failed;
4049
0
    }
4050
0
      }
4051
4052
      /*
4053
       * Add the pair to atts
4054
       */
4055
0
      if (atts == NULL) {
4056
0
          maxatts = 22; /* allow for 10 attrs by default */
4057
0
          atts = (const xmlChar **)
4058
0
           xmlMalloc(maxatts * sizeof(xmlChar *));
4059
0
    if (atts == NULL) {
4060
0
        htmlErrMemory(ctxt, NULL);
4061
0
        if (attvalue != NULL)
4062
0
      xmlFree(attvalue);
4063
0
        goto failed;
4064
0
    }
4065
0
    ctxt->atts = atts;
4066
0
    ctxt->maxatts = maxatts;
4067
0
      } else if (nbatts + 4 > maxatts) {
4068
0
          const xmlChar **n;
4069
4070
0
          maxatts *= 2;
4071
0
          n = (const xmlChar **) xmlRealloc((void *) atts,
4072
0
               maxatts * sizeof(const xmlChar *));
4073
0
    if (n == NULL) {
4074
0
        htmlErrMemory(ctxt, NULL);
4075
0
        if (attvalue != NULL)
4076
0
      xmlFree(attvalue);
4077
0
        goto failed;
4078
0
    }
4079
0
    atts = n;
4080
0
    ctxt->atts = atts;
4081
0
    ctxt->maxatts = maxatts;
4082
0
      }
4083
0
      atts[nbatts++] = attname;
4084
0
      atts[nbatts++] = attvalue;
4085
0
      atts[nbatts] = NULL;
4086
0
      atts[nbatts + 1] = NULL;
4087
0
  }
4088
0
  else {
4089
0
      if (attvalue != NULL)
4090
0
          xmlFree(attvalue);
4091
      /* Dump the bogus attribute string up to the next blank or
4092
       * the end of the tag. */
4093
0
      while ((CUR != 0) &&
4094
0
             !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4095
0
       ((CUR != '/') || (NXT(1) != '>')))
4096
0
    NEXT;
4097
0
  }
4098
4099
0
failed:
4100
0
  SKIP_BLANKS;
4101
0
    }
4102
4103
    /*
4104
     * Handle specific association to the META tag
4105
     */
4106
0
    if (meta && (nbatts != 0))
4107
0
  htmlCheckMeta(ctxt, atts);
4108
4109
    /*
4110
     * SAX: Start of Element !
4111
     */
4112
0
    if (!discardtag) {
4113
0
  htmlnamePush(ctxt, name);
4114
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4115
0
      if (nbatts != 0)
4116
0
    ctxt->sax->startElement(ctxt->userData, name, atts);
4117
0
      else
4118
0
    ctxt->sax->startElement(ctxt->userData, name, NULL);
4119
0
  }
4120
0
    }
4121
4122
0
    if (atts != NULL) {
4123
0
        for (i = 1;i < nbatts;i += 2) {
4124
0
      if (atts[i] != NULL)
4125
0
    xmlFree((xmlChar *) atts[i]);
4126
0
  }
4127
0
    }
4128
4129
0
    return(discardtag);
4130
0
}
4131
4132
/**
4133
 * htmlParseEndTag:
4134
 * @ctxt:  an HTML parser context
4135
 *
4136
 * parse an end of tag
4137
 *
4138
 * [42] ETag ::= '</' Name S? '>'
4139
 *
4140
 * With namespace
4141
 *
4142
 * [NS 9] ETag ::= '</' QName S? '>'
4143
 *
4144
 * Returns 1 if the current level should be closed.
4145
 */
4146
4147
static int
4148
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4149
0
{
4150
0
    const xmlChar *name;
4151
0
    const xmlChar *oldname;
4152
0
    int i, ret;
4153
4154
0
    if ((CUR != '<') || (NXT(1) != '/')) {
4155
0
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4156
0
               "htmlParseEndTag: '</' not found\n", NULL, NULL);
4157
0
        return (0);
4158
0
    }
4159
0
    SKIP(2);
4160
4161
0
    name = htmlParseHTMLName(ctxt);
4162
0
    if (name == NULL)
4163
0
        return (0);
4164
    /*
4165
     * We should definitely be at the ending "S? '>'" part
4166
     */
4167
0
    SKIP_BLANKS;
4168
0
    if (CUR != '>') {
4169
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4170
0
               "End tag : expected '>'\n", NULL, NULL);
4171
        /* Skip to next '>' */
4172
0
        while ((CUR != 0) && (CUR != '>'))
4173
0
            NEXT;
4174
0
    }
4175
0
    if (CUR == '>')
4176
0
        NEXT;
4177
4178
    /*
4179
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4180
     * out now.
4181
     */
4182
0
    if ((ctxt->depth > 0) &&
4183
0
        (xmlStrEqual(name, BAD_CAST "html") ||
4184
0
         xmlStrEqual(name, BAD_CAST "body") ||
4185
0
   xmlStrEqual(name, BAD_CAST "head"))) {
4186
0
  ctxt->depth--;
4187
0
  return (0);
4188
0
    }
4189
4190
    /*
4191
     * If the name read is not one of the element in the parsing stack
4192
     * then return, it's just an error.
4193
     */
4194
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4195
0
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4196
0
            break;
4197
0
    }
4198
0
    if (i < 0) {
4199
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4200
0
               "Unexpected end tag : %s\n", name, NULL);
4201
0
        return (0);
4202
0
    }
4203
4204
4205
    /*
4206
     * Check for auto-closure of HTML elements.
4207
     */
4208
4209
0
    htmlAutoCloseOnClose(ctxt, name);
4210
4211
    /*
4212
     * Well formedness constraints, opening and closing must match.
4213
     * With the exception that the autoclose may have popped stuff out
4214
     * of the stack.
4215
     */
4216
0
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4217
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4218
0
                     "Opening and ending tag mismatch: %s and %s\n",
4219
0
                     name, ctxt->name);
4220
0
    }
4221
4222
    /*
4223
     * SAX: End of Tag
4224
     */
4225
0
    oldname = ctxt->name;
4226
0
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4227
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4228
0
            ctxt->sax->endElement(ctxt->userData, name);
4229
0
  htmlNodeInfoPop(ctxt);
4230
0
        htmlnamePop(ctxt);
4231
0
        ret = 1;
4232
0
    } else {
4233
0
        ret = 0;
4234
0
    }
4235
4236
0
    return (ret);
4237
0
}
4238
4239
4240
/**
4241
 * htmlParseReference:
4242
 * @ctxt:  an HTML parser context
4243
 *
4244
 * parse and handle entity references in content,
4245
 * this will end-up in a call to character() since this is either a
4246
 * CharRef, or a predefined entity.
4247
 */
4248
static void
4249
0
htmlParseReference(htmlParserCtxtPtr ctxt) {
4250
0
    const htmlEntityDesc * ent;
4251
0
    xmlChar out[6];
4252
0
    const xmlChar *name;
4253
0
    if (CUR != '&') return;
4254
4255
0
    if (NXT(1) == '#') {
4256
0
  unsigned int c;
4257
0
  int bits, i = 0;
4258
4259
0
  c = htmlParseCharRef(ctxt);
4260
0
  if (c == 0)
4261
0
      return;
4262
4263
0
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
4264
0
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4265
0
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4266
0
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4267
4268
0
        for ( ; bits >= 0; bits-= 6) {
4269
0
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
4270
0
        }
4271
0
  out[i] = 0;
4272
4273
0
  htmlCheckParagraph(ctxt);
4274
0
  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4275
0
      ctxt->sax->characters(ctxt->userData, out, i);
4276
0
    } else {
4277
0
  ent = htmlParseEntityRef(ctxt, &name);
4278
0
  if (name == NULL) {
4279
0
      htmlCheckParagraph(ctxt);
4280
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4281
0
          ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4282
0
      return;
4283
0
  }
4284
0
  if ((ent == NULL) || !(ent->value > 0)) {
4285
0
      htmlCheckParagraph(ctxt);
4286
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4287
0
    ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4288
0
    ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4289
    /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4290
0
      }
4291
0
  } else {
4292
0
      unsigned int c;
4293
0
      int bits, i = 0;
4294
4295
0
      c = ent->value;
4296
0
      if      (c <    0x80)
4297
0
              { out[i++]= c;                bits= -6; }
4298
0
      else if (c <   0x800)
4299
0
              { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4300
0
      else if (c < 0x10000)
4301
0
              { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4302
0
      else
4303
0
              { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4304
4305
0
      for ( ; bits >= 0; bits-= 6) {
4306
0
    out[i++]= ((c >> bits) & 0x3F) | 0x80;
4307
0
      }
4308
0
      out[i] = 0;
4309
4310
0
      htmlCheckParagraph(ctxt);
4311
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4312
0
    ctxt->sax->characters(ctxt->userData, out, i);
4313
0
  }
4314
0
    }
4315
0
}
4316
4317
/**
4318
 * htmlParseContent:
4319
 * @ctxt:  an HTML parser context
4320
 *
4321
 * Parse a content: comment, sub-element, reference or text.
4322
 * Kept for compatibility with old code
4323
 */
4324
4325
static void
4326
0
htmlParseContent(htmlParserCtxtPtr ctxt) {
4327
0
    xmlChar *currentNode;
4328
0
    int depth;
4329
0
    const xmlChar *name;
4330
4331
0
    currentNode = xmlStrdup(ctxt->name);
4332
0
    depth = ctxt->nameNr;
4333
0
    while (1) {
4334
0
        GROW;
4335
4336
0
        if (ctxt->instate == XML_PARSER_EOF)
4337
0
            break;
4338
4339
  /*
4340
   * Our tag or one of it's parent or children is ending.
4341
   */
4342
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4343
0
      if (htmlParseEndTag(ctxt) &&
4344
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4345
0
    if (currentNode != NULL)
4346
0
        xmlFree(currentNode);
4347
0
    return;
4348
0
      }
4349
0
      continue; /* while */
4350
0
        }
4351
4352
0
  else if ((CUR == '<') &&
4353
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4354
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4355
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4356
0
      if (name == NULL) {
4357
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4358
0
       "htmlParseStartTag: invalid element name\n",
4359
0
       NULL, NULL);
4360
          /* Dump the bogus tag like browsers do */
4361
0
                while ((CUR != 0) && (CUR != '>'))
4362
0
              NEXT;
4363
4364
0
          if (currentNode != NULL)
4365
0
              xmlFree(currentNode);
4366
0
          return;
4367
0
      }
4368
4369
0
      if (ctxt->name != NULL) {
4370
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4371
0
              htmlAutoClose(ctxt, name);
4372
0
              continue;
4373
0
          }
4374
0
      }
4375
0
  }
4376
4377
  /*
4378
   * Has this node been popped out during parsing of
4379
   * the next element
4380
   */
4381
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4382
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4383
0
       {
4384
0
      if (currentNode != NULL) xmlFree(currentNode);
4385
0
      return;
4386
0
  }
4387
4388
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4389
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4390
      /*
4391
       * Handle SCRIPT/STYLE separately
4392
       */
4393
0
      htmlParseScript(ctxt);
4394
0
  }
4395
4396
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4397
            /*
4398
             * Sometimes DOCTYPE arrives in the middle of the document
4399
             */
4400
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4401
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4402
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4403
0
                (UPP(8) == 'E')) {
4404
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4405
0
                             "Misplaced DOCTYPE declaration\n",
4406
0
                             BAD_CAST "DOCTYPE" , NULL);
4407
0
                htmlParseDocTypeDecl(ctxt);
4408
0
            }
4409
            /*
4410
             * First case :  a comment
4411
             */
4412
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4413
0
                htmlParseComment(ctxt);
4414
0
            }
4415
0
            else {
4416
0
                htmlSkipBogusComment(ctxt);
4417
0
            }
4418
0
        }
4419
4420
        /*
4421
         * Second case : a Processing Instruction.
4422
         */
4423
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4424
0
            htmlParsePI(ctxt);
4425
0
        }
4426
4427
        /*
4428
         * Third case :  a sub-element.
4429
         */
4430
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4431
0
            htmlParseElement(ctxt);
4432
0
        }
4433
0
        else if (CUR == '<') {
4434
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4435
0
                (ctxt->sax->characters != NULL))
4436
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4437
0
            NEXT;
4438
0
        }
4439
4440
        /*
4441
         * Fourth case : a reference. If if has not been resolved,
4442
         *    parsing returns it's Name, create the node
4443
         */
4444
0
        else if (CUR == '&') {
4445
0
            htmlParseReference(ctxt);
4446
0
        }
4447
4448
        /*
4449
         * Fifth case : end of the resource
4450
         */
4451
0
        else if (CUR == 0) {
4452
0
            htmlAutoCloseOnEnd(ctxt);
4453
0
            break;
4454
0
        }
4455
4456
        /*
4457
         * Last case, text. Note that References are handled directly.
4458
         */
4459
0
        else {
4460
0
            htmlParseCharData(ctxt);
4461
0
        }
4462
0
        GROW;
4463
0
    }
4464
0
    if (currentNode != NULL) xmlFree(currentNode);
4465
0
}
4466
4467
/**
4468
 * htmlParseElement:
4469
 * @ctxt:  an HTML parser context
4470
 *
4471
 * DEPRECATED: Internal function, don't use.
4472
 *
4473
 * parse an HTML element, this is highly recursive
4474
 * this is kept for compatibility with previous code versions
4475
 *
4476
 * [39] element ::= EmptyElemTag | STag content ETag
4477
 *
4478
 * [41] Attribute ::= Name Eq AttValue
4479
 */
4480
4481
void
4482
0
htmlParseElement(htmlParserCtxtPtr ctxt) {
4483
0
    const xmlChar *name;
4484
0
    xmlChar *currentNode = NULL;
4485
0
    const htmlElemDesc * info;
4486
0
    htmlParserNodeInfo node_info;
4487
0
    int failed;
4488
0
    int depth;
4489
0
    const xmlChar *oldptr;
4490
4491
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4492
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4493
0
         "htmlParseElement: context error\n", NULL, NULL);
4494
0
  return;
4495
0
    }
4496
4497
0
    if (ctxt->instate == XML_PARSER_EOF)
4498
0
        return;
4499
4500
    /* Capture start position */
4501
0
    if (ctxt->record_info) {
4502
0
        node_info.begin_pos = ctxt->input->consumed +
4503
0
                          (CUR_PTR - ctxt->input->base);
4504
0
  node_info.begin_line = ctxt->input->line;
4505
0
    }
4506
4507
0
    failed = htmlParseStartTag(ctxt);
4508
0
    name = ctxt->name;
4509
0
    if ((failed == -1) || (name == NULL)) {
4510
0
  if (CUR == '>')
4511
0
      NEXT;
4512
0
        return;
4513
0
    }
4514
4515
    /*
4516
     * Lookup the info for that element.
4517
     */
4518
0
    info = htmlTagLookup(name);
4519
0
    if (info == NULL) {
4520
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4521
0
               "Tag %s invalid\n", name, NULL);
4522
0
    }
4523
4524
    /*
4525
     * Check for an Empty Element labeled the XML/SGML way
4526
     */
4527
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4528
0
        SKIP(2);
4529
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4530
0
      ctxt->sax->endElement(ctxt->userData, name);
4531
0
  htmlnamePop(ctxt);
4532
0
  return;
4533
0
    }
4534
4535
0
    if (CUR == '>') {
4536
0
        NEXT;
4537
0
    } else {
4538
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4539
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4540
4541
  /*
4542
   * end of parsing of this node.
4543
   */
4544
0
  if (xmlStrEqual(name, ctxt->name)) {
4545
0
      nodePop(ctxt);
4546
0
      htmlnamePop(ctxt);
4547
0
  }
4548
4549
  /*
4550
   * Capture end position and add node
4551
   */
4552
0
  if (ctxt->record_info) {
4553
0
     node_info.end_pos = ctxt->input->consumed +
4554
0
            (CUR_PTR - ctxt->input->base);
4555
0
     node_info.end_line = ctxt->input->line;
4556
0
     node_info.node = ctxt->node;
4557
0
     xmlParserAddNodeInfo(ctxt, &node_info);
4558
0
  }
4559
0
  return;
4560
0
    }
4561
4562
    /*
4563
     * Check for an Empty Element from DTD definition
4564
     */
4565
0
    if ((info != NULL) && (info->empty)) {
4566
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4567
0
      ctxt->sax->endElement(ctxt->userData, name);
4568
0
  htmlnamePop(ctxt);
4569
0
  return;
4570
0
    }
4571
4572
    /*
4573
     * Parse the content of the element:
4574
     */
4575
0
    currentNode = xmlStrdup(ctxt->name);
4576
0
    depth = ctxt->nameNr;
4577
0
    while (CUR != 0) {
4578
0
  oldptr = ctxt->input->cur;
4579
0
  htmlParseContent(ctxt);
4580
0
  if (oldptr==ctxt->input->cur) break;
4581
0
  if (ctxt->nameNr < depth) break;
4582
0
    }
4583
4584
    /*
4585
     * Capture end position and add node
4586
     */
4587
0
    if ( currentNode != NULL && ctxt->record_info ) {
4588
0
       node_info.end_pos = ctxt->input->consumed +
4589
0
                          (CUR_PTR - ctxt->input->base);
4590
0
       node_info.end_line = ctxt->input->line;
4591
0
       node_info.node = ctxt->node;
4592
0
       xmlParserAddNodeInfo(ctxt, &node_info);
4593
0
    }
4594
0
    if (CUR == 0) {
4595
0
  htmlAutoCloseOnEnd(ctxt);
4596
0
    }
4597
4598
0
    if (currentNode != NULL)
4599
0
  xmlFree(currentNode);
4600
0
}
4601
4602
static void
4603
0
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4604
    /*
4605
     * Capture end position and add node
4606
     */
4607
0
    if ( ctxt->node != NULL && ctxt->record_info ) {
4608
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4609
0
                                (CUR_PTR - ctxt->input->base);
4610
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
4611
0
       ctxt->nodeInfo->node = ctxt->node;
4612
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4613
0
       htmlNodeInfoPop(ctxt);
4614
0
    }
4615
0
    if (CUR == 0) {
4616
0
       htmlAutoCloseOnEnd(ctxt);
4617
0
    }
4618
0
}
4619
4620
/**
4621
 * htmlParseElementInternal:
4622
 * @ctxt:  an HTML parser context
4623
 *
4624
 * parse an HTML element, new version, non recursive
4625
 *
4626
 * [39] element ::= EmptyElemTag | STag content ETag
4627
 *
4628
 * [41] Attribute ::= Name Eq AttValue
4629
 */
4630
4631
static void
4632
0
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4633
0
    const xmlChar *name;
4634
0
    const htmlElemDesc * info;
4635
0
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4636
0
    int failed;
4637
4638
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4639
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4640
0
         "htmlParseElementInternal: context error\n", NULL, NULL);
4641
0
  return;
4642
0
    }
4643
4644
0
    if (ctxt->instate == XML_PARSER_EOF)
4645
0
        return;
4646
4647
    /* Capture start position */
4648
0
    if (ctxt->record_info) {
4649
0
        node_info.begin_pos = ctxt->input->consumed +
4650
0
                          (CUR_PTR - ctxt->input->base);
4651
0
  node_info.begin_line = ctxt->input->line;
4652
0
    }
4653
4654
0
    failed = htmlParseStartTag(ctxt);
4655
0
    name = ctxt->name;
4656
0
    if ((failed == -1) || (name == NULL)) {
4657
0
  if (CUR == '>')
4658
0
      NEXT;
4659
0
        return;
4660
0
    }
4661
4662
    /*
4663
     * Lookup the info for that element.
4664
     */
4665
0
    info = htmlTagLookup(name);
4666
0
    if (info == NULL) {
4667
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4668
0
               "Tag %s invalid\n", name, NULL);
4669
0
    }
4670
4671
    /*
4672
     * Check for an Empty Element labeled the XML/SGML way
4673
     */
4674
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4675
0
        SKIP(2);
4676
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4677
0
      ctxt->sax->endElement(ctxt->userData, name);
4678
0
  htmlnamePop(ctxt);
4679
0
  return;
4680
0
    }
4681
4682
0
    if (CUR == '>') {
4683
0
        NEXT;
4684
0
    } else {
4685
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4686
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4687
4688
  /*
4689
   * end of parsing of this node.
4690
   */
4691
0
  if (xmlStrEqual(name, ctxt->name)) {
4692
0
      nodePop(ctxt);
4693
0
      htmlnamePop(ctxt);
4694
0
  }
4695
4696
0
        if (ctxt->record_info)
4697
0
            htmlNodeInfoPush(ctxt, &node_info);
4698
0
        htmlParserFinishElementParsing(ctxt);
4699
0
  return;
4700
0
    }
4701
4702
    /*
4703
     * Check for an Empty Element from DTD definition
4704
     */
4705
0
    if ((info != NULL) && (info->empty)) {
4706
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4707
0
      ctxt->sax->endElement(ctxt->userData, name);
4708
0
  htmlnamePop(ctxt);
4709
0
  return;
4710
0
    }
4711
4712
0
    if (ctxt->record_info)
4713
0
        htmlNodeInfoPush(ctxt, &node_info);
4714
0
}
4715
4716
/**
4717
 * htmlParseContentInternal:
4718
 * @ctxt:  an HTML parser context
4719
 *
4720
 * Parse a content: comment, sub-element, reference or text.
4721
 * New version for non recursive htmlParseElementInternal
4722
 */
4723
4724
static void
4725
0
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4726
0
    xmlChar *currentNode;
4727
0
    int depth;
4728
0
    const xmlChar *name;
4729
4730
0
    currentNode = xmlStrdup(ctxt->name);
4731
0
    depth = ctxt->nameNr;
4732
0
    while (1) {
4733
0
        GROW;
4734
4735
0
        if (ctxt->instate == XML_PARSER_EOF)
4736
0
            break;
4737
4738
  /*
4739
   * Our tag or one of it's parent or children is ending.
4740
   */
4741
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4742
0
      if (htmlParseEndTag(ctxt) &&
4743
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4744
0
    if (currentNode != NULL)
4745
0
        xmlFree(currentNode);
4746
4747
0
          currentNode = xmlStrdup(ctxt->name);
4748
0
          depth = ctxt->nameNr;
4749
0
      }
4750
0
      continue; /* while */
4751
0
        }
4752
4753
0
  else if ((CUR == '<') &&
4754
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4755
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4756
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4757
0
      if (name == NULL) {
4758
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4759
0
       "htmlParseStartTag: invalid element name\n",
4760
0
       NULL, NULL);
4761
          /* Dump the bogus tag like browsers do */
4762
0
          while ((CUR == 0) && (CUR != '>'))
4763
0
              NEXT;
4764
4765
0
          htmlParserFinishElementParsing(ctxt);
4766
0
          if (currentNode != NULL)
4767
0
              xmlFree(currentNode);
4768
4769
0
          currentNode = xmlStrdup(ctxt->name);
4770
0
          depth = ctxt->nameNr;
4771
0
          continue;
4772
0
      }
4773
4774
0
      if (ctxt->name != NULL) {
4775
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4776
0
              htmlAutoClose(ctxt, name);
4777
0
              continue;
4778
0
          }
4779
0
      }
4780
0
  }
4781
4782
  /*
4783
   * Has this node been popped out during parsing of
4784
   * the next element
4785
   */
4786
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4787
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4788
0
       {
4789
0
      htmlParserFinishElementParsing(ctxt);
4790
0
      if (currentNode != NULL) xmlFree(currentNode);
4791
4792
0
      currentNode = xmlStrdup(ctxt->name);
4793
0
      depth = ctxt->nameNr;
4794
0
      continue;
4795
0
  }
4796
4797
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4798
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4799
      /*
4800
       * Handle SCRIPT/STYLE separately
4801
       */
4802
0
      htmlParseScript(ctxt);
4803
0
  }
4804
4805
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4806
            /*
4807
             * Sometimes DOCTYPE arrives in the middle of the document
4808
             */
4809
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4810
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4811
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4812
0
                (UPP(8) == 'E')) {
4813
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4814
0
                             "Misplaced DOCTYPE declaration\n",
4815
0
                             BAD_CAST "DOCTYPE" , NULL);
4816
0
                htmlParseDocTypeDecl(ctxt);
4817
0
            }
4818
            /*
4819
             * First case :  a comment
4820
             */
4821
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4822
0
                htmlParseComment(ctxt);
4823
0
            }
4824
0
            else {
4825
0
                htmlSkipBogusComment(ctxt);
4826
0
            }
4827
0
        }
4828
4829
        /*
4830
         * Second case : a Processing Instruction.
4831
         */
4832
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4833
0
            htmlParsePI(ctxt);
4834
0
        }
4835
4836
        /*
4837
         * Third case :  a sub-element.
4838
         */
4839
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4840
0
            htmlParseElementInternal(ctxt);
4841
0
            if (currentNode != NULL) xmlFree(currentNode);
4842
4843
0
            currentNode = xmlStrdup(ctxt->name);
4844
0
            depth = ctxt->nameNr;
4845
0
        }
4846
0
        else if (CUR == '<') {
4847
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4848
0
                (ctxt->sax->characters != NULL))
4849
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4850
0
            NEXT;
4851
0
        }
4852
4853
        /*
4854
         * Fourth case : a reference. If if has not been resolved,
4855
         *    parsing returns it's Name, create the node
4856
         */
4857
0
        else if (CUR == '&') {
4858
0
            htmlParseReference(ctxt);
4859
0
        }
4860
4861
        /*
4862
         * Fifth case : end of the resource
4863
         */
4864
0
        else if (CUR == 0) {
4865
0
            htmlAutoCloseOnEnd(ctxt);
4866
0
            break;
4867
0
        }
4868
4869
        /*
4870
         * Last case, text. Note that References are handled directly.
4871
         */
4872
0
        else {
4873
0
            htmlParseCharData(ctxt);
4874
0
        }
4875
0
        GROW;
4876
0
    }
4877
0
    if (currentNode != NULL) xmlFree(currentNode);
4878
0
}
4879
4880
/**
4881
 * htmlParseContent:
4882
 * @ctxt:  an HTML parser context
4883
 *
4884
 * Parse a content: comment, sub-element, reference or text.
4885
 * This is the entry point when called from parser.c
4886
 */
4887
4888
void
4889
0
__htmlParseContent(void *ctxt) {
4890
0
    if (ctxt != NULL)
4891
0
  htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4892
0
}
4893
4894
/**
4895
 * htmlParseDocument:
4896
 * @ctxt:  an HTML parser context
4897
 *
4898
 * parse an HTML document (and build a tree if using the standard SAX
4899
 * interface).
4900
 *
4901
 * Returns 0, -1 in case of error. the parser context is augmented
4902
 *                as a result of the parsing.
4903
 */
4904
4905
int
4906
0
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4907
0
    xmlChar start[4];
4908
0
    xmlCharEncoding enc;
4909
0
    xmlDtdPtr dtd;
4910
4911
0
    xmlInitParser();
4912
4913
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4914
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4915
0
         "htmlParseDocument: context error\n", NULL, NULL);
4916
0
  return(XML_ERR_INTERNAL_ERROR);
4917
0
    }
4918
0
    GROW;
4919
    /*
4920
     * SAX: beginning of the document processing.
4921
     */
4922
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4923
0
        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4924
4925
0
    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4926
0
        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4927
  /*
4928
   * Get the 4 first bytes and decode the charset
4929
   * if enc != XML_CHAR_ENCODING_NONE
4930
   * plug some encoding conversion routines.
4931
   */
4932
0
  start[0] = RAW;
4933
0
  start[1] = NXT(1);
4934
0
  start[2] = NXT(2);
4935
0
  start[3] = NXT(3);
4936
0
  enc = xmlDetectCharEncoding(&start[0], 4);
4937
0
  if (enc != XML_CHAR_ENCODING_NONE) {
4938
0
      xmlSwitchEncoding(ctxt, enc);
4939
0
  }
4940
0
    }
4941
4942
    /*
4943
     * Wipe out everything which is before the first '<'
4944
     */
4945
0
    SKIP_BLANKS;
4946
0
    if (CUR == 0) {
4947
0
  htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4948
0
               "Document is empty\n", NULL, NULL);
4949
0
    }
4950
4951
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4952
0
  ctxt->sax->startDocument(ctxt->userData);
4953
4954
4955
    /*
4956
     * Parse possible comments and PIs before any content
4957
     */
4958
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4959
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4960
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4961
0
        htmlParseComment(ctxt);
4962
0
        htmlParsePI(ctxt);
4963
0
  SKIP_BLANKS;
4964
0
    }
4965
4966
4967
    /*
4968
     * Then possibly doc type declaration(s) and more Misc
4969
     * (doctypedecl Misc*)?
4970
     */
4971
0
    if ((CUR == '<') && (NXT(1) == '!') &&
4972
0
  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4973
0
  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4974
0
  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4975
0
  (UPP(8) == 'E')) {
4976
0
  htmlParseDocTypeDecl(ctxt);
4977
0
    }
4978
0
    SKIP_BLANKS;
4979
4980
    /*
4981
     * Parse possible comments and PIs before any content
4982
     */
4983
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4984
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4985
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4986
0
        htmlParseComment(ctxt);
4987
0
        htmlParsePI(ctxt);
4988
0
  SKIP_BLANKS;
4989
0
    }
4990
4991
    /*
4992
     * Time to start parsing the tree itself
4993
     */
4994
0
    htmlParseContentInternal(ctxt);
4995
4996
    /*
4997
     * autoclose
4998
     */
4999
0
    if (CUR == 0)
5000
0
  htmlAutoCloseOnEnd(ctxt);
5001
5002
5003
    /*
5004
     * SAX: end of the document processing.
5005
     */
5006
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5007
0
        ctxt->sax->endDocument(ctxt->userData);
5008
5009
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5010
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
5011
0
  if (dtd == NULL)
5012
0
      ctxt->myDoc->intSubset =
5013
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5014
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5015
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5016
0
    }
5017
0
    if (! ctxt->wellFormed) return(-1);
5018
0
    return(0);
5019
0
}
5020
5021
5022
/************************************************************************
5023
 *                  *
5024
 *      Parser contexts handling      *
5025
 *                  *
5026
 ************************************************************************/
5027
5028
/**
5029
 * htmlInitParserCtxt:
5030
 * @ctxt:  an HTML parser context
5031
 * @sax:  SAX handler
5032
 * @userData:  user data
5033
 *
5034
 * Initialize a parser context
5035
 *
5036
 * Returns 0 in case of success and -1 in case of error
5037
 */
5038
5039
static int
5040
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
5041
                   void *userData)
5042
0
{
5043
0
    if (ctxt == NULL) return(-1);
5044
0
    memset(ctxt, 0, sizeof(htmlParserCtxt));
5045
5046
0
    ctxt->dict = xmlDictCreate();
5047
0
    if (ctxt->dict == NULL) {
5048
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5049
0
  return(-1);
5050
0
    }
5051
5052
0
    if (ctxt->sax == NULL)
5053
0
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5054
0
    if (ctxt->sax == NULL) {
5055
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5056
0
  return(-1);
5057
0
    }
5058
0
    if (sax == NULL) {
5059
0
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
5060
0
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
5061
0
        ctxt->userData = ctxt;
5062
0
    } else {
5063
0
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5064
0
        ctxt->userData = userData ? userData : ctxt;
5065
0
    }
5066
5067
    /* Allocate the Input stack */
5068
0
    ctxt->inputTab = (htmlParserInputPtr *)
5069
0
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
5070
0
    if (ctxt->inputTab == NULL) {
5071
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5072
0
  ctxt->inputNr = 0;
5073
0
  ctxt->inputMax = 0;
5074
0
  ctxt->input = NULL;
5075
0
  return(-1);
5076
0
    }
5077
0
    ctxt->inputNr = 0;
5078
0
    ctxt->inputMax = 5;
5079
0
    ctxt->input = NULL;
5080
0
    ctxt->version = NULL;
5081
0
    ctxt->encoding = NULL;
5082
0
    ctxt->standalone = -1;
5083
0
    ctxt->instate = XML_PARSER_START;
5084
5085
    /* Allocate the Node stack */
5086
0
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5087
0
    if (ctxt->nodeTab == NULL) {
5088
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5089
0
  ctxt->nodeNr = 0;
5090
0
  ctxt->nodeMax = 0;
5091
0
  ctxt->node = NULL;
5092
0
  ctxt->inputNr = 0;
5093
0
  ctxt->inputMax = 0;
5094
0
  ctxt->input = NULL;
5095
0
  return(-1);
5096
0
    }
5097
0
    ctxt->nodeNr = 0;
5098
0
    ctxt->nodeMax = 10;
5099
0
    ctxt->node = NULL;
5100
5101
    /* Allocate the Name stack */
5102
0
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5103
0
    if (ctxt->nameTab == NULL) {
5104
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5105
0
  ctxt->nameNr = 0;
5106
0
  ctxt->nameMax = 0;
5107
0
  ctxt->name = NULL;
5108
0
  ctxt->nodeNr = 0;
5109
0
  ctxt->nodeMax = 0;
5110
0
  ctxt->node = NULL;
5111
0
  ctxt->inputNr = 0;
5112
0
  ctxt->inputMax = 0;
5113
0
  ctxt->input = NULL;
5114
0
  return(-1);
5115
0
    }
5116
0
    ctxt->nameNr = 0;
5117
0
    ctxt->nameMax = 10;
5118
0
    ctxt->name = NULL;
5119
5120
0
    ctxt->nodeInfoTab = NULL;
5121
0
    ctxt->nodeInfoNr  = 0;
5122
0
    ctxt->nodeInfoMax = 0;
5123
5124
0
    ctxt->myDoc = NULL;
5125
0
    ctxt->wellFormed = 1;
5126
0
    ctxt->replaceEntities = 0;
5127
0
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
5128
0
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5129
0
    ctxt->html = 1;
5130
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5131
0
    ctxt->vctxt.userData = ctxt;
5132
0
    ctxt->vctxt.error = xmlParserValidityError;
5133
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
5134
0
    ctxt->record_info = 0;
5135
0
    ctxt->validate = 0;
5136
0
    ctxt->checkIndex = 0;
5137
0
    ctxt->catalogs = NULL;
5138
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
5139
0
    return(0);
5140
0
}
5141
5142
/**
5143
 * htmlFreeParserCtxt:
5144
 * @ctxt:  an HTML parser context
5145
 *
5146
 * Free all the memory used by a parser context. However the parsed
5147
 * document in ctxt->myDoc is not freed.
5148
 */
5149
5150
void
5151
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5152
0
{
5153
0
    xmlFreeParserCtxt(ctxt);
5154
0
}
5155
5156
/**
5157
 * htmlNewParserCtxt:
5158
 *
5159
 * Allocate and initialize a new parser context.
5160
 *
5161
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5162
 */
5163
5164
htmlParserCtxtPtr
5165
htmlNewParserCtxt(void)
5166
0
{
5167
0
    return(htmlNewSAXParserCtxt(NULL, NULL));
5168
0
}
5169
5170
/**
5171
 * htmlNewSAXParserCtxt:
5172
 * @sax:  SAX handler
5173
 * @userData:  user data
5174
 *
5175
 * Allocate and initialize a new SAX parser context. If userData is NULL,
5176
 * the parser context will be passed as user data.
5177
 *
5178
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5179
 */
5180
5181
htmlParserCtxtPtr
5182
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5183
0
{
5184
0
    xmlParserCtxtPtr ctxt;
5185
5186
0
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5187
0
    if (ctxt == NULL) {
5188
0
        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5189
0
  return(NULL);
5190
0
    }
5191
0
    memset(ctxt, 0, sizeof(xmlParserCtxt));
5192
0
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5193
0
        htmlFreeParserCtxt(ctxt);
5194
0
  return(NULL);
5195
0
    }
5196
0
    return(ctxt);
5197
0
}
5198
5199
/**
5200
 * htmlCreateMemoryParserCtxt:
5201
 * @buffer:  a pointer to a char array
5202
 * @size:  the size of the array
5203
 *
5204
 * Create a parser context for an HTML in-memory document.
5205
 *
5206
 * Returns the new parser context or NULL
5207
 */
5208
htmlParserCtxtPtr
5209
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5210
0
    xmlParserCtxtPtr ctxt;
5211
0
    xmlParserInputPtr input;
5212
0
    xmlParserInputBufferPtr buf;
5213
5214
0
    if (buffer == NULL)
5215
0
  return(NULL);
5216
0
    if (size <= 0)
5217
0
  return(NULL);
5218
5219
0
    ctxt = htmlNewParserCtxt();
5220
0
    if (ctxt == NULL)
5221
0
  return(NULL);
5222
5223
0
    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5224
0
    if (buf == NULL) return(NULL);
5225
5226
0
    input = xmlNewInputStream(ctxt);
5227
0
    if (input == NULL) {
5228
0
  xmlFreeParserInputBuffer(buf);
5229
0
  xmlFreeParserCtxt(ctxt);
5230
0
  return(NULL);
5231
0
    }
5232
5233
0
    input->filename = NULL;
5234
0
    input->buf = buf;
5235
0
    xmlBufResetInput(buf->buffer, input);
5236
5237
0
    inputPush(ctxt, input);
5238
0
    return(ctxt);
5239
0
}
5240
5241
/**
5242
 * htmlCreateDocParserCtxt:
5243
 * @cur:  a pointer to an array of xmlChar
5244
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5245
 *
5246
 * Create a parser context for an HTML document.
5247
 *
5248
 * TODO: check the need to add encoding handling there
5249
 *
5250
 * Returns the new parser context or NULL
5251
 */
5252
static htmlParserCtxtPtr
5253
0
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5254
0
    int len;
5255
0
    htmlParserCtxtPtr ctxt;
5256
5257
0
    if (cur == NULL)
5258
0
  return(NULL);
5259
0
    len = xmlStrlen(cur);
5260
0
    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5261
0
    if (ctxt == NULL)
5262
0
  return(NULL);
5263
5264
0
    if (encoding != NULL) {
5265
0
  xmlCharEncoding enc;
5266
0
  xmlCharEncodingHandlerPtr handler;
5267
5268
0
  if (ctxt->input->encoding != NULL)
5269
0
      xmlFree((xmlChar *) ctxt->input->encoding);
5270
0
  ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5271
5272
0
  enc = xmlParseCharEncoding(encoding);
5273
  /*
5274
   * registered set of known encodings
5275
   */
5276
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
5277
0
      xmlSwitchEncoding(ctxt, enc);
5278
0
      if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5279
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5280
0
                 "Unsupported encoding %s\n",
5281
0
           (const xmlChar *) encoding, NULL);
5282
0
      }
5283
0
  } else {
5284
      /*
5285
       * fallback for unknown encodings
5286
       */
5287
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
5288
0
      if (handler != NULL) {
5289
0
    xmlSwitchToEncoding(ctxt, handler);
5290
0
      } else {
5291
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5292
0
                 "Unsupported encoding %s\n",
5293
0
           (const xmlChar *) encoding, NULL);
5294
0
      }
5295
0
  }
5296
0
    }
5297
0
    return(ctxt);
5298
0
}
5299
5300
#ifdef LIBXML_PUSH_ENABLED
5301
/************************************************************************
5302
 *                  *
5303
 *  Progressive parsing interfaces        *
5304
 *                  *
5305
 ************************************************************************/
5306
5307
/**
5308
 * htmlParseLookupSequence:
5309
 * @ctxt:  an HTML parser context
5310
 * @first:  the first char to lookup
5311
 * @next:  the next char to lookup or zero
5312
 * @third:  the next char to lookup or zero
5313
 * @ignoreattrval: skip over attribute values
5314
 *
5315
 * Try to find if a sequence (first, next, third) or  just (first next) or
5316
 * (first) is available in the input stream.
5317
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5318
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5319
 * parser, do not use liberally.
5320
 * This is basically similar to xmlParseLookupSequence()
5321
 *
5322
 * Returns the index to the current parsing point if the full sequence
5323
 *      is available, -1 otherwise.
5324
 */
5325
static int
5326
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5327
                        xmlChar next, xmlChar third, int ignoreattrval)
5328
{
5329
    int base, len;
5330
    htmlParserInputPtr in;
5331
    const xmlChar *buf;
5332
    int quote;
5333
5334
    in = ctxt->input;
5335
    if (in == NULL)
5336
        return (-1);
5337
5338
    base = ctxt->checkIndex;
5339
    quote = ctxt->endCheckState;
5340
5341
    buf = in->cur;
5342
    len = in->end - in->cur;
5343
5344
    /* take into account the sequence length */
5345
    if (third)
5346
        len -= 2;
5347
    else if (next)
5348
        len--;
5349
    for (; base < len; base++) {
5350
        if (ignoreattrval) {
5351
            if (quote) {
5352
                if (buf[base] == quote)
5353
                    quote = 0;
5354
                continue;
5355
            }
5356
            if (buf[base] == '"' || buf[base] == '\'') {
5357
                quote = buf[base];
5358
                continue;
5359
            }
5360
        }
5361
        if (buf[base] == first) {
5362
            if (third != 0) {
5363
                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5364
                    continue;
5365
            } else if (next != 0) {
5366
                if (buf[base + 1] != next)
5367
                    continue;
5368
            }
5369
            ctxt->checkIndex = 0;
5370
            ctxt->endCheckState = 0;
5371
            return (base);
5372
        }
5373
    }
5374
    ctxt->checkIndex = base;
5375
    ctxt->endCheckState = quote;
5376
#ifdef DEBUG_PUSH
5377
    if (next == 0)
5378
        xmlGenericError(xmlGenericErrorContext,
5379
                        "HPP: lookup '%c' failed\n", first);
5380
    else if (third == 0)
5381
        xmlGenericError(xmlGenericErrorContext,
5382
                        "HPP: lookup '%c%c' failed\n", first, next);
5383
    else
5384
        xmlGenericError(xmlGenericErrorContext,
5385
                        "HPP: lookup '%c%c%c' failed\n", first, next,
5386
                        third);
5387
#endif
5388
    return (-1);
5389
}
5390
5391
/**
5392
 * htmlParseLookupCommentEnd:
5393
 * @ctxt: an HTML parser context
5394
 *
5395
 * Try to find a comment end tag in the input stream
5396
 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5397
 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5398
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5399
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5400
 * parser, do not use liberally.
5401
 * This wraps to htmlParseLookupSequence()
5402
 *
5403
 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5404
 */
5405
static int
5406
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5407
{
5408
    int mark = 0;
5409
    int offset;
5410
5411
    while (1) {
5412
  mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5413
  if (mark < 0)
5414
            break;
5415
        if ((NXT(mark+2) == '>') ||
5416
      ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5417
            ctxt->checkIndex = 0;
5418
      break;
5419
  }
5420
        offset = (NXT(mark+2) == '!') ? 3 : 2;
5421
        if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5422
      ctxt->checkIndex = mark;
5423
            return(-1);
5424
        }
5425
  ctxt->checkIndex = mark + 1;
5426
    }
5427
    return mark;
5428
}
5429
5430
5431
/**
5432
 * htmlParseTryOrFinish:
5433
 * @ctxt:  an HTML parser context
5434
 * @terminate:  last chunk indicator
5435
 *
5436
 * Try to progress on parsing
5437
 *
5438
 * Returns zero if no parsing was possible
5439
 */
5440
static int
5441
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5442
    int ret = 0;
5443
    htmlParserInputPtr in;
5444
    ptrdiff_t avail = 0;
5445
    xmlChar cur, next;
5446
5447
    htmlParserNodeInfo node_info;
5448
5449
#ifdef DEBUG_PUSH
5450
    switch (ctxt->instate) {
5451
  case XML_PARSER_EOF:
5452
      xmlGenericError(xmlGenericErrorContext,
5453
        "HPP: try EOF\n"); break;
5454
  case XML_PARSER_START:
5455
      xmlGenericError(xmlGenericErrorContext,
5456
        "HPP: try START\n"); break;
5457
  case XML_PARSER_MISC:
5458
      xmlGenericError(xmlGenericErrorContext,
5459
        "HPP: try MISC\n");break;
5460
  case XML_PARSER_COMMENT:
5461
      xmlGenericError(xmlGenericErrorContext,
5462
        "HPP: try COMMENT\n");break;
5463
  case XML_PARSER_PROLOG:
5464
      xmlGenericError(xmlGenericErrorContext,
5465
        "HPP: try PROLOG\n");break;
5466
  case XML_PARSER_START_TAG:
5467
      xmlGenericError(xmlGenericErrorContext,
5468
        "HPP: try START_TAG\n");break;
5469
  case XML_PARSER_CONTENT:
5470
      xmlGenericError(xmlGenericErrorContext,
5471
        "HPP: try CONTENT\n");break;
5472
  case XML_PARSER_CDATA_SECTION:
5473
      xmlGenericError(xmlGenericErrorContext,
5474
        "HPP: try CDATA_SECTION\n");break;
5475
  case XML_PARSER_END_TAG:
5476
      xmlGenericError(xmlGenericErrorContext,
5477
        "HPP: try END_TAG\n");break;
5478
  case XML_PARSER_ENTITY_DECL:
5479
      xmlGenericError(xmlGenericErrorContext,
5480
        "HPP: try ENTITY_DECL\n");break;
5481
  case XML_PARSER_ENTITY_VALUE:
5482
      xmlGenericError(xmlGenericErrorContext,
5483
        "HPP: try ENTITY_VALUE\n");break;
5484
  case XML_PARSER_ATTRIBUTE_VALUE:
5485
      xmlGenericError(xmlGenericErrorContext,
5486
        "HPP: try ATTRIBUTE_VALUE\n");break;
5487
  case XML_PARSER_DTD:
5488
      xmlGenericError(xmlGenericErrorContext,
5489
        "HPP: try DTD\n");break;
5490
  case XML_PARSER_EPILOG:
5491
      xmlGenericError(xmlGenericErrorContext,
5492
        "HPP: try EPILOG\n");break;
5493
  case XML_PARSER_PI:
5494
      xmlGenericError(xmlGenericErrorContext,
5495
        "HPP: try PI\n");break;
5496
  case XML_PARSER_SYSTEM_LITERAL:
5497
      xmlGenericError(xmlGenericErrorContext,
5498
        "HPP: try SYSTEM_LITERAL\n");break;
5499
    }
5500
#endif
5501
5502
    while (1) {
5503
5504
  in = ctxt->input;
5505
  if (in == NULL) break;
5506
  if (in->buf == NULL)
5507
      avail = in->length - (in->cur - in->base);
5508
  else
5509
      avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5510
                    (in->cur - in->base);
5511
  if ((avail == 0) && (terminate)) {
5512
      htmlAutoCloseOnEnd(ctxt);
5513
      if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5514
    /*
5515
     * SAX: end of the document processing.
5516
     */
5517
    ctxt->instate = XML_PARSER_EOF;
5518
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5519
        ctxt->sax->endDocument(ctxt->userData);
5520
      }
5521
  }
5522
        if (avail < 1)
5523
      goto done;
5524
        /*
5525
         * This is done to make progress and avoid an infinite loop
5526
         * if a parsing attempt was aborted by hitting a NUL byte. After
5527
         * changing htmlCurrentChar, this probably isn't necessary anymore.
5528
         * We should consider removing this check.
5529
         */
5530
  cur = in->cur[0];
5531
  if (cur == 0) {
5532
      SKIP(1);
5533
      continue;
5534
  }
5535
5536
        switch (ctxt->instate) {
5537
            case XML_PARSER_EOF:
5538
          /*
5539
     * Document parsing is done !
5540
     */
5541
          goto done;
5542
            case XML_PARSER_START:
5543
          /*
5544
     * Very first chars read from the document flow.
5545
     */
5546
    cur = in->cur[0];
5547
    if (IS_BLANK_CH(cur)) {
5548
        SKIP_BLANKS;
5549
        if (in->buf == NULL)
5550
      avail = in->length - (in->cur - in->base);
5551
        else
5552
      avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5553
                                (in->cur - in->base);
5554
    }
5555
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5556
        ctxt->sax->setDocumentLocator(ctxt->userData,
5557
              &xmlDefaultSAXLocator);
5558
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5559
              (!ctxt->disableSAX))
5560
        ctxt->sax->startDocument(ctxt->userData);
5561
5562
    cur = in->cur[0];
5563
    next = in->cur[1];
5564
    if ((cur == '<') && (next == '!') &&
5565
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5566
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5567
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5568
        (UPP(8) == 'E')) {
5569
        if ((!terminate) &&
5570
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5571
      goto done;
5572
#ifdef DEBUG_PUSH
5573
        xmlGenericError(xmlGenericErrorContext,
5574
          "HPP: Parsing internal subset\n");
5575
#endif
5576
        htmlParseDocTypeDecl(ctxt);
5577
        ctxt->instate = XML_PARSER_PROLOG;
5578
#ifdef DEBUG_PUSH
5579
        xmlGenericError(xmlGenericErrorContext,
5580
          "HPP: entering PROLOG\n");
5581
#endif
5582
                } else {
5583
        ctxt->instate = XML_PARSER_MISC;
5584
#ifdef DEBUG_PUSH
5585
        xmlGenericError(xmlGenericErrorContext,
5586
          "HPP: entering MISC\n");
5587
#endif
5588
    }
5589
    break;
5590
            case XML_PARSER_MISC:
5591
    SKIP_BLANKS;
5592
    if (in->buf == NULL)
5593
        avail = in->length - (in->cur - in->base);
5594
    else
5595
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5596
                            (in->cur - in->base);
5597
    /*
5598
     * no chars in buffer
5599
     */
5600
    if (avail < 1)
5601
        goto done;
5602
    /*
5603
     * not enough chars in buffer
5604
     */
5605
    if (avail < 2) {
5606
        if (!terminate)
5607
      goto done;
5608
        else
5609
      next = ' ';
5610
    } else {
5611
        next = in->cur[1];
5612
    }
5613
    cur = in->cur[0];
5614
          if ((cur == '<') && (next == '!') &&
5615
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5616
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5617
      goto done;
5618
#ifdef DEBUG_PUSH
5619
        xmlGenericError(xmlGenericErrorContext,
5620
          "HPP: Parsing Comment\n");
5621
#endif
5622
        htmlParseComment(ctxt);
5623
        ctxt->instate = XML_PARSER_MISC;
5624
          } else if ((cur == '<') && (next == '?')) {
5625
        if ((!terminate) &&
5626
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5627
      goto done;
5628
#ifdef DEBUG_PUSH
5629
        xmlGenericError(xmlGenericErrorContext,
5630
          "HPP: Parsing PI\n");
5631
#endif
5632
        htmlParsePI(ctxt);
5633
        ctxt->instate = XML_PARSER_MISC;
5634
    } else if ((cur == '<') && (next == '!') &&
5635
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5636
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5637
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5638
        (UPP(8) == 'E')) {
5639
        if ((!terminate) &&
5640
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5641
      goto done;
5642
#ifdef DEBUG_PUSH
5643
        xmlGenericError(xmlGenericErrorContext,
5644
          "HPP: Parsing internal subset\n");
5645
#endif
5646
        htmlParseDocTypeDecl(ctxt);
5647
        ctxt->instate = XML_PARSER_PROLOG;
5648
#ifdef DEBUG_PUSH
5649
        xmlGenericError(xmlGenericErrorContext,
5650
          "HPP: entering PROLOG\n");
5651
#endif
5652
    } else if ((cur == '<') && (next == '!') &&
5653
               (avail < 9)) {
5654
        goto done;
5655
    } else {
5656
        ctxt->instate = XML_PARSER_CONTENT;
5657
#ifdef DEBUG_PUSH
5658
        xmlGenericError(xmlGenericErrorContext,
5659
          "HPP: entering START_TAG\n");
5660
#endif
5661
    }
5662
    break;
5663
            case XML_PARSER_PROLOG:
5664
    SKIP_BLANKS;
5665
    if (in->buf == NULL)
5666
        avail = in->length - (in->cur - in->base);
5667
    else
5668
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5669
                            (in->cur - in->base);
5670
    if (avail < 2)
5671
        goto done;
5672
    cur = in->cur[0];
5673
    next = in->cur[1];
5674
    if ((cur == '<') && (next == '!') &&
5675
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5676
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5677
      goto done;
5678
#ifdef DEBUG_PUSH
5679
        xmlGenericError(xmlGenericErrorContext,
5680
          "HPP: Parsing Comment\n");
5681
#endif
5682
        htmlParseComment(ctxt);
5683
        ctxt->instate = XML_PARSER_PROLOG;
5684
          } else if ((cur == '<') && (next == '?')) {
5685
        if ((!terminate) &&
5686
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5687
      goto done;
5688
#ifdef DEBUG_PUSH
5689
        xmlGenericError(xmlGenericErrorContext,
5690
          "HPP: Parsing PI\n");
5691
#endif
5692
        htmlParsePI(ctxt);
5693
        ctxt->instate = XML_PARSER_PROLOG;
5694
    } else if ((cur == '<') && (next == '!') &&
5695
               (avail < 4)) {
5696
        goto done;
5697
    } else {
5698
        ctxt->instate = XML_PARSER_CONTENT;
5699
#ifdef DEBUG_PUSH
5700
        xmlGenericError(xmlGenericErrorContext,
5701
          "HPP: entering START_TAG\n");
5702
#endif
5703
    }
5704
    break;
5705
            case XML_PARSER_EPILOG:
5706
    if (in->buf == NULL)
5707
        avail = in->length - (in->cur - in->base);
5708
    else
5709
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5710
                            (in->cur - in->base);
5711
    if (avail < 1)
5712
        goto done;
5713
    cur = in->cur[0];
5714
    if (IS_BLANK_CH(cur)) {
5715
        htmlParseCharData(ctxt);
5716
        goto done;
5717
    }
5718
    if (avail < 2)
5719
        goto done;
5720
    next = in->cur[1];
5721
          if ((cur == '<') && (next == '!') &&
5722
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5723
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5724
      goto done;
5725
#ifdef DEBUG_PUSH
5726
        xmlGenericError(xmlGenericErrorContext,
5727
          "HPP: Parsing Comment\n");
5728
#endif
5729
        htmlParseComment(ctxt);
5730
        ctxt->instate = XML_PARSER_EPILOG;
5731
          } else if ((cur == '<') && (next == '?')) {
5732
        if ((!terminate) &&
5733
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5734
      goto done;
5735
#ifdef DEBUG_PUSH
5736
        xmlGenericError(xmlGenericErrorContext,
5737
          "HPP: Parsing PI\n");
5738
#endif
5739
        htmlParsePI(ctxt);
5740
        ctxt->instate = XML_PARSER_EPILOG;
5741
    } else if ((cur == '<') && (next == '!') &&
5742
               (avail < 4)) {
5743
        goto done;
5744
    } else {
5745
        ctxt->errNo = XML_ERR_DOCUMENT_END;
5746
        ctxt->wellFormed = 0;
5747
        ctxt->instate = XML_PARSER_EOF;
5748
#ifdef DEBUG_PUSH
5749
        xmlGenericError(xmlGenericErrorContext,
5750
          "HPP: entering EOF\n");
5751
#endif
5752
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5753
      ctxt->sax->endDocument(ctxt->userData);
5754
        goto done;
5755
    }
5756
    break;
5757
            case XML_PARSER_START_TAG: {
5758
          const xmlChar *name;
5759
    int failed;
5760
    const htmlElemDesc * info;
5761
5762
    /*
5763
     * no chars in buffer
5764
     */
5765
    if (avail < 1)
5766
        goto done;
5767
    /*
5768
     * not enough chars in buffer
5769
     */
5770
    if (avail < 2) {
5771
        if (!terminate)
5772
      goto done;
5773
        else
5774
      next = ' ';
5775
    } else {
5776
        next = in->cur[1];
5777
    }
5778
    cur = in->cur[0];
5779
          if (cur != '<') {
5780
        ctxt->instate = XML_PARSER_CONTENT;
5781
#ifdef DEBUG_PUSH
5782
        xmlGenericError(xmlGenericErrorContext,
5783
          "HPP: entering CONTENT\n");
5784
#endif
5785
        break;
5786
    }
5787
    if (next == '/') {
5788
        ctxt->instate = XML_PARSER_END_TAG;
5789
        ctxt->checkIndex = 0;
5790
#ifdef DEBUG_PUSH
5791
        xmlGenericError(xmlGenericErrorContext,
5792
          "HPP: entering END_TAG\n");
5793
#endif
5794
        break;
5795
    }
5796
    if ((!terminate) &&
5797
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5798
        goto done;
5799
5800
                /* Capture start position */
5801
          if (ctxt->record_info) {
5802
               node_info.begin_pos = ctxt->input->consumed +
5803
                                  (CUR_PTR - ctxt->input->base);
5804
               node_info.begin_line = ctxt->input->line;
5805
          }
5806
5807
5808
    failed = htmlParseStartTag(ctxt);
5809
    name = ctxt->name;
5810
    if ((failed == -1) ||
5811
        (name == NULL)) {
5812
        if (CUR == '>')
5813
      NEXT;
5814
        break;
5815
    }
5816
5817
    /*
5818
     * Lookup the info for that element.
5819
     */
5820
    info = htmlTagLookup(name);
5821
    if (info == NULL) {
5822
        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5823
                     "Tag %s invalid\n", name, NULL);
5824
    }
5825
5826
    /*
5827
     * Check for an Empty Element labeled the XML/SGML way
5828
     */
5829
    if ((CUR == '/') && (NXT(1) == '>')) {
5830
        SKIP(2);
5831
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5832
      ctxt->sax->endElement(ctxt->userData, name);
5833
        htmlnamePop(ctxt);
5834
        ctxt->instate = XML_PARSER_CONTENT;
5835
#ifdef DEBUG_PUSH
5836
        xmlGenericError(xmlGenericErrorContext,
5837
          "HPP: entering CONTENT\n");
5838
#endif
5839
        break;
5840
    }
5841
5842
    if (CUR == '>') {
5843
        NEXT;
5844
    } else {
5845
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5846
                     "Couldn't find end of Start Tag %s\n",
5847
         name, NULL);
5848
5849
        /*
5850
         * end of parsing of this node.
5851
         */
5852
        if (xmlStrEqual(name, ctxt->name)) {
5853
      nodePop(ctxt);
5854
      htmlnamePop(ctxt);
5855
        }
5856
5857
        if (ctxt->record_info)
5858
            htmlNodeInfoPush(ctxt, &node_info);
5859
5860
        ctxt->instate = XML_PARSER_CONTENT;
5861
#ifdef DEBUG_PUSH
5862
        xmlGenericError(xmlGenericErrorContext,
5863
          "HPP: entering CONTENT\n");
5864
#endif
5865
        break;
5866
    }
5867
5868
    /*
5869
     * Check for an Empty Element from DTD definition
5870
     */
5871
    if ((info != NULL) && (info->empty)) {
5872
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5873
      ctxt->sax->endElement(ctxt->userData, name);
5874
        htmlnamePop(ctxt);
5875
    }
5876
5877
                if (ctxt->record_info)
5878
              htmlNodeInfoPush(ctxt, &node_info);
5879
5880
    ctxt->instate = XML_PARSER_CONTENT;
5881
#ifdef DEBUG_PUSH
5882
    xmlGenericError(xmlGenericErrorContext,
5883
      "HPP: entering CONTENT\n");
5884
#endif
5885
                break;
5886
      }
5887
            case XML_PARSER_CONTENT: {
5888
    xmlChar chr[2] = { 0, 0 };
5889
5890
                /*
5891
     * Handle preparsed entities and charRef
5892
     */
5893
    if (ctxt->token != 0) {
5894
        chr[0] = ctxt->token;
5895
        htmlCheckParagraph(ctxt);
5896
        if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5897
      ctxt->sax->characters(ctxt->userData, chr, 1);
5898
        ctxt->token = 0;
5899
        ctxt->checkIndex = 0;
5900
    }
5901
    if ((avail == 1) && (terminate)) {
5902
        cur = in->cur[0];
5903
        if ((cur != '<') && (cur != '&')) {
5904
      if (ctxt->sax != NULL) {
5905
                            chr[0] = cur;
5906
          if (IS_BLANK_CH(cur)) {
5907
        if (ctxt->keepBlanks) {
5908
            if (ctxt->sax->characters != NULL)
5909
          ctxt->sax->characters(
5910
            ctxt->userData, chr, 1);
5911
        } else {
5912
            if (ctxt->sax->ignorableWhitespace != NULL)
5913
          ctxt->sax->ignorableWhitespace(
5914
            ctxt->userData, chr, 1);
5915
        }
5916
          } else {
5917
        htmlCheckParagraph(ctxt);
5918
        if (ctxt->sax->characters != NULL)
5919
            ctxt->sax->characters(
5920
              ctxt->userData, chr, 1);
5921
          }
5922
      }
5923
      ctxt->token = 0;
5924
      ctxt->checkIndex = 0;
5925
      in->cur++;
5926
      break;
5927
        }
5928
    }
5929
    if (avail < 2)
5930
        goto done;
5931
    cur = in->cur[0];
5932
    next = in->cur[1];
5933
    if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5934
        (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5935
        /*
5936
         * Handle SCRIPT/STYLE separately
5937
         */
5938
        if (!terminate) {
5939
            int idx;
5940
      xmlChar val;
5941
5942
      idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5943
      if (idx < 0)
5944
          goto done;
5945
            val = in->cur[idx + 2];
5946
      if (val == 0) /* bad cut of input */
5947
          goto done;
5948
        }
5949
        htmlParseScript(ctxt);
5950
        if ((cur == '<') && (next == '/')) {
5951
      ctxt->instate = XML_PARSER_END_TAG;
5952
      ctxt->checkIndex = 0;
5953
#ifdef DEBUG_PUSH
5954
      xmlGenericError(xmlGenericErrorContext,
5955
        "HPP: entering END_TAG\n");
5956
#endif
5957
      break;
5958
        }
5959
    } else if ((cur == '<') && (next == '!')) {
5960
                    if (avail < 4)
5961
                        goto done;
5962
                    /*
5963
                     * Sometimes DOCTYPE arrives in the middle of the document
5964
                     */
5965
                    if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5966
                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5967
                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5968
                        (UPP(8) == 'E')) {
5969
                        if ((!terminate) &&
5970
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5971
                            goto done;
5972
                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5973
                                     "Misplaced DOCTYPE declaration\n",
5974
                                     BAD_CAST "DOCTYPE" , NULL);
5975
                        htmlParseDocTypeDecl(ctxt);
5976
                    } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5977
                        if ((!terminate) &&
5978
                            (htmlParseLookupCommentEnd(ctxt) < 0))
5979
                            goto done;
5980
#ifdef DEBUG_PUSH
5981
                        xmlGenericError(xmlGenericErrorContext,
5982
                                "HPP: Parsing Comment\n");
5983
#endif
5984
                        htmlParseComment(ctxt);
5985
                        ctxt->instate = XML_PARSER_CONTENT;
5986
                    } else {
5987
                        if ((!terminate) &&
5988
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5989
                            goto done;
5990
                        htmlSkipBogusComment(ctxt);
5991
                    }
5992
                } else if ((cur == '<') && (next == '?')) {
5993
                    if ((!terminate) &&
5994
                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5995
                        goto done;
5996
#ifdef DEBUG_PUSH
5997
                    xmlGenericError(xmlGenericErrorContext,
5998
                            "HPP: Parsing PI\n");
5999
#endif
6000
                    htmlParsePI(ctxt);
6001
                    ctxt->instate = XML_PARSER_CONTENT;
6002
                } else if ((cur == '<') && (next == '/')) {
6003
                    ctxt->instate = XML_PARSER_END_TAG;
6004
                    ctxt->checkIndex = 0;
6005
#ifdef DEBUG_PUSH
6006
                    xmlGenericError(xmlGenericErrorContext,
6007
                            "HPP: entering END_TAG\n");
6008
#endif
6009
                    break;
6010
                } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6011
                    if ((!terminate) && (next == 0))
6012
                        goto done;
6013
                    ctxt->instate = XML_PARSER_START_TAG;
6014
                    ctxt->checkIndex = 0;
6015
#ifdef DEBUG_PUSH
6016
                    xmlGenericError(xmlGenericErrorContext,
6017
                            "HPP: entering START_TAG\n");
6018
#endif
6019
                    break;
6020
                } else if (cur == '<') {
6021
                    if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6022
                        (ctxt->sax->characters != NULL))
6023
                        ctxt->sax->characters(ctxt->userData,
6024
                                              BAD_CAST "<", 1);
6025
                    NEXT;
6026
                } else {
6027
                    /*
6028
                     * check that the text sequence is complete
6029
                     * before handing out the data to the parser
6030
                     * to avoid problems with erroneous end of
6031
                     * data detection.
6032
                     */
6033
                    if ((!terminate) &&
6034
                        (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6035
                        goto done;
6036
                    ctxt->checkIndex = 0;
6037
#ifdef DEBUG_PUSH
6038
                    xmlGenericError(xmlGenericErrorContext,
6039
                            "HPP: Parsing char data\n");
6040
#endif
6041
                    while ((ctxt->instate != XML_PARSER_EOF) &&
6042
                           (cur != '<') && (in->cur < in->end)) {
6043
                        if (cur == '&') {
6044
                            htmlParseReference(ctxt);
6045
                        } else {
6046
                            htmlParseCharData(ctxt);
6047
                        }
6048
                        cur = in->cur[0];
6049
                    }
6050
    }
6051
6052
    break;
6053
      }
6054
            case XML_PARSER_END_TAG:
6055
    if (avail < 2)
6056
        goto done;
6057
    if ((!terminate) &&
6058
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6059
        goto done;
6060
    htmlParseEndTag(ctxt);
6061
    if (ctxt->nameNr == 0) {
6062
        ctxt->instate = XML_PARSER_EPILOG;
6063
    } else {
6064
        ctxt->instate = XML_PARSER_CONTENT;
6065
    }
6066
    ctxt->checkIndex = 0;
6067
#ifdef DEBUG_PUSH
6068
    xmlGenericError(xmlGenericErrorContext,
6069
      "HPP: entering CONTENT\n");
6070
#endif
6071
          break;
6072
            case XML_PARSER_CDATA_SECTION:
6073
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6074
      "HPP: internal error, state == CDATA\n",
6075
           NULL, NULL);
6076
    ctxt->instate = XML_PARSER_CONTENT;
6077
    ctxt->checkIndex = 0;
6078
#ifdef DEBUG_PUSH
6079
    xmlGenericError(xmlGenericErrorContext,
6080
      "HPP: entering CONTENT\n");
6081
#endif
6082
    break;
6083
            case XML_PARSER_DTD:
6084
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6085
      "HPP: internal error, state == DTD\n",
6086
           NULL, NULL);
6087
    ctxt->instate = XML_PARSER_CONTENT;
6088
    ctxt->checkIndex = 0;
6089
#ifdef DEBUG_PUSH
6090
    xmlGenericError(xmlGenericErrorContext,
6091
      "HPP: entering CONTENT\n");
6092
#endif
6093
    break;
6094
            case XML_PARSER_COMMENT:
6095
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6096
      "HPP: internal error, state == COMMENT\n",
6097
           NULL, NULL);
6098
    ctxt->instate = XML_PARSER_CONTENT;
6099
    ctxt->checkIndex = 0;
6100
#ifdef DEBUG_PUSH
6101
    xmlGenericError(xmlGenericErrorContext,
6102
      "HPP: entering CONTENT\n");
6103
#endif
6104
    break;
6105
            case XML_PARSER_PI:
6106
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6107
      "HPP: internal error, state == PI\n",
6108
           NULL, NULL);
6109
    ctxt->instate = XML_PARSER_CONTENT;
6110
    ctxt->checkIndex = 0;
6111
#ifdef DEBUG_PUSH
6112
    xmlGenericError(xmlGenericErrorContext,
6113
      "HPP: entering CONTENT\n");
6114
#endif
6115
    break;
6116
            case XML_PARSER_ENTITY_DECL:
6117
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6118
      "HPP: internal error, state == ENTITY_DECL\n",
6119
           NULL, NULL);
6120
    ctxt->instate = XML_PARSER_CONTENT;
6121
    ctxt->checkIndex = 0;
6122
#ifdef DEBUG_PUSH
6123
    xmlGenericError(xmlGenericErrorContext,
6124
      "HPP: entering CONTENT\n");
6125
#endif
6126
    break;
6127
            case XML_PARSER_ENTITY_VALUE:
6128
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6129
      "HPP: internal error, state == ENTITY_VALUE\n",
6130
           NULL, NULL);
6131
    ctxt->instate = XML_PARSER_CONTENT;
6132
    ctxt->checkIndex = 0;
6133
#ifdef DEBUG_PUSH
6134
    xmlGenericError(xmlGenericErrorContext,
6135
      "HPP: entering DTD\n");
6136
#endif
6137
    break;
6138
            case XML_PARSER_ATTRIBUTE_VALUE:
6139
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6140
      "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6141
           NULL, NULL);
6142
    ctxt->instate = XML_PARSER_START_TAG;
6143
    ctxt->checkIndex = 0;
6144
#ifdef DEBUG_PUSH
6145
    xmlGenericError(xmlGenericErrorContext,
6146
      "HPP: entering START_TAG\n");
6147
#endif
6148
    break;
6149
      case XML_PARSER_SYSTEM_LITERAL:
6150
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6151
        "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6152
           NULL, NULL);
6153
    ctxt->instate = XML_PARSER_CONTENT;
6154
    ctxt->checkIndex = 0;
6155
#ifdef DEBUG_PUSH
6156
    xmlGenericError(xmlGenericErrorContext,
6157
      "HPP: entering CONTENT\n");
6158
#endif
6159
    break;
6160
      case XML_PARSER_IGNORE:
6161
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6162
      "HPP: internal error, state == XML_PARSER_IGNORE\n",
6163
           NULL, NULL);
6164
    ctxt->instate = XML_PARSER_CONTENT;
6165
    ctxt->checkIndex = 0;
6166
#ifdef DEBUG_PUSH
6167
    xmlGenericError(xmlGenericErrorContext,
6168
      "HPP: entering CONTENT\n");
6169
#endif
6170
    break;
6171
      case XML_PARSER_PUBLIC_LITERAL:
6172
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6173
      "HPP: internal error, state == XML_PARSER_LITERAL\n",
6174
           NULL, NULL);
6175
    ctxt->instate = XML_PARSER_CONTENT;
6176
    ctxt->checkIndex = 0;
6177
#ifdef DEBUG_PUSH
6178
    xmlGenericError(xmlGenericErrorContext,
6179
      "HPP: entering CONTENT\n");
6180
#endif
6181
    break;
6182
6183
  }
6184
    }
6185
done:
6186
    if ((avail == 0) && (terminate)) {
6187
  htmlAutoCloseOnEnd(ctxt);
6188
  if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6189
      /*
6190
       * SAX: end of the document processing.
6191
       */
6192
      ctxt->instate = XML_PARSER_EOF;
6193
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6194
    ctxt->sax->endDocument(ctxt->userData);
6195
  }
6196
    }
6197
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6198
  ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6199
   (ctxt->instate == XML_PARSER_EPILOG))) {
6200
  xmlDtdPtr dtd;
6201
  dtd = xmlGetIntSubset(ctxt->myDoc);
6202
  if (dtd == NULL)
6203
      ctxt->myDoc->intSubset =
6204
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6205
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6206
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6207
    }
6208
#ifdef DEBUG_PUSH
6209
    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6210
#endif
6211
    return(ret);
6212
}
6213
6214
/**
6215
 * htmlParseChunk:
6216
 * @ctxt:  an HTML parser context
6217
 * @chunk:  an char array
6218
 * @size:  the size in byte of the chunk
6219
 * @terminate:  last chunk indicator
6220
 *
6221
 * Parse a Chunk of memory
6222
 *
6223
 * Returns zero if no error, the xmlParserErrors otherwise.
6224
 */
6225
int
6226
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6227
              int terminate) {
6228
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
6229
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6230
         "htmlParseChunk: context error\n", NULL, NULL);
6231
  return(XML_ERR_INTERNAL_ERROR);
6232
    }
6233
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6234
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6235
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6236
  size_t cur = ctxt->input->cur - ctxt->input->base;
6237
  int res;
6238
6239
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6240
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6241
  if (res < 0) {
6242
      ctxt->errNo = XML_PARSER_EOF;
6243
      ctxt->disableSAX = 1;
6244
      return (XML_PARSER_EOF);
6245
  }
6246
#ifdef DEBUG_PUSH
6247
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6248
#endif
6249
6250
#if 0
6251
  if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6252
      htmlParseTryOrFinish(ctxt, terminate);
6253
#endif
6254
    } else if (ctxt->instate != XML_PARSER_EOF) {
6255
  if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6256
      xmlParserInputBufferPtr in = ctxt->input->buf;
6257
      if ((in->encoder != NULL) && (in->buffer != NULL) &&
6258
        (in->raw != NULL)) {
6259
    int nbchars;
6260
    size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6261
    size_t current = ctxt->input->cur - ctxt->input->base;
6262
6263
    nbchars = xmlCharEncInput(in, terminate);
6264
    xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6265
    if (nbchars < 0) {
6266
        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6267
               "encoder error\n", NULL, NULL);
6268
        return(XML_ERR_INVALID_ENCODING);
6269
    }
6270
      }
6271
  }
6272
    }
6273
    htmlParseTryOrFinish(ctxt, terminate);
6274
    if (terminate) {
6275
  if ((ctxt->instate != XML_PARSER_EOF) &&
6276
      (ctxt->instate != XML_PARSER_EPILOG) &&
6277
      (ctxt->instate != XML_PARSER_MISC)) {
6278
      ctxt->errNo = XML_ERR_DOCUMENT_END;
6279
      ctxt->wellFormed = 0;
6280
  }
6281
  if (ctxt->instate != XML_PARSER_EOF) {
6282
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6283
    ctxt->sax->endDocument(ctxt->userData);
6284
  }
6285
  ctxt->instate = XML_PARSER_EOF;
6286
    }
6287
    return((xmlParserErrors) ctxt->errNo);
6288
}
6289
6290
/************************************************************************
6291
 *                  *
6292
 *      User entry points       *
6293
 *                  *
6294
 ************************************************************************/
6295
6296
/**
6297
 * htmlCreatePushParserCtxt:
6298
 * @sax:  a SAX handler
6299
 * @user_data:  The user data returned on SAX callbacks
6300
 * @chunk:  a pointer to an array of chars
6301
 * @size:  number of chars in the array
6302
 * @filename:  an optional file name or URI
6303
 * @enc:  an optional encoding
6304
 *
6305
 * Create a parser context for using the HTML parser in push mode
6306
 * The value of @filename is used for fetching external entities
6307
 * and error/warning reports.
6308
 *
6309
 * Returns the new parser context or NULL
6310
 */
6311
htmlParserCtxtPtr
6312
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6313
                         const char *chunk, int size, const char *filename,
6314
       xmlCharEncoding enc) {
6315
    htmlParserCtxtPtr ctxt;
6316
    htmlParserInputPtr inputStream;
6317
    xmlParserInputBufferPtr buf;
6318
6319
    xmlInitParser();
6320
6321
    buf = xmlAllocParserInputBuffer(enc);
6322
    if (buf == NULL) return(NULL);
6323
6324
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
6325
    if (ctxt == NULL) {
6326
  xmlFreeParserInputBuffer(buf);
6327
  return(NULL);
6328
    }
6329
    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6330
  ctxt->charset=XML_CHAR_ENCODING_UTF8;
6331
    if (filename == NULL) {
6332
  ctxt->directory = NULL;
6333
    } else {
6334
        ctxt->directory = xmlParserGetDirectory(filename);
6335
    }
6336
6337
    inputStream = htmlNewInputStream(ctxt);
6338
    if (inputStream == NULL) {
6339
  xmlFreeParserCtxt(ctxt);
6340
  xmlFree(buf);
6341
  return(NULL);
6342
    }
6343
6344
    if (filename == NULL)
6345
  inputStream->filename = NULL;
6346
    else
6347
  inputStream->filename = (char *)
6348
      xmlCanonicPath((const xmlChar *) filename);
6349
    inputStream->buf = buf;
6350
    xmlBufResetInput(buf->buffer, inputStream);
6351
6352
    inputPush(ctxt, inputStream);
6353
6354
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6355
        (ctxt->input->buf != NULL))  {
6356
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6357
  size_t cur = ctxt->input->cur - ctxt->input->base;
6358
6359
  xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6360
6361
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6362
#ifdef DEBUG_PUSH
6363
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6364
#endif
6365
    }
6366
    ctxt->progressive = 1;
6367
6368
    return(ctxt);
6369
}
6370
#endif /* LIBXML_PUSH_ENABLED */
6371
6372
/**
6373
 * htmlSAXParseDoc:
6374
 * @cur:  a pointer to an array of xmlChar
6375
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6376
 * @sax:  the SAX handler block
6377
 * @userData: if using SAX, this pointer will be provided on callbacks.
6378
 *
6379
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6380
 *
6381
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6382
 * to handle parse events. If sax is NULL, fallback to the default DOM
6383
 * behavior and return a tree.
6384
 *
6385
 * Returns the resulting document tree unless SAX is NULL or the document is
6386
 *     not well formed.
6387
 */
6388
6389
htmlDocPtr
6390
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6391
0
                htmlSAXHandlerPtr sax, void *userData) {
6392
0
    htmlDocPtr ret;
6393
0
    htmlParserCtxtPtr ctxt;
6394
6395
0
    xmlInitParser();
6396
6397
0
    if (cur == NULL) return(NULL);
6398
6399
6400
0
    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6401
0
    if (ctxt == NULL) return(NULL);
6402
0
    if (sax != NULL) {
6403
0
        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6404
0
        ctxt->sax = sax;
6405
0
        ctxt->userData = userData;
6406
0
    }
6407
6408
0
    htmlParseDocument(ctxt);
6409
0
    ret = ctxt->myDoc;
6410
0
    if (sax != NULL) {
6411
0
  ctxt->sax = NULL;
6412
0
  ctxt->userData = NULL;
6413
0
    }
6414
0
    htmlFreeParserCtxt(ctxt);
6415
6416
0
    return(ret);
6417
0
}
6418
6419
/**
6420
 * htmlParseDoc:
6421
 * @cur:  a pointer to an array of xmlChar
6422
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6423
 *
6424
 * parse an HTML in-memory document and build a tree.
6425
 *
6426
 * Returns the resulting document tree
6427
 */
6428
6429
htmlDocPtr
6430
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
6431
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6432
0
}
6433
6434
6435
/**
6436
 * htmlCreateFileParserCtxt:
6437
 * @filename:  the filename
6438
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6439
 *
6440
 * Create a parser context for a file content.
6441
 * Automatic support for ZLIB/Compress compressed document is provided
6442
 * by default if found at compile-time.
6443
 *
6444
 * Returns the new parser context or NULL
6445
 */
6446
htmlParserCtxtPtr
6447
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6448
0
{
6449
0
    htmlParserCtxtPtr ctxt;
6450
0
    htmlParserInputPtr inputStream;
6451
0
    char *canonicFilename;
6452
    /* htmlCharEncoding enc; */
6453
0
    xmlChar *content, *content_line = (xmlChar *) "charset=";
6454
6455
0
    if (filename == NULL)
6456
0
        return(NULL);
6457
6458
0
    ctxt = htmlNewParserCtxt();
6459
0
    if (ctxt == NULL) {
6460
0
  return(NULL);
6461
0
    }
6462
0
    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6463
0
    if (canonicFilename == NULL) {
6464
0
  xmlFreeParserCtxt(ctxt);
6465
0
  return(NULL);
6466
0
    }
6467
6468
0
    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6469
0
    xmlFree(canonicFilename);
6470
0
    if (inputStream == NULL) {
6471
0
  xmlFreeParserCtxt(ctxt);
6472
0
  return(NULL);
6473
0
    }
6474
6475
0
    inputPush(ctxt, inputStream);
6476
6477
    /* set encoding */
6478
0
    if (encoding) {
6479
0
        size_t l = strlen(encoding);
6480
6481
0
  if (l < 1000) {
6482
0
      content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6483
0
      if (content) {
6484
0
    strcpy ((char *)content, (char *)content_line);
6485
0
    strcat ((char *)content, (char *)encoding);
6486
0
    htmlCheckEncoding (ctxt, content);
6487
0
    xmlFree (content);
6488
0
      }
6489
0
  }
6490
0
    }
6491
6492
0
    return(ctxt);
6493
0
}
6494
6495
/**
6496
 * htmlSAXParseFile:
6497
 * @filename:  the filename
6498
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6499
 * @sax:  the SAX handler block
6500
 * @userData: if using SAX, this pointer will be provided on callbacks.
6501
 *
6502
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6503
 *
6504
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6505
 * compressed document is provided by default if found at compile-time.
6506
 * It use the given SAX function block to handle the parsing callback.
6507
 * If sax is NULL, fallback to the default DOM tree building routines.
6508
 *
6509
 * Returns the resulting document tree unless SAX is NULL or the document is
6510
 *     not well formed.
6511
 */
6512
6513
htmlDocPtr
6514
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6515
0
                 void *userData) {
6516
0
    htmlDocPtr ret;
6517
0
    htmlParserCtxtPtr ctxt;
6518
0
    htmlSAXHandlerPtr oldsax = NULL;
6519
6520
0
    xmlInitParser();
6521
6522
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6523
0
    if (ctxt == NULL) return(NULL);
6524
0
    if (sax != NULL) {
6525
0
  oldsax = ctxt->sax;
6526
0
        ctxt->sax = sax;
6527
0
        ctxt->userData = userData;
6528
0
    }
6529
6530
0
    htmlParseDocument(ctxt);
6531
6532
0
    ret = ctxt->myDoc;
6533
0
    if (sax != NULL) {
6534
0
        ctxt->sax = oldsax;
6535
0
        ctxt->userData = NULL;
6536
0
    }
6537
0
    htmlFreeParserCtxt(ctxt);
6538
6539
0
    return(ret);
6540
0
}
6541
6542
/**
6543
 * htmlParseFile:
6544
 * @filename:  the filename
6545
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6546
 *
6547
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6548
 * compressed document is provided by default if found at compile-time.
6549
 *
6550
 * Returns the resulting document tree
6551
 */
6552
6553
htmlDocPtr
6554
0
htmlParseFile(const char *filename, const char *encoding) {
6555
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6556
0
}
6557
6558
/**
6559
 * htmlHandleOmittedElem:
6560
 * @val:  int 0 or 1
6561
 *
6562
 * Set and return the previous value for handling HTML omitted tags.
6563
 *
6564
 * Returns the last value for 0 for no handling, 1 for auto insertion.
6565
 */
6566
6567
int
6568
0
htmlHandleOmittedElem(int val) {
6569
0
    int old = htmlOmittedDefaultValue;
6570
6571
0
    htmlOmittedDefaultValue = val;
6572
0
    return(old);
6573
0
}
6574
6575
/**
6576
 * htmlElementAllowedHere:
6577
 * @parent: HTML parent element
6578
 * @elt: HTML element
6579
 *
6580
 * Checks whether an HTML element may be a direct child of a parent element.
6581
 * Note - doesn't check for deprecated elements
6582
 *
6583
 * Returns 1 if allowed; 0 otherwise.
6584
 */
6585
int
6586
0
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6587
0
  const char** p ;
6588
6589
0
  if ( ! elt || ! parent || ! parent->subelts )
6590
0
  return 0 ;
6591
6592
0
  for ( p = parent->subelts; *p; ++p )
6593
0
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6594
0
      return 1 ;
6595
6596
0
  return 0 ;
6597
0
}
6598
/**
6599
 * htmlElementStatusHere:
6600
 * @parent: HTML parent element
6601
 * @elt: HTML element
6602
 *
6603
 * Checks whether an HTML element may be a direct child of a parent element.
6604
 * and if so whether it is valid or deprecated.
6605
 *
6606
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6607
 */
6608
htmlStatus
6609
0
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6610
0
  if ( ! parent || ! elt )
6611
0
    return HTML_INVALID ;
6612
0
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6613
0
    return HTML_INVALID ;
6614
6615
0
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6616
0
}
6617
/**
6618
 * htmlAttrAllowed:
6619
 * @elt: HTML element
6620
 * @attr: HTML attribute
6621
 * @legacy: whether to allow deprecated attributes
6622
 *
6623
 * Checks whether an attribute is valid for an element
6624
 * Has full knowledge of Required and Deprecated attributes
6625
 *
6626
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6627
 */
6628
htmlStatus
6629
0
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6630
0
  const char** p ;
6631
6632
0
  if ( !elt || ! attr )
6633
0
  return HTML_INVALID ;
6634
6635
0
  if ( elt->attrs_req )
6636
0
    for ( p = elt->attrs_req; *p; ++p)
6637
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6638
0
        return HTML_REQUIRED ;
6639
6640
0
  if ( elt->attrs_opt )
6641
0
    for ( p = elt->attrs_opt; *p; ++p)
6642
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6643
0
        return HTML_VALID ;
6644
6645
0
  if ( legacy && elt->attrs_depr )
6646
0
    for ( p = elt->attrs_depr; *p; ++p)
6647
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6648
0
        return HTML_DEPRECATED ;
6649
6650
0
  return HTML_INVALID ;
6651
0
}
6652
/**
6653
 * htmlNodeStatus:
6654
 * @node: an htmlNodePtr in a tree
6655
 * @legacy: whether to allow deprecated elements (YES is faster here
6656
 *  for Element nodes)
6657
 *
6658
 * Checks whether the tree node is valid.  Experimental (the author
6659
 *     only uses the HTML enhancements in a SAX parser)
6660
 *
6661
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6662
 *  legacy allowed) or htmlElementStatusHere (otherwise).
6663
 *  for Attribute nodes, a return from htmlAttrAllowed
6664
 *  for other nodes, HTML_NA (no checks performed)
6665
 */
6666
htmlStatus
6667
0
htmlNodeStatus(const htmlNodePtr node, int legacy) {
6668
0
  if ( ! node )
6669
0
    return HTML_INVALID ;
6670
6671
0
  switch ( node->type ) {
6672
0
    case XML_ELEMENT_NODE:
6673
0
      return legacy
6674
0
  ? ( htmlElementAllowedHere (
6675
0
    htmlTagLookup(node->parent->name) , node->name
6676
0
    ) ? HTML_VALID : HTML_INVALID )
6677
0
  : htmlElementStatusHere(
6678
0
    htmlTagLookup(node->parent->name) ,
6679
0
    htmlTagLookup(node->name) )
6680
0
  ;
6681
0
    case XML_ATTRIBUTE_NODE:
6682
0
      return htmlAttrAllowed(
6683
0
  htmlTagLookup(node->parent->name) , node->name, legacy) ;
6684
0
    default: return HTML_NA ;
6685
0
  }
6686
0
}
6687
/************************************************************************
6688
 *                  *
6689
 *  New set (2.6.0) of simpler and more flexible APIs   *
6690
 *                  *
6691
 ************************************************************************/
6692
/**
6693
 * DICT_FREE:
6694
 * @str:  a string
6695
 *
6696
 * Free a string if it is not owned by the "dict" dictionary in the
6697
 * current scope
6698
 */
6699
#define DICT_FREE(str)            \
6700
0
  if ((str) && ((!dict) ||       \
6701
0
      (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6702
0
      xmlFree((char *)(str));
6703
6704
/**
6705
 * htmlCtxtReset:
6706
 * @ctxt: an HTML parser context
6707
 *
6708
 * Reset a parser context
6709
 */
6710
void
6711
htmlCtxtReset(htmlParserCtxtPtr ctxt)
6712
0
{
6713
0
    xmlParserInputPtr input;
6714
0
    xmlDictPtr dict;
6715
6716
0
    if (ctxt == NULL)
6717
0
        return;
6718
6719
0
    xmlInitParser();
6720
0
    dict = ctxt->dict;
6721
6722
0
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6723
0
        xmlFreeInputStream(input);
6724
0
    }
6725
0
    ctxt->inputNr = 0;
6726
0
    ctxt->input = NULL;
6727
6728
0
    ctxt->spaceNr = 0;
6729
0
    if (ctxt->spaceTab != NULL) {
6730
0
  ctxt->spaceTab[0] = -1;
6731
0
  ctxt->space = &ctxt->spaceTab[0];
6732
0
    } else {
6733
0
  ctxt->space = NULL;
6734
0
    }
6735
6736
6737
0
    ctxt->nodeNr = 0;
6738
0
    ctxt->node = NULL;
6739
6740
0
    ctxt->nameNr = 0;
6741
0
    ctxt->name = NULL;
6742
6743
0
    ctxt->nsNr = 0;
6744
6745
0
    DICT_FREE(ctxt->version);
6746
0
    ctxt->version = NULL;
6747
0
    DICT_FREE(ctxt->encoding);
6748
0
    ctxt->encoding = NULL;
6749
0
    DICT_FREE(ctxt->directory);
6750
0
    ctxt->directory = NULL;
6751
0
    DICT_FREE(ctxt->extSubURI);
6752
0
    ctxt->extSubURI = NULL;
6753
0
    DICT_FREE(ctxt->extSubSystem);
6754
0
    ctxt->extSubSystem = NULL;
6755
0
    if (ctxt->myDoc != NULL)
6756
0
        xmlFreeDoc(ctxt->myDoc);
6757
0
    ctxt->myDoc = NULL;
6758
6759
0
    ctxt->standalone = -1;
6760
0
    ctxt->hasExternalSubset = 0;
6761
0
    ctxt->hasPErefs = 0;
6762
0
    ctxt->html = 1;
6763
0
    ctxt->external = 0;
6764
0
    ctxt->instate = XML_PARSER_START;
6765
0
    ctxt->token = 0;
6766
6767
0
    ctxt->wellFormed = 1;
6768
0
    ctxt->nsWellFormed = 1;
6769
0
    ctxt->disableSAX = 0;
6770
0
    ctxt->valid = 1;
6771
0
    ctxt->vctxt.userData = ctxt;
6772
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6773
0
    ctxt->vctxt.error = xmlParserValidityError;
6774
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
6775
0
    ctxt->record_info = 0;
6776
0
    ctxt->checkIndex = 0;
6777
0
    ctxt->endCheckState = 0;
6778
0
    ctxt->inSubset = 0;
6779
0
    ctxt->errNo = XML_ERR_OK;
6780
0
    ctxt->depth = 0;
6781
0
    ctxt->charset = XML_CHAR_ENCODING_NONE;
6782
0
    ctxt->catalogs = NULL;
6783
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
6784
6785
0
    if (ctxt->attsDefault != NULL) {
6786
0
        xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6787
0
        ctxt->attsDefault = NULL;
6788
0
    }
6789
0
    if (ctxt->attsSpecial != NULL) {
6790
0
        xmlHashFree(ctxt->attsSpecial, NULL);
6791
0
        ctxt->attsSpecial = NULL;
6792
0
    }
6793
6794
0
    ctxt->nbErrors = 0;
6795
0
    ctxt->nbWarnings = 0;
6796
0
    if (ctxt->lastError.code != XML_ERR_OK)
6797
0
        xmlResetError(&ctxt->lastError);
6798
0
}
6799
6800
/**
6801
 * htmlCtxtUseOptions:
6802
 * @ctxt: an HTML parser context
6803
 * @options:  a combination of htmlParserOption(s)
6804
 *
6805
 * Applies the options to the parser context
6806
 *
6807
 * Returns 0 in case of success, the set of unknown or unimplemented options
6808
 *         in case of error.
6809
 */
6810
int
6811
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6812
0
{
6813
0
    if (ctxt == NULL)
6814
0
        return(-1);
6815
6816
0
    if (options & HTML_PARSE_NOWARNING) {
6817
0
        ctxt->sax->warning = NULL;
6818
0
        ctxt->vctxt.warning = NULL;
6819
0
        options -= XML_PARSE_NOWARNING;
6820
0
  ctxt->options |= XML_PARSE_NOWARNING;
6821
0
    }
6822
0
    if (options & HTML_PARSE_NOERROR) {
6823
0
        ctxt->sax->error = NULL;
6824
0
        ctxt->vctxt.error = NULL;
6825
0
        ctxt->sax->fatalError = NULL;
6826
0
        options -= XML_PARSE_NOERROR;
6827
0
  ctxt->options |= XML_PARSE_NOERROR;
6828
0
    }
6829
0
    if (options & HTML_PARSE_PEDANTIC) {
6830
0
        ctxt->pedantic = 1;
6831
0
        options -= XML_PARSE_PEDANTIC;
6832
0
  ctxt->options |= XML_PARSE_PEDANTIC;
6833
0
    } else
6834
0
        ctxt->pedantic = 0;
6835
0
    if (options & XML_PARSE_NOBLANKS) {
6836
0
        ctxt->keepBlanks = 0;
6837
0
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6838
0
        options -= XML_PARSE_NOBLANKS;
6839
0
  ctxt->options |= XML_PARSE_NOBLANKS;
6840
0
    } else
6841
0
        ctxt->keepBlanks = 1;
6842
0
    if (options & HTML_PARSE_RECOVER) {
6843
0
        ctxt->recovery = 1;
6844
0
  options -= HTML_PARSE_RECOVER;
6845
0
    } else
6846
0
        ctxt->recovery = 0;
6847
0
    if (options & HTML_PARSE_COMPACT) {
6848
0
  ctxt->options |= HTML_PARSE_COMPACT;
6849
0
        options -= HTML_PARSE_COMPACT;
6850
0
    }
6851
0
    if (options & XML_PARSE_HUGE) {
6852
0
  ctxt->options |= XML_PARSE_HUGE;
6853
0
        options -= XML_PARSE_HUGE;
6854
0
    }
6855
0
    if (options & HTML_PARSE_NODEFDTD) {
6856
0
  ctxt->options |= HTML_PARSE_NODEFDTD;
6857
0
        options -= HTML_PARSE_NODEFDTD;
6858
0
    }
6859
0
    if (options & HTML_PARSE_IGNORE_ENC) {
6860
0
  ctxt->options |= HTML_PARSE_IGNORE_ENC;
6861
0
        options -= HTML_PARSE_IGNORE_ENC;
6862
0
    }
6863
0
    if (options & HTML_PARSE_NOIMPLIED) {
6864
0
        ctxt->options |= HTML_PARSE_NOIMPLIED;
6865
0
        options -= HTML_PARSE_NOIMPLIED;
6866
0
    }
6867
0
    ctxt->dictNames = 0;
6868
0
    ctxt->linenumbers = 1;
6869
0
    return (options);
6870
0
}
6871
6872
/**
6873
 * htmlDoRead:
6874
 * @ctxt:  an HTML parser context
6875
 * @URL:  the base URL to use for the document
6876
 * @encoding:  the document encoding, or NULL
6877
 * @options:  a combination of htmlParserOption(s)
6878
 * @reuse:  keep the context for reuse
6879
 *
6880
 * Common front-end for the htmlRead functions
6881
 *
6882
 * Returns the resulting document tree or NULL
6883
 */
6884
static htmlDocPtr
6885
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6886
          int options, int reuse)
6887
0
{
6888
0
    htmlDocPtr ret;
6889
6890
0
    htmlCtxtUseOptions(ctxt, options);
6891
0
    ctxt->html = 1;
6892
0
    if (encoding != NULL) {
6893
0
        xmlCharEncodingHandlerPtr hdlr;
6894
6895
0
  hdlr = xmlFindCharEncodingHandler(encoding);
6896
0
  if (hdlr != NULL) {
6897
0
      xmlSwitchToEncoding(ctxt, hdlr);
6898
0
      if (ctxt->input->encoding != NULL)
6899
0
        xmlFree((xmlChar *) ctxt->input->encoding);
6900
0
            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6901
0
        }
6902
0
    }
6903
0
    if ((URL != NULL) && (ctxt->input != NULL) &&
6904
0
        (ctxt->input->filename == NULL))
6905
0
        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6906
0
    htmlParseDocument(ctxt);
6907
0
    ret = ctxt->myDoc;
6908
0
    ctxt->myDoc = NULL;
6909
0
    if (!reuse) {
6910
0
        if ((ctxt->dictNames) &&
6911
0
      (ret != NULL) &&
6912
0
      (ret->dict == ctxt->dict))
6913
0
      ctxt->dict = NULL;
6914
0
  xmlFreeParserCtxt(ctxt);
6915
0
    }
6916
0
    return (ret);
6917
0
}
6918
6919
/**
6920
 * htmlReadDoc:
6921
 * @cur:  a pointer to a zero terminated string
6922
 * @URL:  the base URL to use for the document
6923
 * @encoding:  the document encoding, or NULL
6924
 * @options:  a combination of htmlParserOption(s)
6925
 *
6926
 * parse an XML in-memory document and build a tree.
6927
 *
6928
 * Returns the resulting document tree
6929
 */
6930
htmlDocPtr
6931
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6932
0
{
6933
0
    htmlParserCtxtPtr ctxt;
6934
6935
0
    if (cur == NULL)
6936
0
        return (NULL);
6937
6938
0
    xmlInitParser();
6939
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6940
0
    if (ctxt == NULL)
6941
0
        return (NULL);
6942
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6943
0
}
6944
6945
/**
6946
 * htmlReadFile:
6947
 * @filename:  a file or URL
6948
 * @encoding:  the document encoding, or NULL
6949
 * @options:  a combination of htmlParserOption(s)
6950
 *
6951
 * parse an XML file from the filesystem or the network.
6952
 *
6953
 * Returns the resulting document tree
6954
 */
6955
htmlDocPtr
6956
htmlReadFile(const char *filename, const char *encoding, int options)
6957
0
{
6958
0
    htmlParserCtxtPtr ctxt;
6959
6960
0
    xmlInitParser();
6961
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6962
0
    if (ctxt == NULL)
6963
0
        return (NULL);
6964
0
    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6965
0
}
6966
6967
/**
6968
 * htmlReadMemory:
6969
 * @buffer:  a pointer to a char array
6970
 * @size:  the size of the array
6971
 * @URL:  the base URL to use for the document
6972
 * @encoding:  the document encoding, or NULL
6973
 * @options:  a combination of htmlParserOption(s)
6974
 *
6975
 * parse an XML in-memory document and build a tree.
6976
 *
6977
 * Returns the resulting document tree
6978
 */
6979
htmlDocPtr
6980
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6981
0
{
6982
0
    htmlParserCtxtPtr ctxt;
6983
6984
0
    xmlInitParser();
6985
0
    ctxt = htmlCreateMemoryParserCtxt(buffer, size);
6986
0
    if (ctxt == NULL)
6987
0
        return (NULL);
6988
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6989
0
}
6990
6991
/**
6992
 * htmlReadFd:
6993
 * @fd:  an open file descriptor
6994
 * @URL:  the base URL to use for the document
6995
 * @encoding:  the document encoding, or NULL
6996
 * @options:  a combination of htmlParserOption(s)
6997
 *
6998
 * parse an HTML from a file descriptor and build a tree.
6999
 * NOTE that the file descriptor will not be closed when the
7000
 *      reader is closed or reset.
7001
 *
7002
 * Returns the resulting document tree
7003
 */
7004
htmlDocPtr
7005
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7006
0
{
7007
0
    htmlParserCtxtPtr ctxt;
7008
0
    xmlParserInputBufferPtr input;
7009
0
    htmlParserInputPtr stream;
7010
7011
0
    if (fd < 0)
7012
0
        return (NULL);
7013
7014
0
    xmlInitParser();
7015
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7016
0
    if (input == NULL)
7017
0
        return (NULL);
7018
0
    input->closecallback = NULL;
7019
0
    ctxt = htmlNewParserCtxt();
7020
0
    if (ctxt == NULL) {
7021
0
        xmlFreeParserInputBuffer(input);
7022
0
        return (NULL);
7023
0
    }
7024
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7025
0
    if (stream == NULL) {
7026
0
        xmlFreeParserInputBuffer(input);
7027
0
  htmlFreeParserCtxt(ctxt);
7028
0
        return (NULL);
7029
0
    }
7030
0
    inputPush(ctxt, stream);
7031
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
7032
0
}
7033
7034
/**
7035
 * htmlReadIO:
7036
 * @ioread:  an I/O read function
7037
 * @ioclose:  an I/O close function
7038
 * @ioctx:  an I/O handler
7039
 * @URL:  the base URL to use for the document
7040
 * @encoding:  the document encoding, or NULL
7041
 * @options:  a combination of htmlParserOption(s)
7042
 *
7043
 * parse an HTML document from I/O functions and source and build a tree.
7044
 *
7045
 * Returns the resulting document tree
7046
 */
7047
htmlDocPtr
7048
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7049
          void *ioctx, const char *URL, const char *encoding, int options)
7050
0
{
7051
0
    htmlParserCtxtPtr ctxt;
7052
0
    xmlParserInputBufferPtr input;
7053
0
    xmlParserInputPtr stream;
7054
7055
0
    if (ioread == NULL)
7056
0
        return (NULL);
7057
0
    xmlInitParser();
7058
7059
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7060
0
                                         XML_CHAR_ENCODING_NONE);
7061
0
    if (input == NULL) {
7062
0
        if (ioclose != NULL)
7063
0
            ioclose(ioctx);
7064
0
        return (NULL);
7065
0
    }
7066
0
    ctxt = htmlNewParserCtxt();
7067
0
    if (ctxt == NULL) {
7068
0
        xmlFreeParserInputBuffer(input);
7069
0
        return (NULL);
7070
0
    }
7071
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7072
0
    if (stream == NULL) {
7073
0
        xmlFreeParserInputBuffer(input);
7074
0
  xmlFreeParserCtxt(ctxt);
7075
0
        return (NULL);
7076
0
    }
7077
0
    inputPush(ctxt, stream);
7078
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
7079
0
}
7080
7081
/**
7082
 * htmlCtxtReadDoc:
7083
 * @ctxt:  an HTML parser context
7084
 * @cur:  a pointer to a zero terminated string
7085
 * @URL:  the base URL to use for the document
7086
 * @encoding:  the document encoding, or NULL
7087
 * @options:  a combination of htmlParserOption(s)
7088
 *
7089
 * parse an XML in-memory document and build a tree.
7090
 * This reuses the existing @ctxt parser context
7091
 *
7092
 * Returns the resulting document tree
7093
 */
7094
htmlDocPtr
7095
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7096
               const char *URL, const char *encoding, int options)
7097
0
{
7098
0
    if (cur == NULL)
7099
0
        return (NULL);
7100
0
    return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
7101
0
                               encoding, options));
7102
0
}
7103
7104
/**
7105
 * htmlCtxtReadFile:
7106
 * @ctxt:  an HTML parser context
7107
 * @filename:  a file or URL
7108
 * @encoding:  the document encoding, or NULL
7109
 * @options:  a combination of htmlParserOption(s)
7110
 *
7111
 * parse an XML file from the filesystem or the network.
7112
 * This reuses the existing @ctxt parser context
7113
 *
7114
 * Returns the resulting document tree
7115
 */
7116
htmlDocPtr
7117
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7118
                const char *encoding, int options)
7119
0
{
7120
0
    xmlParserInputPtr stream;
7121
7122
0
    if (filename == NULL)
7123
0
        return (NULL);
7124
0
    if (ctxt == NULL)
7125
0
        return (NULL);
7126
0
    xmlInitParser();
7127
7128
0
    htmlCtxtReset(ctxt);
7129
7130
0
    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7131
0
    if (stream == NULL) {
7132
0
        return (NULL);
7133
0
    }
7134
0
    inputPush(ctxt, stream);
7135
0
    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7136
0
}
7137
7138
/**
7139
 * htmlCtxtReadMemory:
7140
 * @ctxt:  an HTML parser context
7141
 * @buffer:  a pointer to a char array
7142
 * @size:  the size of the array
7143
 * @URL:  the base URL to use for the document
7144
 * @encoding:  the document encoding, or NULL
7145
 * @options:  a combination of htmlParserOption(s)
7146
 *
7147
 * parse an XML in-memory document and build a tree.
7148
 * This reuses the existing @ctxt parser context
7149
 *
7150
 * Returns the resulting document tree
7151
 */
7152
htmlDocPtr
7153
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7154
                  const char *URL, const char *encoding, int options)
7155
0
{
7156
0
    xmlParserInputBufferPtr input;
7157
0
    xmlParserInputPtr stream;
7158
7159
0
    if (ctxt == NULL)
7160
0
        return (NULL);
7161
0
    if (buffer == NULL)
7162
0
        return (NULL);
7163
0
    xmlInitParser();
7164
7165
0
    htmlCtxtReset(ctxt);
7166
7167
0
    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7168
0
    if (input == NULL) {
7169
0
  return(NULL);
7170
0
    }
7171
7172
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7173
0
    if (stream == NULL) {
7174
0
  xmlFreeParserInputBuffer(input);
7175
0
  return(NULL);
7176
0
    }
7177
7178
0
    inputPush(ctxt, stream);
7179
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7180
0
}
7181
7182
/**
7183
 * htmlCtxtReadFd:
7184
 * @ctxt:  an HTML parser context
7185
 * @fd:  an open file descriptor
7186
 * @URL:  the base URL to use for the document
7187
 * @encoding:  the document encoding, or NULL
7188
 * @options:  a combination of htmlParserOption(s)
7189
 *
7190
 * parse an XML from a file descriptor and build a tree.
7191
 * This reuses the existing @ctxt parser context
7192
 *
7193
 * Returns the resulting document tree
7194
 */
7195
htmlDocPtr
7196
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7197
              const char *URL, const char *encoding, int options)
7198
0
{
7199
0
    xmlParserInputBufferPtr input;
7200
0
    xmlParserInputPtr stream;
7201
7202
0
    if (fd < 0)
7203
0
        return (NULL);
7204
0
    if (ctxt == NULL)
7205
0
        return (NULL);
7206
0
    xmlInitParser();
7207
7208
0
    htmlCtxtReset(ctxt);
7209
7210
7211
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7212
0
    if (input == NULL)
7213
0
        return (NULL);
7214
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7215
0
    if (stream == NULL) {
7216
0
        xmlFreeParserInputBuffer(input);
7217
0
        return (NULL);
7218
0
    }
7219
0
    inputPush(ctxt, stream);
7220
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7221
0
}
7222
7223
/**
7224
 * htmlCtxtReadIO:
7225
 * @ctxt:  an HTML parser context
7226
 * @ioread:  an I/O read function
7227
 * @ioclose:  an I/O close function
7228
 * @ioctx:  an I/O handler
7229
 * @URL:  the base URL to use for the document
7230
 * @encoding:  the document encoding, or NULL
7231
 * @options:  a combination of htmlParserOption(s)
7232
 *
7233
 * parse an HTML document from I/O functions and source and build a tree.
7234
 * This reuses the existing @ctxt parser context
7235
 *
7236
 * Returns the resulting document tree
7237
 */
7238
htmlDocPtr
7239
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7240
              xmlInputCloseCallback ioclose, void *ioctx,
7241
        const char *URL,
7242
              const char *encoding, int options)
7243
0
{
7244
0
    xmlParserInputBufferPtr input;
7245
0
    xmlParserInputPtr stream;
7246
7247
0
    if (ioread == NULL)
7248
0
        return (NULL);
7249
0
    if (ctxt == NULL)
7250
0
        return (NULL);
7251
0
    xmlInitParser();
7252
7253
0
    htmlCtxtReset(ctxt);
7254
7255
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7256
0
                                         XML_CHAR_ENCODING_NONE);
7257
0
    if (input == NULL) {
7258
0
        if (ioclose != NULL)
7259
0
            ioclose(ioctx);
7260
0
        return (NULL);
7261
0
    }
7262
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7263
0
    if (stream == NULL) {
7264
0
        xmlFreeParserInputBuffer(input);
7265
0
        return (NULL);
7266
0
    }
7267
0
    inputPush(ctxt, stream);
7268
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7269
0
}
7270
7271
#endif /* LIBXML_HTML_ENABLED */