Coverage Report

Created: 2025-07-23 08:13

/src/fontconfig/subprojects/libxml2-2.12.6/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12
13
#include <string.h>
14
#include <ctype.h>
15
#include <stdlib.h>
16
17
#include <libxml/HTMLparser.h>
18
#include <libxml/xmlmemory.h>
19
#include <libxml/tree.h>
20
#include <libxml/parser.h>
21
#include <libxml/parserInternals.h>
22
#include <libxml/xmlerror.h>
23
#include <libxml/HTMLtree.h>
24
#include <libxml/entities.h>
25
#include <libxml/encoding.h>
26
#include <libxml/xmlIO.h>
27
#include <libxml/uri.h>
28
29
#include "private/buf.h"
30
#include "private/enc.h"
31
#include "private/error.h"
32
#include "private/html.h"
33
#include "private/io.h"
34
#include "private/parser.h"
35
#include "private/tree.h"
36
37
#define HTML_MAX_NAMELEN 1000
38
0
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
39
0
#define HTML_PARSER_BUFFER_SIZE 100
40
41
static int htmlOmittedDefaultValue = 1;
42
43
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
44
           xmlChar end, xmlChar  end2, xmlChar end3);
45
static void htmlParseComment(htmlParserCtxtPtr ctxt);
46
47
/************************************************************************
48
 *                  *
49
 *    Some factorized error routines        *
50
 *                  *
51
 ************************************************************************/
52
53
/**
54
 * htmlErrMemory:
55
 * @ctxt:  an HTML parser context
56
 * @extra:  extra information
57
 *
58
 * Handle a redefinition of attribute error
59
 */
60
static void
61
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
62
0
{
63
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
64
0
        (ctxt->instate == XML_PARSER_EOF))
65
0
  return;
66
0
    if (ctxt != NULL) {
67
0
        ctxt->errNo = XML_ERR_NO_MEMORY;
68
0
        ctxt->instate = XML_PARSER_EOF;
69
0
        ctxt->disableSAX = 1;
70
0
    }
71
0
    if (extra)
72
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
73
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
74
0
                        NULL, NULL, 0, 0,
75
0
                        "Memory allocation failed : %s\n", extra);
76
0
    else
77
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
78
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
79
0
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
80
0
}
81
82
/**
83
 * htmlParseErr:
84
 * @ctxt:  an HTML parser context
85
 * @error:  the error number
86
 * @msg:  the error message
87
 * @str1:  string infor
88
 * @str2:  string infor
89
 *
90
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
91
 */
92
static void LIBXML_ATTR_FORMAT(3,0)
93
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
94
             const char *msg, const xmlChar *str1, const xmlChar *str2)
95
0
{
96
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
97
0
        (ctxt->instate == XML_PARSER_EOF))
98
0
  return;
99
0
    if (ctxt != NULL)
100
0
  ctxt->errNo = error;
101
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
102
0
                    XML_ERR_ERROR, NULL, 0,
103
0
        (const char *) str1, (const char *) str2,
104
0
        NULL, 0, 0,
105
0
        msg, str1, str2);
106
0
    if (ctxt != NULL)
107
0
  ctxt->wellFormed = 0;
108
0
}
109
110
/**
111
 * htmlParseErrInt:
112
 * @ctxt:  an HTML parser context
113
 * @error:  the error number
114
 * @msg:  the error message
115
 * @val:  integer info
116
 *
117
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
118
 */
119
static void LIBXML_ATTR_FORMAT(3,0)
120
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
121
             const char *msg, int val)
122
0
{
123
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
124
0
        (ctxt->instate == XML_PARSER_EOF))
125
0
  return;
126
0
    if (ctxt != NULL)
127
0
  ctxt->errNo = error;
128
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
129
0
                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
130
0
        NULL, val, 0, msg, val);
131
0
    if (ctxt != NULL)
132
0
  ctxt->wellFormed = 0;
133
0
}
134
135
/************************************************************************
136
 *                  *
137
 *  Parser stacks related functions and macros    *
138
 *                  *
139
 ************************************************************************/
140
141
/**
142
 * htmlnamePush:
143
 * @ctxt:  an HTML parser context
144
 * @value:  the element name
145
 *
146
 * Pushes a new element name on top of the name stack
147
 *
148
 * Returns -1 in case of error, the index in the stack otherwise
149
 */
150
static int
151
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
152
0
{
153
0
    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
154
0
        ctxt->html = 3;
155
0
    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
156
0
        ctxt->html = 10;
157
0
    if (ctxt->nameNr >= ctxt->nameMax) {
158
0
        size_t newSize = ctxt->nameMax * 2;
159
0
        const xmlChar **tmp;
160
161
0
        tmp = xmlRealloc((xmlChar **) ctxt->nameTab,
162
0
                         newSize * sizeof(ctxt->nameTab[0]));
163
0
        if (tmp == NULL) {
164
0
            htmlErrMemory(ctxt, NULL);
165
0
            return (-1);
166
0
        }
167
0
        ctxt->nameTab = tmp;
168
0
        ctxt->nameMax = newSize;
169
0
    }
170
0
    ctxt->nameTab[ctxt->nameNr] = value;
171
0
    ctxt->name = value;
172
0
    return (ctxt->nameNr++);
173
0
}
174
/**
175
 * htmlnamePop:
176
 * @ctxt: an HTML parser context
177
 *
178
 * Pops the top element name from the name stack
179
 *
180
 * Returns the name just removed
181
 */
182
static const xmlChar *
183
htmlnamePop(htmlParserCtxtPtr ctxt)
184
0
{
185
0
    const xmlChar *ret;
186
187
0
    if (ctxt->nameNr <= 0)
188
0
        return (NULL);
189
0
    ctxt->nameNr--;
190
0
    if (ctxt->nameNr < 0)
191
0
        return (NULL);
192
0
    if (ctxt->nameNr > 0)
193
0
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
194
0
    else
195
0
        ctxt->name = NULL;
196
0
    ret = ctxt->nameTab[ctxt->nameNr];
197
0
    ctxt->nameTab[ctxt->nameNr] = NULL;
198
0
    return (ret);
199
0
}
200
201
/**
202
 * htmlNodeInfoPush:
203
 * @ctxt:  an HTML parser context
204
 * @value:  the node info
205
 *
206
 * Pushes a new element name on top of the node info stack
207
 *
208
 * Returns 0 in case of error, the index in the stack otherwise
209
 */
210
static int
211
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
212
0
{
213
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
214
0
        if (ctxt->nodeInfoMax == 0)
215
0
                ctxt->nodeInfoMax = 5;
216
0
        ctxt->nodeInfoMax *= 2;
217
0
        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
218
0
                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
219
0
                                    ctxt->nodeInfoMax *
220
0
                                    sizeof(ctxt->nodeInfoTab[0]));
221
0
        if (ctxt->nodeInfoTab == NULL) {
222
0
            htmlErrMemory(ctxt, NULL);
223
0
            return (0);
224
0
        }
225
0
    }
226
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
227
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
228
0
    return (ctxt->nodeInfoNr++);
229
0
}
230
231
/**
232
 * htmlNodeInfoPop:
233
 * @ctxt:  an HTML parser context
234
 *
235
 * Pops the top element name from the node info stack
236
 *
237
 * Returns 0 in case of error, the pointer to NodeInfo otherwise
238
 */
239
static htmlParserNodeInfo *
240
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
241
0
{
242
0
    if (ctxt->nodeInfoNr <= 0)
243
0
        return (NULL);
244
0
    ctxt->nodeInfoNr--;
245
0
    if (ctxt->nodeInfoNr < 0)
246
0
        return (NULL);
247
0
    if (ctxt->nodeInfoNr > 0)
248
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
249
0
    else
250
0
        ctxt->nodeInfo = NULL;
251
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
252
0
}
253
254
/*
255
 * Macros for accessing the content. Those should be used only by the parser,
256
 * and not exported.
257
 *
258
 * Dirty macros, i.e. one need to make assumption on the context to use them
259
 *
260
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
261
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
262
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
263
 *           in UNICODE mode. This should be used internally by the parser
264
 *           only to compare to ASCII values otherwise it would break when
265
 *           running with UTF-8 encoding.
266
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
267
 *           to compare on ASCII based substring.
268
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
269
 *           it should be used only to compare on ASCII based substring.
270
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
271
 *           strings without newlines within the parser.
272
 *
273
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
274
 *
275
 *   NEXT    Skip to the next character, this does the proper decoding
276
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
277
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
278
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
279
 */
280
281
0
#define UPPER (toupper(*ctxt->input->cur))
282
283
0
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
284
285
0
#define NXT(val) ctxt->input->cur[(val)]
286
287
0
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
288
289
0
#define CUR_PTR ctxt->input->cur
290
0
#define BASE_PTR ctxt->input->base
291
292
0
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
293
0
       (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
294
0
  xmlParserShrink(ctxt)
295
296
0
#define GROW if ((ctxt->progressive == 0) &&       \
297
0
     (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))  \
298
0
  xmlParserGrow(ctxt)
299
300
0
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
301
302
/* Imported from XML */
303
304
0
#define CUR (*ctxt->input->cur)
305
0
#define NEXT xmlNextChar(ctxt)
306
307
0
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
308
309
310
0
#define NEXTL(l) do {             \
311
0
    if (*(ctxt->input->cur) == '\n') {         \
312
0
  ctxt->input->line++; ctxt->input->col = 1;      \
313
0
    } else ctxt->input->col++;           \
314
0
    ctxt->token = 0; ctxt->input->cur += l;       \
315
0
  } while (0)
316
317
/************
318
    \
319
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
320
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
321
 ************/
322
323
0
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
324
325
#define COPY_BUF(l,b,i,v)           \
326
0
    if (l == 1) b[i++] = v;           \
327
0
    else i += xmlCopyChar(l,&b[i],v)
328
329
/**
330
 * htmlFindEncoding:
331
 * @the HTML parser context
332
 *
333
 * Ty to find and encoding in the current data available in the input
334
 * buffer this is needed to try to switch to the proper encoding when
335
 * one face a character error.
336
 * That's an heuristic, since it's operating outside of parsing it could
337
 * try to use a meta which had been commented out, that's the reason it
338
 * should only be used in case of error, not as a default.
339
 *
340
 * Returns an encoding string or NULL if not found, the string need to
341
 *   be freed
342
 */
343
static xmlChar *
344
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
345
0
    const xmlChar *start, *cur, *end;
346
347
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
348
0
        (ctxt->input->flags & XML_INPUT_HAS_ENCODING))
349
0
        return(NULL);
350
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
351
0
        return(NULL);
352
353
0
    start = ctxt->input->cur;
354
0
    end = ctxt->input->end;
355
    /* we also expect the input buffer to be zero terminated */
356
0
    if (*end != 0)
357
0
        return(NULL);
358
359
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
360
0
    if (cur == NULL)
361
0
        return(NULL);
362
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
363
0
    if (cur == NULL)
364
0
        return(NULL);
365
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
366
0
    if (cur == NULL)
367
0
        return(NULL);
368
0
    cur += 8;
369
0
    start = cur;
370
0
    while (((*cur >= 'A') && (*cur <= 'Z')) ||
371
0
           ((*cur >= 'a') && (*cur <= 'z')) ||
372
0
           ((*cur >= '0') && (*cur <= '9')) ||
373
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
374
0
           cur++;
375
0
    if (cur == start)
376
0
        return(NULL);
377
0
    return(xmlStrndup(start, cur - start));
378
0
}
379
380
/**
381
 * htmlCurrentChar:
382
 * @ctxt:  the HTML parser context
383
 * @len:  pointer to the length of the char read
384
 *
385
 * The current char value, if using UTF-8 this may actually span multiple
386
 * bytes in the input buffer. Implement the end of line normalization:
387
 * 2.11 End-of-Line Handling
388
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
389
 * char, then the encoding converter is plugged in automatically.
390
 *
391
 * Returns the current char value and its length
392
 */
393
394
static int
395
0
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
396
0
    const unsigned char *cur;
397
0
    unsigned char c;
398
0
    unsigned int val;
399
400
0
    if (ctxt->instate == XML_PARSER_EOF)
401
0
  return(0);
402
403
0
    if (ctxt->token != 0) {
404
0
  *len = 0;
405
0
  return(ctxt->token);
406
0
    }
407
408
0
    if (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK) {
409
0
        xmlParserGrow(ctxt);
410
0
        if (ctxt->instate == XML_PARSER_EOF)
411
0
            return(0);
412
0
    }
413
414
0
    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) {
415
0
        xmlChar * guess;
416
0
        xmlCharEncodingHandlerPtr handler;
417
418
        /*
419
         * Assume it's a fixed length encoding (1) with
420
         * a compatible encoding for the ASCII set, since
421
         * HTML constructs only use < 128 chars
422
         */
423
0
        if (*ctxt->input->cur < 0x80) {
424
0
            *len = 1;
425
0
            if ((*ctxt->input->cur == 0) &&
426
0
                (ctxt->input->cur < ctxt->input->end)) {
427
0
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
428
0
                                "Char 0x%X out of allowed range\n", 0);
429
0
                return(' ');
430
0
            }
431
0
            return(*ctxt->input->cur);
432
0
        }
433
434
        /*
435
         * Humm this is bad, do an automatic flow conversion
436
         */
437
0
        guess = htmlFindEncoding(ctxt);
438
0
        if (guess == NULL) {
439
0
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
440
0
        } else {
441
0
            handler = xmlFindCharEncodingHandler((const char *) guess);
442
0
            if (handler != NULL) {
443
                /*
444
                 * Don't use UTF-8 encoder which isn't required and
445
                 * can produce invalid UTF-8.
446
                 */
447
0
                if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
448
0
                    xmlSwitchToEncoding(ctxt, handler);
449
0
            } else {
450
0
                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
451
0
                             "Unsupported encoding %s", guess, NULL);
452
0
            }
453
0
            xmlFree(guess);
454
0
        }
455
0
        ctxt->input->flags |= XML_INPUT_HAS_ENCODING;
456
0
    }
457
458
    /*
459
     * We are supposed to handle UTF8, check it's valid
460
     * From rfc2044: encoding of the Unicode values on UTF-8:
461
     *
462
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
463
     * 0000 0000-0000 007F   0xxxxxxx
464
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
465
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
466
     *
467
     * Check for the 0x110000 limit too
468
     */
469
0
    cur = ctxt->input->cur;
470
0
    c = *cur;
471
0
    if (c & 0x80) {
472
0
        size_t avail;
473
474
0
        if ((c & 0x40) == 0)
475
0
            goto encoding_error;
476
477
0
        avail = ctxt->input->end - ctxt->input->cur;
478
479
0
        if ((avail < 2) || ((cur[1] & 0xc0) != 0x80))
480
0
            goto encoding_error;
481
0
        if ((c & 0xe0) == 0xe0) {
482
0
            if ((avail < 3) || ((cur[2] & 0xc0) != 0x80))
483
0
                goto encoding_error;
484
0
            if ((c & 0xf0) == 0xf0) {
485
0
                if (((c & 0xf8) != 0xf0) ||
486
0
                    (avail < 4) || ((cur[3] & 0xc0) != 0x80))
487
0
                    goto encoding_error;
488
                /* 4-byte code */
489
0
                *len = 4;
490
0
                val = (cur[0] & 0x7) << 18;
491
0
                val |= (cur[1] & 0x3f) << 12;
492
0
                val |= (cur[2] & 0x3f) << 6;
493
0
                val |= cur[3] & 0x3f;
494
0
                if (val < 0x10000)
495
0
                    goto encoding_error;
496
0
            } else {
497
              /* 3-byte code */
498
0
                *len = 3;
499
0
                val = (cur[0] & 0xf) << 12;
500
0
                val |= (cur[1] & 0x3f) << 6;
501
0
                val |= cur[2] & 0x3f;
502
0
                if (val < 0x800)
503
0
                    goto encoding_error;
504
0
            }
505
0
        } else {
506
          /* 2-byte code */
507
0
            *len = 2;
508
0
            val = (cur[0] & 0x1f) << 6;
509
0
            val |= cur[1] & 0x3f;
510
0
            if (val < 0x80)
511
0
                goto encoding_error;
512
0
        }
513
0
        if (!IS_CHAR(val)) {
514
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
515
0
                            "Char 0x%X out of allowed range\n", val);
516
0
        }
517
0
        return(val);
518
0
    } else {
519
0
        if ((*ctxt->input->cur == 0) &&
520
0
            (ctxt->input->cur < ctxt->input->end)) {
521
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
522
0
                            "Char 0x%X out of allowed range\n", 0);
523
0
            *len = 1;
524
0
            return(' ');
525
0
        }
526
        /* 1-byte code */
527
0
        *len = 1;
528
0
        return(*ctxt->input->cur);
529
0
    }
530
531
0
encoding_error:
532
0
    {
533
0
        char buffer[150];
534
535
0
  if (ctxt->input->end - ctxt->input->cur >= 4) {
536
0
      snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
537
0
          ctxt->input->cur[0], ctxt->input->cur[1],
538
0
          ctxt->input->cur[2], ctxt->input->cur[3]);
539
0
  } else {
540
0
      snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
541
0
  }
542
0
  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
543
0
         "Input is not proper UTF-8, indicate encoding !\n",
544
0
         BAD_CAST buffer, NULL);
545
0
    }
546
547
0
    if ((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0)
548
0
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
549
0
    *len = 1;
550
0
    return(*ctxt->input->cur);
551
0
}
552
553
/**
554
 * htmlSkipBlankChars:
555
 * @ctxt:  the HTML parser context
556
 *
557
 * skip all blanks character found at that point in the input streams.
558
 *
559
 * Returns the number of space chars skipped
560
 */
561
562
static int
563
0
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
564
0
    int res = 0;
565
566
0
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
567
0
        if (*(ctxt->input->cur) == '\n') {
568
0
            ctxt->input->line++; ctxt->input->col = 1;
569
0
        } else ctxt->input->col++;
570
0
        ctxt->input->cur++;
571
0
        if (*ctxt->input->cur == 0)
572
0
            xmlParserGrow(ctxt);
573
0
  if (res < INT_MAX)
574
0
      res++;
575
0
    }
576
0
    return(res);
577
0
}
578
579
580
581
/************************************************************************
582
 *                  *
583
 *  The list of HTML elements and their properties    *
584
 *                  *
585
 ************************************************************************/
586
587
/*
588
 *  Start Tag: 1 means the start tag can be omitted
589
 *  End Tag:   1 means the end tag can be omitted
590
 *             2 means it's forbidden (empty elements)
591
 *             3 means the tag is stylistic and should be closed easily
592
 *  Depr:      this element is deprecated
593
 *  DTD:       1 means that this element is valid only in the Loose DTD
594
 *             2 means that this element is valid only in the Frameset DTD
595
 *
596
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
597
  , subElements , impliedsubelt , Attributes, userdata
598
 */
599
600
/* Definitions and a couple of vars for HTML Elements */
601
602
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
603
#define NB_FONTSTYLE 8
604
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
605
#define NB_PHRASE 10
606
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
607
#define NB_SPECIAL 16
608
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
609
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
610
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
611
#define NB_BLOCK NB_HEADING + NB_LIST + 14
612
#define FORMCTRL "input", "select", "textarea", "label", "button"
613
#define NB_FORMCTRL 5
614
#define PCDATA
615
#define NB_PCDATA 0
616
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
617
#define NB_HEADING 6
618
#define LIST "ul", "ol", "dir", "menu"
619
#define NB_LIST 4
620
#define MODIFIER
621
#define NB_MODIFIER 0
622
#define FLOW BLOCK,INLINE
623
#define NB_FLOW NB_BLOCK + NB_INLINE
624
#define EMPTY NULL
625
626
627
static const char* const html_flow[] = { FLOW, NULL } ;
628
static const char* const html_inline[] = { INLINE, NULL } ;
629
630
/* placeholders: elts with content but no subelements */
631
static const char* const html_pcdata[] = { NULL } ;
632
#define html_cdata html_pcdata
633
634
635
/* ... and for HTML Attributes */
636
637
#define COREATTRS "id", "class", "style", "title"
638
#define NB_COREATTRS 4
639
#define I18N "lang", "dir"
640
#define NB_I18N 2
641
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
642
#define NB_EVENTS 9
643
#define ATTRS COREATTRS,I18N,EVENTS
644
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
645
#define CELLHALIGN "align", "char", "charoff"
646
#define NB_CELLHALIGN 3
647
#define CELLVALIGN "valign"
648
#define NB_CELLVALIGN 1
649
650
static const char* const html_attrs[] = { ATTRS, NULL } ;
651
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
652
static const char* const core_attrs[] = { COREATTRS, NULL } ;
653
static const char* const i18n_attrs[] = { I18N, NULL } ;
654
655
656
/* Other declarations that should go inline ... */
657
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
658
  "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
659
  "tabindex", "onfocus", "onblur", NULL } ;
660
static const char* const target_attr[] = { "target", NULL } ;
661
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
662
static const char* const alt_attr[] = { "alt", NULL } ;
663
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
664
static const char* const href_attrs[] = { "href", NULL } ;
665
static const char* const clear_attrs[] = { "clear", NULL } ;
666
static const char* const inline_p[] = { INLINE, "p", NULL } ;
667
668
static const char* const flow_param[] = { FLOW, "param", NULL } ;
669
static const char* const applet_attrs[] = { COREATTRS , "codebase",
670
    "archive", "alt", "name", "height", "width", "align",
671
    "hspace", "vspace", NULL } ;
672
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
673
  "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
674
static const char* const basefont_attrs[] =
675
  { "id", "size", "color", "face", NULL } ;
676
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
677
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
678
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
679
static const char* const body_depr[] = { "background", "bgcolor", "text",
680
  "link", "vlink", "alink", NULL } ;
681
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
682
  "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
683
684
685
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
686
static const char* const col_elt[] = { "col", NULL } ;
687
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
688
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
689
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
690
static const char* const compact_attr[] = { "compact", NULL } ;
691
static const char* const label_attr[] = { "label", NULL } ;
692
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
693
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
694
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
695
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
696
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
697
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
698
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
699
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
700
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
701
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
702
static const char* const version_attr[] = { "version", NULL } ;
703
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
704
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
705
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
706
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
707
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
708
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
709
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
710
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
711
static const char* const align_attr[] = { "align", NULL } ;
712
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
713
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
714
static const char* const name_attr[] = { "name", NULL } ;
715
static const char* const action_attr[] = { "action", NULL } ;
716
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
717
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
718
static const char* const content_attr[] = { "content", NULL } ;
719
static const char* const type_attr[] = { "type", NULL } ;
720
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
721
static const char* const object_contents[] = { FLOW, "param", NULL } ;
722
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
723
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
724
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
725
static const char* const option_elt[] = { "option", NULL } ;
726
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
727
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
728
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
729
static const char* const width_attr[] = { "width", NULL } ;
730
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
731
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
732
static const char* const language_attr[] = { "language", NULL } ;
733
static const char* const select_content[] = { "optgroup", "option", NULL } ;
734
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
735
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
736
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
737
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
738
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
739
static const char* const tr_elt[] = { "tr", NULL } ;
740
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
741
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
742
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
743
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
744
static const char* const tr_contents[] = { "th", "td", NULL } ;
745
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
746
static const char* const li_elt[] = { "li", NULL } ;
747
static const char* const ul_depr[] = { "type", "compact", NULL} ;
748
static const char* const dir_attr[] = { "dir", NULL} ;
749
750
#define DECL (const char**)
751
752
static const htmlElemDesc
753
html40ElementTable[] = {
754
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
755
  DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
756
},
757
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
758
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
759
},
760
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
761
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
762
},
763
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
764
  DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
765
},
766
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
767
  DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
768
},
769
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
770
  EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
771
},
772
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
773
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
774
},
775
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
776
  EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
777
},
778
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
779
  EMPTY , NULL , NULL, DECL basefont_attrs, NULL
780
},
781
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
782
  DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
783
},
784
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
785
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
786
},
787
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
788
  DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
789
},
790
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
791
  DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
792
},
793
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
794
  EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
795
},
796
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
797
  DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
798
},
799
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
800
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
801
},
802
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
803
  DECL html_flow , NULL , NULL, DECL html_attrs, NULL
804
},
805
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
806
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
807
},
808
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
809
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
810
},
811
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
812
  EMPTY , NULL , DECL col_attrs , NULL, NULL
813
},
814
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
815
  DECL col_elt , "col" , DECL col_attrs , NULL, NULL
816
},
817
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
818
  DECL html_flow , NULL , DECL html_attrs, NULL, NULL
819
},
820
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
821
  DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
822
},
823
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
824
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
825
},
826
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
827
  DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
828
},
829
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
830
  DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
831
},
832
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
833
  DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
834
},
835
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
836
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
837
},
838
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
839
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
840
},
841
{ "embed",  0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
842
  EMPTY, NULL, DECL embed_attrs, NULL, NULL
843
},
844
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
845
  DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
846
},
847
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
848
  DECL html_inline, NULL, NULL, DECL font_attrs, NULL
849
},
850
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
851
  DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
852
},
853
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
854
  EMPTY, NULL, NULL, DECL frame_attrs, NULL
855
},
856
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
857
  DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
858
},
859
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
860
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
861
},
862
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
863
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
864
},
865
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
866
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
867
},
868
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
869
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
870
},
871
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
872
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
873
},
874
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
875
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
876
},
877
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
878
  DECL head_contents, NULL, DECL head_attrs, NULL, NULL
879
},
880
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
881
  EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
882
},
883
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
884
  DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
885
},
886
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
887
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
888
},
889
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
890
  DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
891
},
892
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
893
  EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
894
},
895
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
896
  EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
897
},
898
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
899
  DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
900
},
901
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
902
  EMPTY, NULL, NULL, DECL prompt_attrs, NULL
903
},
904
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
905
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
906
},
907
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
908
  DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
909
},
910
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
911
  DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
912
},
913
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
914
  DECL html_flow, NULL, DECL html_attrs, NULL, NULL
915
},
916
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
917
  EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
918
},
919
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
920
  DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
921
},
922
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
923
  DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
924
},
925
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
926
  EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
927
},
928
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
929
  DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
930
},
931
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
932
  DECL html_flow, "div", DECL html_attrs, NULL, NULL
933
},
934
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
935
  DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
936
},
937
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
938
  DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
939
},
940
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
941
  DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
942
},
943
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
944
  DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
945
},
946
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
947
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
948
},
949
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
950
  EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
951
},
952
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
953
  DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
954
},
955
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
956
  DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
957
},
958
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
959
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
960
},
961
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
962
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
963
},
964
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
965
  DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
966
},
967
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
968
  DECL select_content, NULL, DECL select_attrs, NULL, NULL
969
},
970
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
971
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
972
},
973
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
974
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
975
},
976
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
977
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
978
},
979
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
980
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
981
},
982
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
983
  DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
984
},
985
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
986
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
987
},
988
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
989
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990
},
991
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
992
  DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
993
},
994
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
995
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
996
},
997
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
998
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
999
},
1000
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1001
  DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1002
},
1003
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
1004
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1005
},
1006
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
1007
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1008
},
1009
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
1010
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1011
},
1012
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
1013
  DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1014
},
1015
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
1016
  DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1017
},
1018
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1019
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1020
},
1021
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
1022
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1023
},
1024
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
1025
  DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1026
},
1027
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1028
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1029
}
1030
};
1031
1032
typedef struct {
1033
    const char *oldTag;
1034
    const char *newTag;
1035
} htmlStartCloseEntry;
1036
1037
/*
1038
 * start tags that imply the end of current element
1039
 */
1040
static const htmlStartCloseEntry htmlStartClose[] = {
1041
    { "a", "a" },
1042
    { "a", "fieldset" },
1043
    { "a", "table" },
1044
    { "a", "td" },
1045
    { "a", "th" },
1046
    { "address", "dd" },
1047
    { "address", "dl" },
1048
    { "address", "dt" },
1049
    { "address", "form" },
1050
    { "address", "li" },
1051
    { "address", "ul" },
1052
    { "b", "center" },
1053
    { "b", "p" },
1054
    { "b", "td" },
1055
    { "b", "th" },
1056
    { "big", "p" },
1057
    { "caption", "col" },
1058
    { "caption", "colgroup" },
1059
    { "caption", "tbody" },
1060
    { "caption", "tfoot" },
1061
    { "caption", "thead" },
1062
    { "caption", "tr" },
1063
    { "col", "col" },
1064
    { "col", "colgroup" },
1065
    { "col", "tbody" },
1066
    { "col", "tfoot" },
1067
    { "col", "thead" },
1068
    { "col", "tr" },
1069
    { "colgroup", "colgroup" },
1070
    { "colgroup", "tbody" },
1071
    { "colgroup", "tfoot" },
1072
    { "colgroup", "thead" },
1073
    { "colgroup", "tr" },
1074
    { "dd", "dt" },
1075
    { "dir", "dd" },
1076
    { "dir", "dl" },
1077
    { "dir", "dt" },
1078
    { "dir", "form" },
1079
    { "dir", "ul" },
1080
    { "dl", "form" },
1081
    { "dl", "li" },
1082
    { "dt", "dd" },
1083
    { "dt", "dl" },
1084
    { "font", "center" },
1085
    { "font", "td" },
1086
    { "font", "th" },
1087
    { "form", "form" },
1088
    { "h1", "fieldset" },
1089
    { "h1", "form" },
1090
    { "h1", "li" },
1091
    { "h1", "p" },
1092
    { "h1", "table" },
1093
    { "h2", "fieldset" },
1094
    { "h2", "form" },
1095
    { "h2", "li" },
1096
    { "h2", "p" },
1097
    { "h2", "table" },
1098
    { "h3", "fieldset" },
1099
    { "h3", "form" },
1100
    { "h3", "li" },
1101
    { "h3", "p" },
1102
    { "h3", "table" },
1103
    { "h4", "fieldset" },
1104
    { "h4", "form" },
1105
    { "h4", "li" },
1106
    { "h4", "p" },
1107
    { "h4", "table" },
1108
    { "h5", "fieldset" },
1109
    { "h5", "form" },
1110
    { "h5", "li" },
1111
    { "h5", "p" },
1112
    { "h5", "table" },
1113
    { "h6", "fieldset" },
1114
    { "h6", "form" },
1115
    { "h6", "li" },
1116
    { "h6", "p" },
1117
    { "h6", "table" },
1118
    { "head", "a" },
1119
    { "head", "abbr" },
1120
    { "head", "acronym" },
1121
    { "head", "address" },
1122
    { "head", "b" },
1123
    { "head", "bdo" },
1124
    { "head", "big" },
1125
    { "head", "blockquote" },
1126
    { "head", "body" },
1127
    { "head", "br" },
1128
    { "head", "center" },
1129
    { "head", "cite" },
1130
    { "head", "code" },
1131
    { "head", "dd" },
1132
    { "head", "dfn" },
1133
    { "head", "dir" },
1134
    { "head", "div" },
1135
    { "head", "dl" },
1136
    { "head", "dt" },
1137
    { "head", "em" },
1138
    { "head", "fieldset" },
1139
    { "head", "font" },
1140
    { "head", "form" },
1141
    { "head", "frameset" },
1142
    { "head", "h1" },
1143
    { "head", "h2" },
1144
    { "head", "h3" },
1145
    { "head", "h4" },
1146
    { "head", "h5" },
1147
    { "head", "h6" },
1148
    { "head", "hr" },
1149
    { "head", "i" },
1150
    { "head", "iframe" },
1151
    { "head", "img" },
1152
    { "head", "kbd" },
1153
    { "head", "li" },
1154
    { "head", "listing" },
1155
    { "head", "map" },
1156
    { "head", "menu" },
1157
    { "head", "ol" },
1158
    { "head", "p" },
1159
    { "head", "pre" },
1160
    { "head", "q" },
1161
    { "head", "s" },
1162
    { "head", "samp" },
1163
    { "head", "small" },
1164
    { "head", "span" },
1165
    { "head", "strike" },
1166
    { "head", "strong" },
1167
    { "head", "sub" },
1168
    { "head", "sup" },
1169
    { "head", "table" },
1170
    { "head", "tt" },
1171
    { "head", "u" },
1172
    { "head", "ul" },
1173
    { "head", "var" },
1174
    { "head", "xmp" },
1175
    { "hr", "form" },
1176
    { "i", "center" },
1177
    { "i", "p" },
1178
    { "i", "td" },
1179
    { "i", "th" },
1180
    { "legend", "fieldset" },
1181
    { "li", "li" },
1182
    { "link", "body" },
1183
    { "link", "frameset" },
1184
    { "listing", "dd" },
1185
    { "listing", "dl" },
1186
    { "listing", "dt" },
1187
    { "listing", "fieldset" },
1188
    { "listing", "form" },
1189
    { "listing", "li" },
1190
    { "listing", "table" },
1191
    { "listing", "ul" },
1192
    { "menu", "dd" },
1193
    { "menu", "dl" },
1194
    { "menu", "dt" },
1195
    { "menu", "form" },
1196
    { "menu", "ul" },
1197
    { "ol", "form" },
1198
    { "option", "optgroup" },
1199
    { "option", "option" },
1200
    { "p", "address" },
1201
    { "p", "blockquote" },
1202
    { "p", "body" },
1203
    { "p", "caption" },
1204
    { "p", "center" },
1205
    { "p", "col" },
1206
    { "p", "colgroup" },
1207
    { "p", "dd" },
1208
    { "p", "dir" },
1209
    { "p", "div" },
1210
    { "p", "dl" },
1211
    { "p", "dt" },
1212
    { "p", "fieldset" },
1213
    { "p", "form" },
1214
    { "p", "frameset" },
1215
    { "p", "h1" },
1216
    { "p", "h2" },
1217
    { "p", "h3" },
1218
    { "p", "h4" },
1219
    { "p", "h5" },
1220
    { "p", "h6" },
1221
    { "p", "head" },
1222
    { "p", "hr" },
1223
    { "p", "li" },
1224
    { "p", "listing" },
1225
    { "p", "menu" },
1226
    { "p", "ol" },
1227
    { "p", "p" },
1228
    { "p", "pre" },
1229
    { "p", "table" },
1230
    { "p", "tbody" },
1231
    { "p", "td" },
1232
    { "p", "tfoot" },
1233
    { "p", "th" },
1234
    { "p", "title" },
1235
    { "p", "tr" },
1236
    { "p", "ul" },
1237
    { "p", "xmp" },
1238
    { "pre", "dd" },
1239
    { "pre", "dl" },
1240
    { "pre", "dt" },
1241
    { "pre", "fieldset" },
1242
    { "pre", "form" },
1243
    { "pre", "li" },
1244
    { "pre", "table" },
1245
    { "pre", "ul" },
1246
    { "s", "p" },
1247
    { "script", "noscript" },
1248
    { "small", "p" },
1249
    { "span", "td" },
1250
    { "span", "th" },
1251
    { "strike", "p" },
1252
    { "style", "body" },
1253
    { "style", "frameset" },
1254
    { "tbody", "tbody" },
1255
    { "tbody", "tfoot" },
1256
    { "td", "tbody" },
1257
    { "td", "td" },
1258
    { "td", "tfoot" },
1259
    { "td", "th" },
1260
    { "td", "tr" },
1261
    { "tfoot", "tbody" },
1262
    { "th", "tbody" },
1263
    { "th", "td" },
1264
    { "th", "tfoot" },
1265
    { "th", "th" },
1266
    { "th", "tr" },
1267
    { "thead", "tbody" },
1268
    { "thead", "tfoot" },
1269
    { "title", "body" },
1270
    { "title", "frameset" },
1271
    { "tr", "tbody" },
1272
    { "tr", "tfoot" },
1273
    { "tr", "tr" },
1274
    { "tt", "p" },
1275
    { "u", "p" },
1276
    { "u", "td" },
1277
    { "u", "th" },
1278
    { "ul", "address" },
1279
    { "ul", "form" },
1280
    { "ul", "menu" },
1281
    { "ul", "pre" },
1282
    { "xmp", "dd" },
1283
    { "xmp", "dl" },
1284
    { "xmp", "dt" },
1285
    { "xmp", "fieldset" },
1286
    { "xmp", "form" },
1287
    { "xmp", "li" },
1288
    { "xmp", "table" },
1289
    { "xmp", "ul" }
1290
};
1291
1292
/*
1293
 * The list of HTML elements which are supposed not to have
1294
 * CDATA content and where a p element will be implied
1295
 *
1296
 * TODO: extend that list by reading the HTML SGML DTD on
1297
 *       implied paragraph
1298
 */
1299
static const char *const htmlNoContentElements[] = {
1300
    "html",
1301
    "head",
1302
    NULL
1303
};
1304
1305
/*
1306
 * The list of HTML attributes which are of content %Script;
1307
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1308
 *       it assumes the name starts with 'on'
1309
 */
1310
static const char *const htmlScriptAttributes[] = {
1311
    "onclick",
1312
    "ondblclick",
1313
    "onmousedown",
1314
    "onmouseup",
1315
    "onmouseover",
1316
    "onmousemove",
1317
    "onmouseout",
1318
    "onkeypress",
1319
    "onkeydown",
1320
    "onkeyup",
1321
    "onload",
1322
    "onunload",
1323
    "onfocus",
1324
    "onblur",
1325
    "onsubmit",
1326
    "onreset",
1327
    "onchange",
1328
    "onselect"
1329
};
1330
1331
/*
1332
 * This table is used by the htmlparser to know what to do with
1333
 * broken html pages. By assigning different priorities to different
1334
 * elements the parser can decide how to handle extra endtags.
1335
 * Endtags are only allowed to close elements with lower or equal
1336
 * priority.
1337
 */
1338
1339
typedef struct {
1340
    const char *name;
1341
    int priority;
1342
} elementPriority;
1343
1344
static const elementPriority htmlEndPriority[] = {
1345
    {"div",   150},
1346
    {"td",    160},
1347
    {"th",    160},
1348
    {"tr",    170},
1349
    {"thead", 180},
1350
    {"tbody", 180},
1351
    {"tfoot", 180},
1352
    {"table", 190},
1353
    {"head",  200},
1354
    {"body",  200},
1355
    {"html",  220},
1356
    {NULL,    100} /* Default priority */
1357
};
1358
1359
/************************************************************************
1360
 *                  *
1361
 *  functions to handle HTML specific data      *
1362
 *                  *
1363
 ************************************************************************/
1364
1365
/**
1366
 * htmlInitAutoClose:
1367
 *
1368
 * DEPRECATED: This is a no-op.
1369
 */
1370
void
1371
0
htmlInitAutoClose(void) {
1372
0
}
1373
1374
static int
1375
0
htmlCompareTags(const void *key, const void *member) {
1376
0
    const xmlChar *tag = (const xmlChar *) key;
1377
0
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1378
1379
0
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1380
0
}
1381
1382
/**
1383
 * htmlTagLookup:
1384
 * @tag:  The tag name in lowercase
1385
 *
1386
 * Lookup the HTML tag in the ElementTable
1387
 *
1388
 * Returns the related htmlElemDescPtr or NULL if not found.
1389
 */
1390
const htmlElemDesc *
1391
0
htmlTagLookup(const xmlChar *tag) {
1392
0
    if (tag == NULL)
1393
0
        return(NULL);
1394
1395
0
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1396
0
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1397
0
                sizeof(htmlElemDesc), htmlCompareTags));
1398
0
}
1399
1400
/**
1401
 * htmlGetEndPriority:
1402
 * @name: The name of the element to look up the priority for.
1403
 *
1404
 * Return value: The "endtag" priority.
1405
 **/
1406
static int
1407
0
htmlGetEndPriority (const xmlChar *name) {
1408
0
    int i = 0;
1409
1410
0
    while ((htmlEndPriority[i].name != NULL) &&
1411
0
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1412
0
  i++;
1413
1414
0
    return(htmlEndPriority[i].priority);
1415
0
}
1416
1417
1418
static int
1419
0
htmlCompareStartClose(const void *vkey, const void *member) {
1420
0
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1421
0
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1422
0
    int ret;
1423
1424
0
    ret = strcmp(key->oldTag, entry->oldTag);
1425
0
    if (ret == 0)
1426
0
        ret = strcmp(key->newTag, entry->newTag);
1427
1428
0
    return(ret);
1429
0
}
1430
1431
/**
1432
 * htmlCheckAutoClose:
1433
 * @newtag:  The new tag name
1434
 * @oldtag:  The old tag name
1435
 *
1436
 * Checks whether the new tag is one of the registered valid tags for
1437
 * closing old.
1438
 *
1439
 * Returns 0 if no, 1 if yes.
1440
 */
1441
static int
1442
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1443
0
{
1444
0
    htmlStartCloseEntry key;
1445
0
    void *res;
1446
1447
0
    key.oldTag = (const char *) oldtag;
1448
0
    key.newTag = (const char *) newtag;
1449
0
    res = bsearch(&key, htmlStartClose,
1450
0
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1451
0
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1452
0
    return(res != NULL);
1453
0
}
1454
1455
/**
1456
 * htmlAutoCloseOnClose:
1457
 * @ctxt:  an HTML parser context
1458
 * @newtag:  The new tag name
1459
 * @force:  force the tag closure
1460
 *
1461
 * The HTML DTD allows an ending tag to implicitly close other tags.
1462
 */
1463
static void
1464
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1465
0
{
1466
0
    const htmlElemDesc *info;
1467
0
    int i, priority;
1468
1469
0
    priority = htmlGetEndPriority(newtag);
1470
1471
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1472
1473
0
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1474
0
            break;
1475
        /*
1476
         * A misplaced endtag can only close elements with lower
1477
         * or equal priority, so if we find an element with higher
1478
         * priority before we find an element with
1479
         * matching name, we just ignore this endtag
1480
         */
1481
0
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1482
0
            return;
1483
0
    }
1484
0
    if (i < 0)
1485
0
        return;
1486
1487
0
    while (!xmlStrEqual(newtag, ctxt->name)) {
1488
0
        info = htmlTagLookup(ctxt->name);
1489
0
        if ((info != NULL) && (info->endTag == 3)) {
1490
0
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1491
0
                   "Opening and ending tag mismatch: %s and %s\n",
1492
0
       newtag, ctxt->name);
1493
0
        }
1494
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1495
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1496
0
  htmlnamePop(ctxt);
1497
0
    }
1498
0
}
1499
1500
/**
1501
 * htmlAutoCloseOnEnd:
1502
 * @ctxt:  an HTML parser context
1503
 *
1504
 * Close all remaining tags at the end of the stream
1505
 */
1506
static void
1507
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1508
0
{
1509
0
    int i;
1510
1511
0
    if (ctxt->nameNr == 0)
1512
0
        return;
1513
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1514
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1515
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1516
0
  htmlnamePop(ctxt);
1517
0
    }
1518
0
}
1519
1520
/**
1521
 * htmlAutoClose:
1522
 * @ctxt:  an HTML parser context
1523
 * @newtag:  The new tag name or NULL
1524
 *
1525
 * The HTML DTD allows a tag to implicitly close other tags.
1526
 * The list is kept in htmlStartClose array. This function is
1527
 * called when a new tag has been detected and generates the
1528
 * appropriates closes if possible/needed.
1529
 * If newtag is NULL this mean we are at the end of the resource
1530
 * and we should check
1531
 */
1532
static void
1533
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1534
0
{
1535
0
    if (newtag == NULL)
1536
0
        return;
1537
1538
0
    while ((ctxt->name != NULL) &&
1539
0
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1540
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1541
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1542
0
  htmlnamePop(ctxt);
1543
0
    }
1544
0
}
1545
1546
/**
1547
 * htmlAutoCloseTag:
1548
 * @doc:  the HTML document
1549
 * @name:  The tag name
1550
 * @elem:  the HTML element
1551
 *
1552
 * The HTML DTD allows a tag to implicitly close other tags.
1553
 * The list is kept in htmlStartClose array. This function checks
1554
 * if the element or one of it's children would autoclose the
1555
 * given tag.
1556
 *
1557
 * Returns 1 if autoclose, 0 otherwise
1558
 */
1559
int
1560
0
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1561
0
    htmlNodePtr child;
1562
1563
0
    if (elem == NULL) return(1);
1564
0
    if (xmlStrEqual(name, elem->name)) return(0);
1565
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1566
0
    child = elem->children;
1567
0
    while (child != NULL) {
1568
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1569
0
  child = child->next;
1570
0
    }
1571
0
    return(0);
1572
0
}
1573
1574
/**
1575
 * htmlIsAutoClosed:
1576
 * @doc:  the HTML document
1577
 * @elem:  the HTML element
1578
 *
1579
 * The HTML DTD allows a tag to implicitly close other tags.
1580
 * The list is kept in htmlStartClose array. This function checks
1581
 * if a tag is autoclosed by one of it's child
1582
 *
1583
 * Returns 1 if autoclosed, 0 otherwise
1584
 */
1585
int
1586
0
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1587
0
    htmlNodePtr child;
1588
1589
0
    if (elem == NULL) return(1);
1590
0
    child = elem->children;
1591
0
    while (child != NULL) {
1592
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1593
0
  child = child->next;
1594
0
    }
1595
0
    return(0);
1596
0
}
1597
1598
/**
1599
 * htmlCheckImplied:
1600
 * @ctxt:  an HTML parser context
1601
 * @newtag:  The new tag name
1602
 *
1603
 * The HTML DTD allows a tag to exists only implicitly
1604
 * called when a new tag has been detected and generates the
1605
 * appropriates implicit tags if missing
1606
 */
1607
static void
1608
0
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1609
0
    int i;
1610
1611
0
    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1612
0
        return;
1613
0
    if (!htmlOmittedDefaultValue)
1614
0
  return;
1615
0
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1616
0
  return;
1617
0
    if (ctxt->nameNr <= 0) {
1618
0
  htmlnamePush(ctxt, BAD_CAST"html");
1619
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1620
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1621
0
    }
1622
0
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1623
0
        return;
1624
0
    if ((ctxt->nameNr <= 1) &&
1625
0
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1626
0
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1627
0
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1628
0
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1629
0
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1630
0
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1631
0
        if (ctxt->html >= 3) {
1632
            /* we already saw or generated an <head> before */
1633
0
            return;
1634
0
        }
1635
        /*
1636
         * dropped OBJECT ... i you put it first BODY will be
1637
         * assumed !
1638
         */
1639
0
        htmlnamePush(ctxt, BAD_CAST"head");
1640
0
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1641
0
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1642
0
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1643
0
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1644
0
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1645
0
        if (ctxt->html >= 10) {
1646
            /* we already saw or generated a <body> before */
1647
0
            return;
1648
0
        }
1649
0
  for (i = 0;i < ctxt->nameNr;i++) {
1650
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1651
0
    return;
1652
0
      }
1653
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1654
0
    return;
1655
0
      }
1656
0
  }
1657
1658
0
  htmlnamePush(ctxt, BAD_CAST"body");
1659
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1660
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1661
0
    }
1662
0
}
1663
1664
/**
1665
 * htmlCheckParagraph
1666
 * @ctxt:  an HTML parser context
1667
 *
1668
 * Check whether a p element need to be implied before inserting
1669
 * characters in the current element.
1670
 *
1671
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1672
 *         in case of error.
1673
 */
1674
1675
static int
1676
0
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1677
0
    const xmlChar *tag;
1678
0
    int i;
1679
1680
0
    if (ctxt == NULL)
1681
0
  return(-1);
1682
0
    tag = ctxt->name;
1683
0
    if (tag == NULL) {
1684
0
  htmlAutoClose(ctxt, BAD_CAST"p");
1685
0
  htmlCheckImplied(ctxt, BAD_CAST"p");
1686
0
  htmlnamePush(ctxt, BAD_CAST"p");
1687
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1688
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1689
0
  return(1);
1690
0
    }
1691
0
    if (!htmlOmittedDefaultValue)
1692
0
  return(0);
1693
0
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1694
0
  if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1695
0
      htmlAutoClose(ctxt, BAD_CAST"p");
1696
0
      htmlCheckImplied(ctxt, BAD_CAST"p");
1697
0
      htmlnamePush(ctxt, BAD_CAST"p");
1698
0
      if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1699
0
    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1700
0
      return(1);
1701
0
  }
1702
0
    }
1703
0
    return(0);
1704
0
}
1705
1706
/**
1707
 * htmlIsScriptAttribute:
1708
 * @name:  an attribute name
1709
 *
1710
 * Check if an attribute is of content type Script
1711
 *
1712
 * Returns 1 is the attribute is a script 0 otherwise
1713
 */
1714
int
1715
0
htmlIsScriptAttribute(const xmlChar *name) {
1716
0
    unsigned int i;
1717
1718
0
    if (name == NULL)
1719
0
      return(0);
1720
    /*
1721
     * all script attributes start with 'on'
1722
     */
1723
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1724
0
      return(0);
1725
0
    for (i = 0;
1726
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1727
0
   i++) {
1728
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1729
0
      return(1);
1730
0
    }
1731
0
    return(0);
1732
0
}
1733
1734
/************************************************************************
1735
 *                  *
1736
 *  The list of HTML predefined entities      *
1737
 *                  *
1738
 ************************************************************************/
1739
1740
1741
static const htmlEntityDesc  html40EntitiesTable[] = {
1742
/*
1743
 * the 4 absolute ones, plus apostrophe.
1744
 */
1745
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1746
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1747
{ 39, "apos", "single quote" },
1748
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1749
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1750
1751
/*
1752
 * A bunch still in the 128-255 range
1753
 * Replacing them depend really on the charset used.
1754
 */
1755
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1756
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1757
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1758
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1759
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1760
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1761
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1762
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1763
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1764
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1765
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1766
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1767
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1768
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1769
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1770
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1771
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1772
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1773
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1774
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1775
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1776
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1777
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1778
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1779
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1780
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1781
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1782
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1783
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1784
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1785
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1786
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1787
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1788
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1789
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1790
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1791
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1792
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1793
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1794
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1795
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1796
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1797
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1798
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1799
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1800
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1801
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1802
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1803
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1804
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1805
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1806
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1807
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1808
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1809
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1810
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1811
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1812
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1813
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1814
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1815
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1816
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1817
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1818
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1819
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1820
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1821
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1822
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1823
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1824
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1825
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1826
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1827
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1828
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1829
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1830
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1831
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1832
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1833
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1834
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1835
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1836
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1837
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1838
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1839
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1840
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1841
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1842
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1843
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1844
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1845
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1846
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1847
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1848
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1849
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1850
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1851
1852
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1853
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1854
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1855
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1856
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1857
1858
/*
1859
 * Anything below should really be kept as entities references
1860
 */
1861
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1862
1863
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1864
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1865
1866
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1867
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1868
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1869
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1870
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1871
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1872
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1873
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1874
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1875
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1876
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1877
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1878
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1879
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1880
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1881
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1882
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1883
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1884
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1885
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1886
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1887
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1888
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1889
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1890
1891
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1892
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1893
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1894
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1895
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1896
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1897
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1898
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1899
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1900
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1901
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1902
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1903
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1904
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1905
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1906
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1907
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1908
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1909
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1910
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1911
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1912
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1913
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1914
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1915
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1916
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1917
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1918
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1919
1920
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1921
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1922
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1923
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1924
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1925
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1926
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1927
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1928
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1929
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1930
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1931
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1932
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1933
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1934
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1935
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1936
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1937
1938
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1939
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1940
1941
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1942
1943
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1944
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1945
1946
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1947
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1948
1949
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1950
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1951
1952
{ 8364, "euro", "euro sign, U+20AC NEW" },
1953
1954
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1955
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1956
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1957
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1958
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1959
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1960
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1961
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1962
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1963
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1964
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1965
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1966
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1967
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1968
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1969
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1970
1971
{ 8704, "forall","for all, U+2200 ISOtech" },
1972
{ 8706, "part", "partial differential, U+2202 ISOtech" },
1973
{ 8707, "exist","there exists, U+2203 ISOtech" },
1974
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1975
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1976
{ 8712, "isin", "element of, U+2208 ISOtech" },
1977
{ 8713, "notin","not an element of, U+2209 ISOtech" },
1978
{ 8715, "ni", "contains as member, U+220B ISOtech" },
1979
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1980
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1981
{ 8722, "minus","minus sign, U+2212 ISOtech" },
1982
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1983
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1984
{ 8733, "prop", "proportional to, U+221D ISOtech" },
1985
{ 8734, "infin","infinity, U+221E ISOtech" },
1986
{ 8736, "ang",  "angle, U+2220 ISOamso" },
1987
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1988
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1989
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1990
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
1991
{ 8747, "int",  "integral, U+222B ISOtech" },
1992
{ 8756, "there4","therefore, U+2234 ISOtech" },
1993
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1994
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1995
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1996
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1997
{ 8801, "equiv","identical to, U+2261 ISOtech" },
1998
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1999
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2000
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
2001
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
2002
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2003
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2004
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2005
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2006
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2007
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2008
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2009
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2010
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2011
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2012
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
2013
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2014
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2015
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
2016
2017
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
2018
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2019
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2020
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
2021
2022
};
2023
2024
/************************************************************************
2025
 *                  *
2026
 *    Commodity functions to handle entities      *
2027
 *                  *
2028
 ************************************************************************/
2029
2030
/*
2031
 * Macro used to grow the current buffer.
2032
 */
2033
0
#define growBuffer(buffer) {           \
2034
0
    xmlChar *tmp;             \
2035
0
    buffer##_size *= 2;             \
2036
0
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size);    \
2037
0
    if (tmp == NULL) {             \
2038
0
  htmlErrMemory(ctxt, "growing buffer\n");      \
2039
0
  xmlFree(buffer);            \
2040
0
  return(NULL);             \
2041
0
    }                  \
2042
0
    buffer = tmp;             \
2043
0
}
2044
2045
/**
2046
 * htmlEntityLookup:
2047
 * @name: the entity name
2048
 *
2049
 * Lookup the given entity in EntitiesTable
2050
 *
2051
 * TODO: the linear scan is really ugly, an hash table is really needed.
2052
 *
2053
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2054
 */
2055
const htmlEntityDesc *
2056
0
htmlEntityLookup(const xmlChar *name) {
2057
0
    unsigned int i;
2058
2059
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2060
0
                    sizeof(html40EntitiesTable[0]));i++) {
2061
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2062
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2063
0
  }
2064
0
    }
2065
0
    return(NULL);
2066
0
}
2067
2068
/**
2069
 * htmlEntityValueLookup:
2070
 * @value: the entity's unicode value
2071
 *
2072
 * Lookup the given entity in EntitiesTable
2073
 *
2074
 * TODO: the linear scan is really ugly, an hash table is really needed.
2075
 *
2076
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2077
 */
2078
const htmlEntityDesc *
2079
0
htmlEntityValueLookup(unsigned int value) {
2080
0
    unsigned int i;
2081
2082
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2083
0
                    sizeof(html40EntitiesTable[0]));i++) {
2084
0
        if (html40EntitiesTable[i].value >= value) {
2085
0
      if (html40EntitiesTable[i].value > value)
2086
0
    break;
2087
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2088
0
  }
2089
0
    }
2090
0
    return(NULL);
2091
0
}
2092
2093
/**
2094
 * UTF8ToHtml:
2095
 * @out:  a pointer to an array of bytes to store the result
2096
 * @outlen:  the length of @out
2097
 * @in:  a pointer to an array of UTF-8 chars
2098
 * @inlen:  the length of @in
2099
 *
2100
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2101
 * plus HTML entities block of chars out.
2102
 *
2103
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2104
 * The value of @inlen after return is the number of octets consumed
2105
 *     as the return value is positive, else unpredictable.
2106
 * The value of @outlen after return is the number of octets consumed.
2107
 */
2108
int
2109
UTF8ToHtml(unsigned char* out, int *outlen,
2110
0
              const unsigned char* in, int *inlen) {
2111
0
    const unsigned char* processed = in;
2112
0
    const unsigned char* outend;
2113
0
    const unsigned char* outstart = out;
2114
0
    const unsigned char* instart = in;
2115
0
    const unsigned char* inend;
2116
0
    unsigned int c, d;
2117
0
    int trailing;
2118
2119
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2120
0
    if (in == NULL) {
2121
        /*
2122
   * initialization nothing to do
2123
   */
2124
0
  *outlen = 0;
2125
0
  *inlen = 0;
2126
0
  return(0);
2127
0
    }
2128
0
    inend = in + (*inlen);
2129
0
    outend = out + (*outlen);
2130
0
    while (in < inend) {
2131
0
  d = *in++;
2132
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2133
0
  else if (d < 0xC0) {
2134
      /* trailing byte in leading position */
2135
0
      *outlen = out - outstart;
2136
0
      *inlen = processed - instart;
2137
0
      return(-2);
2138
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2139
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2140
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2141
0
  else {
2142
      /* no chance for this in Ascii */
2143
0
      *outlen = out - outstart;
2144
0
      *inlen = processed - instart;
2145
0
      return(-2);
2146
0
  }
2147
2148
0
  if (inend - in < trailing) {
2149
0
      break;
2150
0
  }
2151
2152
0
  for ( ; trailing; trailing--) {
2153
0
      if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2154
0
    break;
2155
0
      c <<= 6;
2156
0
      c |= d & 0x3F;
2157
0
  }
2158
2159
  /* assertion: c is a single UTF-4 value */
2160
0
  if (c < 0x80) {
2161
0
      if (out + 1 >= outend)
2162
0
    break;
2163
0
      *out++ = c;
2164
0
  } else {
2165
0
      int len;
2166
0
      const htmlEntityDesc * ent;
2167
0
      const char *cp;
2168
0
      char nbuf[16];
2169
2170
      /*
2171
       * Try to lookup a predefined HTML entity for it
2172
       */
2173
2174
0
      ent = htmlEntityValueLookup(c);
2175
0
      if (ent == NULL) {
2176
0
        snprintf(nbuf, sizeof(nbuf), "#%u", c);
2177
0
        cp = nbuf;
2178
0
      }
2179
0
      else
2180
0
        cp = ent->name;
2181
0
      len = strlen(cp);
2182
0
      if (out + 2 + len >= outend)
2183
0
    break;
2184
0
      *out++ = '&';
2185
0
      memcpy(out, cp, len);
2186
0
      out += len;
2187
0
      *out++ = ';';
2188
0
  }
2189
0
  processed = in;
2190
0
    }
2191
0
    *outlen = out - outstart;
2192
0
    *inlen = processed - instart;
2193
0
    return(0);
2194
0
}
2195
2196
/**
2197
 * htmlEncodeEntities:
2198
 * @out:  a pointer to an array of bytes to store the result
2199
 * @outlen:  the length of @out
2200
 * @in:  a pointer to an array of UTF-8 chars
2201
 * @inlen:  the length of @in
2202
 * @quoteChar: the quote character to escape (' or ") or zero.
2203
 *
2204
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2205
 * plus HTML entities block of chars out.
2206
 *
2207
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2208
 * The value of @inlen after return is the number of octets consumed
2209
 *     as the return value is positive, else unpredictable.
2210
 * The value of @outlen after return is the number of octets consumed.
2211
 */
2212
int
2213
htmlEncodeEntities(unsigned char* out, int *outlen,
2214
0
       const unsigned char* in, int *inlen, int quoteChar) {
2215
0
    const unsigned char* processed = in;
2216
0
    const unsigned char* outend;
2217
0
    const unsigned char* outstart = out;
2218
0
    const unsigned char* instart = in;
2219
0
    const unsigned char* inend;
2220
0
    unsigned int c, d;
2221
0
    int trailing;
2222
2223
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2224
0
        return(-1);
2225
0
    outend = out + (*outlen);
2226
0
    inend = in + (*inlen);
2227
0
    while (in < inend) {
2228
0
  d = *in++;
2229
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2230
0
  else if (d < 0xC0) {
2231
      /* trailing byte in leading position */
2232
0
      *outlen = out - outstart;
2233
0
      *inlen = processed - instart;
2234
0
      return(-2);
2235
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2236
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2237
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2238
0
  else {
2239
      /* no chance for this in Ascii */
2240
0
      *outlen = out - outstart;
2241
0
      *inlen = processed - instart;
2242
0
      return(-2);
2243
0
  }
2244
2245
0
  if (inend - in < trailing)
2246
0
      break;
2247
2248
0
  while (trailing--) {
2249
0
      if (((d= *in++) & 0xC0) != 0x80) {
2250
0
    *outlen = out - outstart;
2251
0
    *inlen = processed - instart;
2252
0
    return(-2);
2253
0
      }
2254
0
      c <<= 6;
2255
0
      c |= d & 0x3F;
2256
0
  }
2257
2258
  /* assertion: c is a single UTF-4 value */
2259
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2260
0
      (c != '&') && (c != '<') && (c != '>')) {
2261
0
      if (out >= outend)
2262
0
    break;
2263
0
      *out++ = c;
2264
0
  } else {
2265
0
      const htmlEntityDesc * ent;
2266
0
      const char *cp;
2267
0
      char nbuf[16];
2268
0
      int len;
2269
2270
      /*
2271
       * Try to lookup a predefined HTML entity for it
2272
       */
2273
0
      ent = htmlEntityValueLookup(c);
2274
0
      if (ent == NULL) {
2275
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2276
0
    cp = nbuf;
2277
0
      }
2278
0
      else
2279
0
    cp = ent->name;
2280
0
      len = strlen(cp);
2281
0
      if (outend - out < len + 2)
2282
0
    break;
2283
0
      *out++ = '&';
2284
0
      memcpy(out, cp, len);
2285
0
      out += len;
2286
0
      *out++ = ';';
2287
0
  }
2288
0
  processed = in;
2289
0
    }
2290
0
    *outlen = out - outstart;
2291
0
    *inlen = processed - instart;
2292
0
    return(0);
2293
0
}
2294
2295
/************************************************************************
2296
 *                  *
2297
 *    Commodity functions to handle streams     *
2298
 *                  *
2299
 ************************************************************************/
2300
2301
#ifdef LIBXML_PUSH_ENABLED
2302
/**
2303
 * htmlNewInputStream:
2304
 * @ctxt:  an HTML parser context
2305
 *
2306
 * Create a new input stream structure
2307
 * Returns the new input stream or NULL
2308
 */
2309
static htmlParserInputPtr
2310
0
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2311
0
    htmlParserInputPtr input;
2312
2313
0
    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2314
0
    if (input == NULL) {
2315
0
        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2316
0
  return(NULL);
2317
0
    }
2318
0
    memset(input, 0, sizeof(htmlParserInput));
2319
0
    input->filename = NULL;
2320
0
    input->directory = NULL;
2321
0
    input->base = NULL;
2322
0
    input->cur = NULL;
2323
0
    input->buf = NULL;
2324
0
    input->line = 1;
2325
0
    input->col = 1;
2326
0
    input->buf = NULL;
2327
0
    input->free = NULL;
2328
0
    input->version = NULL;
2329
0
    input->consumed = 0;
2330
0
    input->length = 0;
2331
0
    return(input);
2332
0
}
2333
#endif
2334
2335
2336
/************************************************************************
2337
 *                  *
2338
 *    Commodity functions, cleanup needed ?     *
2339
 *                  *
2340
 ************************************************************************/
2341
/*
2342
 * all tags allowing pc data from the html 4.01 loose dtd
2343
 * NOTE: it might be more appropriate to integrate this information
2344
 * into the html40ElementTable array but I don't want to risk any
2345
 * binary incompatibility
2346
 */
2347
static const char *allowPCData[] = {
2348
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2349
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2350
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2351
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2352
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2353
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2354
};
2355
2356
/**
2357
 * areBlanks:
2358
 * @ctxt:  an HTML parser context
2359
 * @str:  a xmlChar *
2360
 * @len:  the size of @str
2361
 *
2362
 * Is this a sequence of blank chars that one can ignore ?
2363
 *
2364
 * Returns 1 if ignorable 0 otherwise.
2365
 */
2366
2367
0
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2368
0
    unsigned int i;
2369
0
    int j;
2370
0
    xmlNodePtr lastChild;
2371
0
    xmlDtdPtr dtd;
2372
2373
0
    for (j = 0;j < len;j++)
2374
0
        if (!(IS_BLANK_CH(str[j]))) return(0);
2375
2376
0
    if (CUR == 0) return(1);
2377
0
    if (CUR != '<') return(0);
2378
0
    if (ctxt->name == NULL)
2379
0
  return(1);
2380
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2381
0
  return(1);
2382
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2383
0
  return(1);
2384
2385
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2386
0
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2387
0
        dtd = xmlGetIntSubset(ctxt->myDoc);
2388
0
        if (dtd != NULL && dtd->ExternalID != NULL) {
2389
0
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2390
0
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2391
0
                return(1);
2392
0
        }
2393
0
    }
2394
2395
0
    if (ctxt->node == NULL) return(0);
2396
0
    lastChild = xmlGetLastChild(ctxt->node);
2397
0
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2398
0
  lastChild = lastChild->prev;
2399
0
    if (lastChild == NULL) {
2400
0
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2401
0
            (ctxt->node->content != NULL)) return(0);
2402
  /* keep ws in constructs like ...<b> </b>...
2403
     for all tags "b" allowing PCDATA */
2404
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2405
0
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2406
0
    return(0);
2407
0
      }
2408
0
  }
2409
0
    } else if (xmlNodeIsText(lastChild)) {
2410
0
        return(0);
2411
0
    } else {
2412
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2413
     for all tags "p" allowing PCDATA */
2414
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2415
0
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2416
0
    return(0);
2417
0
      }
2418
0
  }
2419
0
    }
2420
0
    return(1);
2421
0
}
2422
2423
/**
2424
 * htmlNewDocNoDtD:
2425
 * @URI:  URI for the dtd, or NULL
2426
 * @ExternalID:  the external ID of the DTD, or NULL
2427
 *
2428
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2429
 * are NULL
2430
 *
2431
 * Returns a new document, do not initialize the DTD if not provided
2432
 */
2433
htmlDocPtr
2434
0
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2435
0
    xmlDocPtr cur;
2436
2437
    /*
2438
     * Allocate a new document and fill the fields.
2439
     */
2440
0
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2441
0
    if (cur == NULL) {
2442
0
  htmlErrMemory(NULL, "HTML document creation failed\n");
2443
0
  return(NULL);
2444
0
    }
2445
0
    memset(cur, 0, sizeof(xmlDoc));
2446
2447
0
    cur->type = XML_HTML_DOCUMENT_NODE;
2448
0
    cur->version = NULL;
2449
0
    cur->intSubset = NULL;
2450
0
    cur->doc = cur;
2451
0
    cur->name = NULL;
2452
0
    cur->children = NULL;
2453
0
    cur->extSubset = NULL;
2454
0
    cur->oldNs = NULL;
2455
0
    cur->encoding = NULL;
2456
0
    cur->standalone = 1;
2457
0
    cur->compression = 0;
2458
0
    cur->ids = NULL;
2459
0
    cur->refs = NULL;
2460
0
    cur->_private = NULL;
2461
0
    cur->charset = XML_CHAR_ENCODING_UTF8;
2462
0
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2463
0
    if ((ExternalID != NULL) ||
2464
0
  (URI != NULL))
2465
0
  xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2466
0
    if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2467
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2468
0
    return(cur);
2469
0
}
2470
2471
/**
2472
 * htmlNewDoc:
2473
 * @URI:  URI for the dtd, or NULL
2474
 * @ExternalID:  the external ID of the DTD, or NULL
2475
 *
2476
 * Creates a new HTML document
2477
 *
2478
 * Returns a new document
2479
 */
2480
htmlDocPtr
2481
0
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2482
0
    if ((URI == NULL) && (ExternalID == NULL))
2483
0
  return(htmlNewDocNoDtD(
2484
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2485
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2486
2487
0
    return(htmlNewDocNoDtD(URI, ExternalID));
2488
0
}
2489
2490
2491
/************************************************************************
2492
 *                  *
2493
 *      The parser itself       *
2494
 *  Relates to http://www.w3.org/TR/html40        *
2495
 *                  *
2496
 ************************************************************************/
2497
2498
/************************************************************************
2499
 *                  *
2500
 *      The parser itself       *
2501
 *                  *
2502
 ************************************************************************/
2503
2504
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2505
2506
static void
2507
0
htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2508
0
    int c;
2509
2510
0
    htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2511
0
                 "Incorrectly opened comment\n", NULL, NULL);
2512
2513
0
    do {
2514
0
        c = CUR;
2515
0
        if (c == 0)
2516
0
            break;
2517
0
        NEXT;
2518
0
    } while (c != '>');
2519
0
}
2520
2521
/**
2522
 * htmlParseHTMLName:
2523
 * @ctxt:  an HTML parser context
2524
 *
2525
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2526
 * since HTML names are not case-sensitive.
2527
 *
2528
 * Returns the Tag Name parsed or NULL
2529
 */
2530
2531
static const xmlChar *
2532
0
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2533
0
    const xmlChar *ret;
2534
0
    int i = 0;
2535
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2536
2537
0
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2538
0
        (CUR != ':') && (CUR != '.')) return(NULL);
2539
2540
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2541
0
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2542
0
     (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2543
0
           (CUR == '.'))) {
2544
0
  if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2545
0
        else loc[i] = CUR;
2546
0
  i++;
2547
2548
0
  NEXT;
2549
0
    }
2550
2551
0
    ret = xmlDictLookup(ctxt->dict, loc, i);
2552
0
    if (ret == NULL)
2553
0
        htmlErrMemory(ctxt, NULL);
2554
2555
0
    return(ret);
2556
0
}
2557
2558
2559
/**
2560
 * htmlParseHTMLName_nonInvasive:
2561
 * @ctxt:  an HTML parser context
2562
 *
2563
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2564
 * since HTML names are not case-sensitive, this doesn't consume the data
2565
 * from the stream, it's a look-ahead
2566
 *
2567
 * Returns the Tag Name parsed or NULL
2568
 */
2569
2570
static const xmlChar *
2571
0
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2572
0
    int i = 0;
2573
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2574
2575
0
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2576
0
        (NXT(1) != ':')) return(NULL);
2577
2578
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2579
0
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2580
0
     (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2581
0
  if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2582
0
        else loc[i] = NXT(1+i);
2583
0
  i++;
2584
0
    }
2585
2586
0
    return(xmlDictLookup(ctxt->dict, loc, i));
2587
0
}
2588
2589
2590
/**
2591
 * htmlParseName:
2592
 * @ctxt:  an HTML parser context
2593
 *
2594
 * parse an HTML name, this routine is case sensitive.
2595
 *
2596
 * Returns the Name parsed or NULL
2597
 */
2598
2599
static const xmlChar *
2600
0
htmlParseName(htmlParserCtxtPtr ctxt) {
2601
0
    const xmlChar *in;
2602
0
    const xmlChar *ret;
2603
0
    int count = 0;
2604
2605
0
    GROW;
2606
2607
    /*
2608
     * Accelerator for simple ASCII names
2609
     */
2610
0
    in = ctxt->input->cur;
2611
0
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2612
0
  ((*in >= 0x41) && (*in <= 0x5A)) ||
2613
0
  (*in == '_') || (*in == ':')) {
2614
0
  in++;
2615
0
  while (((*in >= 0x61) && (*in <= 0x7A)) ||
2616
0
         ((*in >= 0x41) && (*in <= 0x5A)) ||
2617
0
         ((*in >= 0x30) && (*in <= 0x39)) ||
2618
0
         (*in == '_') || (*in == '-') ||
2619
0
         (*in == ':') || (*in == '.'))
2620
0
      in++;
2621
2622
0
  if (in == ctxt->input->end)
2623
0
      return(NULL);
2624
2625
0
  if ((*in > 0) && (*in < 0x80)) {
2626
0
      count = in - ctxt->input->cur;
2627
0
      ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2628
0
      ctxt->input->cur = in;
2629
0
      ctxt->input->col += count;
2630
0
      return(ret);
2631
0
  }
2632
0
    }
2633
0
    return(htmlParseNameComplex(ctxt));
2634
0
}
2635
2636
static const xmlChar *
2637
0
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2638
0
    int len = 0, l;
2639
0
    int c;
2640
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2641
0
                    XML_MAX_TEXT_LENGTH :
2642
0
                    XML_MAX_NAME_LENGTH;
2643
0
    const xmlChar *base = ctxt->input->base;
2644
2645
    /*
2646
     * Handler for more complex cases
2647
     */
2648
0
    c = CUR_CHAR(l);
2649
0
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2650
0
  (!IS_LETTER(c) && (c != '_') &&
2651
0
         (c != ':'))) {
2652
0
  return(NULL);
2653
0
    }
2654
2655
0
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2656
0
     ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2657
0
            (c == '.') || (c == '-') ||
2658
0
      (c == '_') || (c == ':') ||
2659
0
      (IS_COMBINING(c)) ||
2660
0
      (IS_EXTENDER(c)))) {
2661
0
  len += l;
2662
0
        if (len > maxLength) {
2663
0
            htmlParseErr(ctxt, XML_ERR_NAME_TOO_LONG, "name too long", NULL, NULL);
2664
0
            return(NULL);
2665
0
        }
2666
0
  NEXTL(l);
2667
0
  c = CUR_CHAR(l);
2668
0
  if (ctxt->input->base != base) {
2669
      /*
2670
       * We changed encoding from an unknown encoding
2671
       * Input buffer changed location, so we better start again
2672
       */
2673
0
      return(htmlParseNameComplex(ctxt));
2674
0
  }
2675
0
    }
2676
0
    if (ctxt->instate == XML_PARSER_EOF)
2677
0
        return(NULL);
2678
2679
0
    if (ctxt->input->cur - ctxt->input->base < len) {
2680
        /* Sanity check */
2681
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2682
0
                     "unexpected change of input buffer", NULL, NULL);
2683
0
        return (NULL);
2684
0
    }
2685
2686
0
    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2687
0
}
2688
2689
2690
/**
2691
 * htmlParseHTMLAttribute:
2692
 * @ctxt:  an HTML parser context
2693
 * @stop:  a char stop value
2694
 *
2695
 * parse an HTML attribute value till the stop (quote), if
2696
 * stop is 0 then it stops at the first space
2697
 *
2698
 * Returns the attribute parsed or NULL
2699
 */
2700
2701
static xmlChar *
2702
0
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2703
0
    xmlChar *buffer = NULL;
2704
0
    int buffer_size = 0;
2705
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
2706
0
                    XML_MAX_HUGE_LENGTH :
2707
0
                    XML_MAX_TEXT_LENGTH;
2708
0
    xmlChar *out = NULL;
2709
0
    const xmlChar *name = NULL;
2710
0
    const xmlChar *cur = NULL;
2711
0
    const htmlEntityDesc * ent;
2712
2713
    /*
2714
     * allocate a translation buffer.
2715
     */
2716
0
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2717
0
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2718
0
    if (buffer == NULL) {
2719
0
  htmlErrMemory(ctxt, "buffer allocation failed\n");
2720
0
  return(NULL);
2721
0
    }
2722
0
    out = buffer;
2723
2724
    /*
2725
     * Ok loop until we reach one of the ending chars
2726
     */
2727
0
    while ((CUR != 0) && (CUR != stop)) {
2728
0
  if ((stop == 0) && (CUR == '>')) break;
2729
0
  if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2730
0
        if (CUR == '&') {
2731
0
      if (NXT(1) == '#') {
2732
0
    unsigned int c;
2733
0
    int bits;
2734
2735
0
    c = htmlParseCharRef(ctxt);
2736
0
    if      (c <    0x80)
2737
0
            { *out++  = c;                bits= -6; }
2738
0
    else if (c <   0x800)
2739
0
            { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2740
0
    else if (c < 0x10000)
2741
0
            { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2742
0
    else
2743
0
            { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2744
2745
0
    for ( ; bits >= 0; bits-= 6) {
2746
0
        *out++  = ((c >> bits) & 0x3F) | 0x80;
2747
0
    }
2748
2749
0
    if (out - buffer > buffer_size - 100) {
2750
0
      int indx = out - buffer;
2751
2752
0
      growBuffer(buffer);
2753
0
      out = &buffer[indx];
2754
0
    }
2755
0
      } else {
2756
0
    ent = htmlParseEntityRef(ctxt, &name);
2757
0
    if (name == NULL) {
2758
0
        *out++ = '&';
2759
0
        if (out - buffer > buffer_size - 100) {
2760
0
      int indx = out - buffer;
2761
2762
0
      growBuffer(buffer);
2763
0
      out = &buffer[indx];
2764
0
        }
2765
0
    } else if (ent == NULL) {
2766
0
        *out++ = '&';
2767
0
        cur = name;
2768
0
        while (*cur != 0) {
2769
0
      if (out - buffer > buffer_size - 100) {
2770
0
          int indx = out - buffer;
2771
2772
0
          growBuffer(buffer);
2773
0
          out = &buffer[indx];
2774
0
      }
2775
0
      *out++ = *cur++;
2776
0
        }
2777
0
    } else {
2778
0
        unsigned int c;
2779
0
        int bits;
2780
2781
0
        if (out - buffer > buffer_size - 100) {
2782
0
      int indx = out - buffer;
2783
2784
0
      growBuffer(buffer);
2785
0
      out = &buffer[indx];
2786
0
        }
2787
0
        c = ent->value;
2788
0
        if      (c <    0x80)
2789
0
      { *out++  = c;                bits= -6; }
2790
0
        else if (c <   0x800)
2791
0
      { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2792
0
        else if (c < 0x10000)
2793
0
      { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2794
0
        else
2795
0
      { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2796
2797
0
        for ( ; bits >= 0; bits-= 6) {
2798
0
      *out++  = ((c >> bits) & 0x3F) | 0x80;
2799
0
        }
2800
0
    }
2801
0
      }
2802
0
  } else {
2803
0
      unsigned int c;
2804
0
      int bits, l;
2805
2806
0
      if (out - buffer > buffer_size - 100) {
2807
0
    int indx = out - buffer;
2808
2809
0
    growBuffer(buffer);
2810
0
    out = &buffer[indx];
2811
0
      }
2812
0
      c = CUR_CHAR(l);
2813
0
            if (ctxt->instate == XML_PARSER_EOF) {
2814
0
                xmlFree(buffer);
2815
0
                return(NULL);
2816
0
            }
2817
0
      if      (c <    0x80)
2818
0
        { *out++  = c;                bits= -6; }
2819
0
      else if (c <   0x800)
2820
0
        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2821
0
      else if (c < 0x10000)
2822
0
        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2823
0
      else
2824
0
        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2825
2826
0
      for ( ; bits >= 0; bits-= 6) {
2827
0
    *out++  = ((c >> bits) & 0x3F) | 0x80;
2828
0
      }
2829
0
      NEXTL(l);
2830
0
  }
2831
0
        if (out - buffer > maxLength) {
2832
0
            htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2833
0
                         "attribute value too long\n", NULL, NULL);
2834
0
            xmlFree(buffer);
2835
0
            return(NULL);
2836
0
        }
2837
0
    }
2838
0
    *out = 0;
2839
0
    return(buffer);
2840
0
}
2841
2842
/**
2843
 * htmlParseEntityRef:
2844
 * @ctxt:  an HTML parser context
2845
 * @str:  location to store the entity name
2846
 *
2847
 * DEPRECATED: Internal function, don't use.
2848
 *
2849
 * parse an HTML ENTITY references
2850
 *
2851
 * [68] EntityRef ::= '&' Name ';'
2852
 *
2853
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2854
 *         if non-NULL *str will have to be freed by the caller.
2855
 */
2856
const htmlEntityDesc *
2857
0
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2858
0
    const xmlChar *name;
2859
0
    const htmlEntityDesc * ent = NULL;
2860
2861
0
    if (str != NULL) *str = NULL;
2862
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2863
2864
0
    if (CUR == '&') {
2865
0
        NEXT;
2866
0
        name = htmlParseName(ctxt);
2867
0
  if (name == NULL) {
2868
0
      htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2869
0
                   "htmlParseEntityRef: no name\n", NULL, NULL);
2870
0
  } else {
2871
0
      GROW;
2872
0
      if (CUR == ';') {
2873
0
          if (str != NULL)
2874
0
        *str = name;
2875
2876
    /*
2877
     * Lookup the entity in the table.
2878
     */
2879
0
    ent = htmlEntityLookup(name);
2880
0
    if (ent != NULL) /* OK that's ugly !!! */
2881
0
        NEXT;
2882
0
      } else {
2883
0
    htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2884
0
                 "htmlParseEntityRef: expecting ';'\n",
2885
0
           NULL, NULL);
2886
0
          if (str != NULL)
2887
0
        *str = name;
2888
0
      }
2889
0
  }
2890
0
    }
2891
0
    return(ent);
2892
0
}
2893
2894
/**
2895
 * htmlParseAttValue:
2896
 * @ctxt:  an HTML parser context
2897
 *
2898
 * parse a value for an attribute
2899
 * Note: the parser won't do substitution of entities here, this
2900
 * will be handled later in xmlStringGetNodeList, unless it was
2901
 * asked for ctxt->replaceEntities != 0
2902
 *
2903
 * Returns the AttValue parsed or NULL.
2904
 */
2905
2906
static xmlChar *
2907
0
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2908
0
    xmlChar *ret = NULL;
2909
2910
0
    if (CUR == '"') {
2911
0
        NEXT;
2912
0
  ret = htmlParseHTMLAttribute(ctxt, '"');
2913
0
        if (CUR != '"') {
2914
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2915
0
                   "AttValue: \" expected\n", NULL, NULL);
2916
0
  } else
2917
0
      NEXT;
2918
0
    } else if (CUR == '\'') {
2919
0
        NEXT;
2920
0
  ret = htmlParseHTMLAttribute(ctxt, '\'');
2921
0
        if (CUR != '\'') {
2922
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2923
0
                   "AttValue: ' expected\n", NULL, NULL);
2924
0
  } else
2925
0
      NEXT;
2926
0
    } else {
2927
        /*
2928
   * That's an HTMLism, the attribute value may not be quoted
2929
   */
2930
0
  ret = htmlParseHTMLAttribute(ctxt, 0);
2931
0
  if (ret == NULL) {
2932
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2933
0
                   "AttValue: no value found\n", NULL, NULL);
2934
0
  }
2935
0
    }
2936
0
    return(ret);
2937
0
}
2938
2939
/**
2940
 * htmlParseSystemLiteral:
2941
 * @ctxt:  an HTML parser context
2942
 *
2943
 * parse an HTML Literal
2944
 *
2945
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2946
 *
2947
 * Returns the SystemLiteral parsed or NULL
2948
 */
2949
2950
static xmlChar *
2951
0
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2952
0
    size_t len = 0, startPosition = 0;
2953
0
    int err = 0;
2954
0
    int quote;
2955
0
    xmlChar *ret = NULL;
2956
2957
0
    if ((CUR != '"') && (CUR != '\'')) {
2958
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2959
0
               "SystemLiteral \" or ' expected\n", NULL, NULL);
2960
0
        return(NULL);
2961
0
    }
2962
0
    quote = CUR;
2963
0
    NEXT;
2964
2965
0
    if (CUR_PTR < BASE_PTR)
2966
0
        return(ret);
2967
0
    startPosition = CUR_PTR - BASE_PTR;
2968
2969
0
    while ((CUR != 0) && (CUR != quote)) {
2970
        /* TODO: Handle UTF-8 */
2971
0
        if (!IS_CHAR_CH(CUR)) {
2972
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2973
0
                            "Invalid char in SystemLiteral 0x%X\n", CUR);
2974
0
            err = 1;
2975
0
        }
2976
0
        NEXT;
2977
0
        len++;
2978
0
    }
2979
0
    if (CUR != quote) {
2980
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2981
0
                     "Unfinished SystemLiteral\n", NULL, NULL);
2982
0
    } else {
2983
0
        if (err == 0)
2984
0
            ret = xmlStrndup((BASE_PTR+startPosition), len);
2985
0
        NEXT;
2986
0
    }
2987
2988
0
    return(ret);
2989
0
}
2990
2991
/**
2992
 * htmlParsePubidLiteral:
2993
 * @ctxt:  an HTML parser context
2994
 *
2995
 * parse an HTML public literal
2996
 *
2997
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2998
 *
2999
 * Returns the PubidLiteral parsed or NULL.
3000
 */
3001
3002
static xmlChar *
3003
0
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3004
0
    size_t len = 0, startPosition = 0;
3005
0
    int err = 0;
3006
0
    int quote;
3007
0
    xmlChar *ret = NULL;
3008
3009
0
    if ((CUR != '"') && (CUR != '\'')) {
3010
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3011
0
               "PubidLiteral \" or ' expected\n", NULL, NULL);
3012
0
        return(NULL);
3013
0
    }
3014
0
    quote = CUR;
3015
0
    NEXT;
3016
3017
    /*
3018
     * Name ::= (Letter | '_') (NameChar)*
3019
     */
3020
0
    if (CUR_PTR < BASE_PTR)
3021
0
        return(ret);
3022
0
    startPosition = CUR_PTR - BASE_PTR;
3023
3024
0
    while ((CUR != 0) && (CUR != quote)) {
3025
0
        if (!IS_PUBIDCHAR_CH(CUR)) {
3026
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3027
0
                            "Invalid char in PubidLiteral 0x%X\n", CUR);
3028
0
            err = 1;
3029
0
        }
3030
0
        len++;
3031
0
        NEXT;
3032
0
    }
3033
3034
0
    if (CUR != quote) {
3035
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3036
0
                     "Unfinished PubidLiteral\n", NULL, NULL);
3037
0
    } else {
3038
0
        if (err == 0)
3039
0
            ret = xmlStrndup((BASE_PTR + startPosition), len);
3040
0
        NEXT;
3041
0
    }
3042
3043
0
    return(ret);
3044
0
}
3045
3046
/**
3047
 * htmlParseScript:
3048
 * @ctxt:  an HTML parser context
3049
 *
3050
 * parse the content of an HTML SCRIPT or STYLE element
3051
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3052
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3053
 * http://www.w3.org/TR/html4/types.html#type-script
3054
 * http://www.w3.org/TR/html4/types.html#h-6.15
3055
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3056
 *
3057
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3058
 * element and the value of intrinsic event attributes. User agents must
3059
 * not evaluate script data as HTML markup but instead must pass it on as
3060
 * data to a script engine.
3061
 * NOTES:
3062
 * - The content is passed like CDATA
3063
 * - the attributes for style and scripting "onXXX" are also described
3064
 *   as CDATA but SGML allows entities references in attributes so their
3065
 *   processing is identical as other attributes
3066
 */
3067
static void
3068
0
htmlParseScript(htmlParserCtxtPtr ctxt) {
3069
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3070
0
    int nbchar = 0;
3071
0
    int cur,l;
3072
3073
0
    cur = CUR_CHAR(l);
3074
0
    while (cur != 0) {
3075
0
  if ((cur == '<') && (NXT(1) == '/')) {
3076
            /*
3077
             * One should break here, the specification is clear:
3078
             * Authors should therefore escape "</" within the content.
3079
             * Escape mechanisms are specific to each scripting or
3080
             * style sheet language.
3081
             *
3082
             * In recovery mode, only break if end tag match the
3083
             * current tag, effectively ignoring all tags inside the
3084
             * script/style block and treating the entire block as
3085
             * CDATA.
3086
             */
3087
0
            if (ctxt->recovery) {
3088
0
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3089
0
           xmlStrlen(ctxt->name)) == 0)
3090
0
                {
3091
0
                    break; /* while */
3092
0
                } else {
3093
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3094
0
         "Element %s embeds close tag\n",
3095
0
                     ctxt->name, NULL);
3096
0
    }
3097
0
            } else {
3098
0
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3099
0
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3100
0
                {
3101
0
                    break; /* while */
3102
0
                }
3103
0
            }
3104
0
  }
3105
0
        if (IS_CHAR(cur)) {
3106
0
      COPY_BUF(l,buf,nbchar,cur);
3107
0
        } else {
3108
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3109
0
                            "Invalid char in CDATA 0x%X\n", cur);
3110
0
        }
3111
0
  NEXTL(l);
3112
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3113
0
            buf[nbchar] = 0;
3114
0
      if (ctxt->sax->cdataBlock!= NULL) {
3115
    /*
3116
     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3117
     */
3118
0
    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3119
0
      } else if (ctxt->sax->characters != NULL) {
3120
0
    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3121
0
      }
3122
0
      nbchar = 0;
3123
0
            SHRINK;
3124
0
  }
3125
0
  cur = CUR_CHAR(l);
3126
0
    }
3127
3128
0
    if (ctxt->instate == XML_PARSER_EOF)
3129
0
        return;
3130
3131
0
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3132
0
        buf[nbchar] = 0;
3133
0
  if (ctxt->sax->cdataBlock!= NULL) {
3134
      /*
3135
       * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3136
       */
3137
0
      ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3138
0
  } else if (ctxt->sax->characters != NULL) {
3139
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3140
0
  }
3141
0
    }
3142
0
}
3143
3144
3145
/**
3146
 * htmlParseCharDataInternal:
3147
 * @ctxt:  an HTML parser context
3148
 * @readahead: optional read ahead character in ascii range
3149
 *
3150
 * parse a CharData section.
3151
 * if we are within a CDATA section ']]>' marks an end of section.
3152
 *
3153
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3154
 */
3155
3156
static void
3157
0
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3158
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3159
0
    int nbchar = 0;
3160
0
    int cur, l;
3161
3162
0
    if (readahead)
3163
0
        buf[nbchar++] = readahead;
3164
3165
0
    cur = CUR_CHAR(l);
3166
0
    while (((cur != '<') || (ctxt->token == '<')) &&
3167
0
           ((cur != '&') || (ctxt->token == '&')) &&
3168
0
     (cur != 0)) {
3169
0
  if (!(IS_CHAR(cur))) {
3170
0
      htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3171
0
                  "Invalid char in CDATA 0x%X\n", cur);
3172
0
  } else {
3173
0
      COPY_BUF(l,buf,nbchar,cur);
3174
0
  }
3175
0
  NEXTL(l);
3176
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3177
0
            buf[nbchar] = 0;
3178
3179
      /*
3180
       * Ok the segment is to be consumed as chars.
3181
       */
3182
0
      if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3183
0
    if (areBlanks(ctxt, buf, nbchar)) {
3184
0
        if (ctxt->keepBlanks) {
3185
0
      if (ctxt->sax->characters != NULL)
3186
0
          ctxt->sax->characters(ctxt->userData, buf, nbchar);
3187
0
        } else {
3188
0
      if (ctxt->sax->ignorableWhitespace != NULL)
3189
0
          ctxt->sax->ignorableWhitespace(ctxt->userData,
3190
0
                                         buf, nbchar);
3191
0
        }
3192
0
    } else {
3193
0
        htmlCheckParagraph(ctxt);
3194
0
        if (ctxt->sax->characters != NULL)
3195
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3196
0
    }
3197
0
      }
3198
0
      nbchar = 0;
3199
0
            SHRINK;
3200
0
  }
3201
0
  cur = CUR_CHAR(l);
3202
0
    }
3203
0
    if (ctxt->instate == XML_PARSER_EOF)
3204
0
        return;
3205
0
    if (nbchar != 0) {
3206
0
        buf[nbchar] = 0;
3207
3208
  /*
3209
   * Ok the segment is to be consumed as chars.
3210
   */
3211
0
  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3212
0
      if (areBlanks(ctxt, buf, nbchar)) {
3213
0
    if (ctxt->keepBlanks) {
3214
0
        if (ctxt->sax->characters != NULL)
3215
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3216
0
    } else {
3217
0
        if (ctxt->sax->ignorableWhitespace != NULL)
3218
0
      ctxt->sax->ignorableWhitespace(ctxt->userData,
3219
0
                                     buf, nbchar);
3220
0
    }
3221
0
      } else {
3222
0
    htmlCheckParagraph(ctxt);
3223
0
    if (ctxt->sax->characters != NULL)
3224
0
        ctxt->sax->characters(ctxt->userData, buf, nbchar);
3225
0
      }
3226
0
  }
3227
0
    }
3228
0
}
3229
3230
/**
3231
 * htmlParseCharData:
3232
 * @ctxt:  an HTML parser context
3233
 *
3234
 * parse a CharData section.
3235
 * if we are within a CDATA section ']]>' marks an end of section.
3236
 *
3237
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3238
 */
3239
3240
static void
3241
0
htmlParseCharData(htmlParserCtxtPtr ctxt) {
3242
0
    htmlParseCharDataInternal(ctxt, 0);
3243
0
}
3244
3245
/**
3246
 * htmlParseExternalID:
3247
 * @ctxt:  an HTML parser context
3248
 * @publicID:  a xmlChar** receiving PubidLiteral
3249
 *
3250
 * Parse an External ID or a Public ID
3251
 *
3252
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3253
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3254
 *
3255
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3256
 *
3257
 * Returns the function returns SystemLiteral and in the second
3258
 *                case publicID receives PubidLiteral, is strict is off
3259
 *                it is possible to return NULL and have publicID set.
3260
 */
3261
3262
static xmlChar *
3263
0
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3264
0
    xmlChar *URI = NULL;
3265
3266
0
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3267
0
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3268
0
   (UPP(4) == 'E') && (UPP(5) == 'M')) {
3269
0
        SKIP(6);
3270
0
  if (!IS_BLANK_CH(CUR)) {
3271
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3272
0
                   "Space required after 'SYSTEM'\n", NULL, NULL);
3273
0
  }
3274
0
        SKIP_BLANKS;
3275
0
  URI = htmlParseSystemLiteral(ctxt);
3276
0
  if (URI == NULL) {
3277
0
      htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3278
0
                   "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3279
0
        }
3280
0
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3281
0
         (UPP(2) == 'B') && (UPP(3) == 'L') &&
3282
0
         (UPP(4) == 'I') && (UPP(5) == 'C')) {
3283
0
        SKIP(6);
3284
0
  if (!IS_BLANK_CH(CUR)) {
3285
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3286
0
                   "Space required after 'PUBLIC'\n", NULL, NULL);
3287
0
  }
3288
0
        SKIP_BLANKS;
3289
0
  *publicID = htmlParsePubidLiteral(ctxt);
3290
0
  if (*publicID == NULL) {
3291
0
      htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3292
0
                   "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3293
0
       NULL, NULL);
3294
0
  }
3295
0
        SKIP_BLANKS;
3296
0
        if ((CUR == '"') || (CUR == '\'')) {
3297
0
      URI = htmlParseSystemLiteral(ctxt);
3298
0
  }
3299
0
    }
3300
0
    return(URI);
3301
0
}
3302
3303
/**
3304
 * xmlParsePI:
3305
 * @ctxt:  an XML parser context
3306
 *
3307
 * parse an XML Processing Instruction.
3308
 *
3309
 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3310
 */
3311
static void
3312
0
htmlParsePI(htmlParserCtxtPtr ctxt) {
3313
0
    xmlChar *buf = NULL;
3314
0
    int len = 0;
3315
0
    int size = HTML_PARSER_BUFFER_SIZE;
3316
0
    int cur, l;
3317
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3318
0
                    XML_MAX_HUGE_LENGTH :
3319
0
                    XML_MAX_TEXT_LENGTH;
3320
0
    const xmlChar *target;
3321
0
    xmlParserInputState state;
3322
3323
0
    if ((RAW == '<') && (NXT(1) == '?')) {
3324
0
  state = ctxt->instate;
3325
0
        ctxt->instate = XML_PARSER_PI;
3326
  /*
3327
   * this is a Processing Instruction.
3328
   */
3329
0
  SKIP(2);
3330
3331
  /*
3332
   * Parse the target name and check for special support like
3333
   * namespace.
3334
   */
3335
0
        target = htmlParseName(ctxt);
3336
0
  if (target != NULL) {
3337
0
      if (RAW == '>') {
3338
0
    SKIP(1);
3339
3340
    /*
3341
     * SAX: PI detected.
3342
     */
3343
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3344
0
        (ctxt->sax->processingInstruction != NULL))
3345
0
        ctxt->sax->processingInstruction(ctxt->userData,
3346
0
                                         target, NULL);
3347
0
    ctxt->instate = state;
3348
0
    return;
3349
0
      }
3350
0
      buf = (xmlChar *) xmlMallocAtomic(size);
3351
0
      if (buf == NULL) {
3352
0
    htmlErrMemory(ctxt, NULL);
3353
0
    ctxt->instate = state;
3354
0
    return;
3355
0
      }
3356
0
      cur = CUR;
3357
0
      if (!IS_BLANK(cur)) {
3358
0
    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3359
0
        "ParsePI: PI %s space expected\n", target, NULL);
3360
0
      }
3361
0
            SKIP_BLANKS;
3362
0
      cur = CUR_CHAR(l);
3363
0
      while ((cur != 0) && (cur != '>')) {
3364
0
    if (len + 5 >= size) {
3365
0
        xmlChar *tmp;
3366
3367
0
        size *= 2;
3368
0
        tmp = (xmlChar *) xmlRealloc(buf, size);
3369
0
        if (tmp == NULL) {
3370
0
      htmlErrMemory(ctxt, NULL);
3371
0
      xmlFree(buf);
3372
0
      ctxt->instate = state;
3373
0
      return;
3374
0
        }
3375
0
        buf = tmp;
3376
0
    }
3377
0
                if (IS_CHAR(cur)) {
3378
0
        COPY_BUF(l,buf,len,cur);
3379
0
                } else {
3380
0
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3381
0
                                    "Invalid char in processing instruction "
3382
0
                                    "0x%X\n", cur);
3383
0
                }
3384
0
                if (len > maxLength) {
3385
0
                    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3386
0
                                 "PI %s too long", target, NULL);
3387
0
                    xmlFree(buf);
3388
0
                    ctxt->instate = state;
3389
0
                    return;
3390
0
                }
3391
0
    NEXTL(l);
3392
0
    cur = CUR_CHAR(l);
3393
0
      }
3394
0
      buf[len] = 0;
3395
0
            if (ctxt->instate == XML_PARSER_EOF) {
3396
0
                xmlFree(buf);
3397
0
                return;
3398
0
            }
3399
0
      if (cur != '>') {
3400
0
    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3401
0
          "ParsePI: PI %s never end ...\n", target, NULL);
3402
0
      } else {
3403
0
    SKIP(1);
3404
3405
    /*
3406
     * SAX: PI detected.
3407
     */
3408
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3409
0
        (ctxt->sax->processingInstruction != NULL))
3410
0
        ctxt->sax->processingInstruction(ctxt->userData,
3411
0
                                         target, buf);
3412
0
      }
3413
0
      xmlFree(buf);
3414
0
  } else {
3415
0
      htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3416
0
                         "PI is not started correctly", NULL, NULL);
3417
0
  }
3418
0
  ctxt->instate = state;
3419
0
    }
3420
0
}
3421
3422
/**
3423
 * htmlParseComment:
3424
 * @ctxt:  an HTML parser context
3425
 *
3426
 * Parse an XML (SGML) comment <!-- .... -->
3427
 *
3428
 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3429
 */
3430
static void
3431
0
htmlParseComment(htmlParserCtxtPtr ctxt) {
3432
0
    xmlChar *buf = NULL;
3433
0
    int len;
3434
0
    int size = HTML_PARSER_BUFFER_SIZE;
3435
0
    int q, ql;
3436
0
    int r, rl;
3437
0
    int cur, l;
3438
0
    int next, nl;
3439
0
    int maxLength = (ctxt->options & XML_PARSE_HUGE) ?
3440
0
                    XML_MAX_HUGE_LENGTH :
3441
0
                    XML_MAX_TEXT_LENGTH;
3442
0
    xmlParserInputState state;
3443
3444
    /*
3445
     * Check that there is a comment right here.
3446
     */
3447
0
    if ((RAW != '<') || (NXT(1) != '!') ||
3448
0
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3449
3450
0
    state = ctxt->instate;
3451
0
    ctxt->instate = XML_PARSER_COMMENT;
3452
0
    SKIP(4);
3453
0
    buf = (xmlChar *) xmlMallocAtomic(size);
3454
0
    if (buf == NULL) {
3455
0
        htmlErrMemory(ctxt, "buffer allocation failed\n");
3456
0
  ctxt->instate = state;
3457
0
  return;
3458
0
    }
3459
0
    len = 0;
3460
0
    buf[len] = 0;
3461
0
    q = CUR_CHAR(ql);
3462
0
    if (q == 0)
3463
0
        goto unfinished;
3464
0
    if (q == '>') {
3465
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3466
0
        cur = '>';
3467
0
        goto finished;
3468
0
    }
3469
0
    NEXTL(ql);
3470
0
    r = CUR_CHAR(rl);
3471
0
    if (r == 0)
3472
0
        goto unfinished;
3473
0
    if (q == '-' && r == '>') {
3474
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3475
0
        cur = '>';
3476
0
        goto finished;
3477
0
    }
3478
0
    NEXTL(rl);
3479
0
    cur = CUR_CHAR(l);
3480
0
    while ((cur != 0) &&
3481
0
           ((cur != '>') ||
3482
0
      (r != '-') || (q != '-'))) {
3483
0
  NEXTL(l);
3484
0
  next = CUR_CHAR(nl);
3485
3486
0
  if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3487
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3488
0
           "Comment incorrectly closed by '--!>'", NULL, NULL);
3489
0
    cur = '>';
3490
0
    break;
3491
0
  }
3492
3493
0
  if (len + 5 >= size) {
3494
0
      xmlChar *tmp;
3495
3496
0
      size *= 2;
3497
0
      tmp = (xmlChar *) xmlRealloc(buf, size);
3498
0
      if (tmp == NULL) {
3499
0
          xmlFree(buf);
3500
0
          htmlErrMemory(ctxt, "growing buffer failed\n");
3501
0
    ctxt->instate = state;
3502
0
    return;
3503
0
      }
3504
0
      buf = tmp;
3505
0
  }
3506
0
        if (IS_CHAR(q)) {
3507
0
      COPY_BUF(ql,buf,len,q);
3508
0
        } else {
3509
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3510
0
                            "Invalid char in comment 0x%X\n", q);
3511
0
        }
3512
0
        if (len > maxLength) {
3513
0
            htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3514
0
                         "comment too long", NULL, NULL);
3515
0
            xmlFree(buf);
3516
0
            ctxt->instate = state;
3517
0
            return;
3518
0
        }
3519
3520
0
  q = r;
3521
0
  ql = rl;
3522
0
  r = cur;
3523
0
  rl = l;
3524
0
  cur = next;
3525
0
  l = nl;
3526
0
    }
3527
0
finished:
3528
0
    buf[len] = 0;
3529
0
    if (ctxt->instate == XML_PARSER_EOF) {
3530
0
        xmlFree(buf);
3531
0
        return;
3532
0
    }
3533
0
    if (cur == '>') {
3534
0
        NEXT;
3535
0
  if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3536
0
      (!ctxt->disableSAX))
3537
0
      ctxt->sax->comment(ctxt->userData, buf);
3538
0
  xmlFree(buf);
3539
0
  ctxt->instate = state;
3540
0
  return;
3541
0
    }
3542
3543
0
unfinished:
3544
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3545
0
     "Comment not terminated \n<!--%.50s\n", buf, NULL);
3546
0
    xmlFree(buf);
3547
0
}
3548
3549
/**
3550
 * htmlParseCharRef:
3551
 * @ctxt:  an HTML parser context
3552
 *
3553
 * DEPRECATED: Internal function, don't use.
3554
 *
3555
 * parse Reference declarations
3556
 *
3557
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3558
 *                  '&#x' [0-9a-fA-F]+ ';'
3559
 *
3560
 * Returns the value parsed (as an int)
3561
 */
3562
int
3563
0
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3564
0
    int val = 0;
3565
3566
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3567
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3568
0
         "htmlParseCharRef: context error\n",
3569
0
         NULL, NULL);
3570
0
        return(0);
3571
0
    }
3572
0
    if ((CUR == '&') && (NXT(1) == '#') &&
3573
0
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3574
0
  SKIP(3);
3575
0
  while (CUR != ';') {
3576
0
      if ((CUR >= '0') && (CUR <= '9')) {
3577
0
                if (val < 0x110000)
3578
0
              val = val * 16 + (CUR - '0');
3579
0
            } else if ((CUR >= 'a') && (CUR <= 'f')) {
3580
0
                if (val < 0x110000)
3581
0
              val = val * 16 + (CUR - 'a') + 10;
3582
0
            } else if ((CUR >= 'A') && (CUR <= 'F')) {
3583
0
                if (val < 0x110000)
3584
0
              val = val * 16 + (CUR - 'A') + 10;
3585
0
            } else {
3586
0
          htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3587
0
                 "htmlParseCharRef: missing semicolon\n",
3588
0
           NULL, NULL);
3589
0
    break;
3590
0
      }
3591
0
      NEXT;
3592
0
  }
3593
0
  if (CUR == ';')
3594
0
      NEXT;
3595
0
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3596
0
  SKIP(2);
3597
0
  while (CUR != ';') {
3598
0
      if ((CUR >= '0') && (CUR <= '9')) {
3599
0
                if (val < 0x110000)
3600
0
              val = val * 10 + (CUR - '0');
3601
0
            } else {
3602
0
          htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3603
0
                 "htmlParseCharRef: missing semicolon\n",
3604
0
           NULL, NULL);
3605
0
    break;
3606
0
      }
3607
0
      NEXT;
3608
0
  }
3609
0
  if (CUR == ';')
3610
0
      NEXT;
3611
0
    } else {
3612
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3613
0
               "htmlParseCharRef: invalid value\n", NULL, NULL);
3614
0
    }
3615
    /*
3616
     * Check the value IS_CHAR ...
3617
     */
3618
0
    if (IS_CHAR(val)) {
3619
0
        return(val);
3620
0
    } else if (val >= 0x110000) {
3621
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3622
0
         "htmlParseCharRef: value too large\n", NULL, NULL);
3623
0
    } else {
3624
0
  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3625
0
      "htmlParseCharRef: invalid xmlChar value %d\n",
3626
0
      val);
3627
0
    }
3628
0
    return(0);
3629
0
}
3630
3631
3632
/**
3633
 * htmlParseDocTypeDecl:
3634
 * @ctxt:  an HTML parser context
3635
 *
3636
 * parse a DOCTYPE declaration
3637
 *
3638
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3639
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3640
 */
3641
3642
static void
3643
0
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3644
0
    const xmlChar *name;
3645
0
    xmlChar *ExternalID = NULL;
3646
0
    xmlChar *URI = NULL;
3647
3648
    /*
3649
     * We know that '<!DOCTYPE' has been detected.
3650
     */
3651
0
    SKIP(9);
3652
3653
0
    SKIP_BLANKS;
3654
3655
    /*
3656
     * Parse the DOCTYPE name.
3657
     */
3658
0
    name = htmlParseName(ctxt);
3659
0
    if (name == NULL) {
3660
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3661
0
               "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3662
0
         NULL, NULL);
3663
0
    }
3664
    /*
3665
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3666
     */
3667
3668
0
    SKIP_BLANKS;
3669
3670
    /*
3671
     * Check for SystemID and ExternalID
3672
     */
3673
0
    URI = htmlParseExternalID(ctxt, &ExternalID);
3674
0
    SKIP_BLANKS;
3675
3676
    /*
3677
     * We should be at the end of the DOCTYPE declaration.
3678
     */
3679
0
    if (CUR != '>') {
3680
0
  htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3681
0
               "DOCTYPE improperly terminated\n", NULL, NULL);
3682
        /* Ignore bogus content */
3683
0
        while ((CUR != 0) && (CUR != '>') &&
3684
0
               (ctxt->instate != XML_PARSER_EOF))
3685
0
            NEXT;
3686
0
    }
3687
0
    if (CUR == '>')
3688
0
        NEXT;
3689
3690
    /*
3691
     * Create or update the document accordingly to the DOCTYPE
3692
     */
3693
0
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3694
0
  (!ctxt->disableSAX))
3695
0
  ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3696
3697
    /*
3698
     * Cleanup, since we don't use all those identifiers
3699
     */
3700
0
    if (URI != NULL) xmlFree(URI);
3701
0
    if (ExternalID != NULL) xmlFree(ExternalID);
3702
0
}
3703
3704
/**
3705
 * htmlParseAttribute:
3706
 * @ctxt:  an HTML parser context
3707
 * @value:  a xmlChar ** used to store the value of the attribute
3708
 *
3709
 * parse an attribute
3710
 *
3711
 * [41] Attribute ::= Name Eq AttValue
3712
 *
3713
 * [25] Eq ::= S? '=' S?
3714
 *
3715
 * With namespace:
3716
 *
3717
 * [NS 11] Attribute ::= QName Eq AttValue
3718
 *
3719
 * Also the case QName == xmlns:??? is handled independently as a namespace
3720
 * definition.
3721
 *
3722
 * Returns the attribute name, and the value in *value.
3723
 */
3724
3725
static const xmlChar *
3726
0
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3727
0
    const xmlChar *name;
3728
0
    xmlChar *val = NULL;
3729
3730
0
    *value = NULL;
3731
0
    name = htmlParseHTMLName(ctxt);
3732
0
    if (name == NULL) {
3733
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3734
0
               "error parsing attribute name\n", NULL, NULL);
3735
0
        return(NULL);
3736
0
    }
3737
3738
    /*
3739
     * read the value
3740
     */
3741
0
    SKIP_BLANKS;
3742
0
    if (CUR == '=') {
3743
0
        NEXT;
3744
0
  SKIP_BLANKS;
3745
0
  val = htmlParseAttValue(ctxt);
3746
0
    }
3747
3748
0
    *value = val;
3749
0
    return(name);
3750
0
}
3751
3752
/**
3753
 * htmlCheckEncoding:
3754
 * @ctxt:  an HTML parser context
3755
 * @attvalue: the attribute value
3756
 *
3757
 * Checks an http-equiv attribute from a Meta tag to detect
3758
 * the encoding
3759
 * If a new encoding is detected the parser is switched to decode
3760
 * it and pass UTF8
3761
 */
3762
static void
3763
0
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3764
0
    const xmlChar *encoding;
3765
3766
0
    if (!attvalue)
3767
0
  return;
3768
3769
0
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3770
0
    if (encoding != NULL) {
3771
0
  encoding += 7;
3772
0
    }
3773
    /*
3774
     * skip blank
3775
     */
3776
0
    if (encoding && IS_BLANK_CH(*encoding))
3777
0
  encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3778
0
    if (encoding && *encoding == '=') {
3779
0
  encoding ++;
3780
0
  xmlSetDeclaredEncoding(ctxt, xmlStrdup(encoding));
3781
0
    }
3782
0
}
3783
3784
/**
3785
 * htmlCheckMeta:
3786
 * @ctxt:  an HTML parser context
3787
 * @atts:  the attributes values
3788
 *
3789
 * Checks an attributes from a Meta tag
3790
 */
3791
static void
3792
0
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3793
0
    int i;
3794
0
    const xmlChar *att, *value;
3795
0
    int http = 0;
3796
0
    const xmlChar *content = NULL;
3797
3798
0
    if ((ctxt == NULL) || (atts == NULL))
3799
0
  return;
3800
3801
0
    i = 0;
3802
0
    att = atts[i++];
3803
0
    while (att != NULL) {
3804
0
  value = atts[i++];
3805
0
  if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3806
0
   && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3807
0
      http = 1;
3808
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3809
0
      xmlSetDeclaredEncoding(ctxt, xmlStrdup(value));
3810
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3811
0
      content = value;
3812
0
  att = atts[i++];
3813
0
    }
3814
0
    if ((http) && (content != NULL))
3815
0
  htmlCheckEncoding(ctxt, content);
3816
3817
0
}
3818
3819
/**
3820
 * htmlParseStartTag:
3821
 * @ctxt:  an HTML parser context
3822
 *
3823
 * parse a start of tag either for rule element or
3824
 * EmptyElement. In both case we don't parse the tag closing chars.
3825
 *
3826
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3827
 *
3828
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3829
 *
3830
 * With namespace:
3831
 *
3832
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3833
 *
3834
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3835
 *
3836
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3837
 */
3838
3839
static int
3840
0
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3841
0
    const xmlChar *name;
3842
0
    const xmlChar *attname;
3843
0
    xmlChar *attvalue;
3844
0
    const xmlChar **atts;
3845
0
    int nbatts = 0;
3846
0
    int maxatts;
3847
0
    int meta = 0;
3848
0
    int i;
3849
0
    int discardtag = 0;
3850
3851
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3852
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3853
0
         "htmlParseStartTag: context error\n", NULL, NULL);
3854
0
  return -1;
3855
0
    }
3856
0
    if (ctxt->instate == XML_PARSER_EOF)
3857
0
        return(-1);
3858
0
    if (CUR != '<') return -1;
3859
0
    NEXT;
3860
3861
0
    atts = ctxt->atts;
3862
0
    maxatts = ctxt->maxatts;
3863
3864
0
    GROW;
3865
0
    name = htmlParseHTMLName(ctxt);
3866
0
    if (name == NULL) {
3867
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3868
0
               "htmlParseStartTag: invalid element name\n",
3869
0
         NULL, NULL);
3870
  /* Dump the bogus tag like browsers do */
3871
0
  while ((CUR != 0) && (CUR != '>') &&
3872
0
               (ctxt->instate != XML_PARSER_EOF))
3873
0
      NEXT;
3874
0
        return -1;
3875
0
    }
3876
0
    if (xmlStrEqual(name, BAD_CAST"meta"))
3877
0
  meta = 1;
3878
3879
    /*
3880
     * Check for auto-closure of HTML elements.
3881
     */
3882
0
    htmlAutoClose(ctxt, name);
3883
3884
    /*
3885
     * Check for implied HTML elements.
3886
     */
3887
0
    htmlCheckImplied(ctxt, name);
3888
3889
    /*
3890
     * Avoid html at any level > 0, head at any level != 1
3891
     * or any attempt to recurse body
3892
     */
3893
0
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3894
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3895
0
               "htmlParseStartTag: misplaced <html> tag\n",
3896
0
         name, NULL);
3897
0
  discardtag = 1;
3898
0
  ctxt->depth++;
3899
0
    }
3900
0
    if ((ctxt->nameNr != 1) &&
3901
0
  (xmlStrEqual(name, BAD_CAST"head"))) {
3902
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3903
0
               "htmlParseStartTag: misplaced <head> tag\n",
3904
0
         name, NULL);
3905
0
  discardtag = 1;
3906
0
  ctxt->depth++;
3907
0
    }
3908
0
    if (xmlStrEqual(name, BAD_CAST"body")) {
3909
0
  int indx;
3910
0
  for (indx = 0;indx < ctxt->nameNr;indx++) {
3911
0
      if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3912
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3913
0
                 "htmlParseStartTag: misplaced <body> tag\n",
3914
0
           name, NULL);
3915
0
    discardtag = 1;
3916
0
    ctxt->depth++;
3917
0
      }
3918
0
  }
3919
0
    }
3920
3921
    /*
3922
     * Now parse the attributes, it ends up with the ending
3923
     *
3924
     * (S Attribute)* S?
3925
     */
3926
0
    SKIP_BLANKS;
3927
0
    while ((CUR != 0) &&
3928
0
           (CUR != '>') &&
3929
0
     ((CUR != '/') || (NXT(1) != '>')) &&
3930
0
           (ctxt->instate != XML_PARSER_EOF)) {
3931
0
  GROW;
3932
0
  attname = htmlParseAttribute(ctxt, &attvalue);
3933
0
        if (attname != NULL) {
3934
3935
      /*
3936
       * Well formedness requires at most one declaration of an attribute
3937
       */
3938
0
      for (i = 0; i < nbatts;i += 2) {
3939
0
          if (xmlStrEqual(atts[i], attname)) {
3940
0
        htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3941
0
                     "Attribute %s redefined\n", attname, NULL);
3942
0
        if (attvalue != NULL)
3943
0
      xmlFree(attvalue);
3944
0
        goto failed;
3945
0
    }
3946
0
      }
3947
3948
      /*
3949
       * Add the pair to atts
3950
       */
3951
0
      if (atts == NULL) {
3952
0
          maxatts = 22; /* allow for 10 attrs by default */
3953
0
          atts = (const xmlChar **)
3954
0
           xmlMalloc(maxatts * sizeof(xmlChar *));
3955
0
    if (atts == NULL) {
3956
0
        htmlErrMemory(ctxt, NULL);
3957
0
        if (attvalue != NULL)
3958
0
      xmlFree(attvalue);
3959
0
        goto failed;
3960
0
    }
3961
0
    ctxt->atts = atts;
3962
0
    ctxt->maxatts = maxatts;
3963
0
      } else if (nbatts + 4 > maxatts) {
3964
0
          const xmlChar **n;
3965
3966
0
          maxatts *= 2;
3967
0
          n = (const xmlChar **) xmlRealloc((void *) atts,
3968
0
               maxatts * sizeof(const xmlChar *));
3969
0
    if (n == NULL) {
3970
0
        htmlErrMemory(ctxt, NULL);
3971
0
        if (attvalue != NULL)
3972
0
      xmlFree(attvalue);
3973
0
        goto failed;
3974
0
    }
3975
0
    atts = n;
3976
0
    ctxt->atts = atts;
3977
0
    ctxt->maxatts = maxatts;
3978
0
      }
3979
0
      atts[nbatts++] = attname;
3980
0
      atts[nbatts++] = attvalue;
3981
0
      atts[nbatts] = NULL;
3982
0
      atts[nbatts + 1] = NULL;
3983
0
  }
3984
0
  else {
3985
0
      if (attvalue != NULL)
3986
0
          xmlFree(attvalue);
3987
      /* Dump the bogus attribute string up to the next blank or
3988
       * the end of the tag. */
3989
0
      while ((CUR != 0) &&
3990
0
             !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3991
0
       ((CUR != '/') || (NXT(1) != '>')) &&
3992
0
                   (ctxt->instate != XML_PARSER_EOF))
3993
0
    NEXT;
3994
0
  }
3995
3996
0
failed:
3997
0
  SKIP_BLANKS;
3998
0
    }
3999
4000
    /*
4001
     * Handle specific association to the META tag
4002
     */
4003
0
    if (meta && (nbatts != 0))
4004
0
  htmlCheckMeta(ctxt, atts);
4005
4006
    /*
4007
     * SAX: Start of Element !
4008
     */
4009
0
    if (!discardtag) {
4010
0
  htmlnamePush(ctxt, name);
4011
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4012
0
      if (nbatts != 0)
4013
0
    ctxt->sax->startElement(ctxt->userData, name, atts);
4014
0
      else
4015
0
    ctxt->sax->startElement(ctxt->userData, name, NULL);
4016
0
  }
4017
0
    }
4018
4019
0
    if (atts != NULL) {
4020
0
        for (i = 1;i < nbatts;i += 2) {
4021
0
      if (atts[i] != NULL)
4022
0
    xmlFree((xmlChar *) atts[i]);
4023
0
  }
4024
0
    }
4025
4026
0
    return(discardtag);
4027
0
}
4028
4029
/**
4030
 * htmlParseEndTag:
4031
 * @ctxt:  an HTML parser context
4032
 *
4033
 * parse an end of tag
4034
 *
4035
 * [42] ETag ::= '</' Name S? '>'
4036
 *
4037
 * With namespace
4038
 *
4039
 * [NS 9] ETag ::= '</' QName S? '>'
4040
 *
4041
 * Returns 1 if the current level should be closed.
4042
 */
4043
4044
static int
4045
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4046
0
{
4047
0
    const xmlChar *name;
4048
0
    const xmlChar *oldname;
4049
0
    int i, ret;
4050
4051
0
    if ((CUR != '<') || (NXT(1) != '/')) {
4052
0
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4053
0
               "htmlParseEndTag: '</' not found\n", NULL, NULL);
4054
0
        return (0);
4055
0
    }
4056
0
    SKIP(2);
4057
4058
0
    name = htmlParseHTMLName(ctxt);
4059
0
    if (name == NULL)
4060
0
        return (0);
4061
    /*
4062
     * We should definitely be at the ending "S? '>'" part
4063
     */
4064
0
    SKIP_BLANKS;
4065
0
    if (CUR != '>') {
4066
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4067
0
               "End tag : expected '>'\n", NULL, NULL);
4068
        /* Skip to next '>' */
4069
0
        while ((CUR != 0) && (CUR != '>'))
4070
0
            NEXT;
4071
0
    }
4072
0
    if (CUR == '>')
4073
0
        NEXT;
4074
4075
    /*
4076
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4077
     * out now.
4078
     */
4079
0
    if ((ctxt->depth > 0) &&
4080
0
        (xmlStrEqual(name, BAD_CAST "html") ||
4081
0
         xmlStrEqual(name, BAD_CAST "body") ||
4082
0
   xmlStrEqual(name, BAD_CAST "head"))) {
4083
0
  ctxt->depth--;
4084
0
  return (0);
4085
0
    }
4086
4087
    /*
4088
     * If the name read is not one of the element in the parsing stack
4089
     * then return, it's just an error.
4090
     */
4091
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4092
0
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4093
0
            break;
4094
0
    }
4095
0
    if (i < 0) {
4096
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4097
0
               "Unexpected end tag : %s\n", name, NULL);
4098
0
        return (0);
4099
0
    }
4100
4101
4102
    /*
4103
     * Check for auto-closure of HTML elements.
4104
     */
4105
4106
0
    htmlAutoCloseOnClose(ctxt, name);
4107
4108
    /*
4109
     * Well formedness constraints, opening and closing must match.
4110
     * With the exception that the autoclose may have popped stuff out
4111
     * of the stack.
4112
     */
4113
0
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4114
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4115
0
                     "Opening and ending tag mismatch: %s and %s\n",
4116
0
                     name, ctxt->name);
4117
0
    }
4118
4119
    /*
4120
     * SAX: End of Tag
4121
     */
4122
0
    oldname = ctxt->name;
4123
0
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4124
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4125
0
            ctxt->sax->endElement(ctxt->userData, name);
4126
0
  htmlNodeInfoPop(ctxt);
4127
0
        htmlnamePop(ctxt);
4128
0
        ret = 1;
4129
0
    } else {
4130
0
        ret = 0;
4131
0
    }
4132
4133
0
    return (ret);
4134
0
}
4135
4136
4137
/**
4138
 * htmlParseReference:
4139
 * @ctxt:  an HTML parser context
4140
 *
4141
 * parse and handle entity references in content,
4142
 * this will end-up in a call to character() since this is either a
4143
 * CharRef, or a predefined entity.
4144
 */
4145
static void
4146
0
htmlParseReference(htmlParserCtxtPtr ctxt) {
4147
0
    const htmlEntityDesc * ent;
4148
0
    xmlChar out[6];
4149
0
    const xmlChar *name;
4150
0
    if (CUR != '&') return;
4151
4152
0
    if (NXT(1) == '#') {
4153
0
  unsigned int c;
4154
0
  int bits, i = 0;
4155
4156
0
  c = htmlParseCharRef(ctxt);
4157
0
  if (c == 0)
4158
0
      return;
4159
4160
0
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
4161
0
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4162
0
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4163
0
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4164
4165
0
        for ( ; bits >= 0; bits-= 6) {
4166
0
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
4167
0
        }
4168
0
  out[i] = 0;
4169
4170
0
  htmlCheckParagraph(ctxt);
4171
0
  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4172
0
      ctxt->sax->characters(ctxt->userData, out, i);
4173
0
    } else {
4174
0
  ent = htmlParseEntityRef(ctxt, &name);
4175
0
  if (name == NULL) {
4176
0
      htmlCheckParagraph(ctxt);
4177
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4178
0
          ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4179
0
      return;
4180
0
  }
4181
0
  if ((ent == NULL) || !(ent->value > 0)) {
4182
0
      htmlCheckParagraph(ctxt);
4183
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4184
0
    ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4185
0
    ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4186
    /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4187
0
      }
4188
0
  } else {
4189
0
      unsigned int c;
4190
0
      int bits, i = 0;
4191
4192
0
      c = ent->value;
4193
0
      if      (c <    0x80)
4194
0
              { out[i++]= c;                bits= -6; }
4195
0
      else if (c <   0x800)
4196
0
              { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4197
0
      else if (c < 0x10000)
4198
0
              { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4199
0
      else
4200
0
              { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4201
4202
0
      for ( ; bits >= 0; bits-= 6) {
4203
0
    out[i++]= ((c >> bits) & 0x3F) | 0x80;
4204
0
      }
4205
0
      out[i] = 0;
4206
4207
0
      htmlCheckParagraph(ctxt);
4208
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4209
0
    ctxt->sax->characters(ctxt->userData, out, i);
4210
0
  }
4211
0
    }
4212
0
}
4213
4214
/**
4215
 * htmlParseContent:
4216
 * @ctxt:  an HTML parser context
4217
 *
4218
 * Parse a content: comment, sub-element, reference or text.
4219
 * Kept for compatibility with old code
4220
 */
4221
4222
static void
4223
0
htmlParseContent(htmlParserCtxtPtr ctxt) {
4224
0
    xmlChar *currentNode;
4225
0
    int depth;
4226
0
    const xmlChar *name;
4227
4228
0
    currentNode = xmlStrdup(ctxt->name);
4229
0
    depth = ctxt->nameNr;
4230
0
    while (1) {
4231
0
        GROW;
4232
4233
0
        if (ctxt->instate == XML_PARSER_EOF)
4234
0
            break;
4235
4236
  /*
4237
   * Our tag or one of it's parent or children is ending.
4238
   */
4239
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4240
0
      if (htmlParseEndTag(ctxt) &&
4241
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4242
0
    if (currentNode != NULL)
4243
0
        xmlFree(currentNode);
4244
0
    return;
4245
0
      }
4246
0
      continue; /* while */
4247
0
        }
4248
4249
0
  else if ((CUR == '<') &&
4250
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4251
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4252
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4253
0
      if (name == NULL) {
4254
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4255
0
       "htmlParseStartTag: invalid element name\n",
4256
0
       NULL, NULL);
4257
          /* Dump the bogus tag like browsers do */
4258
0
                while ((CUR != 0) && (CUR != '>'))
4259
0
              NEXT;
4260
4261
0
          if (currentNode != NULL)
4262
0
              xmlFree(currentNode);
4263
0
          return;
4264
0
      }
4265
4266
0
      if (ctxt->name != NULL) {
4267
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4268
0
              htmlAutoClose(ctxt, name);
4269
0
              continue;
4270
0
          }
4271
0
      }
4272
0
  }
4273
4274
  /*
4275
   * Has this node been popped out during parsing of
4276
   * the next element
4277
   */
4278
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4279
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4280
0
       {
4281
0
      if (currentNode != NULL) xmlFree(currentNode);
4282
0
      return;
4283
0
  }
4284
4285
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4286
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4287
      /*
4288
       * Handle SCRIPT/STYLE separately
4289
       */
4290
0
      htmlParseScript(ctxt);
4291
0
  }
4292
4293
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4294
            /*
4295
             * Sometimes DOCTYPE arrives in the middle of the document
4296
             */
4297
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4298
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4299
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4300
0
                (UPP(8) == 'E')) {
4301
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4302
0
                             "Misplaced DOCTYPE declaration\n",
4303
0
                             BAD_CAST "DOCTYPE" , NULL);
4304
0
                htmlParseDocTypeDecl(ctxt);
4305
0
            }
4306
            /*
4307
             * First case :  a comment
4308
             */
4309
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4310
0
                htmlParseComment(ctxt);
4311
0
            }
4312
0
            else {
4313
0
                htmlSkipBogusComment(ctxt);
4314
0
            }
4315
0
        }
4316
4317
        /*
4318
         * Second case : a Processing Instruction.
4319
         */
4320
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4321
0
            htmlParsePI(ctxt);
4322
0
        }
4323
4324
        /*
4325
         * Third case :  a sub-element.
4326
         */
4327
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4328
0
            htmlParseElement(ctxt);
4329
0
        }
4330
0
        else if (CUR == '<') {
4331
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4332
0
                (ctxt->sax->characters != NULL))
4333
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4334
0
            NEXT;
4335
0
        }
4336
4337
        /*
4338
         * Fourth case : a reference. If if has not been resolved,
4339
         *    parsing returns it's Name, create the node
4340
         */
4341
0
        else if (CUR == '&') {
4342
0
            htmlParseReference(ctxt);
4343
0
        }
4344
4345
        /*
4346
         * Fifth case : end of the resource
4347
         */
4348
0
        else if (CUR == 0) {
4349
0
            htmlAutoCloseOnEnd(ctxt);
4350
0
            break;
4351
0
        }
4352
4353
        /*
4354
         * Last case, text. Note that References are handled directly.
4355
         */
4356
0
        else {
4357
0
            htmlParseCharData(ctxt);
4358
0
        }
4359
4360
0
        SHRINK;
4361
0
        GROW;
4362
0
    }
4363
0
    if (currentNode != NULL) xmlFree(currentNode);
4364
0
}
4365
4366
/**
4367
 * htmlParseElement:
4368
 * @ctxt:  an HTML parser context
4369
 *
4370
 * DEPRECATED: Internal function, don't use.
4371
 *
4372
 * parse an HTML element, this is highly recursive
4373
 * this is kept for compatibility with previous code versions
4374
 *
4375
 * [39] element ::= EmptyElemTag | STag content ETag
4376
 *
4377
 * [41] Attribute ::= Name Eq AttValue
4378
 */
4379
4380
void
4381
0
htmlParseElement(htmlParserCtxtPtr ctxt) {
4382
0
    const xmlChar *name;
4383
0
    xmlChar *currentNode = NULL;
4384
0
    const htmlElemDesc * info;
4385
0
    htmlParserNodeInfo node_info;
4386
0
    int failed;
4387
0
    int depth;
4388
0
    const xmlChar *oldptr;
4389
4390
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4391
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4392
0
         "htmlParseElement: context error\n", NULL, NULL);
4393
0
  return;
4394
0
    }
4395
4396
0
    if (ctxt->instate == XML_PARSER_EOF)
4397
0
        return;
4398
4399
    /* Capture start position */
4400
0
    if (ctxt->record_info) {
4401
0
        node_info.begin_pos = ctxt->input->consumed +
4402
0
                          (CUR_PTR - ctxt->input->base);
4403
0
  node_info.begin_line = ctxt->input->line;
4404
0
    }
4405
4406
0
    failed = htmlParseStartTag(ctxt);
4407
0
    name = ctxt->name;
4408
0
    if ((failed == -1) || (name == NULL)) {
4409
0
  if (CUR == '>')
4410
0
      NEXT;
4411
0
        return;
4412
0
    }
4413
4414
    /*
4415
     * Lookup the info for that element.
4416
     */
4417
0
    info = htmlTagLookup(name);
4418
0
    if (info == NULL) {
4419
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4420
0
               "Tag %s invalid\n", name, NULL);
4421
0
    }
4422
4423
    /*
4424
     * Check for an Empty Element labeled the XML/SGML way
4425
     */
4426
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4427
0
        SKIP(2);
4428
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4429
0
      ctxt->sax->endElement(ctxt->userData, name);
4430
0
  htmlnamePop(ctxt);
4431
0
  return;
4432
0
    }
4433
4434
0
    if (CUR == '>') {
4435
0
        NEXT;
4436
0
    } else {
4437
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4438
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4439
4440
  /*
4441
   * end of parsing of this node.
4442
   */
4443
0
  if (xmlStrEqual(name, ctxt->name)) {
4444
0
      nodePop(ctxt);
4445
0
      htmlnamePop(ctxt);
4446
0
  }
4447
4448
  /*
4449
   * Capture end position and add node
4450
   */
4451
0
  if (ctxt->record_info) {
4452
0
     node_info.end_pos = ctxt->input->consumed +
4453
0
            (CUR_PTR - ctxt->input->base);
4454
0
     node_info.end_line = ctxt->input->line;
4455
0
     node_info.node = ctxt->node;
4456
0
     xmlParserAddNodeInfo(ctxt, &node_info);
4457
0
  }
4458
0
  return;
4459
0
    }
4460
4461
    /*
4462
     * Check for an Empty Element from DTD definition
4463
     */
4464
0
    if ((info != NULL) && (info->empty)) {
4465
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4466
0
      ctxt->sax->endElement(ctxt->userData, name);
4467
0
  htmlnamePop(ctxt);
4468
0
  return;
4469
0
    }
4470
4471
    /*
4472
     * Parse the content of the element:
4473
     */
4474
0
    currentNode = xmlStrdup(ctxt->name);
4475
0
    depth = ctxt->nameNr;
4476
0
    while (CUR != 0) {
4477
0
  oldptr = ctxt->input->cur;
4478
0
  htmlParseContent(ctxt);
4479
0
  if (oldptr==ctxt->input->cur) break;
4480
0
  if (ctxt->nameNr < depth) break;
4481
0
    }
4482
4483
    /*
4484
     * Capture end position and add node
4485
     */
4486
0
    if ( currentNode != NULL && ctxt->record_info ) {
4487
0
       node_info.end_pos = ctxt->input->consumed +
4488
0
                          (CUR_PTR - ctxt->input->base);
4489
0
       node_info.end_line = ctxt->input->line;
4490
0
       node_info.node = ctxt->node;
4491
0
       xmlParserAddNodeInfo(ctxt, &node_info);
4492
0
    }
4493
0
    if (CUR == 0) {
4494
0
  htmlAutoCloseOnEnd(ctxt);
4495
0
    }
4496
4497
0
    if (currentNode != NULL)
4498
0
  xmlFree(currentNode);
4499
0
}
4500
4501
static void
4502
0
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4503
    /*
4504
     * Capture end position and add node
4505
     */
4506
0
    if ( ctxt->node != NULL && ctxt->record_info ) {
4507
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4508
0
                                (CUR_PTR - ctxt->input->base);
4509
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
4510
0
       ctxt->nodeInfo->node = ctxt->node;
4511
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4512
0
       htmlNodeInfoPop(ctxt);
4513
0
    }
4514
0
    if (CUR == 0) {
4515
0
       htmlAutoCloseOnEnd(ctxt);
4516
0
    }
4517
0
}
4518
4519
/**
4520
 * htmlParseElementInternal:
4521
 * @ctxt:  an HTML parser context
4522
 *
4523
 * parse an HTML element, new version, non recursive
4524
 *
4525
 * [39] element ::= EmptyElemTag | STag content ETag
4526
 *
4527
 * [41] Attribute ::= Name Eq AttValue
4528
 */
4529
4530
static void
4531
0
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4532
0
    const xmlChar *name;
4533
0
    const htmlElemDesc * info;
4534
0
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4535
0
    int failed;
4536
4537
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4538
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4539
0
         "htmlParseElementInternal: context error\n", NULL, NULL);
4540
0
  return;
4541
0
    }
4542
4543
0
    if (ctxt->instate == XML_PARSER_EOF)
4544
0
        return;
4545
4546
    /* Capture start position */
4547
0
    if (ctxt->record_info) {
4548
0
        node_info.begin_pos = ctxt->input->consumed +
4549
0
                          (CUR_PTR - ctxt->input->base);
4550
0
  node_info.begin_line = ctxt->input->line;
4551
0
    }
4552
4553
0
    failed = htmlParseStartTag(ctxt);
4554
0
    name = ctxt->name;
4555
0
    if ((failed == -1) || (name == NULL)) {
4556
0
  if (CUR == '>')
4557
0
      NEXT;
4558
0
        return;
4559
0
    }
4560
4561
    /*
4562
     * Lookup the info for that element.
4563
     */
4564
0
    info = htmlTagLookup(name);
4565
0
    if (info == NULL) {
4566
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4567
0
               "Tag %s invalid\n", name, NULL);
4568
0
    }
4569
4570
    /*
4571
     * Check for an Empty Element labeled the XML/SGML way
4572
     */
4573
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4574
0
        SKIP(2);
4575
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4576
0
      ctxt->sax->endElement(ctxt->userData, name);
4577
0
  htmlnamePop(ctxt);
4578
0
  return;
4579
0
    }
4580
4581
0
    if (CUR == '>') {
4582
0
        NEXT;
4583
0
    } else {
4584
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4585
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4586
4587
  /*
4588
   * end of parsing of this node.
4589
   */
4590
0
  if (xmlStrEqual(name, ctxt->name)) {
4591
0
      nodePop(ctxt);
4592
0
      htmlnamePop(ctxt);
4593
0
  }
4594
4595
0
        if (ctxt->record_info)
4596
0
            htmlNodeInfoPush(ctxt, &node_info);
4597
0
        htmlParserFinishElementParsing(ctxt);
4598
0
  return;
4599
0
    }
4600
4601
    /*
4602
     * Check for an Empty Element from DTD definition
4603
     */
4604
0
    if ((info != NULL) && (info->empty)) {
4605
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4606
0
      ctxt->sax->endElement(ctxt->userData, name);
4607
0
  htmlnamePop(ctxt);
4608
0
  return;
4609
0
    }
4610
4611
0
    if (ctxt->record_info)
4612
0
        htmlNodeInfoPush(ctxt, &node_info);
4613
0
}
4614
4615
/**
4616
 * htmlParseContentInternal:
4617
 * @ctxt:  an HTML parser context
4618
 *
4619
 * Parse a content: comment, sub-element, reference or text.
4620
 * New version for non recursive htmlParseElementInternal
4621
 */
4622
4623
static void
4624
0
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4625
0
    xmlChar *currentNode;
4626
0
    int depth;
4627
0
    const xmlChar *name;
4628
4629
0
    depth = ctxt->nameNr;
4630
0
    if (depth <= 0) {
4631
0
        currentNode = NULL;
4632
0
    } else {
4633
0
        currentNode = xmlStrdup(ctxt->name);
4634
0
        if (currentNode == NULL) {
4635
0
            htmlErrMemory(ctxt, NULL);
4636
0
            return;
4637
0
        }
4638
0
    }
4639
0
    while (1) {
4640
0
        GROW;
4641
4642
0
        if (ctxt->instate == XML_PARSER_EOF)
4643
0
            break;
4644
4645
  /*
4646
   * Our tag or one of it's parent or children is ending.
4647
   */
4648
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4649
0
      if (htmlParseEndTag(ctxt) &&
4650
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4651
0
    if (currentNode != NULL)
4652
0
        xmlFree(currentNode);
4653
4654
0
          depth = ctxt->nameNr;
4655
0
                if (depth <= 0) {
4656
0
                    currentNode = NULL;
4657
0
                } else {
4658
0
                    currentNode = xmlStrdup(ctxt->name);
4659
0
                    if (currentNode == NULL) {
4660
0
                        htmlErrMemory(ctxt, NULL);
4661
0
                        break;
4662
0
                    }
4663
0
                }
4664
0
      }
4665
0
      continue; /* while */
4666
0
        }
4667
4668
0
  else if ((CUR == '<') &&
4669
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4670
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4671
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4672
0
      if (name == NULL) {
4673
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4674
0
       "htmlParseStartTag: invalid element name\n",
4675
0
       NULL, NULL);
4676
          /* Dump the bogus tag like browsers do */
4677
0
          while ((CUR == 0) && (CUR != '>'))
4678
0
              NEXT;
4679
4680
0
          htmlParserFinishElementParsing(ctxt);
4681
0
          if (currentNode != NULL)
4682
0
              xmlFree(currentNode);
4683
4684
0
          currentNode = xmlStrdup(ctxt->name);
4685
0
                if (currentNode == NULL) {
4686
0
                    htmlErrMemory(ctxt, NULL);
4687
0
                    break;
4688
0
                }
4689
0
          depth = ctxt->nameNr;
4690
0
          continue;
4691
0
      }
4692
4693
0
      if (ctxt->name != NULL) {
4694
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4695
0
              htmlAutoClose(ctxt, name);
4696
0
              continue;
4697
0
          }
4698
0
      }
4699
0
  }
4700
4701
  /*
4702
   * Has this node been popped out during parsing of
4703
   * the next element
4704
   */
4705
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4706
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4707
0
       {
4708
0
      htmlParserFinishElementParsing(ctxt);
4709
0
      if (currentNode != NULL) xmlFree(currentNode);
4710
4711
0
      currentNode = xmlStrdup(ctxt->name);
4712
0
            if (currentNode == NULL) {
4713
0
                htmlErrMemory(ctxt, NULL);
4714
0
                break;
4715
0
            }
4716
0
      depth = ctxt->nameNr;
4717
0
      continue;
4718
0
  }
4719
4720
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4721
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4722
      /*
4723
       * Handle SCRIPT/STYLE separately
4724
       */
4725
0
      htmlParseScript(ctxt);
4726
0
  }
4727
4728
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4729
            /*
4730
             * Sometimes DOCTYPE arrives in the middle of the document
4731
             */
4732
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4733
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4734
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4735
0
                (UPP(8) == 'E')) {
4736
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4737
0
                             "Misplaced DOCTYPE declaration\n",
4738
0
                             BAD_CAST "DOCTYPE" , NULL);
4739
0
                htmlParseDocTypeDecl(ctxt);
4740
0
            }
4741
            /*
4742
             * First case :  a comment
4743
             */
4744
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4745
0
                htmlParseComment(ctxt);
4746
0
            }
4747
0
            else {
4748
0
                htmlSkipBogusComment(ctxt);
4749
0
            }
4750
0
        }
4751
4752
        /*
4753
         * Second case : a Processing Instruction.
4754
         */
4755
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4756
0
            htmlParsePI(ctxt);
4757
0
        }
4758
4759
        /*
4760
         * Third case :  a sub-element.
4761
         */
4762
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4763
0
            htmlParseElementInternal(ctxt);
4764
0
            if (currentNode != NULL) xmlFree(currentNode);
4765
4766
0
            currentNode = xmlStrdup(ctxt->name);
4767
0
            if (currentNode == NULL) {
4768
0
                htmlErrMemory(ctxt, NULL);
4769
0
                break;
4770
0
            }
4771
0
            depth = ctxt->nameNr;
4772
0
        }
4773
0
        else if (CUR == '<') {
4774
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4775
0
                (ctxt->sax->characters != NULL))
4776
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4777
0
            NEXT;
4778
0
        }
4779
4780
        /*
4781
         * Fourth case : a reference. If if has not been resolved,
4782
         *    parsing returns it's Name, create the node
4783
         */
4784
0
        else if (CUR == '&') {
4785
0
            htmlParseReference(ctxt);
4786
0
        }
4787
4788
        /*
4789
         * Fifth case : end of the resource
4790
         */
4791
0
        else if (CUR == 0) {
4792
0
            htmlAutoCloseOnEnd(ctxt);
4793
0
            break;
4794
0
        }
4795
4796
        /*
4797
         * Last case, text. Note that References are handled directly.
4798
         */
4799
0
        else {
4800
0
            htmlParseCharData(ctxt);
4801
0
        }
4802
4803
0
        SHRINK;
4804
0
        GROW;
4805
0
    }
4806
0
    if (currentNode != NULL) xmlFree(currentNode);
4807
0
}
4808
4809
/**
4810
 * htmlParseContent:
4811
 * @ctxt:  an HTML parser context
4812
 *
4813
 * Parse a content: comment, sub-element, reference or text.
4814
 * This is the entry point when called from parser.c
4815
 */
4816
4817
void
4818
0
__htmlParseContent(void *ctxt) {
4819
0
    if (ctxt != NULL)
4820
0
  htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4821
0
}
4822
4823
/**
4824
 * htmlParseDocument:
4825
 * @ctxt:  an HTML parser context
4826
 *
4827
 * parse an HTML document (and build a tree if using the standard SAX
4828
 * interface).
4829
 *
4830
 * Returns 0, -1 in case of error. the parser context is augmented
4831
 *                as a result of the parsing.
4832
 */
4833
4834
int
4835
0
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4836
0
    xmlDtdPtr dtd;
4837
4838
0
    xmlInitParser();
4839
4840
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4841
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4842
0
         "htmlParseDocument: context error\n", NULL, NULL);
4843
0
  return(XML_ERR_INTERNAL_ERROR);
4844
0
    }
4845
4846
    /*
4847
     * SAX: beginning of the document processing.
4848
     */
4849
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4850
0
        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4851
4852
0
    xmlDetectEncoding(ctxt);
4853
4854
    /*
4855
     * This is wrong but matches long-standing behavior. In most cases,
4856
     * a document starting with an XML declaration will specify UTF-8.
4857
     */
4858
0
    if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
4859
0
        (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0))
4860
0
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
4861
4862
    /*
4863
     * Wipe out everything which is before the first '<'
4864
     */
4865
0
    SKIP_BLANKS;
4866
0
    if (CUR == 0) {
4867
0
  htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4868
0
               "Document is empty\n", NULL, NULL);
4869
0
    }
4870
4871
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4872
0
  ctxt->sax->startDocument(ctxt->userData);
4873
4874
4875
    /*
4876
     * Parse possible comments and PIs before any content
4877
     */
4878
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4879
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4880
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4881
0
        htmlParseComment(ctxt);
4882
0
        htmlParsePI(ctxt);
4883
0
  SKIP_BLANKS;
4884
0
    }
4885
4886
4887
    /*
4888
     * Then possibly doc type declaration(s) and more Misc
4889
     * (doctypedecl Misc*)?
4890
     */
4891
0
    if ((CUR == '<') && (NXT(1) == '!') &&
4892
0
  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4893
0
  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4894
0
  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4895
0
  (UPP(8) == 'E')) {
4896
0
  htmlParseDocTypeDecl(ctxt);
4897
0
    }
4898
0
    SKIP_BLANKS;
4899
4900
    /*
4901
     * Parse possible comments and PIs before any content
4902
     */
4903
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4904
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4905
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4906
0
        htmlParseComment(ctxt);
4907
0
        htmlParsePI(ctxt);
4908
0
  SKIP_BLANKS;
4909
0
    }
4910
4911
    /*
4912
     * Time to start parsing the tree itself
4913
     */
4914
0
    htmlParseContentInternal(ctxt);
4915
4916
    /*
4917
     * autoclose
4918
     */
4919
0
    if (CUR == 0)
4920
0
  htmlAutoCloseOnEnd(ctxt);
4921
4922
4923
    /*
4924
     * SAX: end of the document processing.
4925
     */
4926
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4927
0
        ctxt->sax->endDocument(ctxt->userData);
4928
4929
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4930
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
4931
0
  if (dtd == NULL)
4932
0
      ctxt->myDoc->intSubset =
4933
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4934
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4935
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4936
0
    }
4937
0
    if (! ctxt->wellFormed) return(-1);
4938
0
    return(0);
4939
0
}
4940
4941
4942
/************************************************************************
4943
 *                  *
4944
 *      Parser contexts handling      *
4945
 *                  *
4946
 ************************************************************************/
4947
4948
/**
4949
 * htmlInitParserCtxt:
4950
 * @ctxt:  an HTML parser context
4951
 * @sax:  SAX handler
4952
 * @userData:  user data
4953
 *
4954
 * Initialize a parser context
4955
 *
4956
 * Returns 0 in case of success and -1 in case of error
4957
 */
4958
4959
static int
4960
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
4961
                   void *userData)
4962
0
{
4963
0
    if (ctxt == NULL) return(-1);
4964
0
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4965
4966
0
    ctxt->dict = xmlDictCreate();
4967
0
    if (ctxt->dict == NULL) {
4968
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4969
0
  return(-1);
4970
0
    }
4971
4972
0
    if (ctxt->sax == NULL)
4973
0
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4974
0
    if (ctxt->sax == NULL) {
4975
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4976
0
  return(-1);
4977
0
    }
4978
0
    if (sax == NULL) {
4979
0
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
4980
0
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
4981
0
        ctxt->userData = ctxt;
4982
0
    } else {
4983
0
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
4984
0
        ctxt->userData = userData ? userData : ctxt;
4985
0
    }
4986
4987
    /* Allocate the Input stack */
4988
0
    ctxt->inputTab = (htmlParserInputPtr *)
4989
0
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4990
0
    if (ctxt->inputTab == NULL) {
4991
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4992
0
  ctxt->inputNr = 0;
4993
0
  ctxt->inputMax = 0;
4994
0
  ctxt->input = NULL;
4995
0
  return(-1);
4996
0
    }
4997
0
    ctxt->inputNr = 0;
4998
0
    ctxt->inputMax = 5;
4999
0
    ctxt->input = NULL;
5000
0
    ctxt->version = NULL;
5001
0
    ctxt->encoding = NULL;
5002
0
    ctxt->standalone = -1;
5003
0
    ctxt->instate = XML_PARSER_START;
5004
5005
    /* Allocate the Node stack */
5006
0
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5007
0
    if (ctxt->nodeTab == NULL) {
5008
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5009
0
  ctxt->nodeNr = 0;
5010
0
  ctxt->nodeMax = 0;
5011
0
  ctxt->node = NULL;
5012
0
  ctxt->inputNr = 0;
5013
0
  ctxt->inputMax = 0;
5014
0
  ctxt->input = NULL;
5015
0
  return(-1);
5016
0
    }
5017
0
    ctxt->nodeNr = 0;
5018
0
    ctxt->nodeMax = 10;
5019
0
    ctxt->node = NULL;
5020
5021
    /* Allocate the Name stack */
5022
0
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5023
0
    if (ctxt->nameTab == NULL) {
5024
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5025
0
  ctxt->nameNr = 0;
5026
0
  ctxt->nameMax = 0;
5027
0
  ctxt->name = NULL;
5028
0
  ctxt->nodeNr = 0;
5029
0
  ctxt->nodeMax = 0;
5030
0
  ctxt->node = NULL;
5031
0
  ctxt->inputNr = 0;
5032
0
  ctxt->inputMax = 0;
5033
0
  ctxt->input = NULL;
5034
0
  return(-1);
5035
0
    }
5036
0
    ctxt->nameNr = 0;
5037
0
    ctxt->nameMax = 10;
5038
0
    ctxt->name = NULL;
5039
5040
0
    ctxt->nodeInfoTab = NULL;
5041
0
    ctxt->nodeInfoNr  = 0;
5042
0
    ctxt->nodeInfoMax = 0;
5043
5044
0
    ctxt->myDoc = NULL;
5045
0
    ctxt->wellFormed = 1;
5046
0
    ctxt->replaceEntities = 0;
5047
0
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
5048
0
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5049
0
    ctxt->html = 1;
5050
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5051
0
    ctxt->vctxt.userData = ctxt;
5052
0
    ctxt->vctxt.error = xmlParserValidityError;
5053
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
5054
0
    ctxt->record_info = 0;
5055
0
    ctxt->validate = 0;
5056
0
    ctxt->checkIndex = 0;
5057
0
    ctxt->catalogs = NULL;
5058
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
5059
0
    return(0);
5060
0
}
5061
5062
/**
5063
 * htmlFreeParserCtxt:
5064
 * @ctxt:  an HTML parser context
5065
 *
5066
 * Free all the memory used by a parser context. However the parsed
5067
 * document in ctxt->myDoc is not freed.
5068
 */
5069
5070
void
5071
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5072
0
{
5073
0
    xmlFreeParserCtxt(ctxt);
5074
0
}
5075
5076
/**
5077
 * htmlNewParserCtxt:
5078
 *
5079
 * Allocate and initialize a new parser context.
5080
 *
5081
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5082
 */
5083
5084
htmlParserCtxtPtr
5085
htmlNewParserCtxt(void)
5086
0
{
5087
0
    return(htmlNewSAXParserCtxt(NULL, NULL));
5088
0
}
5089
5090
/**
5091
 * htmlNewSAXParserCtxt:
5092
 * @sax:  SAX handler
5093
 * @userData:  user data
5094
 *
5095
 * Allocate and initialize a new SAX parser context. If userData is NULL,
5096
 * the parser context will be passed as user data.
5097
 *
5098
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5099
 */
5100
5101
htmlParserCtxtPtr
5102
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5103
0
{
5104
0
    xmlParserCtxtPtr ctxt;
5105
5106
0
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5107
0
    if (ctxt == NULL) {
5108
0
        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5109
0
  return(NULL);
5110
0
    }
5111
0
    memset(ctxt, 0, sizeof(xmlParserCtxt));
5112
0
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5113
0
        htmlFreeParserCtxt(ctxt);
5114
0
  return(NULL);
5115
0
    }
5116
0
    return(ctxt);
5117
0
}
5118
5119
/**
5120
 * htmlCreateMemoryParserCtxt:
5121
 * @buffer:  a pointer to a char array
5122
 * @size:  the size of the array
5123
 *
5124
 * Create a parser context for an HTML in-memory document.
5125
 *
5126
 * Returns the new parser context or NULL
5127
 */
5128
htmlParserCtxtPtr
5129
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5130
0
    xmlParserCtxtPtr ctxt;
5131
0
    xmlParserInputPtr input;
5132
0
    xmlParserInputBufferPtr buf;
5133
5134
0
    if (buffer == NULL)
5135
0
  return(NULL);
5136
0
    if (size <= 0)
5137
0
  return(NULL);
5138
5139
0
    ctxt = htmlNewParserCtxt();
5140
0
    if (ctxt == NULL)
5141
0
  return(NULL);
5142
5143
0
    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5144
0
    if (buf == NULL) {
5145
0
  xmlFreeParserCtxt(ctxt);
5146
0
        return(NULL);
5147
0
    }
5148
5149
0
    input = xmlNewInputStream(ctxt);
5150
0
    if (input == NULL) {
5151
0
  xmlFreeParserInputBuffer(buf);
5152
0
  xmlFreeParserCtxt(ctxt);
5153
0
  return(NULL);
5154
0
    }
5155
5156
0
    input->filename = NULL;
5157
0
    input->buf = buf;
5158
0
    xmlBufResetInput(buf->buffer, input);
5159
5160
0
    inputPush(ctxt, input);
5161
0
    return(ctxt);
5162
0
}
5163
5164
/**
5165
 * htmlCreateDocParserCtxt:
5166
 * @str:  a pointer to an array of xmlChar
5167
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5168
 *
5169
 * Create a parser context for an HTML document.
5170
 *
5171
 * TODO: check the need to add encoding handling there
5172
 *
5173
 * Returns the new parser context or NULL
5174
 */
5175
static htmlParserCtxtPtr
5176
0
htmlCreateDocParserCtxt(const xmlChar *str, const char *encoding) {
5177
0
    xmlParserCtxtPtr ctxt;
5178
0
    xmlParserInputPtr input;
5179
0
    xmlParserInputBufferPtr buf;
5180
5181
0
    if (str == NULL)
5182
0
  return(NULL);
5183
5184
0
    ctxt = htmlNewParserCtxt();
5185
0
    if (ctxt == NULL)
5186
0
  return(NULL);
5187
5188
0
    buf = xmlParserInputBufferCreateString(str);
5189
0
    if (buf == NULL) {
5190
0
  xmlFreeParserCtxt(ctxt);
5191
0
        return(NULL);
5192
0
    }
5193
5194
0
    input = xmlNewInputStream(ctxt);
5195
0
    if (input == NULL) {
5196
0
  xmlFreeParserInputBuffer(buf);
5197
0
  xmlFreeParserCtxt(ctxt);
5198
0
  return(NULL);
5199
0
    }
5200
5201
0
    input->filename = NULL;
5202
0
    input->buf = buf;
5203
0
    xmlBufResetInput(buf->buffer, input);
5204
5205
0
    inputPush(ctxt, input);
5206
5207
0
    if (encoding != NULL) {
5208
0
  xmlCharEncoding enc;
5209
0
  xmlCharEncodingHandlerPtr handler;
5210
5211
0
  enc = xmlParseCharEncoding(encoding);
5212
  /*
5213
   * registered set of known encodings
5214
   */
5215
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
5216
0
      xmlSwitchEncoding(ctxt, enc);
5217
0
      if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5218
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5219
0
                 "Unsupported encoding %s\n",
5220
0
           (const xmlChar *) encoding, NULL);
5221
0
      }
5222
0
  } else {
5223
      /*
5224
       * fallback for unknown encodings
5225
       */
5226
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
5227
0
      if (handler != NULL) {
5228
0
    xmlSwitchToEncoding(ctxt, handler);
5229
0
      } else {
5230
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5231
0
                 "Unsupported encoding %s\n",
5232
0
           (const xmlChar *) encoding, NULL);
5233
0
      }
5234
0
  }
5235
0
    }
5236
5237
0
    return(ctxt);
5238
0
}
5239
5240
#ifdef LIBXML_PUSH_ENABLED
5241
/************************************************************************
5242
 *                  *
5243
 *  Progressive parsing interfaces        *
5244
 *                  *
5245
 ************************************************************************/
5246
5247
/**
5248
 * htmlParseLookupSequence:
5249
 * @ctxt:  an HTML parser context
5250
 * @first:  the first char to lookup
5251
 * @next:  the next char to lookup or zero
5252
 * @third:  the next char to lookup or zero
5253
 * @ignoreattrval: skip over attribute values
5254
 *
5255
 * Try to find if a sequence (first, next, third) or  just (first next) or
5256
 * (first) is available in the input stream.
5257
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5258
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5259
 * parser, do not use liberally.
5260
 * This is basically similar to xmlParseLookupSequence()
5261
 *
5262
 * Returns the index to the current parsing point if the full sequence
5263
 *      is available, -1 otherwise.
5264
 */
5265
static int
5266
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5267
                        xmlChar next, xmlChar third, int ignoreattrval)
5268
0
{
5269
0
    size_t base, len;
5270
0
    htmlParserInputPtr in;
5271
0
    const xmlChar *buf;
5272
0
    int quote;
5273
5274
0
    in = ctxt->input;
5275
0
    if (in == NULL)
5276
0
        return (-1);
5277
5278
0
    base = ctxt->checkIndex;
5279
0
    quote = ctxt->endCheckState;
5280
5281
0
    buf = in->cur;
5282
0
    len = in->end - in->cur;
5283
5284
    /* take into account the sequence length */
5285
0
    if (third)
5286
0
        len -= 2;
5287
0
    else if (next)
5288
0
        len--;
5289
0
    for (; base < len; base++) {
5290
0
        if (base >= INT_MAX / 2) {
5291
0
            ctxt->checkIndex = 0;
5292
0
            ctxt->endCheckState = 0;
5293
0
            return (base - 2);
5294
0
        }
5295
0
        if (ignoreattrval) {
5296
0
            if (quote) {
5297
0
                if (buf[base] == quote)
5298
0
                    quote = 0;
5299
0
                continue;
5300
0
            }
5301
0
            if (buf[base] == '"' || buf[base] == '\'') {
5302
0
                quote = buf[base];
5303
0
                continue;
5304
0
            }
5305
0
        }
5306
0
        if (buf[base] == first) {
5307
0
            if (third != 0) {
5308
0
                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5309
0
                    continue;
5310
0
            } else if (next != 0) {
5311
0
                if (buf[base + 1] != next)
5312
0
                    continue;
5313
0
            }
5314
0
            ctxt->checkIndex = 0;
5315
0
            ctxt->endCheckState = 0;
5316
0
            return (base);
5317
0
        }
5318
0
    }
5319
0
    ctxt->checkIndex = base;
5320
0
    ctxt->endCheckState = quote;
5321
0
    return (-1);
5322
0
}
5323
5324
/**
5325
 * htmlParseLookupCommentEnd:
5326
 * @ctxt: an HTML parser context
5327
 *
5328
 * Try to find a comment end tag in the input stream
5329
 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5330
 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5331
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5332
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5333
 * parser, do not use liberally.
5334
 * This wraps to htmlParseLookupSequence()
5335
 *
5336
 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5337
 */
5338
static int
5339
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5340
0
{
5341
0
    int mark = 0;
5342
0
    int offset;
5343
5344
0
    while (1) {
5345
0
  mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5346
0
  if (mark < 0)
5347
0
            break;
5348
0
        if ((NXT(mark+2) == '>') ||
5349
0
      ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5350
0
            ctxt->checkIndex = 0;
5351
0
      break;
5352
0
  }
5353
0
        offset = (NXT(mark+2) == '!') ? 3 : 2;
5354
0
        if (mark + offset >= ctxt->input->end - ctxt->input->cur) {
5355
0
      ctxt->checkIndex = mark;
5356
0
            return(-1);
5357
0
        }
5358
0
  ctxt->checkIndex = mark + 1;
5359
0
    }
5360
0
    return mark;
5361
0
}
5362
5363
5364
/**
5365
 * htmlParseTryOrFinish:
5366
 * @ctxt:  an HTML parser context
5367
 * @terminate:  last chunk indicator
5368
 *
5369
 * Try to progress on parsing
5370
 *
5371
 * Returns zero if no parsing was possible
5372
 */
5373
static int
5374
0
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5375
0
    int ret = 0;
5376
0
    htmlParserInputPtr in;
5377
0
    ptrdiff_t avail = 0;
5378
0
    xmlChar cur, next;
5379
5380
0
    htmlParserNodeInfo node_info;
5381
5382
0
    while (1) {
5383
5384
0
  in = ctxt->input;
5385
0
  if (in == NULL) break;
5386
0
  avail = in->end - in->cur;
5387
0
  if ((avail == 0) && (terminate)) {
5388
0
      htmlAutoCloseOnEnd(ctxt);
5389
0
      if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5390
    /*
5391
     * SAX: end of the document processing.
5392
     */
5393
0
    ctxt->instate = XML_PARSER_EOF;
5394
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5395
0
        ctxt->sax->endDocument(ctxt->userData);
5396
0
      }
5397
0
  }
5398
0
        if (avail < 1)
5399
0
      goto done;
5400
        /*
5401
         * This is done to make progress and avoid an infinite loop
5402
         * if a parsing attempt was aborted by hitting a NUL byte. After
5403
         * changing htmlCurrentChar, this probably isn't necessary anymore.
5404
         * We should consider removing this check.
5405
         */
5406
0
  cur = in->cur[0];
5407
0
  if (cur == 0) {
5408
0
      SKIP(1);
5409
0
      continue;
5410
0
  }
5411
5412
0
        switch (ctxt->instate) {
5413
0
            case XML_PARSER_EOF:
5414
          /*
5415
     * Document parsing is done !
5416
     */
5417
0
          goto done;
5418
0
            case XML_PARSER_START:
5419
                /*
5420
                 * This is wrong but matches long-standing behavior. In most
5421
                 * cases, a document starting with an XML declaration will
5422
                 * specify UTF-8.
5423
                 */
5424
0
                if (((ctxt->input->flags & XML_INPUT_HAS_ENCODING) == 0) &&
5425
0
                    (xmlStrncmp(ctxt->input->cur, BAD_CAST "<?xm", 4) == 0)) {
5426
0
                    xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_UTF8);
5427
0
                }
5428
5429
          /*
5430
     * Very first chars read from the document flow.
5431
     */
5432
0
    cur = in->cur[0];
5433
0
    if (IS_BLANK_CH(cur)) {
5434
0
        SKIP_BLANKS;
5435
0
                    avail = in->end - in->cur;
5436
0
    }
5437
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5438
0
        ctxt->sax->setDocumentLocator(ctxt->userData,
5439
0
              &xmlDefaultSAXLocator);
5440
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5441
0
              (!ctxt->disableSAX))
5442
0
        ctxt->sax->startDocument(ctxt->userData);
5443
0
                if (ctxt->instate == XML_PARSER_EOF)
5444
0
                    goto done;
5445
5446
0
    cur = in->cur[0];
5447
0
    next = in->cur[1];
5448
0
    if ((cur == '<') && (next == '!') &&
5449
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5450
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5451
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5452
0
        (UPP(8) == 'E')) {
5453
0
        if ((!terminate) &&
5454
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5455
0
      goto done;
5456
0
        htmlParseDocTypeDecl(ctxt);
5457
0
                    if (ctxt->instate == XML_PARSER_EOF)
5458
0
                        goto done;
5459
0
        ctxt->instate = XML_PARSER_PROLOG;
5460
0
                } else {
5461
0
        ctxt->instate = XML_PARSER_MISC;
5462
0
    }
5463
0
    break;
5464
0
            case XML_PARSER_MISC:
5465
0
    SKIP_BLANKS;
5466
0
                avail = in->end - in->cur;
5467
    /*
5468
     * no chars in buffer
5469
     */
5470
0
    if (avail < 1)
5471
0
        goto done;
5472
    /*
5473
     * not enough chars in buffer
5474
     */
5475
0
    if (avail < 2) {
5476
0
        if (!terminate)
5477
0
      goto done;
5478
0
        else
5479
0
      next = ' ';
5480
0
    } else {
5481
0
        next = in->cur[1];
5482
0
    }
5483
0
    cur = in->cur[0];
5484
0
          if ((cur == '<') && (next == '!') &&
5485
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5486
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5487
0
      goto done;
5488
0
        htmlParseComment(ctxt);
5489
0
                    if (ctxt->instate == XML_PARSER_EOF)
5490
0
                        goto done;
5491
0
        ctxt->instate = XML_PARSER_MISC;
5492
0
          } else if ((cur == '<') && (next == '?')) {
5493
0
        if ((!terminate) &&
5494
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5495
0
      goto done;
5496
0
        htmlParsePI(ctxt);
5497
0
                    if (ctxt->instate == XML_PARSER_EOF)
5498
0
                        goto done;
5499
0
        ctxt->instate = XML_PARSER_MISC;
5500
0
    } else if ((cur == '<') && (next == '!') &&
5501
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5502
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5503
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5504
0
        (UPP(8) == 'E')) {
5505
0
        if ((!terminate) &&
5506
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5507
0
      goto done;
5508
0
        htmlParseDocTypeDecl(ctxt);
5509
0
                    if (ctxt->instate == XML_PARSER_EOF)
5510
0
                        goto done;
5511
0
        ctxt->instate = XML_PARSER_PROLOG;
5512
0
    } else if ((cur == '<') && (next == '!') &&
5513
0
               (avail < 9)) {
5514
0
        goto done;
5515
0
    } else {
5516
0
        ctxt->instate = XML_PARSER_CONTENT;
5517
0
    }
5518
0
    break;
5519
0
            case XML_PARSER_PROLOG:
5520
0
    SKIP_BLANKS;
5521
0
                avail = in->end - in->cur;
5522
0
    if (avail < 2)
5523
0
        goto done;
5524
0
    cur = in->cur[0];
5525
0
    next = in->cur[1];
5526
0
    if ((cur == '<') && (next == '!') &&
5527
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5528
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5529
0
      goto done;
5530
0
        htmlParseComment(ctxt);
5531
0
                    if (ctxt->instate == XML_PARSER_EOF)
5532
0
                        goto done;
5533
0
        ctxt->instate = XML_PARSER_PROLOG;
5534
0
          } else if ((cur == '<') && (next == '?')) {
5535
0
        if ((!terminate) &&
5536
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5537
0
      goto done;
5538
0
        htmlParsePI(ctxt);
5539
0
                    if (ctxt->instate == XML_PARSER_EOF)
5540
0
                        goto done;
5541
0
        ctxt->instate = XML_PARSER_PROLOG;
5542
0
    } else if ((cur == '<') && (next == '!') &&
5543
0
               (avail < 4)) {
5544
0
        goto done;
5545
0
    } else {
5546
0
        ctxt->instate = XML_PARSER_CONTENT;
5547
0
    }
5548
0
    break;
5549
0
            case XML_PARSER_EPILOG:
5550
0
                avail = in->end - in->cur;
5551
0
    if (avail < 1)
5552
0
        goto done;
5553
0
    cur = in->cur[0];
5554
0
    if (IS_BLANK_CH(cur)) {
5555
0
        htmlParseCharData(ctxt);
5556
0
        goto done;
5557
0
    }
5558
0
    if (avail < 2)
5559
0
        goto done;
5560
0
    next = in->cur[1];
5561
0
          if ((cur == '<') && (next == '!') &&
5562
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5563
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5564
0
      goto done;
5565
0
        htmlParseComment(ctxt);
5566
0
                    if (ctxt->instate == XML_PARSER_EOF)
5567
0
                        goto done;
5568
0
        ctxt->instate = XML_PARSER_EPILOG;
5569
0
          } else if ((cur == '<') && (next == '?')) {
5570
0
        if ((!terminate) &&
5571
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5572
0
      goto done;
5573
0
        htmlParsePI(ctxt);
5574
0
                    if (ctxt->instate == XML_PARSER_EOF)
5575
0
                        goto done;
5576
0
        ctxt->instate = XML_PARSER_EPILOG;
5577
0
    } else if ((cur == '<') && (next == '!') &&
5578
0
               (avail < 4)) {
5579
0
        goto done;
5580
0
    } else {
5581
0
        ctxt->errNo = XML_ERR_DOCUMENT_END;
5582
0
        ctxt->wellFormed = 0;
5583
0
        ctxt->instate = XML_PARSER_EOF;
5584
0
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5585
0
      ctxt->sax->endDocument(ctxt->userData);
5586
0
        goto done;
5587
0
    }
5588
0
    break;
5589
0
            case XML_PARSER_START_TAG: {
5590
0
          const xmlChar *name;
5591
0
    int failed;
5592
0
    const htmlElemDesc * info;
5593
5594
    /*
5595
     * no chars in buffer
5596
     */
5597
0
    if (avail < 1)
5598
0
        goto done;
5599
    /*
5600
     * not enough chars in buffer
5601
     */
5602
0
    if (avail < 2) {
5603
0
        if (!terminate)
5604
0
      goto done;
5605
0
        else
5606
0
      next = ' ';
5607
0
    } else {
5608
0
        next = in->cur[1];
5609
0
    }
5610
0
    cur = in->cur[0];
5611
0
          if (cur != '<') {
5612
0
        ctxt->instate = XML_PARSER_CONTENT;
5613
0
        break;
5614
0
    }
5615
0
    if (next == '/') {
5616
0
        ctxt->instate = XML_PARSER_END_TAG;
5617
0
        ctxt->checkIndex = 0;
5618
0
        break;
5619
0
    }
5620
0
    if ((!terminate) &&
5621
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5622
0
        goto done;
5623
5624
                /* Capture start position */
5625
0
          if (ctxt->record_info) {
5626
0
               node_info.begin_pos = ctxt->input->consumed +
5627
0
                                  (CUR_PTR - ctxt->input->base);
5628
0
               node_info.begin_line = ctxt->input->line;
5629
0
          }
5630
5631
5632
0
    failed = htmlParseStartTag(ctxt);
5633
0
    name = ctxt->name;
5634
0
    if ((failed == -1) ||
5635
0
        (name == NULL)) {
5636
0
        if (CUR == '>')
5637
0
      NEXT;
5638
0
        break;
5639
0
    }
5640
5641
    /*
5642
     * Lookup the info for that element.
5643
     */
5644
0
    info = htmlTagLookup(name);
5645
0
    if (info == NULL) {
5646
0
        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5647
0
                     "Tag %s invalid\n", name, NULL);
5648
0
    }
5649
5650
    /*
5651
     * Check for an Empty Element labeled the XML/SGML way
5652
     */
5653
0
    if ((CUR == '/') && (NXT(1) == '>')) {
5654
0
        SKIP(2);
5655
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5656
0
      ctxt->sax->endElement(ctxt->userData, name);
5657
0
        htmlnamePop(ctxt);
5658
0
                    if (ctxt->instate == XML_PARSER_EOF)
5659
0
                        goto done;
5660
0
        ctxt->instate = XML_PARSER_CONTENT;
5661
0
        break;
5662
0
    }
5663
5664
0
    if (CUR == '>') {
5665
0
        NEXT;
5666
0
    } else {
5667
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5668
0
                     "Couldn't find end of Start Tag %s\n",
5669
0
         name, NULL);
5670
5671
        /*
5672
         * end of parsing of this node.
5673
         */
5674
0
        if (xmlStrEqual(name, ctxt->name)) {
5675
0
      nodePop(ctxt);
5676
0
      htmlnamePop(ctxt);
5677
0
        }
5678
5679
0
        if (ctxt->record_info)
5680
0
            htmlNodeInfoPush(ctxt, &node_info);
5681
5682
0
                    if (ctxt->instate == XML_PARSER_EOF)
5683
0
                        goto done;
5684
0
        ctxt->instate = XML_PARSER_CONTENT;
5685
0
        break;
5686
0
    }
5687
5688
    /*
5689
     * Check for an Empty Element from DTD definition
5690
     */
5691
0
    if ((info != NULL) && (info->empty)) {
5692
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5693
0
      ctxt->sax->endElement(ctxt->userData, name);
5694
0
        htmlnamePop(ctxt);
5695
0
    }
5696
5697
0
                if (ctxt->record_info)
5698
0
              htmlNodeInfoPush(ctxt, &node_info);
5699
5700
0
                if (ctxt->instate == XML_PARSER_EOF)
5701
0
                    goto done;
5702
0
    ctxt->instate = XML_PARSER_CONTENT;
5703
0
                break;
5704
0
      }
5705
0
            case XML_PARSER_CONTENT: {
5706
0
    xmlChar chr[2] = { 0, 0 };
5707
5708
                /*
5709
     * Handle preparsed entities and charRef
5710
     */
5711
0
    if (ctxt->token != 0) {
5712
0
        chr[0] = ctxt->token;
5713
0
        htmlCheckParagraph(ctxt);
5714
0
        if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5715
0
      ctxt->sax->characters(ctxt->userData, chr, 1);
5716
0
        ctxt->token = 0;
5717
0
        ctxt->checkIndex = 0;
5718
0
    }
5719
0
    if ((avail == 1) && (terminate)) {
5720
0
        cur = in->cur[0];
5721
0
        if ((cur != '<') && (cur != '&')) {
5722
0
      if (ctxt->sax != NULL) {
5723
0
                            chr[0] = cur;
5724
0
          if (IS_BLANK_CH(cur)) {
5725
0
        if (ctxt->keepBlanks) {
5726
0
            if (ctxt->sax->characters != NULL)
5727
0
          ctxt->sax->characters(
5728
0
            ctxt->userData, chr, 1);
5729
0
        } else {
5730
0
            if (ctxt->sax->ignorableWhitespace != NULL)
5731
0
          ctxt->sax->ignorableWhitespace(
5732
0
            ctxt->userData, chr, 1);
5733
0
        }
5734
0
          } else {
5735
0
        htmlCheckParagraph(ctxt);
5736
0
        if (ctxt->sax->characters != NULL)
5737
0
            ctxt->sax->characters(
5738
0
              ctxt->userData, chr, 1);
5739
0
          }
5740
0
      }
5741
0
      ctxt->token = 0;
5742
0
      ctxt->checkIndex = 0;
5743
0
      in->cur++;
5744
0
      break;
5745
0
        }
5746
0
    }
5747
0
    if (avail < 2)
5748
0
        goto done;
5749
0
    cur = in->cur[0];
5750
0
    next = in->cur[1];
5751
0
    if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5752
0
        (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5753
        /*
5754
         * Handle SCRIPT/STYLE separately
5755
         */
5756
0
        if (!terminate) {
5757
0
            int idx;
5758
0
      xmlChar val;
5759
5760
0
      idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5761
0
      if (idx < 0)
5762
0
          goto done;
5763
0
            val = in->cur[idx + 2];
5764
0
      if (val == 0) { /* bad cut of input */
5765
                            /*
5766
                             * FIXME: htmlParseScript checks for additional
5767
                             * characters after '</'.
5768
                             */
5769
0
                            ctxt->checkIndex = idx;
5770
0
          goto done;
5771
0
                        }
5772
0
        }
5773
0
        htmlParseScript(ctxt);
5774
0
                    if (ctxt->instate == XML_PARSER_EOF)
5775
0
                        goto done;
5776
0
        if ((cur == '<') && (next == '/')) {
5777
0
      ctxt->instate = XML_PARSER_END_TAG;
5778
0
      ctxt->checkIndex = 0;
5779
0
      break;
5780
0
        }
5781
0
    } else if ((cur == '<') && (next == '!')) {
5782
0
                    if (avail < 4)
5783
0
                        goto done;
5784
                    /*
5785
                     * Sometimes DOCTYPE arrives in the middle of the document
5786
                     */
5787
0
                    if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5788
0
                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5789
0
                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5790
0
                        (UPP(8) == 'E')) {
5791
0
                        if ((!terminate) &&
5792
0
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5793
0
                            goto done;
5794
0
                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5795
0
                                     "Misplaced DOCTYPE declaration\n",
5796
0
                                     BAD_CAST "DOCTYPE" , NULL);
5797
0
                        htmlParseDocTypeDecl(ctxt);
5798
0
                    } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
5799
0
                        if ((!terminate) &&
5800
0
                            (htmlParseLookupCommentEnd(ctxt) < 0))
5801
0
                            goto done;
5802
0
                        htmlParseComment(ctxt);
5803
0
                        if (ctxt->instate == XML_PARSER_EOF)
5804
0
                            goto done;
5805
0
                        ctxt->instate = XML_PARSER_CONTENT;
5806
0
                    } else {
5807
0
                        if ((!terminate) &&
5808
0
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5809
0
                            goto done;
5810
0
                        htmlSkipBogusComment(ctxt);
5811
0
                    }
5812
0
                } else if ((cur == '<') && (next == '?')) {
5813
0
                    if ((!terminate) &&
5814
0
                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5815
0
                        goto done;
5816
0
                    htmlParsePI(ctxt);
5817
0
                    if (ctxt->instate == XML_PARSER_EOF)
5818
0
                        goto done;
5819
0
                    ctxt->instate = XML_PARSER_CONTENT;
5820
0
                } else if ((cur == '<') && (next == '/')) {
5821
0
                    ctxt->instate = XML_PARSER_END_TAG;
5822
0
                    ctxt->checkIndex = 0;
5823
0
                    break;
5824
0
                } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
5825
0
                    if ((!terminate) && (next == 0))
5826
0
                        goto done;
5827
0
                    ctxt->instate = XML_PARSER_START_TAG;
5828
0
                    ctxt->checkIndex = 0;
5829
0
                    break;
5830
0
                } else if (cur == '<') {
5831
0
                    if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
5832
0
                        (ctxt->sax->characters != NULL))
5833
0
                        ctxt->sax->characters(ctxt->userData,
5834
0
                                              BAD_CAST "<", 1);
5835
0
                    NEXT;
5836
0
                } else {
5837
                    /*
5838
                     * check that the text sequence is complete
5839
                     * before handing out the data to the parser
5840
                     * to avoid problems with erroneous end of
5841
                     * data detection.
5842
                     */
5843
0
                    if ((!terminate) &&
5844
0
                        (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
5845
0
                        goto done;
5846
0
                    ctxt->checkIndex = 0;
5847
0
                    while ((ctxt->instate != XML_PARSER_EOF) &&
5848
0
                           (cur != '<') && (in->cur < in->end)) {
5849
0
                        if (cur == '&') {
5850
0
                            htmlParseReference(ctxt);
5851
0
                        } else {
5852
0
                            htmlParseCharData(ctxt);
5853
0
                        }
5854
0
                        cur = in->cur[0];
5855
0
                    }
5856
0
    }
5857
5858
0
    break;
5859
0
      }
5860
0
            case XML_PARSER_END_TAG:
5861
0
    if (avail < 2)
5862
0
        goto done;
5863
0
    if ((!terminate) &&
5864
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5865
0
        goto done;
5866
0
    htmlParseEndTag(ctxt);
5867
0
                if (ctxt->instate == XML_PARSER_EOF)
5868
0
                    goto done;
5869
0
    if (ctxt->nameNr == 0) {
5870
0
        ctxt->instate = XML_PARSER_EPILOG;
5871
0
    } else {
5872
0
        ctxt->instate = XML_PARSER_CONTENT;
5873
0
    }
5874
0
    ctxt->checkIndex = 0;
5875
0
          break;
5876
0
      default:
5877
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5878
0
           "HPP: internal error\n", NULL, NULL);
5879
0
    ctxt->instate = XML_PARSER_EOF;
5880
0
    break;
5881
0
  }
5882
0
    }
5883
0
done:
5884
0
    if ((avail == 0) && (terminate)) {
5885
0
  htmlAutoCloseOnEnd(ctxt);
5886
0
  if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5887
      /*
5888
       * SAX: end of the document processing.
5889
       */
5890
0
      ctxt->instate = XML_PARSER_EOF;
5891
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5892
0
    ctxt->sax->endDocument(ctxt->userData);
5893
0
  }
5894
0
    }
5895
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
5896
0
  ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
5897
0
   (ctxt->instate == XML_PARSER_EPILOG))) {
5898
0
  xmlDtdPtr dtd;
5899
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
5900
0
  if (dtd == NULL)
5901
0
      ctxt->myDoc->intSubset =
5902
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5903
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5904
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5905
0
    }
5906
0
    return(ret);
5907
0
}
5908
5909
/**
5910
 * htmlParseChunk:
5911
 * @ctxt:  an HTML parser context
5912
 * @chunk:  an char array
5913
 * @size:  the size in byte of the chunk
5914
 * @terminate:  last chunk indicator
5915
 *
5916
 * Parse a Chunk of memory
5917
 *
5918
 * Returns zero if no error, the xmlParserErrors otherwise.
5919
 */
5920
int
5921
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
5922
0
              int terminate) {
5923
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
5924
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5925
0
         "htmlParseChunk: context error\n", NULL, NULL);
5926
0
  return(XML_ERR_INTERNAL_ERROR);
5927
0
    }
5928
0
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
5929
0
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
5930
0
  size_t pos = ctxt->input->cur - ctxt->input->base;
5931
0
  int res;
5932
5933
0
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
5934
0
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
5935
0
  if (res < 0) {
5936
0
            htmlParseErr(ctxt, ctxt->input->buf->error,
5937
0
                         "xmlParserInputBufferPush failed", NULL, NULL);
5938
0
            xmlHaltParser(ctxt);
5939
0
      return (ctxt->errNo);
5940
0
  }
5941
0
    }
5942
0
    htmlParseTryOrFinish(ctxt, terminate);
5943
0
    if (terminate) {
5944
0
  if ((ctxt->instate != XML_PARSER_EOF) &&
5945
0
      (ctxt->instate != XML_PARSER_EPILOG) &&
5946
0
      (ctxt->instate != XML_PARSER_MISC)) {
5947
0
      ctxt->errNo = XML_ERR_DOCUMENT_END;
5948
0
      ctxt->wellFormed = 0;
5949
0
  }
5950
0
  if (ctxt->instate != XML_PARSER_EOF) {
5951
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5952
0
    ctxt->sax->endDocument(ctxt->userData);
5953
0
  }
5954
0
  ctxt->instate = XML_PARSER_EOF;
5955
0
    }
5956
0
    return((xmlParserErrors) ctxt->errNo);
5957
0
}
5958
5959
/************************************************************************
5960
 *                  *
5961
 *      User entry points       *
5962
 *                  *
5963
 ************************************************************************/
5964
5965
/**
5966
 * htmlCreatePushParserCtxt:
5967
 * @sax:  a SAX handler
5968
 * @user_data:  The user data returned on SAX callbacks
5969
 * @chunk:  a pointer to an array of chars
5970
 * @size:  number of chars in the array
5971
 * @filename:  an optional file name or URI
5972
 * @enc:  an optional encoding
5973
 *
5974
 * Create a parser context for using the HTML parser in push mode
5975
 * The value of @filename is used for fetching external entities
5976
 * and error/warning reports.
5977
 *
5978
 * Returns the new parser context or NULL
5979
 */
5980
htmlParserCtxtPtr
5981
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
5982
                         const char *chunk, int size, const char *filename,
5983
0
       xmlCharEncoding enc) {
5984
0
    htmlParserCtxtPtr ctxt;
5985
0
    htmlParserInputPtr inputStream;
5986
0
    xmlParserInputBufferPtr buf;
5987
5988
0
    xmlInitParser();
5989
5990
0
    buf = xmlAllocParserInputBuffer(XML_CHAR_ENCODING_NONE);
5991
0
    if (buf == NULL) return(NULL);
5992
5993
0
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
5994
0
    if (ctxt == NULL) {
5995
0
  xmlFreeParserInputBuffer(buf);
5996
0
  return(NULL);
5997
0
    }
5998
0
    if (filename == NULL) {
5999
0
  ctxt->directory = NULL;
6000
0
    } else {
6001
0
        ctxt->directory = xmlParserGetDirectory(filename);
6002
0
    }
6003
6004
0
    inputStream = htmlNewInputStream(ctxt);
6005
0
    if (inputStream == NULL) {
6006
0
  xmlFreeParserCtxt(ctxt);
6007
0
  xmlFreeParserInputBuffer(buf);
6008
0
  return(NULL);
6009
0
    }
6010
6011
0
    if (filename == NULL)
6012
0
  inputStream->filename = NULL;
6013
0
    else
6014
0
  inputStream->filename = (char *)
6015
0
      xmlCanonicPath((const xmlChar *) filename);
6016
0
    inputStream->buf = buf;
6017
0
    xmlBufResetInput(buf->buffer, inputStream);
6018
6019
0
    inputPush(ctxt, inputStream);
6020
6021
0
    if (enc != XML_CHAR_ENCODING_NONE)
6022
0
        xmlSwitchEncoding(ctxt, enc);
6023
6024
0
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6025
0
        (ctxt->input->buf != NULL))  {
6026
0
  size_t pos = ctxt->input->cur - ctxt->input->base;
6027
0
        int res;
6028
6029
0
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6030
0
        xmlBufUpdateInput(ctxt->input->buf->buffer, ctxt->input, pos);
6031
0
        if (res < 0) {
6032
0
            htmlParseErr(ctxt, ctxt->input->buf->error,
6033
0
                         "xmlParserInputBufferPush failed\n", NULL, NULL);
6034
0
            xmlHaltParser(ctxt);
6035
0
        }
6036
0
    }
6037
0
    ctxt->progressive = 1;
6038
6039
0
    return(ctxt);
6040
0
}
6041
#endif /* LIBXML_PUSH_ENABLED */
6042
6043
/**
6044
 * htmlSAXParseDoc:
6045
 * @cur:  a pointer to an array of xmlChar
6046
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6047
 * @sax:  the SAX handler block
6048
 * @userData: if using SAX, this pointer will be provided on callbacks.
6049
 *
6050
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6051
 *
6052
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6053
 * to handle parse events. If sax is NULL, fallback to the default DOM
6054
 * behavior and return a tree.
6055
 *
6056
 * Returns the resulting document tree unless SAX is NULL or the document is
6057
 *     not well formed.
6058
 */
6059
6060
htmlDocPtr
6061
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6062
0
                htmlSAXHandlerPtr sax, void *userData) {
6063
0
    htmlDocPtr ret;
6064
0
    htmlParserCtxtPtr ctxt;
6065
6066
0
    xmlInitParser();
6067
6068
0
    if (cur == NULL) return(NULL);
6069
6070
6071
0
    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6072
0
    if (ctxt == NULL) return(NULL);
6073
0
    if (sax != NULL) {
6074
0
        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6075
0
        ctxt->sax = sax;
6076
0
        ctxt->userData = userData;
6077
0
    }
6078
6079
0
    htmlParseDocument(ctxt);
6080
0
    ret = ctxt->myDoc;
6081
0
    if (sax != NULL) {
6082
0
  ctxt->sax = NULL;
6083
0
  ctxt->userData = NULL;
6084
0
    }
6085
0
    htmlFreeParserCtxt(ctxt);
6086
6087
0
    return(ret);
6088
0
}
6089
6090
/**
6091
 * htmlParseDoc:
6092
 * @cur:  a pointer to an array of xmlChar
6093
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6094
 *
6095
 * parse an HTML in-memory document and build a tree.
6096
 *
6097
 * Returns the resulting document tree
6098
 */
6099
6100
htmlDocPtr
6101
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
6102
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6103
0
}
6104
6105
6106
/**
6107
 * htmlCreateFileParserCtxt:
6108
 * @filename:  the filename
6109
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6110
 *
6111
 * Create a parser context for a file content.
6112
 * Automatic support for ZLIB/Compress compressed document is provided
6113
 * by default if found at compile-time.
6114
 *
6115
 * Returns the new parser context or NULL
6116
 */
6117
htmlParserCtxtPtr
6118
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6119
0
{
6120
0
    htmlParserCtxtPtr ctxt;
6121
0
    htmlParserInputPtr inputStream;
6122
0
    char *canonicFilename;
6123
6124
0
    if (filename == NULL)
6125
0
        return(NULL);
6126
6127
0
    ctxt = htmlNewParserCtxt();
6128
0
    if (ctxt == NULL) {
6129
0
  return(NULL);
6130
0
    }
6131
0
    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6132
0
    if (canonicFilename == NULL) {
6133
0
  xmlFreeParserCtxt(ctxt);
6134
0
  return(NULL);
6135
0
    }
6136
6137
0
    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6138
0
    xmlFree(canonicFilename);
6139
0
    if (inputStream == NULL) {
6140
0
  xmlFreeParserCtxt(ctxt);
6141
0
  return(NULL);
6142
0
    }
6143
6144
0
    inputPush(ctxt, inputStream);
6145
6146
    /* set encoding */
6147
0
    if (encoding) {
6148
0
        xmlCharEncodingHandlerPtr hdlr;
6149
6150
0
        hdlr = xmlFindCharEncodingHandler(encoding);
6151
0
        if (hdlr != NULL) {
6152
0
            xmlSwitchToEncoding(ctxt, hdlr);
6153
0
        }
6154
0
    }
6155
6156
0
    return(ctxt);
6157
0
}
6158
6159
/**
6160
 * htmlSAXParseFile:
6161
 * @filename:  the filename
6162
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6163
 * @sax:  the SAX handler block
6164
 * @userData: if using SAX, this pointer will be provided on callbacks.
6165
 *
6166
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6167
 *
6168
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6169
 * compressed document is provided by default if found at compile-time.
6170
 * It use the given SAX function block to handle the parsing callback.
6171
 * If sax is NULL, fallback to the default DOM tree building routines.
6172
 *
6173
 * Returns the resulting document tree unless SAX is NULL or the document is
6174
 *     not well formed.
6175
 */
6176
6177
htmlDocPtr
6178
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6179
0
                 void *userData) {
6180
0
    htmlDocPtr ret;
6181
0
    htmlParserCtxtPtr ctxt;
6182
0
    htmlSAXHandlerPtr oldsax = NULL;
6183
6184
0
    xmlInitParser();
6185
6186
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6187
0
    if (ctxt == NULL) return(NULL);
6188
0
    if (sax != NULL) {
6189
0
  oldsax = ctxt->sax;
6190
0
        ctxt->sax = sax;
6191
0
        ctxt->userData = userData;
6192
0
    }
6193
6194
0
    htmlParseDocument(ctxt);
6195
6196
0
    ret = ctxt->myDoc;
6197
0
    if (sax != NULL) {
6198
0
        ctxt->sax = oldsax;
6199
0
        ctxt->userData = NULL;
6200
0
    }
6201
0
    htmlFreeParserCtxt(ctxt);
6202
6203
0
    return(ret);
6204
0
}
6205
6206
/**
6207
 * htmlParseFile:
6208
 * @filename:  the filename
6209
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6210
 *
6211
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6212
 * compressed document is provided by default if found at compile-time.
6213
 *
6214
 * Returns the resulting document tree
6215
 */
6216
6217
htmlDocPtr
6218
0
htmlParseFile(const char *filename, const char *encoding) {
6219
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6220
0
}
6221
6222
/**
6223
 * htmlHandleOmittedElem:
6224
 * @val:  int 0 or 1
6225
 *
6226
 * Set and return the previous value for handling HTML omitted tags.
6227
 *
6228
 * Returns the last value for 0 for no handling, 1 for auto insertion.
6229
 */
6230
6231
int
6232
0
htmlHandleOmittedElem(int val) {
6233
0
    int old = htmlOmittedDefaultValue;
6234
6235
0
    htmlOmittedDefaultValue = val;
6236
0
    return(old);
6237
0
}
6238
6239
/**
6240
 * htmlElementAllowedHere:
6241
 * @parent: HTML parent element
6242
 * @elt: HTML element
6243
 *
6244
 * Checks whether an HTML element may be a direct child of a parent element.
6245
 * Note - doesn't check for deprecated elements
6246
 *
6247
 * Returns 1 if allowed; 0 otherwise.
6248
 */
6249
int
6250
0
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6251
0
  const char** p ;
6252
6253
0
  if ( ! elt || ! parent || ! parent->subelts )
6254
0
  return 0 ;
6255
6256
0
  for ( p = parent->subelts; *p; ++p )
6257
0
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6258
0
      return 1 ;
6259
6260
0
  return 0 ;
6261
0
}
6262
/**
6263
 * htmlElementStatusHere:
6264
 * @parent: HTML parent element
6265
 * @elt: HTML element
6266
 *
6267
 * Checks whether an HTML element may be a direct child of a parent element.
6268
 * and if so whether it is valid or deprecated.
6269
 *
6270
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6271
 */
6272
htmlStatus
6273
0
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6274
0
  if ( ! parent || ! elt )
6275
0
    return HTML_INVALID ;
6276
0
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6277
0
    return HTML_INVALID ;
6278
6279
0
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6280
0
}
6281
/**
6282
 * htmlAttrAllowed:
6283
 * @elt: HTML element
6284
 * @attr: HTML attribute
6285
 * @legacy: whether to allow deprecated attributes
6286
 *
6287
 * Checks whether an attribute is valid for an element
6288
 * Has full knowledge of Required and Deprecated attributes
6289
 *
6290
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6291
 */
6292
htmlStatus
6293
0
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6294
0
  const char** p ;
6295
6296
0
  if ( !elt || ! attr )
6297
0
  return HTML_INVALID ;
6298
6299
0
  if ( elt->attrs_req )
6300
0
    for ( p = elt->attrs_req; *p; ++p)
6301
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6302
0
        return HTML_REQUIRED ;
6303
6304
0
  if ( elt->attrs_opt )
6305
0
    for ( p = elt->attrs_opt; *p; ++p)
6306
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6307
0
        return HTML_VALID ;
6308
6309
0
  if ( legacy && elt->attrs_depr )
6310
0
    for ( p = elt->attrs_depr; *p; ++p)
6311
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6312
0
        return HTML_DEPRECATED ;
6313
6314
0
  return HTML_INVALID ;
6315
0
}
6316
/**
6317
 * htmlNodeStatus:
6318
 * @node: an htmlNodePtr in a tree
6319
 * @legacy: whether to allow deprecated elements (YES is faster here
6320
 *  for Element nodes)
6321
 *
6322
 * Checks whether the tree node is valid.  Experimental (the author
6323
 *     only uses the HTML enhancements in a SAX parser)
6324
 *
6325
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6326
 *  legacy allowed) or htmlElementStatusHere (otherwise).
6327
 *  for Attribute nodes, a return from htmlAttrAllowed
6328
 *  for other nodes, HTML_NA (no checks performed)
6329
 */
6330
htmlStatus
6331
0
htmlNodeStatus(const htmlNodePtr node, int legacy) {
6332
0
  if ( ! node )
6333
0
    return HTML_INVALID ;
6334
6335
0
  switch ( node->type ) {
6336
0
    case XML_ELEMENT_NODE:
6337
0
      return legacy
6338
0
  ? ( htmlElementAllowedHere (
6339
0
    htmlTagLookup(node->parent->name) , node->name
6340
0
    ) ? HTML_VALID : HTML_INVALID )
6341
0
  : htmlElementStatusHere(
6342
0
    htmlTagLookup(node->parent->name) ,
6343
0
    htmlTagLookup(node->name) )
6344
0
  ;
6345
0
    case XML_ATTRIBUTE_NODE:
6346
0
      return htmlAttrAllowed(
6347
0
  htmlTagLookup(node->parent->name) , node->name, legacy) ;
6348
0
    default: return HTML_NA ;
6349
0
  }
6350
0
}
6351
/************************************************************************
6352
 *                  *
6353
 *  New set (2.6.0) of simpler and more flexible APIs   *
6354
 *                  *
6355
 ************************************************************************/
6356
/**
6357
 * DICT_FREE:
6358
 * @str:  a string
6359
 *
6360
 * Free a string if it is not owned by the "dict" dictionary in the
6361
 * current scope
6362
 */
6363
#define DICT_FREE(str)            \
6364
0
  if ((str) && ((!dict) ||       \
6365
0
      (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6366
0
      xmlFree((char *)(str));
6367
6368
/**
6369
 * htmlCtxtReset:
6370
 * @ctxt: an HTML parser context
6371
 *
6372
 * Reset a parser context
6373
 */
6374
void
6375
htmlCtxtReset(htmlParserCtxtPtr ctxt)
6376
0
{
6377
0
    xmlParserInputPtr input;
6378
0
    xmlDictPtr dict;
6379
6380
0
    if (ctxt == NULL)
6381
0
        return;
6382
6383
0
    xmlInitParser();
6384
0
    dict = ctxt->dict;
6385
6386
0
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6387
0
        xmlFreeInputStream(input);
6388
0
    }
6389
0
    ctxt->inputNr = 0;
6390
0
    ctxt->input = NULL;
6391
6392
0
    ctxt->spaceNr = 0;
6393
0
    if (ctxt->spaceTab != NULL) {
6394
0
  ctxt->spaceTab[0] = -1;
6395
0
  ctxt->space = &ctxt->spaceTab[0];
6396
0
    } else {
6397
0
  ctxt->space = NULL;
6398
0
    }
6399
6400
6401
0
    ctxt->nodeNr = 0;
6402
0
    ctxt->node = NULL;
6403
6404
0
    ctxt->nameNr = 0;
6405
0
    ctxt->name = NULL;
6406
6407
0
    ctxt->nsNr = 0;
6408
6409
0
    DICT_FREE(ctxt->version);
6410
0
    ctxt->version = NULL;
6411
0
    DICT_FREE(ctxt->encoding);
6412
0
    ctxt->encoding = NULL;
6413
0
    DICT_FREE(ctxt->directory);
6414
0
    ctxt->directory = NULL;
6415
0
    DICT_FREE(ctxt->extSubURI);
6416
0
    ctxt->extSubURI = NULL;
6417
0
    DICT_FREE(ctxt->extSubSystem);
6418
0
    ctxt->extSubSystem = NULL;
6419
0
    if (ctxt->myDoc != NULL)
6420
0
        xmlFreeDoc(ctxt->myDoc);
6421
0
    ctxt->myDoc = NULL;
6422
6423
0
    ctxt->standalone = -1;
6424
0
    ctxt->hasExternalSubset = 0;
6425
0
    ctxt->hasPErefs = 0;
6426
0
    ctxt->html = 1;
6427
0
    ctxt->external = 0;
6428
0
    ctxt->instate = XML_PARSER_START;
6429
0
    ctxt->token = 0;
6430
6431
0
    ctxt->wellFormed = 1;
6432
0
    ctxt->nsWellFormed = 1;
6433
0
    ctxt->disableSAX = 0;
6434
0
    ctxt->valid = 1;
6435
0
    ctxt->vctxt.userData = ctxt;
6436
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
6437
0
    ctxt->vctxt.error = xmlParserValidityError;
6438
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
6439
0
    ctxt->record_info = 0;
6440
0
    ctxt->checkIndex = 0;
6441
0
    ctxt->endCheckState = 0;
6442
0
    ctxt->inSubset = 0;
6443
0
    ctxt->errNo = XML_ERR_OK;
6444
0
    ctxt->depth = 0;
6445
0
    ctxt->catalogs = NULL;
6446
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
6447
6448
0
    if (ctxt->attsDefault != NULL) {
6449
0
        xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6450
0
        ctxt->attsDefault = NULL;
6451
0
    }
6452
0
    if (ctxt->attsSpecial != NULL) {
6453
0
        xmlHashFree(ctxt->attsSpecial, NULL);
6454
0
        ctxt->attsSpecial = NULL;
6455
0
    }
6456
6457
0
    ctxt->nbErrors = 0;
6458
0
    ctxt->nbWarnings = 0;
6459
0
    if (ctxt->lastError.code != XML_ERR_OK)
6460
0
        xmlResetError(&ctxt->lastError);
6461
0
}
6462
6463
/**
6464
 * htmlCtxtUseOptions:
6465
 * @ctxt: an HTML parser context
6466
 * @options:  a combination of htmlParserOption(s)
6467
 *
6468
 * Applies the options to the parser context
6469
 *
6470
 * Returns 0 in case of success, the set of unknown or unimplemented options
6471
 *         in case of error.
6472
 */
6473
int
6474
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6475
0
{
6476
0
    if (ctxt == NULL)
6477
0
        return(-1);
6478
6479
0
    if (options & HTML_PARSE_NOWARNING) {
6480
0
        ctxt->sax->warning = NULL;
6481
0
        ctxt->vctxt.warning = NULL;
6482
0
        options -= XML_PARSE_NOWARNING;
6483
0
  ctxt->options |= XML_PARSE_NOWARNING;
6484
0
    }
6485
0
    if (options & HTML_PARSE_NOERROR) {
6486
0
        ctxt->sax->error = NULL;
6487
0
        ctxt->vctxt.error = NULL;
6488
0
        ctxt->sax->fatalError = NULL;
6489
0
        options -= XML_PARSE_NOERROR;
6490
0
  ctxt->options |= XML_PARSE_NOERROR;
6491
0
    }
6492
0
    if (options & HTML_PARSE_PEDANTIC) {
6493
0
        ctxt->pedantic = 1;
6494
0
        options -= XML_PARSE_PEDANTIC;
6495
0
  ctxt->options |= XML_PARSE_PEDANTIC;
6496
0
    } else
6497
0
        ctxt->pedantic = 0;
6498
0
    if (options & XML_PARSE_NOBLANKS) {
6499
0
        ctxt->keepBlanks = 0;
6500
0
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6501
0
        options -= XML_PARSE_NOBLANKS;
6502
0
  ctxt->options |= XML_PARSE_NOBLANKS;
6503
0
    } else
6504
0
        ctxt->keepBlanks = 1;
6505
0
    if (options & HTML_PARSE_RECOVER) {
6506
0
        ctxt->recovery = 1;
6507
0
  options -= HTML_PARSE_RECOVER;
6508
0
    } else
6509
0
        ctxt->recovery = 0;
6510
0
    if (options & HTML_PARSE_COMPACT) {
6511
0
  ctxt->options |= HTML_PARSE_COMPACT;
6512
0
        options -= HTML_PARSE_COMPACT;
6513
0
    }
6514
0
    if (options & XML_PARSE_HUGE) {
6515
0
  ctxt->options |= XML_PARSE_HUGE;
6516
0
        options -= XML_PARSE_HUGE;
6517
0
    }
6518
0
    if (options & HTML_PARSE_NODEFDTD) {
6519
0
  ctxt->options |= HTML_PARSE_NODEFDTD;
6520
0
        options -= HTML_PARSE_NODEFDTD;
6521
0
    }
6522
0
    if (options & HTML_PARSE_IGNORE_ENC) {
6523
0
  ctxt->options |= HTML_PARSE_IGNORE_ENC;
6524
0
        options -= HTML_PARSE_IGNORE_ENC;
6525
0
    }
6526
0
    if (options & HTML_PARSE_NOIMPLIED) {
6527
0
        ctxt->options |= HTML_PARSE_NOIMPLIED;
6528
0
        options -= HTML_PARSE_NOIMPLIED;
6529
0
    }
6530
0
    ctxt->dictNames = 0;
6531
0
    ctxt->linenumbers = 1;
6532
0
    return (options);
6533
0
}
6534
6535
/**
6536
 * htmlDoRead:
6537
 * @ctxt:  an HTML parser context
6538
 * @URL:  the base URL to use for the document
6539
 * @encoding:  the document encoding, or NULL
6540
 * @options:  a combination of htmlParserOption(s)
6541
 * @reuse:  keep the context for reuse
6542
 *
6543
 * Common front-end for the htmlRead functions
6544
 *
6545
 * Returns the resulting document tree or NULL
6546
 */
6547
static htmlDocPtr
6548
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6549
          int options, int reuse)
6550
0
{
6551
0
    htmlDocPtr ret;
6552
6553
0
    htmlCtxtUseOptions(ctxt, options);
6554
0
    ctxt->html = 1;
6555
0
    if (encoding != NULL) {
6556
0
        xmlCharEncodingHandlerPtr hdlr;
6557
6558
0
  hdlr = xmlFindCharEncodingHandler(encoding);
6559
0
  if (hdlr != NULL) {
6560
0
      xmlSwitchToEncoding(ctxt, hdlr);
6561
0
        }
6562
0
    }
6563
0
    if ((URL != NULL) && (ctxt->input != NULL) &&
6564
0
        (ctxt->input->filename == NULL))
6565
0
        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6566
0
    htmlParseDocument(ctxt);
6567
0
    ret = ctxt->myDoc;
6568
0
    ctxt->myDoc = NULL;
6569
0
    if (!reuse) {
6570
0
        if ((ctxt->dictNames) &&
6571
0
      (ret != NULL) &&
6572
0
      (ret->dict == ctxt->dict))
6573
0
      ctxt->dict = NULL;
6574
0
  xmlFreeParserCtxt(ctxt);
6575
0
    }
6576
0
    return (ret);
6577
0
}
6578
6579
/**
6580
 * htmlReadDoc:
6581
 * @cur:  a pointer to a zero terminated string
6582
 * @URL:  the base URL to use for the document
6583
 * @encoding:  the document encoding, or NULL
6584
 * @options:  a combination of htmlParserOption(s)
6585
 *
6586
 * parse an XML in-memory document and build a tree.
6587
 *
6588
 * Returns the resulting document tree
6589
 */
6590
htmlDocPtr
6591
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6592
0
{
6593
0
    htmlParserCtxtPtr ctxt;
6594
6595
0
    if (cur == NULL)
6596
0
        return (NULL);
6597
6598
0
    xmlInitParser();
6599
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6600
0
    if (ctxt == NULL)
6601
0
        return (NULL);
6602
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6603
0
}
6604
6605
/**
6606
 * htmlReadFile:
6607
 * @filename:  a file or URL
6608
 * @encoding:  the document encoding, or NULL
6609
 * @options:  a combination of htmlParserOption(s)
6610
 *
6611
 * parse an XML file from the filesystem or the network.
6612
 *
6613
 * Returns the resulting document tree
6614
 */
6615
htmlDocPtr
6616
htmlReadFile(const char *filename, const char *encoding, int options)
6617
0
{
6618
0
    htmlParserCtxtPtr ctxt;
6619
6620
0
    xmlInitParser();
6621
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6622
0
    if (ctxt == NULL)
6623
0
        return (NULL);
6624
0
    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6625
0
}
6626
6627
/**
6628
 * htmlReadMemory:
6629
 * @buffer:  a pointer to a char array
6630
 * @size:  the size of the array
6631
 * @URL:  the base URL to use for the document
6632
 * @encoding:  the document encoding, or NULL
6633
 * @options:  a combination of htmlParserOption(s)
6634
 *
6635
 * parse an XML in-memory document and build a tree.
6636
 *
6637
 * Returns the resulting document tree
6638
 */
6639
htmlDocPtr
6640
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6641
0
{
6642
0
    htmlParserCtxtPtr ctxt;
6643
6644
0
    xmlInitParser();
6645
0
    ctxt = htmlCreateMemoryParserCtxt(buffer, size);
6646
0
    if (ctxt == NULL)
6647
0
        return (NULL);
6648
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6649
0
}
6650
6651
/**
6652
 * htmlReadFd:
6653
 * @fd:  an open file descriptor
6654
 * @URL:  the base URL to use for the document
6655
 * @encoding:  the document encoding, or NULL
6656
 * @options:  a combination of htmlParserOption(s)
6657
 *
6658
 * parse an HTML from a file descriptor and build a tree.
6659
 * NOTE that the file descriptor will not be closed when the
6660
 *      reader is closed or reset.
6661
 *
6662
 * Returns the resulting document tree
6663
 */
6664
htmlDocPtr
6665
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6666
0
{
6667
0
    htmlParserCtxtPtr ctxt;
6668
0
    xmlParserInputBufferPtr input;
6669
0
    htmlParserInputPtr stream;
6670
6671
0
    if (fd < 0)
6672
0
        return (NULL);
6673
6674
0
    xmlInitParser();
6675
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6676
0
    if (input == NULL)
6677
0
        return (NULL);
6678
0
    input->closecallback = NULL;
6679
0
    ctxt = htmlNewParserCtxt();
6680
0
    if (ctxt == NULL) {
6681
0
        xmlFreeParserInputBuffer(input);
6682
0
        return (NULL);
6683
0
    }
6684
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6685
0
    if (stream == NULL) {
6686
0
        xmlFreeParserInputBuffer(input);
6687
0
  htmlFreeParserCtxt(ctxt);
6688
0
        return (NULL);
6689
0
    }
6690
0
    inputPush(ctxt, stream);
6691
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6692
0
}
6693
6694
/**
6695
 * htmlReadIO:
6696
 * @ioread:  an I/O read function
6697
 * @ioclose:  an I/O close function
6698
 * @ioctx:  an I/O handler
6699
 * @URL:  the base URL to use for the document
6700
 * @encoding:  the document encoding, or NULL
6701
 * @options:  a combination of htmlParserOption(s)
6702
 *
6703
 * parse an HTML document from I/O functions and source and build a tree.
6704
 *
6705
 * Returns the resulting document tree
6706
 */
6707
htmlDocPtr
6708
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6709
          void *ioctx, const char *URL, const char *encoding, int options)
6710
0
{
6711
0
    htmlParserCtxtPtr ctxt;
6712
0
    xmlParserInputBufferPtr input;
6713
0
    xmlParserInputPtr stream;
6714
6715
0
    if (ioread == NULL)
6716
0
        return (NULL);
6717
0
    xmlInitParser();
6718
6719
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6720
0
                                         XML_CHAR_ENCODING_NONE);
6721
0
    if (input == NULL) {
6722
0
        if (ioclose != NULL)
6723
0
            ioclose(ioctx);
6724
0
        return (NULL);
6725
0
    }
6726
0
    ctxt = htmlNewParserCtxt();
6727
0
    if (ctxt == NULL) {
6728
0
        xmlFreeParserInputBuffer(input);
6729
0
        return (NULL);
6730
0
    }
6731
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6732
0
    if (stream == NULL) {
6733
0
        xmlFreeParserInputBuffer(input);
6734
0
  xmlFreeParserCtxt(ctxt);
6735
0
        return (NULL);
6736
0
    }
6737
0
    inputPush(ctxt, stream);
6738
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6739
0
}
6740
6741
/**
6742
 * htmlCtxtReadDoc:
6743
 * @ctxt:  an HTML parser context
6744
 * @str:  a pointer to a zero terminated string
6745
 * @URL:  the base URL to use for the document
6746
 * @encoding:  the document encoding, or NULL
6747
 * @options:  a combination of htmlParserOption(s)
6748
 *
6749
 * parse an XML in-memory document and build a tree.
6750
 * This reuses the existing @ctxt parser context
6751
 *
6752
 * Returns the resulting document tree
6753
 */
6754
htmlDocPtr
6755
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar *str,
6756
               const char *URL, const char *encoding, int options)
6757
0
{
6758
0
    xmlParserInputBufferPtr input;
6759
0
    xmlParserInputPtr stream;
6760
6761
0
    if (ctxt == NULL)
6762
0
        return (NULL);
6763
0
    if (str == NULL)
6764
0
        return (NULL);
6765
0
    xmlInitParser();
6766
6767
0
    htmlCtxtReset(ctxt);
6768
6769
0
    input = xmlParserInputBufferCreateString(str);
6770
0
    if (input == NULL) {
6771
0
  return(NULL);
6772
0
    }
6773
6774
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6775
0
    if (stream == NULL) {
6776
0
  xmlFreeParserInputBuffer(input);
6777
0
  return(NULL);
6778
0
    }
6779
6780
0
    inputPush(ctxt, stream);
6781
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6782
0
}
6783
6784
/**
6785
 * htmlCtxtReadFile:
6786
 * @ctxt:  an HTML parser context
6787
 * @filename:  a file or URL
6788
 * @encoding:  the document encoding, or NULL
6789
 * @options:  a combination of htmlParserOption(s)
6790
 *
6791
 * parse an XML file from the filesystem or the network.
6792
 * This reuses the existing @ctxt parser context
6793
 *
6794
 * Returns the resulting document tree
6795
 */
6796
htmlDocPtr
6797
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
6798
                const char *encoding, int options)
6799
0
{
6800
0
    xmlParserInputPtr stream;
6801
6802
0
    if (filename == NULL)
6803
0
        return (NULL);
6804
0
    if (ctxt == NULL)
6805
0
        return (NULL);
6806
0
    xmlInitParser();
6807
6808
0
    htmlCtxtReset(ctxt);
6809
6810
0
    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
6811
0
    if (stream == NULL) {
6812
0
        return (NULL);
6813
0
    }
6814
0
    inputPush(ctxt, stream);
6815
0
    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
6816
0
}
6817
6818
/**
6819
 * htmlCtxtReadMemory:
6820
 * @ctxt:  an HTML parser context
6821
 * @buffer:  a pointer to a char array
6822
 * @size:  the size of the array
6823
 * @URL:  the base URL to use for the document
6824
 * @encoding:  the document encoding, or NULL
6825
 * @options:  a combination of htmlParserOption(s)
6826
 *
6827
 * parse an XML in-memory document and build a tree.
6828
 * This reuses the existing @ctxt parser context
6829
 *
6830
 * Returns the resulting document tree
6831
 */
6832
htmlDocPtr
6833
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
6834
                  const char *URL, const char *encoding, int options)
6835
0
{
6836
0
    xmlParserInputBufferPtr input;
6837
0
    xmlParserInputPtr stream;
6838
6839
0
    if (ctxt == NULL)
6840
0
        return (NULL);
6841
0
    if (buffer == NULL)
6842
0
        return (NULL);
6843
0
    xmlInitParser();
6844
6845
0
    htmlCtxtReset(ctxt);
6846
6847
0
    input = xmlParserInputBufferCreateStatic(buffer, size,
6848
0
                                             XML_CHAR_ENCODING_NONE);
6849
0
    if (input == NULL) {
6850
0
  return(NULL);
6851
0
    }
6852
6853
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6854
0
    if (stream == NULL) {
6855
0
  xmlFreeParserInputBuffer(input);
6856
0
  return(NULL);
6857
0
    }
6858
6859
0
    inputPush(ctxt, stream);
6860
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6861
0
}
6862
6863
/**
6864
 * htmlCtxtReadFd:
6865
 * @ctxt:  an HTML parser context
6866
 * @fd:  an open file descriptor
6867
 * @URL:  the base URL to use for the document
6868
 * @encoding:  the document encoding, or NULL
6869
 * @options:  a combination of htmlParserOption(s)
6870
 *
6871
 * parse an XML from a file descriptor and build a tree.
6872
 * This reuses the existing @ctxt parser context
6873
 *
6874
 * Returns the resulting document tree
6875
 */
6876
htmlDocPtr
6877
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
6878
              const char *URL, const char *encoding, int options)
6879
0
{
6880
0
    xmlParserInputBufferPtr input;
6881
0
    xmlParserInputPtr stream;
6882
6883
0
    if (fd < 0)
6884
0
        return (NULL);
6885
0
    if (ctxt == NULL)
6886
0
        return (NULL);
6887
0
    xmlInitParser();
6888
6889
0
    htmlCtxtReset(ctxt);
6890
6891
6892
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6893
0
    if (input == NULL)
6894
0
        return (NULL);
6895
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6896
0
    if (stream == NULL) {
6897
0
        xmlFreeParserInputBuffer(input);
6898
0
        return (NULL);
6899
0
    }
6900
0
    inputPush(ctxt, stream);
6901
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6902
0
}
6903
6904
/**
6905
 * htmlCtxtReadIO:
6906
 * @ctxt:  an HTML parser context
6907
 * @ioread:  an I/O read function
6908
 * @ioclose:  an I/O close function
6909
 * @ioctx:  an I/O handler
6910
 * @URL:  the base URL to use for the document
6911
 * @encoding:  the document encoding, or NULL
6912
 * @options:  a combination of htmlParserOption(s)
6913
 *
6914
 * parse an HTML document from I/O functions and source and build a tree.
6915
 * This reuses the existing @ctxt parser context
6916
 *
6917
 * Returns the resulting document tree
6918
 */
6919
htmlDocPtr
6920
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
6921
              xmlInputCloseCallback ioclose, void *ioctx,
6922
        const char *URL,
6923
              const char *encoding, int options)
6924
0
{
6925
0
    xmlParserInputBufferPtr input;
6926
0
    xmlParserInputPtr stream;
6927
6928
0
    if (ioread == NULL)
6929
0
        return (NULL);
6930
0
    if (ctxt == NULL)
6931
0
        return (NULL);
6932
0
    xmlInitParser();
6933
6934
0
    htmlCtxtReset(ctxt);
6935
6936
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6937
0
                                         XML_CHAR_ENCODING_NONE);
6938
0
    if (input == NULL) {
6939
0
        if (ioclose != NULL)
6940
0
            ioclose(ioctx);
6941
0
        return (NULL);
6942
0
    }
6943
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6944
0
    if (stream == NULL) {
6945
0
        xmlFreeParserInputBuffer(input);
6946
0
        return (NULL);
6947
0
    }
6948
0
    inputPush(ctxt, stream);
6949
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
6950
0
}
6951
6952
#endif /* LIBXML_HTML_ENABLED */