Coverage Report

Created: 2023-12-13 20:03

/src/libxml2/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12
13
#include <string.h>
14
#include <ctype.h>
15
#include <stdlib.h>
16
17
#include <libxml/xmlmemory.h>
18
#include <libxml/tree.h>
19
#include <libxml/parser.h>
20
#include <libxml/parserInternals.h>
21
#include <libxml/xmlerror.h>
22
#include <libxml/HTMLparser.h>
23
#include <libxml/HTMLtree.h>
24
#include <libxml/entities.h>
25
#include <libxml/encoding.h>
26
#include <libxml/valid.h>
27
#include <libxml/xmlIO.h>
28
#include <libxml/globals.h>
29
#include <libxml/uri.h>
30
31
#include "private/buf.h"
32
#include "private/enc.h"
33
#include "private/error.h"
34
#include "private/html.h"
35
#include "private/parser.h"
36
#include "private/tree.h"
37
38
#define HTML_MAX_NAMELEN 1000
39
0
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
40
0
#define HTML_PARSER_BUFFER_SIZE 100
41
42
/* #define DEBUG */
43
/* #define DEBUG_PUSH */
44
45
static int htmlOmittedDefaultValue = 1;
46
47
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
48
           xmlChar end, xmlChar  end2, xmlChar end3);
49
static void htmlParseComment(htmlParserCtxtPtr ctxt);
50
51
/************************************************************************
52
 *                  *
53
 *    Some factorized error routines        *
54
 *                  *
55
 ************************************************************************/
56
57
/**
58
 * htmlErrMemory:
59
 * @ctxt:  an HTML parser context
60
 * @extra:  extra information
61
 *
62
 * Handle a redefinition of attribute error
63
 */
64
static void
65
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
66
0
{
67
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
68
0
        (ctxt->instate == XML_PARSER_EOF))
69
0
  return;
70
0
    if (ctxt != NULL) {
71
0
        ctxt->errNo = XML_ERR_NO_MEMORY;
72
0
        ctxt->instate = XML_PARSER_EOF;
73
0
        ctxt->disableSAX = 1;
74
0
    }
75
0
    if (extra)
76
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
77
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
78
0
                        NULL, NULL, 0, 0,
79
0
                        "Memory allocation failed : %s\n", extra);
80
0
    else
81
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
82
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
83
0
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
84
0
}
85
86
/**
87
 * htmlParseErr:
88
 * @ctxt:  an HTML parser context
89
 * @error:  the error number
90
 * @msg:  the error message
91
 * @str1:  string infor
92
 * @str2:  string infor
93
 *
94
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
95
 */
96
static void LIBXML_ATTR_FORMAT(3,0)
97
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
98
             const char *msg, const xmlChar *str1, const xmlChar *str2)
99
0
{
100
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
101
0
        (ctxt->instate == XML_PARSER_EOF))
102
0
  return;
103
0
    if (ctxt != NULL)
104
0
  ctxt->errNo = error;
105
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
106
0
                    XML_ERR_ERROR, NULL, 0,
107
0
        (const char *) str1, (const char *) str2,
108
0
        NULL, 0, 0,
109
0
        msg, str1, str2);
110
0
    if (ctxt != NULL)
111
0
  ctxt->wellFormed = 0;
112
0
}
113
114
/**
115
 * htmlParseErrInt:
116
 * @ctxt:  an HTML parser context
117
 * @error:  the error number
118
 * @msg:  the error message
119
 * @val:  integer info
120
 *
121
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
122
 */
123
static void LIBXML_ATTR_FORMAT(3,0)
124
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
125
             const char *msg, int val)
126
0
{
127
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
128
0
        (ctxt->instate == XML_PARSER_EOF))
129
0
  return;
130
0
    if (ctxt != NULL)
131
0
  ctxt->errNo = error;
132
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
133
0
                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
134
0
        NULL, val, 0, msg, val);
135
0
    if (ctxt != NULL)
136
0
  ctxt->wellFormed = 0;
137
0
}
138
139
/************************************************************************
140
 *                  *
141
 *  Parser stacks related functions and macros    *
142
 *                  *
143
 ************************************************************************/
144
145
/**
146
 * htmlnamePush:
147
 * @ctxt:  an HTML parser context
148
 * @value:  the element name
149
 *
150
 * Pushes a new element name on top of the name stack
151
 *
152
 * Returns 0 in case of error, the index in the stack otherwise
153
 */
154
static int
155
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
156
0
{
157
0
    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
158
0
        ctxt->html = 3;
159
0
    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
160
0
        ctxt->html = 10;
161
0
    if (ctxt->nameNr >= ctxt->nameMax) {
162
0
        ctxt->nameMax *= 2;
163
0
        ctxt->nameTab = (const xmlChar * *)
164
0
                         xmlRealloc((xmlChar * *)ctxt->nameTab,
165
0
                                    ctxt->nameMax *
166
0
                                    sizeof(ctxt->nameTab[0]));
167
0
        if (ctxt->nameTab == NULL) {
168
0
            htmlErrMemory(ctxt, NULL);
169
0
            return (0);
170
0
        }
171
0
    }
172
0
    ctxt->nameTab[ctxt->nameNr] = value;
173
0
    ctxt->name = value;
174
0
    return (ctxt->nameNr++);
175
0
}
176
/**
177
 * htmlnamePop:
178
 * @ctxt: an HTML parser context
179
 *
180
 * Pops the top element name from the name stack
181
 *
182
 * Returns the name just removed
183
 */
184
static const xmlChar *
185
htmlnamePop(htmlParserCtxtPtr ctxt)
186
0
{
187
0
    const xmlChar *ret;
188
189
0
    if (ctxt->nameNr <= 0)
190
0
        return (NULL);
191
0
    ctxt->nameNr--;
192
0
    if (ctxt->nameNr < 0)
193
0
        return (NULL);
194
0
    if (ctxt->nameNr > 0)
195
0
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
196
0
    else
197
0
        ctxt->name = NULL;
198
0
    ret = ctxt->nameTab[ctxt->nameNr];
199
0
    ctxt->nameTab[ctxt->nameNr] = NULL;
200
0
    return (ret);
201
0
}
202
203
/**
204
 * htmlNodeInfoPush:
205
 * @ctxt:  an HTML parser context
206
 * @value:  the node info
207
 *
208
 * Pushes a new element name on top of the node info stack
209
 *
210
 * Returns 0 in case of error, the index in the stack otherwise
211
 */
212
static int
213
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
214
0
{
215
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
216
0
        if (ctxt->nodeInfoMax == 0)
217
0
                ctxt->nodeInfoMax = 5;
218
0
        ctxt->nodeInfoMax *= 2;
219
0
        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
220
0
                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
221
0
                                    ctxt->nodeInfoMax *
222
0
                                    sizeof(ctxt->nodeInfoTab[0]));
223
0
        if (ctxt->nodeInfoTab == NULL) {
224
0
            htmlErrMemory(ctxt, NULL);
225
0
            return (0);
226
0
        }
227
0
    }
228
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
229
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
230
0
    return (ctxt->nodeInfoNr++);
231
0
}
232
233
/**
234
 * htmlNodeInfoPop:
235
 * @ctxt:  an HTML parser context
236
 *
237
 * Pops the top element name from the node info stack
238
 *
239
 * Returns 0 in case of error, the pointer to NodeInfo otherwise
240
 */
241
static htmlParserNodeInfo *
242
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
243
0
{
244
0
    if (ctxt->nodeInfoNr <= 0)
245
0
        return (NULL);
246
0
    ctxt->nodeInfoNr--;
247
0
    if (ctxt->nodeInfoNr < 0)
248
0
        return (NULL);
249
0
    if (ctxt->nodeInfoNr > 0)
250
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
251
0
    else
252
0
        ctxt->nodeInfo = NULL;
253
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
254
0
}
255
256
/*
257
 * Macros for accessing the content. Those should be used only by the parser,
258
 * and not exported.
259
 *
260
 * Dirty macros, i.e. one need to make assumption on the context to use them
261
 *
262
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
263
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
264
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
265
 *           in UNICODE mode. This should be used internally by the parser
266
 *           only to compare to ASCII values otherwise it would break when
267
 *           running with UTF-8 encoding.
268
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
269
 *           to compare on ASCII based substring.
270
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
271
 *           it should be used only to compare on ASCII based substring.
272
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
273
 *           strings without newlines within the parser.
274
 *
275
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
276
 *
277
 *   NEXT    Skip to the next character, this does the proper decoding
278
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
279
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
280
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
281
 */
282
283
0
#define UPPER (toupper(*ctxt->input->cur))
284
285
0
#define SKIP(val) ctxt->input->cur += (val),ctxt->input->col+=(val)
286
287
0
#define NXT(val) ctxt->input->cur[(val)]
288
289
0
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
290
291
0
#define CUR_PTR ctxt->input->cur
292
0
#define BASE_PTR ctxt->input->base
293
294
0
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
295
0
       (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
296
0
  xmlParserInputShrink(ctxt->input)
297
298
0
#define GROW if ((ctxt->progressive == 0) &&       \
299
0
     (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))  \
300
0
  xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
301
302
0
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
303
304
/* Imported from XML */
305
306
0
#define CUR (*ctxt->input->cur)
307
0
#define NEXT xmlNextChar(ctxt)
308
309
0
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
310
311
312
0
#define NEXTL(l) do {             \
313
0
    if (*(ctxt->input->cur) == '\n') {         \
314
0
  ctxt->input->line++; ctxt->input->col = 1;      \
315
0
    } else ctxt->input->col++;           \
316
0
    ctxt->token = 0; ctxt->input->cur += l;       \
317
0
  } while (0)
318
319
/************
320
    \
321
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
322
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
323
 ************/
324
325
0
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
326
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
327
328
#define COPY_BUF(l,b,i,v)           \
329
0
    if (l == 1) b[i++] = v;           \
330
0
    else i += xmlCopyChar(l,&b[i],v)
331
332
/**
333
 * htmlFindEncoding:
334
 * @the HTML parser context
335
 *
336
 * Ty to find and encoding in the current data available in the input
337
 * buffer this is needed to try to switch to the proper encoding when
338
 * one face a character error.
339
 * That's an heuristic, since it's operating outside of parsing it could
340
 * try to use a meta which had been commented out, that's the reason it
341
 * should only be used in case of error, not as a default.
342
 *
343
 * Returns an encoding string or NULL if not found, the string need to
344
 *   be freed
345
 */
346
static xmlChar *
347
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
348
0
    const xmlChar *start, *cur, *end;
349
350
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
351
0
        (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
352
0
        (ctxt->input->buf->encoder != NULL))
353
0
        return(NULL);
354
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
355
0
        return(NULL);
356
357
0
    start = ctxt->input->cur;
358
0
    end = ctxt->input->end;
359
    /* we also expect the input buffer to be zero terminated */
360
0
    if (*end != 0)
361
0
        return(NULL);
362
363
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
364
0
    if (cur == NULL)
365
0
        return(NULL);
366
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
367
0
    if (cur == NULL)
368
0
        return(NULL);
369
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
370
0
    if (cur == NULL)
371
0
        return(NULL);
372
0
    cur += 8;
373
0
    start = cur;
374
0
    while (((*cur >= 'A') && (*cur <= 'Z')) ||
375
0
           ((*cur >= 'a') && (*cur <= 'z')) ||
376
0
           ((*cur >= '0') && (*cur <= '9')) ||
377
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
378
0
           cur++;
379
0
    if (cur == start)
380
0
        return(NULL);
381
0
    return(xmlStrndup(start, cur - start));
382
0
}
383
384
/**
385
 * htmlCurrentChar:
386
 * @ctxt:  the HTML parser context
387
 * @len:  pointer to the length of the char read
388
 *
389
 * The current char value, if using UTF-8 this may actually span multiple
390
 * bytes in the input buffer. Implement the end of line normalization:
391
 * 2.11 End-of-Line Handling
392
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
393
 * char, then the encoding converter is plugged in automatically.
394
 *
395
 * Returns the current char value and its length
396
 */
397
398
static int
399
0
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
400
0
    const unsigned char *cur;
401
0
    unsigned char c;
402
0
    unsigned int val;
403
404
0
    if (ctxt->instate == XML_PARSER_EOF)
405
0
  return(0);
406
407
0
    if (ctxt->token != 0) {
408
0
  *len = 0;
409
0
  return(ctxt->token);
410
0
    }
411
0
    if (ctxt->charset != XML_CHAR_ENCODING_UTF8) {
412
0
        xmlChar * guess;
413
0
        xmlCharEncodingHandlerPtr handler;
414
415
        /*
416
         * Assume it's a fixed length encoding (1) with
417
         * a compatible encoding for the ASCII set, since
418
         * HTML constructs only use < 128 chars
419
         */
420
0
        if (*ctxt->input->cur < 0x80) {
421
0
            *len = 1;
422
0
            if ((*ctxt->input->cur == 0) &&
423
0
                (ctxt->input->cur < ctxt->input->end)) {
424
0
                htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
425
0
                                "Char 0x%X out of allowed range\n", 0);
426
0
                return(' ');
427
0
            }
428
0
            return(*ctxt->input->cur);
429
0
        }
430
431
        /*
432
         * Humm this is bad, do an automatic flow conversion
433
         */
434
0
        guess = htmlFindEncoding(ctxt);
435
0
        if (guess == NULL) {
436
0
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
437
0
        } else {
438
0
            if (ctxt->input->encoding != NULL)
439
0
                xmlFree((xmlChar *) ctxt->input->encoding);
440
0
            ctxt->input->encoding = guess;
441
0
            handler = xmlFindCharEncodingHandler((const char *) guess);
442
0
            if (handler != NULL) {
443
                /*
444
                 * Don't use UTF-8 encoder which isn't required and
445
                 * can produce invalid UTF-8.
446
                 */
447
0
                if (!xmlStrEqual(BAD_CAST handler->name, BAD_CAST "UTF-8"))
448
0
                    xmlSwitchToEncoding(ctxt, handler);
449
0
            } else {
450
0
                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
451
0
                             "Unsupported encoding %s", guess, NULL);
452
0
            }
453
0
        }
454
0
        ctxt->charset = XML_CHAR_ENCODING_UTF8;
455
0
    }
456
457
    /*
458
     * We are supposed to handle UTF8, check it's valid
459
     * From rfc2044: encoding of the Unicode values on UTF-8:
460
     *
461
     * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
462
     * 0000 0000-0000 007F   0xxxxxxx
463
     * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
464
     * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
465
     *
466
     * Check for the 0x110000 limit too
467
     */
468
0
    cur = ctxt->input->cur;
469
0
    c = *cur;
470
0
    if (c & 0x80) {
471
0
        if ((c & 0x40) == 0)
472
0
            goto encoding_error;
473
0
        if (cur[1] == 0) {
474
0
            xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
475
0
            cur = ctxt->input->cur;
476
0
        }
477
0
        if ((cur[1] & 0xc0) != 0x80)
478
0
            goto encoding_error;
479
0
        if ((c & 0xe0) == 0xe0) {
480
481
0
            if (cur[2] == 0) {
482
0
                xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
483
0
                cur = ctxt->input->cur;
484
0
            }
485
0
            if ((cur[2] & 0xc0) != 0x80)
486
0
                goto encoding_error;
487
0
            if ((c & 0xf0) == 0xf0) {
488
0
                if (cur[3] == 0) {
489
0
                    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
490
0
                    cur = ctxt->input->cur;
491
0
                }
492
0
                if (((c & 0xf8) != 0xf0) ||
493
0
                    ((cur[3] & 0xc0) != 0x80))
494
0
                    goto encoding_error;
495
                /* 4-byte code */
496
0
                *len = 4;
497
0
                val = (cur[0] & 0x7) << 18;
498
0
                val |= (cur[1] & 0x3f) << 12;
499
0
                val |= (cur[2] & 0x3f) << 6;
500
0
                val |= cur[3] & 0x3f;
501
0
                if (val < 0x10000)
502
0
                    goto encoding_error;
503
0
            } else {
504
              /* 3-byte code */
505
0
                *len = 3;
506
0
                val = (cur[0] & 0xf) << 12;
507
0
                val |= (cur[1] & 0x3f) << 6;
508
0
                val |= cur[2] & 0x3f;
509
0
                if (val < 0x800)
510
0
                    goto encoding_error;
511
0
            }
512
0
        } else {
513
          /* 2-byte code */
514
0
            *len = 2;
515
0
            val = (cur[0] & 0x1f) << 6;
516
0
            val |= cur[1] & 0x3f;
517
0
            if (val < 0x80)
518
0
                goto encoding_error;
519
0
        }
520
0
        if (!IS_CHAR(val)) {
521
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
522
0
                            "Char 0x%X out of allowed range\n", val);
523
0
        }
524
0
        return(val);
525
0
    } else {
526
0
        if ((*ctxt->input->cur == 0) &&
527
0
            (ctxt->input->cur < ctxt->input->end)) {
528
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
529
0
                            "Char 0x%X out of allowed range\n", 0);
530
0
            *len = 1;
531
0
            return(' ');
532
0
        }
533
        /* 1-byte code */
534
0
        *len = 1;
535
0
        return(*ctxt->input->cur);
536
0
    }
537
538
0
encoding_error:
539
    /*
540
     * If we detect an UTF8 error that probably mean that the
541
     * input encoding didn't get properly advertised in the
542
     * declaration header. Report the error and switch the encoding
543
     * to ISO-Latin-1 (if you don't like this policy, just declare the
544
     * encoding !)
545
     */
546
0
    {
547
0
        char buffer[150];
548
549
0
  if (ctxt->input->end - ctxt->input->cur >= 4) {
550
0
      snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
551
0
          ctxt->input->cur[0], ctxt->input->cur[1],
552
0
          ctxt->input->cur[2], ctxt->input->cur[3]);
553
0
  } else {
554
0
      snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
555
0
  }
556
0
  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
557
0
         "Input is not proper UTF-8, indicate encoding !\n",
558
0
         BAD_CAST buffer, NULL);
559
0
    }
560
561
    /*
562
     * Don't switch encodings twice. Note that if there's an encoder, we
563
     * shouldn't receive invalid UTF-8 anyway.
564
     *
565
     * Note that if ctxt->input->buf == NULL, switching encodings is
566
     * impossible, see Gitlab issue #34.
567
     */
568
0
    if ((ctxt->input->buf != NULL) &&
569
0
        (ctxt->input->buf->encoder == NULL))
570
0
        xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
571
0
    *len = 1;
572
0
    return(*ctxt->input->cur);
573
0
}
574
575
/**
576
 * htmlSkipBlankChars:
577
 * @ctxt:  the HTML parser context
578
 *
579
 * skip all blanks character found at that point in the input streams.
580
 *
581
 * Returns the number of space chars skipped
582
 */
583
584
static int
585
0
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
586
0
    int res = 0;
587
588
0
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
589
0
  if ((*ctxt->input->cur == 0) &&
590
0
      (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
591
0
    xmlPopInput(ctxt);
592
0
  } else {
593
0
      if (*(ctxt->input->cur) == '\n') {
594
0
    ctxt->input->line++; ctxt->input->col = 1;
595
0
      } else ctxt->input->col++;
596
0
      ctxt->input->cur++;
597
0
      if (*ctxt->input->cur == 0)
598
0
    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
599
0
  }
600
0
  if (res < INT_MAX)
601
0
      res++;
602
0
    }
603
0
    return(res);
604
0
}
605
606
607
608
/************************************************************************
609
 *                  *
610
 *  The list of HTML elements and their properties    *
611
 *                  *
612
 ************************************************************************/
613
614
/*
615
 *  Start Tag: 1 means the start tag can be omitted
616
 *  End Tag:   1 means the end tag can be omitted
617
 *             2 means it's forbidden (empty elements)
618
 *             3 means the tag is stylistic and should be closed easily
619
 *  Depr:      this element is deprecated
620
 *  DTD:       1 means that this element is valid only in the Loose DTD
621
 *             2 means that this element is valid only in the Frameset DTD
622
 *
623
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
624
  , subElements , impliedsubelt , Attributes, userdata
625
 */
626
627
/* Definitions and a couple of vars for HTML Elements */
628
629
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
630
#define NB_FONTSTYLE 8
631
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
632
#define NB_PHRASE 10
633
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
634
#define NB_SPECIAL 16
635
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
636
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
637
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
638
#define NB_BLOCK NB_HEADING + NB_LIST + 14
639
#define FORMCTRL "input", "select", "textarea", "label", "button"
640
#define NB_FORMCTRL 5
641
#define PCDATA
642
#define NB_PCDATA 0
643
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
644
#define NB_HEADING 6
645
#define LIST "ul", "ol", "dir", "menu"
646
#define NB_LIST 4
647
#define MODIFIER
648
#define NB_MODIFIER 0
649
#define FLOW BLOCK,INLINE
650
#define NB_FLOW NB_BLOCK + NB_INLINE
651
#define EMPTY NULL
652
653
654
static const char* const html_flow[] = { FLOW, NULL } ;
655
static const char* const html_inline[] = { INLINE, NULL } ;
656
657
/* placeholders: elts with content but no subelements */
658
static const char* const html_pcdata[] = { NULL } ;
659
#define html_cdata html_pcdata
660
661
662
/* ... and for HTML Attributes */
663
664
#define COREATTRS "id", "class", "style", "title"
665
#define NB_COREATTRS 4
666
#define I18N "lang", "dir"
667
#define NB_I18N 2
668
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
669
#define NB_EVENTS 9
670
#define ATTRS COREATTRS,I18N,EVENTS
671
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
672
#define CELLHALIGN "align", "char", "charoff"
673
#define NB_CELLHALIGN 3
674
#define CELLVALIGN "valign"
675
#define NB_CELLVALIGN 1
676
677
static const char* const html_attrs[] = { ATTRS, NULL } ;
678
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
679
static const char* const core_attrs[] = { COREATTRS, NULL } ;
680
static const char* const i18n_attrs[] = { I18N, NULL } ;
681
682
683
/* Other declarations that should go inline ... */
684
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
685
  "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
686
  "tabindex", "onfocus", "onblur", NULL } ;
687
static const char* const target_attr[] = { "target", NULL } ;
688
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
689
static const char* const alt_attr[] = { "alt", NULL } ;
690
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
691
static const char* const href_attrs[] = { "href", NULL } ;
692
static const char* const clear_attrs[] = { "clear", NULL } ;
693
static const char* const inline_p[] = { INLINE, "p", NULL } ;
694
695
static const char* const flow_param[] = { FLOW, "param", NULL } ;
696
static const char* const applet_attrs[] = { COREATTRS , "codebase",
697
    "archive", "alt", "name", "height", "width", "align",
698
    "hspace", "vspace", NULL } ;
699
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
700
  "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
701
static const char* const basefont_attrs[] =
702
  { "id", "size", "color", "face", NULL } ;
703
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
704
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
705
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
706
static const char* const body_depr[] = { "background", "bgcolor", "text",
707
  "link", "vlink", "alink", NULL } ;
708
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
709
  "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
710
711
712
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
713
static const char* const col_elt[] = { "col", NULL } ;
714
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
715
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
716
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
717
static const char* const compact_attr[] = { "compact", NULL } ;
718
static const char* const label_attr[] = { "label", NULL } ;
719
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
720
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
721
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
722
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
723
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
724
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
725
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
726
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
727
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
728
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
729
static const char* const version_attr[] = { "version", NULL } ;
730
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
731
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
732
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
733
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
734
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
735
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
736
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
737
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
738
static const char* const align_attr[] = { "align", NULL } ;
739
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
740
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
741
static const char* const name_attr[] = { "name", NULL } ;
742
static const char* const action_attr[] = { "action", NULL } ;
743
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
744
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
745
static const char* const content_attr[] = { "content", NULL } ;
746
static const char* const type_attr[] = { "type", NULL } ;
747
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
748
static const char* const object_contents[] = { FLOW, "param", NULL } ;
749
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
750
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
751
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
752
static const char* const option_elt[] = { "option", NULL } ;
753
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
754
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
755
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
756
static const char* const width_attr[] = { "width", NULL } ;
757
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
758
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
759
static const char* const language_attr[] = { "language", NULL } ;
760
static const char* const select_content[] = { "optgroup", "option", NULL } ;
761
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
762
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
763
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
764
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
765
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
766
static const char* const tr_elt[] = { "tr", NULL } ;
767
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
768
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
769
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
770
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
771
static const char* const tr_contents[] = { "th", "td", NULL } ;
772
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
773
static const char* const li_elt[] = { "li", NULL } ;
774
static const char* const ul_depr[] = { "type", "compact", NULL} ;
775
static const char* const dir_attr[] = { "dir", NULL} ;
776
777
#define DECL (const char**)
778
779
static const htmlElemDesc
780
html40ElementTable[] = {
781
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
782
  DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
783
},
784
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
785
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
786
},
787
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
788
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
789
},
790
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
791
  DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
792
},
793
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
794
  DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
795
},
796
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
797
  EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
798
},
799
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
800
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
801
},
802
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
803
  EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
804
},
805
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
806
  EMPTY , NULL , NULL, DECL basefont_attrs, NULL
807
},
808
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
809
  DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
810
},
811
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
812
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
813
},
814
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
815
  DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
816
},
817
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
818
  DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
819
},
820
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
821
  EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
822
},
823
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
824
  DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
825
},
826
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
827
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
828
},
829
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
830
  DECL html_flow , NULL , NULL, DECL html_attrs, NULL
831
},
832
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
833
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
834
},
835
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
836
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
837
},
838
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
839
  EMPTY , NULL , DECL col_attrs , NULL, NULL
840
},
841
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
842
  DECL col_elt , "col" , DECL col_attrs , NULL, NULL
843
},
844
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
845
  DECL html_flow , NULL , DECL html_attrs, NULL, NULL
846
},
847
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
848
  DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
849
},
850
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
851
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
852
},
853
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
854
  DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
855
},
856
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
857
  DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
858
},
859
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
860
  DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
861
},
862
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
863
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
864
},
865
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
866
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
867
},
868
{ "embed",  0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
869
  EMPTY, NULL, DECL embed_attrs, NULL, NULL
870
},
871
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
872
  DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
873
},
874
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
875
  DECL html_inline, NULL, NULL, DECL font_attrs, NULL
876
},
877
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
878
  DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
879
},
880
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
881
  EMPTY, NULL, NULL, DECL frame_attrs, NULL
882
},
883
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
884
  DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
885
},
886
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
887
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
888
},
889
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
890
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
891
},
892
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
893
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
894
},
895
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
896
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
897
},
898
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
899
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
900
},
901
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
902
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
903
},
904
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
905
  DECL head_contents, NULL, DECL head_attrs, NULL, NULL
906
},
907
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
908
  EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
909
},
910
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
911
  DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
912
},
913
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
914
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
915
},
916
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
917
  DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
918
},
919
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
920
  EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
921
},
922
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
923
  EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
924
},
925
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
926
  DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
927
},
928
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
929
  EMPTY, NULL, NULL, DECL prompt_attrs, NULL
930
},
931
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
932
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
933
},
934
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
935
  DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
936
},
937
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
938
  DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
939
},
940
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
941
  DECL html_flow, NULL, DECL html_attrs, NULL, NULL
942
},
943
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
944
  EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
945
},
946
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
947
  DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
948
},
949
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
950
  DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
951
},
952
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
953
  EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
954
},
955
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
956
  DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
957
},
958
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
959
  DECL html_flow, "div", DECL html_attrs, NULL, NULL
960
},
961
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
962
  DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
963
},
964
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
965
  DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
966
},
967
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
968
  DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
969
},
970
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
971
  DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
972
},
973
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
974
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
975
},
976
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
977
  EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
978
},
979
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
980
  DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
981
},
982
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
983
  DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
984
},
985
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
986
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
987
},
988
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
989
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
990
},
991
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
992
  DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
993
},
994
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
995
  DECL select_content, NULL, DECL select_attrs, NULL, NULL
996
},
997
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
998
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
999
},
1000
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
1001
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1002
},
1003
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
1004
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1005
},
1006
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
1007
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1008
},
1009
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
1010
  DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1011
},
1012
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
1013
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1014
},
1015
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
1016
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1017
},
1018
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
1019
  DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1020
},
1021
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
1022
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1023
},
1024
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
1025
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1026
},
1027
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1028
  DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1029
},
1030
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
1031
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1032
},
1033
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
1034
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1035
},
1036
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
1037
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1038
},
1039
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
1040
  DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1041
},
1042
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
1043
  DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1044
},
1045
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1046
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1047
},
1048
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
1049
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1050
},
1051
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
1052
  DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1053
},
1054
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1055
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1056
}
1057
};
1058
1059
typedef struct {
1060
    const char *oldTag;
1061
    const char *newTag;
1062
} htmlStartCloseEntry;
1063
1064
/*
1065
 * start tags that imply the end of current element
1066
 */
1067
static const htmlStartCloseEntry htmlStartClose[] = {
1068
    { "a", "a" },
1069
    { "a", "fieldset" },
1070
    { "a", "table" },
1071
    { "a", "td" },
1072
    { "a", "th" },
1073
    { "address", "dd" },
1074
    { "address", "dl" },
1075
    { "address", "dt" },
1076
    { "address", "form" },
1077
    { "address", "li" },
1078
    { "address", "ul" },
1079
    { "b", "center" },
1080
    { "b", "p" },
1081
    { "b", "td" },
1082
    { "b", "th" },
1083
    { "big", "p" },
1084
    { "caption", "col" },
1085
    { "caption", "colgroup" },
1086
    { "caption", "tbody" },
1087
    { "caption", "tfoot" },
1088
    { "caption", "thead" },
1089
    { "caption", "tr" },
1090
    { "col", "col" },
1091
    { "col", "colgroup" },
1092
    { "col", "tbody" },
1093
    { "col", "tfoot" },
1094
    { "col", "thead" },
1095
    { "col", "tr" },
1096
    { "colgroup", "colgroup" },
1097
    { "colgroup", "tbody" },
1098
    { "colgroup", "tfoot" },
1099
    { "colgroup", "thead" },
1100
    { "colgroup", "tr" },
1101
    { "dd", "dt" },
1102
    { "dir", "dd" },
1103
    { "dir", "dl" },
1104
    { "dir", "dt" },
1105
    { "dir", "form" },
1106
    { "dir", "ul" },
1107
    { "dl", "form" },
1108
    { "dl", "li" },
1109
    { "dt", "dd" },
1110
    { "dt", "dl" },
1111
    { "font", "center" },
1112
    { "font", "td" },
1113
    { "font", "th" },
1114
    { "form", "form" },
1115
    { "h1", "fieldset" },
1116
    { "h1", "form" },
1117
    { "h1", "li" },
1118
    { "h1", "p" },
1119
    { "h1", "table" },
1120
    { "h2", "fieldset" },
1121
    { "h2", "form" },
1122
    { "h2", "li" },
1123
    { "h2", "p" },
1124
    { "h2", "table" },
1125
    { "h3", "fieldset" },
1126
    { "h3", "form" },
1127
    { "h3", "li" },
1128
    { "h3", "p" },
1129
    { "h3", "table" },
1130
    { "h4", "fieldset" },
1131
    { "h4", "form" },
1132
    { "h4", "li" },
1133
    { "h4", "p" },
1134
    { "h4", "table" },
1135
    { "h5", "fieldset" },
1136
    { "h5", "form" },
1137
    { "h5", "li" },
1138
    { "h5", "p" },
1139
    { "h5", "table" },
1140
    { "h6", "fieldset" },
1141
    { "h6", "form" },
1142
    { "h6", "li" },
1143
    { "h6", "p" },
1144
    { "h6", "table" },
1145
    { "head", "a" },
1146
    { "head", "abbr" },
1147
    { "head", "acronym" },
1148
    { "head", "address" },
1149
    { "head", "b" },
1150
    { "head", "bdo" },
1151
    { "head", "big" },
1152
    { "head", "blockquote" },
1153
    { "head", "body" },
1154
    { "head", "br" },
1155
    { "head", "center" },
1156
    { "head", "cite" },
1157
    { "head", "code" },
1158
    { "head", "dd" },
1159
    { "head", "dfn" },
1160
    { "head", "dir" },
1161
    { "head", "div" },
1162
    { "head", "dl" },
1163
    { "head", "dt" },
1164
    { "head", "em" },
1165
    { "head", "fieldset" },
1166
    { "head", "font" },
1167
    { "head", "form" },
1168
    { "head", "frameset" },
1169
    { "head", "h1" },
1170
    { "head", "h2" },
1171
    { "head", "h3" },
1172
    { "head", "h4" },
1173
    { "head", "h5" },
1174
    { "head", "h6" },
1175
    { "head", "hr" },
1176
    { "head", "i" },
1177
    { "head", "iframe" },
1178
    { "head", "img" },
1179
    { "head", "kbd" },
1180
    { "head", "li" },
1181
    { "head", "listing" },
1182
    { "head", "map" },
1183
    { "head", "menu" },
1184
    { "head", "ol" },
1185
    { "head", "p" },
1186
    { "head", "pre" },
1187
    { "head", "q" },
1188
    { "head", "s" },
1189
    { "head", "samp" },
1190
    { "head", "small" },
1191
    { "head", "span" },
1192
    { "head", "strike" },
1193
    { "head", "strong" },
1194
    { "head", "sub" },
1195
    { "head", "sup" },
1196
    { "head", "table" },
1197
    { "head", "tt" },
1198
    { "head", "u" },
1199
    { "head", "ul" },
1200
    { "head", "var" },
1201
    { "head", "xmp" },
1202
    { "hr", "form" },
1203
    { "i", "center" },
1204
    { "i", "p" },
1205
    { "i", "td" },
1206
    { "i", "th" },
1207
    { "legend", "fieldset" },
1208
    { "li", "li" },
1209
    { "link", "body" },
1210
    { "link", "frameset" },
1211
    { "listing", "dd" },
1212
    { "listing", "dl" },
1213
    { "listing", "dt" },
1214
    { "listing", "fieldset" },
1215
    { "listing", "form" },
1216
    { "listing", "li" },
1217
    { "listing", "table" },
1218
    { "listing", "ul" },
1219
    { "menu", "dd" },
1220
    { "menu", "dl" },
1221
    { "menu", "dt" },
1222
    { "menu", "form" },
1223
    { "menu", "ul" },
1224
    { "ol", "form" },
1225
    { "ol", "ul" },
1226
    { "option", "optgroup" },
1227
    { "option", "option" },
1228
    { "p", "address" },
1229
    { "p", "blockquote" },
1230
    { "p", "body" },
1231
    { "p", "caption" },
1232
    { "p", "center" },
1233
    { "p", "col" },
1234
    { "p", "colgroup" },
1235
    { "p", "dd" },
1236
    { "p", "dir" },
1237
    { "p", "div" },
1238
    { "p", "dl" },
1239
    { "p", "dt" },
1240
    { "p", "fieldset" },
1241
    { "p", "form" },
1242
    { "p", "frameset" },
1243
    { "p", "h1" },
1244
    { "p", "h2" },
1245
    { "p", "h3" },
1246
    { "p", "h4" },
1247
    { "p", "h5" },
1248
    { "p", "h6" },
1249
    { "p", "head" },
1250
    { "p", "hr" },
1251
    { "p", "li" },
1252
    { "p", "listing" },
1253
    { "p", "menu" },
1254
    { "p", "ol" },
1255
    { "p", "p" },
1256
    { "p", "pre" },
1257
    { "p", "table" },
1258
    { "p", "tbody" },
1259
    { "p", "td" },
1260
    { "p", "tfoot" },
1261
    { "p", "th" },
1262
    { "p", "title" },
1263
    { "p", "tr" },
1264
    { "p", "ul" },
1265
    { "p", "xmp" },
1266
    { "pre", "dd" },
1267
    { "pre", "dl" },
1268
    { "pre", "dt" },
1269
    { "pre", "fieldset" },
1270
    { "pre", "form" },
1271
    { "pre", "li" },
1272
    { "pre", "table" },
1273
    { "pre", "ul" },
1274
    { "s", "p" },
1275
    { "script", "noscript" },
1276
    { "small", "p" },
1277
    { "span", "td" },
1278
    { "span", "th" },
1279
    { "strike", "p" },
1280
    { "style", "body" },
1281
    { "style", "frameset" },
1282
    { "tbody", "tbody" },
1283
    { "tbody", "tfoot" },
1284
    { "td", "tbody" },
1285
    { "td", "td" },
1286
    { "td", "tfoot" },
1287
    { "td", "th" },
1288
    { "td", "tr" },
1289
    { "tfoot", "tbody" },
1290
    { "th", "tbody" },
1291
    { "th", "td" },
1292
    { "th", "tfoot" },
1293
    { "th", "th" },
1294
    { "th", "tr" },
1295
    { "thead", "tbody" },
1296
    { "thead", "tfoot" },
1297
    { "title", "body" },
1298
    { "title", "frameset" },
1299
    { "tr", "tbody" },
1300
    { "tr", "tfoot" },
1301
    { "tr", "tr" },
1302
    { "tt", "p" },
1303
    { "u", "p" },
1304
    { "u", "td" },
1305
    { "u", "th" },
1306
    { "ul", "address" },
1307
    { "ul", "form" },
1308
    { "ul", "menu" },
1309
    { "ul", "ol" },
1310
    { "ul", "pre" },
1311
    { "xmp", "dd" },
1312
    { "xmp", "dl" },
1313
    { "xmp", "dt" },
1314
    { "xmp", "fieldset" },
1315
    { "xmp", "form" },
1316
    { "xmp", "li" },
1317
    { "xmp", "table" },
1318
    { "xmp", "ul" }
1319
};
1320
1321
/*
1322
 * The list of HTML elements which are supposed not to have
1323
 * CDATA content and where a p element will be implied
1324
 *
1325
 * TODO: extend that list by reading the HTML SGML DTD on
1326
 *       implied paragraph
1327
 */
1328
static const char *const htmlNoContentElements[] = {
1329
    "html",
1330
    "head",
1331
    NULL
1332
};
1333
1334
/*
1335
 * The list of HTML attributes which are of content %Script;
1336
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1337
 *       it assumes the name starts with 'on'
1338
 */
1339
static const char *const htmlScriptAttributes[] = {
1340
    "onclick",
1341
    "ondblclick",
1342
    "onmousedown",
1343
    "onmouseup",
1344
    "onmouseover",
1345
    "onmousemove",
1346
    "onmouseout",
1347
    "onkeypress",
1348
    "onkeydown",
1349
    "onkeyup",
1350
    "onload",
1351
    "onunload",
1352
    "onfocus",
1353
    "onblur",
1354
    "onsubmit",
1355
    "onreset",
1356
    "onchange",
1357
    "onselect"
1358
};
1359
1360
/*
1361
 * This table is used by the htmlparser to know what to do with
1362
 * broken html pages. By assigning different priorities to different
1363
 * elements the parser can decide how to handle extra endtags.
1364
 * Endtags are only allowed to close elements with lower or equal
1365
 * priority.
1366
 */
1367
1368
typedef struct {
1369
    const char *name;
1370
    int priority;
1371
} elementPriority;
1372
1373
static const elementPriority htmlEndPriority[] = {
1374
    {"div",   150},
1375
    {"td",    160},
1376
    {"th",    160},
1377
    {"tr",    170},
1378
    {"thead", 180},
1379
    {"tbody", 180},
1380
    {"tfoot", 180},
1381
    {"table", 190},
1382
    {"head",  200},
1383
    {"body",  200},
1384
    {"html",  220},
1385
    {NULL,    100} /* Default priority */
1386
};
1387
1388
/************************************************************************
1389
 *                  *
1390
 *  functions to handle HTML specific data      *
1391
 *                  *
1392
 ************************************************************************/
1393
1394
/**
1395
 * htmlInitAutoClose:
1396
 *
1397
 * DEPRECATED: This function will be made private. Call xmlInitParser to
1398
 * initialize the library.
1399
 *
1400
 * This is a no-op now.
1401
 */
1402
void
1403
3.99k
htmlInitAutoClose(void) {
1404
3.99k
}
1405
1406
static int
1407
0
htmlCompareTags(const void *key, const void *member) {
1408
0
    const xmlChar *tag = (const xmlChar *) key;
1409
0
    const htmlElemDesc *desc = (const htmlElemDesc *) member;
1410
1411
0
    return(xmlStrcasecmp(tag, BAD_CAST desc->name));
1412
0
}
1413
1414
/**
1415
 * htmlTagLookup:
1416
 * @tag:  The tag name in lowercase
1417
 *
1418
 * Lookup the HTML tag in the ElementTable
1419
 *
1420
 * Returns the related htmlElemDescPtr or NULL if not found.
1421
 */
1422
const htmlElemDesc *
1423
0
htmlTagLookup(const xmlChar *tag) {
1424
0
    if (tag == NULL)
1425
0
        return(NULL);
1426
1427
0
    return((const htmlElemDesc *) bsearch(tag, html40ElementTable,
1428
0
                sizeof(html40ElementTable) / sizeof(htmlElemDesc),
1429
0
                sizeof(htmlElemDesc), htmlCompareTags));
1430
0
}
1431
1432
/**
1433
 * htmlGetEndPriority:
1434
 * @name: The name of the element to look up the priority for.
1435
 *
1436
 * Return value: The "endtag" priority.
1437
 **/
1438
static int
1439
0
htmlGetEndPriority (const xmlChar *name) {
1440
0
    int i = 0;
1441
1442
0
    while ((htmlEndPriority[i].name != NULL) &&
1443
0
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1444
0
  i++;
1445
1446
0
    return(htmlEndPriority[i].priority);
1447
0
}
1448
1449
1450
static int
1451
0
htmlCompareStartClose(const void *vkey, const void *member) {
1452
0
    const htmlStartCloseEntry *key = (const htmlStartCloseEntry *) vkey;
1453
0
    const htmlStartCloseEntry *entry = (const htmlStartCloseEntry *) member;
1454
0
    int ret;
1455
1456
0
    ret = strcmp(key->oldTag, entry->oldTag);
1457
0
    if (ret == 0)
1458
0
        ret = strcmp(key->newTag, entry->newTag);
1459
1460
0
    return(ret);
1461
0
}
1462
1463
/**
1464
 * htmlCheckAutoClose:
1465
 * @newtag:  The new tag name
1466
 * @oldtag:  The old tag name
1467
 *
1468
 * Checks whether the new tag is one of the registered valid tags for
1469
 * closing old.
1470
 *
1471
 * Returns 0 if no, 1 if yes.
1472
 */
1473
static int
1474
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1475
0
{
1476
0
    htmlStartCloseEntry key;
1477
0
    void *res;
1478
1479
0
    key.oldTag = (const char *) oldtag;
1480
0
    key.newTag = (const char *) newtag;
1481
0
    res = bsearch(&key, htmlStartClose,
1482
0
            sizeof(htmlStartClose) / sizeof(htmlStartCloseEntry),
1483
0
            sizeof(htmlStartCloseEntry), htmlCompareStartClose);
1484
0
    return(res != NULL);
1485
0
}
1486
1487
/**
1488
 * htmlAutoCloseOnClose:
1489
 * @ctxt:  an HTML parser context
1490
 * @newtag:  The new tag name
1491
 * @force:  force the tag closure
1492
 *
1493
 * The HTML DTD allows an ending tag to implicitly close other tags.
1494
 */
1495
static void
1496
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1497
0
{
1498
0
    const htmlElemDesc *info;
1499
0
    int i, priority;
1500
1501
0
    priority = htmlGetEndPriority(newtag);
1502
1503
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1504
1505
0
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1506
0
            break;
1507
        /*
1508
         * A misplaced endtag can only close elements with lower
1509
         * or equal priority, so if we find an element with higher
1510
         * priority before we find an element with
1511
         * matching name, we just ignore this endtag
1512
         */
1513
0
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1514
0
            return;
1515
0
    }
1516
0
    if (i < 0)
1517
0
        return;
1518
1519
0
    while (!xmlStrEqual(newtag, ctxt->name)) {
1520
0
        info = htmlTagLookup(ctxt->name);
1521
0
        if ((info != NULL) && (info->endTag == 3)) {
1522
0
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1523
0
                   "Opening and ending tag mismatch: %s and %s\n",
1524
0
       newtag, ctxt->name);
1525
0
        }
1526
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1527
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1528
0
  htmlnamePop(ctxt);
1529
0
    }
1530
0
}
1531
1532
/**
1533
 * htmlAutoCloseOnEnd:
1534
 * @ctxt:  an HTML parser context
1535
 *
1536
 * Close all remaining tags at the end of the stream
1537
 */
1538
static void
1539
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1540
0
{
1541
0
    int i;
1542
1543
0
    if (ctxt->nameNr == 0)
1544
0
        return;
1545
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1546
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1547
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1548
0
  htmlnamePop(ctxt);
1549
0
    }
1550
0
}
1551
1552
/**
1553
 * htmlAutoClose:
1554
 * @ctxt:  an HTML parser context
1555
 * @newtag:  The new tag name or NULL
1556
 *
1557
 * The HTML DTD allows a tag to implicitly close other tags.
1558
 * The list is kept in htmlStartClose array. This function is
1559
 * called when a new tag has been detected and generates the
1560
 * appropriates closes if possible/needed.
1561
 * If newtag is NULL this mean we are at the end of the resource
1562
 * and we should check
1563
 */
1564
static void
1565
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1566
0
{
1567
0
    while ((newtag != NULL) && (ctxt->name != NULL) &&
1568
0
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1569
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1570
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1571
0
  htmlnamePop(ctxt);
1572
0
    }
1573
0
    if (newtag == NULL) {
1574
0
        htmlAutoCloseOnEnd(ctxt);
1575
0
        return;
1576
0
    }
1577
0
    while ((newtag == NULL) && (ctxt->name != NULL) &&
1578
0
           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1579
0
            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1580
0
            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1581
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1582
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1583
0
  htmlnamePop(ctxt);
1584
0
    }
1585
0
}
1586
1587
/**
1588
 * htmlAutoCloseTag:
1589
 * @doc:  the HTML document
1590
 * @name:  The tag name
1591
 * @elem:  the HTML element
1592
 *
1593
 * The HTML DTD allows a tag to implicitly close other tags.
1594
 * The list is kept in htmlStartClose array. This function checks
1595
 * if the element or one of it's children would autoclose the
1596
 * given tag.
1597
 *
1598
 * Returns 1 if autoclose, 0 otherwise
1599
 */
1600
int
1601
0
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1602
0
    htmlNodePtr child;
1603
1604
0
    if (elem == NULL) return(1);
1605
0
    if (xmlStrEqual(name, elem->name)) return(0);
1606
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1607
0
    child = elem->children;
1608
0
    while (child != NULL) {
1609
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1610
0
  child = child->next;
1611
0
    }
1612
0
    return(0);
1613
0
}
1614
1615
/**
1616
 * htmlIsAutoClosed:
1617
 * @doc:  the HTML document
1618
 * @elem:  the HTML element
1619
 *
1620
 * The HTML DTD allows a tag to implicitly close other tags.
1621
 * The list is kept in htmlStartClose array. This function checks
1622
 * if a tag is autoclosed by one of it's child
1623
 *
1624
 * Returns 1 if autoclosed, 0 otherwise
1625
 */
1626
int
1627
0
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1628
0
    htmlNodePtr child;
1629
1630
0
    if (elem == NULL) return(1);
1631
0
    child = elem->children;
1632
0
    while (child != NULL) {
1633
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1634
0
  child = child->next;
1635
0
    }
1636
0
    return(0);
1637
0
}
1638
1639
/**
1640
 * htmlCheckImplied:
1641
 * @ctxt:  an HTML parser context
1642
 * @newtag:  The new tag name
1643
 *
1644
 * The HTML DTD allows a tag to exists only implicitly
1645
 * called when a new tag has been detected and generates the
1646
 * appropriates implicit tags if missing
1647
 */
1648
static void
1649
0
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1650
0
    int i;
1651
1652
0
    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1653
0
        return;
1654
0
    if (!htmlOmittedDefaultValue)
1655
0
  return;
1656
0
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1657
0
  return;
1658
0
    if (ctxt->nameNr <= 0) {
1659
0
  htmlnamePush(ctxt, BAD_CAST"html");
1660
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1661
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1662
0
    }
1663
0
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1664
0
        return;
1665
0
    if ((ctxt->nameNr <= 1) &&
1666
0
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1667
0
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1668
0
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1669
0
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1670
0
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1671
0
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1672
0
        if (ctxt->html >= 3) {
1673
            /* we already saw or generated an <head> before */
1674
0
            return;
1675
0
        }
1676
        /*
1677
         * dropped OBJECT ... i you put it first BODY will be
1678
         * assumed !
1679
         */
1680
0
        htmlnamePush(ctxt, BAD_CAST"head");
1681
0
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1682
0
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1683
0
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1684
0
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1685
0
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1686
0
        if (ctxt->html >= 10) {
1687
            /* we already saw or generated a <body> before */
1688
0
            return;
1689
0
        }
1690
0
  for (i = 0;i < ctxt->nameNr;i++) {
1691
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1692
0
    return;
1693
0
      }
1694
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1695
0
    return;
1696
0
      }
1697
0
  }
1698
1699
0
  htmlnamePush(ctxt, BAD_CAST"body");
1700
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1701
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1702
0
    }
1703
0
}
1704
1705
/**
1706
 * htmlCheckParagraph
1707
 * @ctxt:  an HTML parser context
1708
 *
1709
 * Check whether a p element need to be implied before inserting
1710
 * characters in the current element.
1711
 *
1712
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1713
 *         in case of error.
1714
 */
1715
1716
static int
1717
0
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1718
0
    const xmlChar *tag;
1719
0
    int i;
1720
1721
0
    if (ctxt == NULL)
1722
0
  return(-1);
1723
0
    tag = ctxt->name;
1724
0
    if (tag == NULL) {
1725
0
  htmlAutoClose(ctxt, BAD_CAST"p");
1726
0
  htmlCheckImplied(ctxt, BAD_CAST"p");
1727
0
  htmlnamePush(ctxt, BAD_CAST"p");
1728
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1729
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1730
0
  return(1);
1731
0
    }
1732
0
    if (!htmlOmittedDefaultValue)
1733
0
  return(0);
1734
0
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1735
0
  if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1736
0
      htmlAutoClose(ctxt, BAD_CAST"p");
1737
0
      htmlCheckImplied(ctxt, BAD_CAST"p");
1738
0
      htmlnamePush(ctxt, BAD_CAST"p");
1739
0
      if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1740
0
    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1741
0
      return(1);
1742
0
  }
1743
0
    }
1744
0
    return(0);
1745
0
}
1746
1747
/**
1748
 * htmlIsScriptAttribute:
1749
 * @name:  an attribute name
1750
 *
1751
 * Check if an attribute is of content type Script
1752
 *
1753
 * Returns 1 is the attribute is a script 0 otherwise
1754
 */
1755
int
1756
0
htmlIsScriptAttribute(const xmlChar *name) {
1757
0
    unsigned int i;
1758
1759
0
    if (name == NULL)
1760
0
      return(0);
1761
    /*
1762
     * all script attributes start with 'on'
1763
     */
1764
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1765
0
      return(0);
1766
0
    for (i = 0;
1767
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1768
0
   i++) {
1769
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1770
0
      return(1);
1771
0
    }
1772
0
    return(0);
1773
0
}
1774
1775
/************************************************************************
1776
 *                  *
1777
 *  The list of HTML predefined entities      *
1778
 *                  *
1779
 ************************************************************************/
1780
1781
1782
static const htmlEntityDesc  html40EntitiesTable[] = {
1783
/*
1784
 * the 4 absolute ones, plus apostrophe.
1785
 */
1786
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1787
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1788
{ 39, "apos", "single quote" },
1789
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1790
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1791
1792
/*
1793
 * A bunch still in the 128-255 range
1794
 * Replacing them depend really on the charset used.
1795
 */
1796
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1797
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1798
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1799
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1800
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1801
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1802
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1803
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1804
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1805
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1806
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1807
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1808
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1809
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1810
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1811
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1812
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1813
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1814
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1815
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1816
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1817
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1818
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1819
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1820
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1821
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1822
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1823
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1824
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1825
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1826
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1827
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1828
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1829
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1830
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1831
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1832
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1833
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1834
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1835
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1836
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1837
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1838
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1839
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1840
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1841
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1842
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1843
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1844
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1845
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1846
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1847
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1848
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1849
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1850
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1851
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1852
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1853
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1854
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1855
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1856
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1857
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1858
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1859
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1860
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1861
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1862
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1863
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1864
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1865
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1866
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1867
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1868
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1869
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1870
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1871
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1872
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1873
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1874
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1875
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1876
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1877
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1878
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1879
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1880
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1881
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1882
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1883
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1884
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1885
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1886
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1887
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1888
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1889
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1890
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1891
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1892
1893
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1894
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1895
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1896
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1897
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1898
1899
/*
1900
 * Anything below should really be kept as entities references
1901
 */
1902
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1903
1904
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1905
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1906
1907
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1908
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1909
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1910
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1911
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1912
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1913
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1914
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1915
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1916
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1917
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1918
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1919
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1920
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1921
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1922
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1923
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1924
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1925
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1926
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1927
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1928
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1929
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1930
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1931
1932
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1933
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1934
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1935
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1936
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1937
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1938
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1939
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1940
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1941
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1942
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1943
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1944
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1945
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1946
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1947
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1948
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1949
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1950
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1951
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1952
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1953
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1954
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1955
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1956
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1957
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1958
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1959
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1960
1961
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1962
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1963
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1964
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1965
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1966
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1967
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1968
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1969
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1970
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1971
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1972
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1973
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1974
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1975
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1976
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1977
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1978
1979
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1980
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1981
1982
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1983
1984
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1985
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1986
1987
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1988
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1989
1990
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1991
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1992
1993
{ 8364, "euro", "euro sign, U+20AC NEW" },
1994
1995
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1996
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1997
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1998
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1999
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
2000
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
2001
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
2002
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
2003
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
2004
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
2005
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
2006
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
2007
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
2008
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
2009
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
2010
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
2011
2012
{ 8704, "forall","for all, U+2200 ISOtech" },
2013
{ 8706, "part", "partial differential, U+2202 ISOtech" },
2014
{ 8707, "exist","there exists, U+2203 ISOtech" },
2015
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
2016
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
2017
{ 8712, "isin", "element of, U+2208 ISOtech" },
2018
{ 8713, "notin","not an element of, U+2209 ISOtech" },
2019
{ 8715, "ni", "contains as member, U+220B ISOtech" },
2020
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
2021
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
2022
{ 8722, "minus","minus sign, U+2212 ISOtech" },
2023
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
2024
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
2025
{ 8733, "prop", "proportional to, U+221D ISOtech" },
2026
{ 8734, "infin","infinity, U+221E ISOtech" },
2027
{ 8736, "ang",  "angle, U+2220 ISOamso" },
2028
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
2029
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
2030
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
2031
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
2032
{ 8747, "int",  "integral, U+222B ISOtech" },
2033
{ 8756, "there4","therefore, U+2234 ISOtech" },
2034
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
2035
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
2036
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
2037
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
2038
{ 8801, "equiv","identical to, U+2261 ISOtech" },
2039
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
2040
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
2041
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
2042
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
2043
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
2044
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
2045
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
2046
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
2047
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
2048
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
2049
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
2050
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
2051
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
2052
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
2053
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
2054
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
2055
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
2056
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
2057
2058
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
2059
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
2060
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
2061
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
2062
2063
};
2064
2065
/************************************************************************
2066
 *                  *
2067
 *    Commodity functions to handle entities      *
2068
 *                  *
2069
 ************************************************************************/
2070
2071
/*
2072
 * Macro used to grow the current buffer.
2073
 */
2074
0
#define growBuffer(buffer) {           \
2075
0
    xmlChar *tmp;             \
2076
0
    buffer##_size *= 2;             \
2077
0
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size);    \
2078
0
    if (tmp == NULL) {             \
2079
0
  htmlErrMemory(ctxt, "growing buffer\n");      \
2080
0
  xmlFree(buffer);            \
2081
0
  return(NULL);             \
2082
0
    }                  \
2083
0
    buffer = tmp;             \
2084
0
}
2085
2086
/**
2087
 * htmlEntityLookup:
2088
 * @name: the entity name
2089
 *
2090
 * Lookup the given entity in EntitiesTable
2091
 *
2092
 * TODO: the linear scan is really ugly, an hash table is really needed.
2093
 *
2094
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2095
 */
2096
const htmlEntityDesc *
2097
0
htmlEntityLookup(const xmlChar *name) {
2098
0
    unsigned int i;
2099
2100
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2101
0
                    sizeof(html40EntitiesTable[0]));i++) {
2102
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
2103
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2104
0
  }
2105
0
    }
2106
0
    return(NULL);
2107
0
}
2108
2109
/**
2110
 * htmlEntityValueLookup:
2111
 * @value: the entity's unicode value
2112
 *
2113
 * Lookup the given entity in EntitiesTable
2114
 *
2115
 * TODO: the linear scan is really ugly, an hash table is really needed.
2116
 *
2117
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
2118
 */
2119
const htmlEntityDesc *
2120
0
htmlEntityValueLookup(unsigned int value) {
2121
0
    unsigned int i;
2122
2123
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
2124
0
                    sizeof(html40EntitiesTable[0]));i++) {
2125
0
        if (html40EntitiesTable[i].value >= value) {
2126
0
      if (html40EntitiesTable[i].value > value)
2127
0
    break;
2128
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
2129
0
  }
2130
0
    }
2131
0
    return(NULL);
2132
0
}
2133
2134
/**
2135
 * UTF8ToHtml:
2136
 * @out:  a pointer to an array of bytes to store the result
2137
 * @outlen:  the length of @out
2138
 * @in:  a pointer to an array of UTF-8 chars
2139
 * @inlen:  the length of @in
2140
 *
2141
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2142
 * plus HTML entities block of chars out.
2143
 *
2144
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2145
 * The value of @inlen after return is the number of octets consumed
2146
 *     as the return value is positive, else unpredictable.
2147
 * The value of @outlen after return is the number of octets consumed.
2148
 */
2149
int
2150
UTF8ToHtml(unsigned char* out, int *outlen,
2151
0
              const unsigned char* in, int *inlen) {
2152
0
    const unsigned char* processed = in;
2153
0
    const unsigned char* outend;
2154
0
    const unsigned char* outstart = out;
2155
0
    const unsigned char* instart = in;
2156
0
    const unsigned char* inend;
2157
0
    unsigned int c, d;
2158
0
    int trailing;
2159
2160
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
2161
0
    if (in == NULL) {
2162
        /*
2163
   * initialization nothing to do
2164
   */
2165
0
  *outlen = 0;
2166
0
  *inlen = 0;
2167
0
  return(0);
2168
0
    }
2169
0
    inend = in + (*inlen);
2170
0
    outend = out + (*outlen);
2171
0
    while (in < inend) {
2172
0
  d = *in++;
2173
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2174
0
  else if (d < 0xC0) {
2175
      /* trailing byte in leading position */
2176
0
      *outlen = out - outstart;
2177
0
      *inlen = processed - instart;
2178
0
      return(-2);
2179
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2180
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2181
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2182
0
  else {
2183
      /* no chance for this in Ascii */
2184
0
      *outlen = out - outstart;
2185
0
      *inlen = processed - instart;
2186
0
      return(-2);
2187
0
  }
2188
2189
0
  if (inend - in < trailing) {
2190
0
      break;
2191
0
  }
2192
2193
0
  for ( ; trailing; trailing--) {
2194
0
      if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2195
0
    break;
2196
0
      c <<= 6;
2197
0
      c |= d & 0x3F;
2198
0
  }
2199
2200
  /* assertion: c is a single UTF-4 value */
2201
0
  if (c < 0x80) {
2202
0
      if (out + 1 >= outend)
2203
0
    break;
2204
0
      *out++ = c;
2205
0
  } else {
2206
0
      int len;
2207
0
      const htmlEntityDesc * ent;
2208
0
      const char *cp;
2209
0
      char nbuf[16];
2210
2211
      /*
2212
       * Try to lookup a predefined HTML entity for it
2213
       */
2214
2215
0
      ent = htmlEntityValueLookup(c);
2216
0
      if (ent == NULL) {
2217
0
        snprintf(nbuf, sizeof(nbuf), "#%u", c);
2218
0
        cp = nbuf;
2219
0
      }
2220
0
      else
2221
0
        cp = ent->name;
2222
0
      len = strlen(cp);
2223
0
      if (out + 2 + len >= outend)
2224
0
    break;
2225
0
      *out++ = '&';
2226
0
      memcpy(out, cp, len);
2227
0
      out += len;
2228
0
      *out++ = ';';
2229
0
  }
2230
0
  processed = in;
2231
0
    }
2232
0
    *outlen = out - outstart;
2233
0
    *inlen = processed - instart;
2234
0
    return(0);
2235
0
}
2236
2237
/**
2238
 * htmlEncodeEntities:
2239
 * @out:  a pointer to an array of bytes to store the result
2240
 * @outlen:  the length of @out
2241
 * @in:  a pointer to an array of UTF-8 chars
2242
 * @inlen:  the length of @in
2243
 * @quoteChar: the quote character to escape (' or ") or zero.
2244
 *
2245
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2246
 * plus HTML entities block of chars out.
2247
 *
2248
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2249
 * The value of @inlen after return is the number of octets consumed
2250
 *     as the return value is positive, else unpredictable.
2251
 * The value of @outlen after return is the number of octets consumed.
2252
 */
2253
int
2254
htmlEncodeEntities(unsigned char* out, int *outlen,
2255
0
       const unsigned char* in, int *inlen, int quoteChar) {
2256
0
    const unsigned char* processed = in;
2257
0
    const unsigned char* outend;
2258
0
    const unsigned char* outstart = out;
2259
0
    const unsigned char* instart = in;
2260
0
    const unsigned char* inend;
2261
0
    unsigned int c, d;
2262
0
    int trailing;
2263
2264
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2265
0
        return(-1);
2266
0
    outend = out + (*outlen);
2267
0
    inend = in + (*inlen);
2268
0
    while (in < inend) {
2269
0
  d = *in++;
2270
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2271
0
  else if (d < 0xC0) {
2272
      /* trailing byte in leading position */
2273
0
      *outlen = out - outstart;
2274
0
      *inlen = processed - instart;
2275
0
      return(-2);
2276
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2277
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2278
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2279
0
  else {
2280
      /* no chance for this in Ascii */
2281
0
      *outlen = out - outstart;
2282
0
      *inlen = processed - instart;
2283
0
      return(-2);
2284
0
  }
2285
2286
0
  if (inend - in < trailing)
2287
0
      break;
2288
2289
0
  while (trailing--) {
2290
0
      if (((d= *in++) & 0xC0) != 0x80) {
2291
0
    *outlen = out - outstart;
2292
0
    *inlen = processed - instart;
2293
0
    return(-2);
2294
0
      }
2295
0
      c <<= 6;
2296
0
      c |= d & 0x3F;
2297
0
  }
2298
2299
  /* assertion: c is a single UTF-4 value */
2300
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2301
0
      (c != '&') && (c != '<') && (c != '>')) {
2302
0
      if (out >= outend)
2303
0
    break;
2304
0
      *out++ = c;
2305
0
  } else {
2306
0
      const htmlEntityDesc * ent;
2307
0
      const char *cp;
2308
0
      char nbuf[16];
2309
0
      int len;
2310
2311
      /*
2312
       * Try to lookup a predefined HTML entity for it
2313
       */
2314
0
      ent = htmlEntityValueLookup(c);
2315
0
      if (ent == NULL) {
2316
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2317
0
    cp = nbuf;
2318
0
      }
2319
0
      else
2320
0
    cp = ent->name;
2321
0
      len = strlen(cp);
2322
0
      if (out + 2 + len > outend)
2323
0
    break;
2324
0
      *out++ = '&';
2325
0
      memcpy(out, cp, len);
2326
0
      out += len;
2327
0
      *out++ = ';';
2328
0
  }
2329
0
  processed = in;
2330
0
    }
2331
0
    *outlen = out - outstart;
2332
0
    *inlen = processed - instart;
2333
0
    return(0);
2334
0
}
2335
2336
/************************************************************************
2337
 *                  *
2338
 *    Commodity functions to handle streams     *
2339
 *                  *
2340
 ************************************************************************/
2341
2342
#ifdef LIBXML_PUSH_ENABLED
2343
/**
2344
 * htmlNewInputStream:
2345
 * @ctxt:  an HTML parser context
2346
 *
2347
 * Create a new input stream structure
2348
 * Returns the new input stream or NULL
2349
 */
2350
static htmlParserInputPtr
2351
0
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2352
0
    htmlParserInputPtr input;
2353
2354
0
    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2355
0
    if (input == NULL) {
2356
0
        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2357
0
  return(NULL);
2358
0
    }
2359
0
    memset(input, 0, sizeof(htmlParserInput));
2360
0
    input->filename = NULL;
2361
0
    input->directory = NULL;
2362
0
    input->base = NULL;
2363
0
    input->cur = NULL;
2364
0
    input->buf = NULL;
2365
0
    input->line = 1;
2366
0
    input->col = 1;
2367
0
    input->buf = NULL;
2368
0
    input->free = NULL;
2369
0
    input->version = NULL;
2370
0
    input->consumed = 0;
2371
0
    input->length = 0;
2372
0
    return(input);
2373
0
}
2374
#endif
2375
2376
2377
/************************************************************************
2378
 *                  *
2379
 *    Commodity functions, cleanup needed ?     *
2380
 *                  *
2381
 ************************************************************************/
2382
/*
2383
 * all tags allowing pc data from the html 4.01 loose dtd
2384
 * NOTE: it might be more appropriate to integrate this information
2385
 * into the html40ElementTable array but I don't want to risk any
2386
 * binary incompatibility
2387
 */
2388
static const char *allowPCData[] = {
2389
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2390
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2391
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2392
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2393
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2394
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2395
};
2396
2397
/**
2398
 * areBlanks:
2399
 * @ctxt:  an HTML parser context
2400
 * @str:  a xmlChar *
2401
 * @len:  the size of @str
2402
 *
2403
 * Is this a sequence of blank chars that one can ignore ?
2404
 *
2405
 * Returns 1 if ignorable 0 otherwise.
2406
 */
2407
2408
0
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2409
0
    unsigned int i;
2410
0
    int j;
2411
0
    xmlNodePtr lastChild;
2412
0
    xmlDtdPtr dtd;
2413
2414
0
    for (j = 0;j < len;j++)
2415
0
        if (!(IS_BLANK_CH(str[j]))) return(0);
2416
2417
0
    if (CUR == 0) return(1);
2418
0
    if (CUR != '<') return(0);
2419
0
    if (ctxt->name == NULL)
2420
0
  return(1);
2421
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2422
0
  return(1);
2423
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2424
0
  return(1);
2425
2426
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2427
0
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2428
0
        dtd = xmlGetIntSubset(ctxt->myDoc);
2429
0
        if (dtd != NULL && dtd->ExternalID != NULL) {
2430
0
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2431
0
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2432
0
                return(1);
2433
0
        }
2434
0
    }
2435
2436
0
    if (ctxt->node == NULL) return(0);
2437
0
    lastChild = xmlGetLastChild(ctxt->node);
2438
0
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2439
0
  lastChild = lastChild->prev;
2440
0
    if (lastChild == NULL) {
2441
0
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2442
0
            (ctxt->node->content != NULL)) return(0);
2443
  /* keep ws in constructs like ...<b> </b>...
2444
     for all tags "b" allowing PCDATA */
2445
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2446
0
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2447
0
    return(0);
2448
0
      }
2449
0
  }
2450
0
    } else if (xmlNodeIsText(lastChild)) {
2451
0
        return(0);
2452
0
    } else {
2453
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2454
     for all tags "p" allowing PCDATA */
2455
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2456
0
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2457
0
    return(0);
2458
0
      }
2459
0
  }
2460
0
    }
2461
0
    return(1);
2462
0
}
2463
2464
/**
2465
 * htmlNewDocNoDtD:
2466
 * @URI:  URI for the dtd, or NULL
2467
 * @ExternalID:  the external ID of the DTD, or NULL
2468
 *
2469
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2470
 * are NULL
2471
 *
2472
 * Returns a new document, do not initialize the DTD if not provided
2473
 */
2474
htmlDocPtr
2475
0
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2476
0
    xmlDocPtr cur;
2477
2478
    /*
2479
     * Allocate a new document and fill the fields.
2480
     */
2481
0
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2482
0
    if (cur == NULL) {
2483
0
  htmlErrMemory(NULL, "HTML document creation failed\n");
2484
0
  return(NULL);
2485
0
    }
2486
0
    memset(cur, 0, sizeof(xmlDoc));
2487
2488
0
    cur->type = XML_HTML_DOCUMENT_NODE;
2489
0
    cur->version = NULL;
2490
0
    cur->intSubset = NULL;
2491
0
    cur->doc = cur;
2492
0
    cur->name = NULL;
2493
0
    cur->children = NULL;
2494
0
    cur->extSubset = NULL;
2495
0
    cur->oldNs = NULL;
2496
0
    cur->encoding = NULL;
2497
0
    cur->standalone = 1;
2498
0
    cur->compression = 0;
2499
0
    cur->ids = NULL;
2500
0
    cur->refs = NULL;
2501
0
    cur->_private = NULL;
2502
0
    cur->charset = XML_CHAR_ENCODING_UTF8;
2503
0
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2504
0
    if ((ExternalID != NULL) ||
2505
0
  (URI != NULL))
2506
0
  xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2507
0
    if ((__xmlRegisterCallbacks) && (xmlRegisterNodeDefaultValue))
2508
0
  xmlRegisterNodeDefaultValue((xmlNodePtr)cur);
2509
0
    return(cur);
2510
0
}
2511
2512
/**
2513
 * htmlNewDoc:
2514
 * @URI:  URI for the dtd, or NULL
2515
 * @ExternalID:  the external ID of the DTD, or NULL
2516
 *
2517
 * Creates a new HTML document
2518
 *
2519
 * Returns a new document
2520
 */
2521
htmlDocPtr
2522
0
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2523
0
    if ((URI == NULL) && (ExternalID == NULL))
2524
0
  return(htmlNewDocNoDtD(
2525
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2526
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2527
2528
0
    return(htmlNewDocNoDtD(URI, ExternalID));
2529
0
}
2530
2531
2532
/************************************************************************
2533
 *                  *
2534
 *      The parser itself       *
2535
 *  Relates to http://www.w3.org/TR/html40        *
2536
 *                  *
2537
 ************************************************************************/
2538
2539
/************************************************************************
2540
 *                  *
2541
 *      The parser itself       *
2542
 *                  *
2543
 ************************************************************************/
2544
2545
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2546
2547
static void
2548
0
htmlSkipBogusComment(htmlParserCtxtPtr ctxt) {
2549
0
    int c;
2550
2551
0
    htmlParseErr(ctxt, XML_HTML_INCORRECTLY_OPENED_COMMENT,
2552
0
                 "Incorrectly opened comment\n", NULL, NULL);
2553
2554
0
    do {
2555
0
        c = CUR;
2556
0
        if (c == 0)
2557
0
            break;
2558
0
        NEXT;
2559
0
    } while (c != '>');
2560
0
}
2561
2562
/**
2563
 * htmlParseHTMLName:
2564
 * @ctxt:  an HTML parser context
2565
 *
2566
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2567
 * since HTML names are not case-sensitive.
2568
 *
2569
 * Returns the Tag Name parsed or NULL
2570
 */
2571
2572
static const xmlChar *
2573
0
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2574
0
    int i = 0;
2575
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2576
2577
0
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2578
0
        (CUR != ':') && (CUR != '.')) return(NULL);
2579
2580
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2581
0
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2582
0
     (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2583
0
           (CUR == '.'))) {
2584
0
  if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2585
0
        else loc[i] = CUR;
2586
0
  i++;
2587
2588
0
  NEXT;
2589
0
    }
2590
2591
0
    return(xmlDictLookup(ctxt->dict, loc, i));
2592
0
}
2593
2594
2595
/**
2596
 * htmlParseHTMLName_nonInvasive:
2597
 * @ctxt:  an HTML parser context
2598
 *
2599
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2600
 * since HTML names are not case-sensitive, this doesn't consume the data
2601
 * from the stream, it's a look-ahead
2602
 *
2603
 * Returns the Tag Name parsed or NULL
2604
 */
2605
2606
static const xmlChar *
2607
0
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2608
0
    int i = 0;
2609
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2610
2611
0
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2612
0
        (NXT(1) != ':')) return(NULL);
2613
2614
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2615
0
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2616
0
     (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2617
0
  if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2618
0
        else loc[i] = NXT(1+i);
2619
0
  i++;
2620
0
    }
2621
2622
0
    return(xmlDictLookup(ctxt->dict, loc, i));
2623
0
}
2624
2625
2626
/**
2627
 * htmlParseName:
2628
 * @ctxt:  an HTML parser context
2629
 *
2630
 * parse an HTML name, this routine is case sensitive.
2631
 *
2632
 * Returns the Name parsed or NULL
2633
 */
2634
2635
static const xmlChar *
2636
0
htmlParseName(htmlParserCtxtPtr ctxt) {
2637
0
    const xmlChar *in;
2638
0
    const xmlChar *ret;
2639
0
    int count = 0;
2640
2641
0
    GROW;
2642
2643
    /*
2644
     * Accelerator for simple ASCII names
2645
     */
2646
0
    in = ctxt->input->cur;
2647
0
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2648
0
  ((*in >= 0x41) && (*in <= 0x5A)) ||
2649
0
  (*in == '_') || (*in == ':')) {
2650
0
  in++;
2651
0
  while (((*in >= 0x61) && (*in <= 0x7A)) ||
2652
0
         ((*in >= 0x41) && (*in <= 0x5A)) ||
2653
0
         ((*in >= 0x30) && (*in <= 0x39)) ||
2654
0
         (*in == '_') || (*in == '-') ||
2655
0
         (*in == ':') || (*in == '.'))
2656
0
      in++;
2657
2658
0
  if (in == ctxt->input->end)
2659
0
      return(NULL);
2660
2661
0
  if ((*in > 0) && (*in < 0x80)) {
2662
0
      count = in - ctxt->input->cur;
2663
0
      ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2664
0
      ctxt->input->cur = in;
2665
0
      ctxt->input->col += count;
2666
0
      return(ret);
2667
0
  }
2668
0
    }
2669
0
    return(htmlParseNameComplex(ctxt));
2670
0
}
2671
2672
static const xmlChar *
2673
0
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2674
0
    int len = 0, l;
2675
0
    int c;
2676
0
    int count = 0;
2677
0
    const xmlChar *base = ctxt->input->base;
2678
2679
    /*
2680
     * Handler for more complex cases
2681
     */
2682
0
    GROW;
2683
0
    c = CUR_CHAR(l);
2684
0
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2685
0
  (!IS_LETTER(c) && (c != '_') &&
2686
0
         (c != ':'))) {
2687
0
  return(NULL);
2688
0
    }
2689
2690
0
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2691
0
     ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2692
0
            (c == '.') || (c == '-') ||
2693
0
      (c == '_') || (c == ':') ||
2694
0
      (IS_COMBINING(c)) ||
2695
0
      (IS_EXTENDER(c)))) {
2696
0
  if (count++ > 100) {
2697
0
      count = 0;
2698
0
      GROW;
2699
0
  }
2700
0
  len += l;
2701
0
  NEXTL(l);
2702
0
  c = CUR_CHAR(l);
2703
0
  if (ctxt->input->base != base) {
2704
      /*
2705
       * We changed encoding from an unknown encoding
2706
       * Input buffer changed location, so we better start again
2707
       */
2708
0
      return(htmlParseNameComplex(ctxt));
2709
0
  }
2710
0
    }
2711
2712
0
    if (ctxt->input->cur - ctxt->input->base < len) {
2713
        /* Sanity check */
2714
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2715
0
                     "unexpected change of input buffer", NULL, NULL);
2716
0
        return (NULL);
2717
0
    }
2718
2719
0
    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2720
0
}
2721
2722
2723
/**
2724
 * htmlParseHTMLAttribute:
2725
 * @ctxt:  an HTML parser context
2726
 * @stop:  a char stop value
2727
 *
2728
 * parse an HTML attribute value till the stop (quote), if
2729
 * stop is 0 then it stops at the first space
2730
 *
2731
 * Returns the attribute parsed or NULL
2732
 */
2733
2734
static xmlChar *
2735
0
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2736
0
    xmlChar *buffer = NULL;
2737
0
    int buffer_size = 0;
2738
0
    xmlChar *out = NULL;
2739
0
    const xmlChar *name = NULL;
2740
0
    const xmlChar *cur = NULL;
2741
0
    const htmlEntityDesc * ent;
2742
2743
    /*
2744
     * allocate a translation buffer.
2745
     */
2746
0
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2747
0
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size);
2748
0
    if (buffer == NULL) {
2749
0
  htmlErrMemory(ctxt, "buffer allocation failed\n");
2750
0
  return(NULL);
2751
0
    }
2752
0
    out = buffer;
2753
2754
    /*
2755
     * Ok loop until we reach one of the ending chars
2756
     */
2757
0
    while ((CUR != 0) && (CUR != stop)) {
2758
0
  if ((stop == 0) && (CUR == '>')) break;
2759
0
  if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2760
0
        if (CUR == '&') {
2761
0
      if (NXT(1) == '#') {
2762
0
    unsigned int c;
2763
0
    int bits;
2764
2765
0
    c = htmlParseCharRef(ctxt);
2766
0
    if      (c <    0x80)
2767
0
            { *out++  = c;                bits= -6; }
2768
0
    else if (c <   0x800)
2769
0
            { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2770
0
    else if (c < 0x10000)
2771
0
            { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2772
0
    else
2773
0
            { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2774
2775
0
    for ( ; bits >= 0; bits-= 6) {
2776
0
        *out++  = ((c >> bits) & 0x3F) | 0x80;
2777
0
    }
2778
2779
0
    if (out - buffer > buffer_size - 100) {
2780
0
      int indx = out - buffer;
2781
2782
0
      growBuffer(buffer);
2783
0
      out = &buffer[indx];
2784
0
    }
2785
0
      } else {
2786
0
    ent = htmlParseEntityRef(ctxt, &name);
2787
0
    if (name == NULL) {
2788
0
        *out++ = '&';
2789
0
        if (out - buffer > buffer_size - 100) {
2790
0
      int indx = out - buffer;
2791
2792
0
      growBuffer(buffer);
2793
0
      out = &buffer[indx];
2794
0
        }
2795
0
    } else if (ent == NULL) {
2796
0
        *out++ = '&';
2797
0
        cur = name;
2798
0
        while (*cur != 0) {
2799
0
      if (out - buffer > buffer_size - 100) {
2800
0
          int indx = out - buffer;
2801
2802
0
          growBuffer(buffer);
2803
0
          out = &buffer[indx];
2804
0
      }
2805
0
      *out++ = *cur++;
2806
0
        }
2807
0
    } else {
2808
0
        unsigned int c;
2809
0
        int bits;
2810
2811
0
        if (out - buffer > buffer_size - 100) {
2812
0
      int indx = out - buffer;
2813
2814
0
      growBuffer(buffer);
2815
0
      out = &buffer[indx];
2816
0
        }
2817
0
        c = ent->value;
2818
0
        if      (c <    0x80)
2819
0
      { *out++  = c;                bits= -6; }
2820
0
        else if (c <   0x800)
2821
0
      { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2822
0
        else if (c < 0x10000)
2823
0
      { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2824
0
        else
2825
0
      { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2826
2827
0
        for ( ; bits >= 0; bits-= 6) {
2828
0
      *out++  = ((c >> bits) & 0x3F) | 0x80;
2829
0
        }
2830
0
    }
2831
0
      }
2832
0
  } else {
2833
0
      unsigned int c;
2834
0
      int bits, l;
2835
2836
0
      if (out - buffer > buffer_size - 100) {
2837
0
    int indx = out - buffer;
2838
2839
0
    growBuffer(buffer);
2840
0
    out = &buffer[indx];
2841
0
      }
2842
0
      c = CUR_CHAR(l);
2843
0
      if      (c <    0x80)
2844
0
        { *out++  = c;                bits= -6; }
2845
0
      else if (c <   0x800)
2846
0
        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2847
0
      else if (c < 0x10000)
2848
0
        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2849
0
      else
2850
0
        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2851
2852
0
      for ( ; bits >= 0; bits-= 6) {
2853
0
    *out++  = ((c >> bits) & 0x3F) | 0x80;
2854
0
      }
2855
0
      NEXT;
2856
0
  }
2857
0
    }
2858
0
    *out = 0;
2859
0
    return(buffer);
2860
0
}
2861
2862
/**
2863
 * htmlParseEntityRef:
2864
 * @ctxt:  an HTML parser context
2865
 * @str:  location to store the entity name
2866
 *
2867
 * DEPRECATED: Internal function, don't use.
2868
 *
2869
 * parse an HTML ENTITY references
2870
 *
2871
 * [68] EntityRef ::= '&' Name ';'
2872
 *
2873
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2874
 *         if non-NULL *str will have to be freed by the caller.
2875
 */
2876
const htmlEntityDesc *
2877
0
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2878
0
    const xmlChar *name;
2879
0
    const htmlEntityDesc * ent = NULL;
2880
2881
0
    if (str != NULL) *str = NULL;
2882
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2883
2884
0
    if (CUR == '&') {
2885
0
        NEXT;
2886
0
        name = htmlParseName(ctxt);
2887
0
  if (name == NULL) {
2888
0
      htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2889
0
                   "htmlParseEntityRef: no name\n", NULL, NULL);
2890
0
  } else {
2891
0
      GROW;
2892
0
      if (CUR == ';') {
2893
0
          if (str != NULL)
2894
0
        *str = name;
2895
2896
    /*
2897
     * Lookup the entity in the table.
2898
     */
2899
0
    ent = htmlEntityLookup(name);
2900
0
    if (ent != NULL) /* OK that's ugly !!! */
2901
0
        NEXT;
2902
0
      } else {
2903
0
    htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2904
0
                 "htmlParseEntityRef: expecting ';'\n",
2905
0
           NULL, NULL);
2906
0
          if (str != NULL)
2907
0
        *str = name;
2908
0
      }
2909
0
  }
2910
0
    }
2911
0
    return(ent);
2912
0
}
2913
2914
/**
2915
 * htmlParseAttValue:
2916
 * @ctxt:  an HTML parser context
2917
 *
2918
 * parse a value for an attribute
2919
 * Note: the parser won't do substitution of entities here, this
2920
 * will be handled later in xmlStringGetNodeList, unless it was
2921
 * asked for ctxt->replaceEntities != 0
2922
 *
2923
 * Returns the AttValue parsed or NULL.
2924
 */
2925
2926
static xmlChar *
2927
0
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2928
0
    xmlChar *ret = NULL;
2929
2930
0
    if (CUR == '"') {
2931
0
        NEXT;
2932
0
  ret = htmlParseHTMLAttribute(ctxt, '"');
2933
0
        if (CUR != '"') {
2934
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2935
0
                   "AttValue: \" expected\n", NULL, NULL);
2936
0
  } else
2937
0
      NEXT;
2938
0
    } else if (CUR == '\'') {
2939
0
        NEXT;
2940
0
  ret = htmlParseHTMLAttribute(ctxt, '\'');
2941
0
        if (CUR != '\'') {
2942
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2943
0
                   "AttValue: ' expected\n", NULL, NULL);
2944
0
  } else
2945
0
      NEXT;
2946
0
    } else {
2947
        /*
2948
   * That's an HTMLism, the attribute value may not be quoted
2949
   */
2950
0
  ret = htmlParseHTMLAttribute(ctxt, 0);
2951
0
  if (ret == NULL) {
2952
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2953
0
                   "AttValue: no value found\n", NULL, NULL);
2954
0
  }
2955
0
    }
2956
0
    return(ret);
2957
0
}
2958
2959
/**
2960
 * htmlParseSystemLiteral:
2961
 * @ctxt:  an HTML parser context
2962
 *
2963
 * parse an HTML Literal
2964
 *
2965
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2966
 *
2967
 * Returns the SystemLiteral parsed or NULL
2968
 */
2969
2970
static xmlChar *
2971
0
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2972
0
    size_t len = 0, startPosition = 0;
2973
0
    int err = 0;
2974
0
    int quote;
2975
0
    xmlChar *ret = NULL;
2976
2977
0
    if ((CUR != '"') && (CUR != '\'')) {
2978
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2979
0
               "SystemLiteral \" or ' expected\n", NULL, NULL);
2980
0
        return(NULL);
2981
0
    }
2982
0
    quote = CUR;
2983
0
    NEXT;
2984
2985
0
    if (CUR_PTR < BASE_PTR)
2986
0
        return(ret);
2987
0
    startPosition = CUR_PTR - BASE_PTR;
2988
2989
0
    while ((CUR != 0) && (CUR != quote)) {
2990
        /* TODO: Handle UTF-8 */
2991
0
        if (!IS_CHAR_CH(CUR)) {
2992
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2993
0
                            "Invalid char in SystemLiteral 0x%X\n", CUR);
2994
0
            err = 1;
2995
0
        }
2996
0
        NEXT;
2997
0
        len++;
2998
0
    }
2999
0
    if (CUR != quote) {
3000
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3001
0
                     "Unfinished SystemLiteral\n", NULL, NULL);
3002
0
    } else {
3003
0
        NEXT;
3004
0
        if (err == 0)
3005
0
            ret = xmlStrndup((BASE_PTR+startPosition), len);
3006
0
    }
3007
3008
0
    return(ret);
3009
0
}
3010
3011
/**
3012
 * htmlParsePubidLiteral:
3013
 * @ctxt:  an HTML parser context
3014
 *
3015
 * parse an HTML public literal
3016
 *
3017
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
3018
 *
3019
 * Returns the PubidLiteral parsed or NULL.
3020
 */
3021
3022
static xmlChar *
3023
0
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
3024
0
    size_t len = 0, startPosition = 0;
3025
0
    int err = 0;
3026
0
    int quote;
3027
0
    xmlChar *ret = NULL;
3028
3029
0
    if ((CUR != '"') && (CUR != '\'')) {
3030
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
3031
0
               "PubidLiteral \" or ' expected\n", NULL, NULL);
3032
0
        return(NULL);
3033
0
    }
3034
0
    quote = CUR;
3035
0
    NEXT;
3036
3037
    /*
3038
     * Name ::= (Letter | '_') (NameChar)*
3039
     */
3040
0
    if (CUR_PTR < BASE_PTR)
3041
0
        return(ret);
3042
0
    startPosition = CUR_PTR - BASE_PTR;
3043
3044
0
    while ((CUR != 0) && (CUR != quote)) {
3045
0
        if (!IS_PUBIDCHAR_CH(CUR)) {
3046
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3047
0
                            "Invalid char in PubidLiteral 0x%X\n", CUR);
3048
0
            err = 1;
3049
0
        }
3050
0
        len++;
3051
0
        NEXT;
3052
0
    }
3053
3054
0
    if (CUR != quote) {
3055
0
        htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
3056
0
                     "Unfinished PubidLiteral\n", NULL, NULL);
3057
0
    } else {
3058
0
        NEXT;
3059
0
        if (err == 0)
3060
0
            ret = xmlStrndup((BASE_PTR + startPosition), len);
3061
0
    }
3062
3063
0
    return(ret);
3064
0
}
3065
3066
/**
3067
 * htmlParseScript:
3068
 * @ctxt:  an HTML parser context
3069
 *
3070
 * parse the content of an HTML SCRIPT or STYLE element
3071
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
3072
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
3073
 * http://www.w3.org/TR/html4/types.html#type-script
3074
 * http://www.w3.org/TR/html4/types.html#h-6.15
3075
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
3076
 *
3077
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
3078
 * element and the value of intrinsic event attributes. User agents must
3079
 * not evaluate script data as HTML markup but instead must pass it on as
3080
 * data to a script engine.
3081
 * NOTES:
3082
 * - The content is passed like CDATA
3083
 * - the attributes for style and scripting "onXXX" are also described
3084
 *   as CDATA but SGML allows entities references in attributes so their
3085
 *   processing is identical as other attributes
3086
 */
3087
static void
3088
0
htmlParseScript(htmlParserCtxtPtr ctxt) {
3089
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
3090
0
    int nbchar = 0;
3091
0
    int cur,l;
3092
3093
0
    SHRINK;
3094
0
    cur = CUR_CHAR(l);
3095
0
    while (cur != 0) {
3096
0
  if ((cur == '<') && (NXT(1) == '/')) {
3097
            /*
3098
             * One should break here, the specification is clear:
3099
             * Authors should therefore escape "</" within the content.
3100
             * Escape mechanisms are specific to each scripting or
3101
             * style sheet language.
3102
             *
3103
             * In recovery mode, only break if end tag match the
3104
             * current tag, effectively ignoring all tags inside the
3105
             * script/style block and treating the entire block as
3106
             * CDATA.
3107
             */
3108
0
            if (ctxt->recovery) {
3109
0
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
3110
0
           xmlStrlen(ctxt->name)) == 0)
3111
0
                {
3112
0
                    break; /* while */
3113
0
                } else {
3114
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
3115
0
         "Element %s embeds close tag\n",
3116
0
                     ctxt->name, NULL);
3117
0
    }
3118
0
            } else {
3119
0
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
3120
0
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
3121
0
                {
3122
0
                    break; /* while */
3123
0
                }
3124
0
            }
3125
0
  }
3126
0
        if (IS_CHAR(cur)) {
3127
0
      COPY_BUF(l,buf,nbchar,cur);
3128
0
        } else {
3129
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3130
0
                            "Invalid char in CDATA 0x%X\n", cur);
3131
0
        }
3132
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3133
0
            buf[nbchar] = 0;
3134
0
      if (ctxt->sax->cdataBlock!= NULL) {
3135
    /*
3136
     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3137
     */
3138
0
    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3139
0
      } else if (ctxt->sax->characters != NULL) {
3140
0
    ctxt->sax->characters(ctxt->userData, buf, nbchar);
3141
0
      }
3142
0
      nbchar = 0;
3143
0
  }
3144
0
  GROW;
3145
0
  NEXTL(l);
3146
0
  cur = CUR_CHAR(l);
3147
0
    }
3148
3149
0
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3150
0
        buf[nbchar] = 0;
3151
0
  if (ctxt->sax->cdataBlock!= NULL) {
3152
      /*
3153
       * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
3154
       */
3155
0
      ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
3156
0
  } else if (ctxt->sax->characters != NULL) {
3157
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3158
0
  }
3159
0
    }
3160
0
}
3161
3162
3163
/**
3164
 * htmlParseCharDataInternal:
3165
 * @ctxt:  an HTML parser context
3166
 * @readahead: optional read ahead character in ascii range
3167
 *
3168
 * parse a CharData section.
3169
 * if we are within a CDATA section ']]>' marks an end of section.
3170
 *
3171
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3172
 */
3173
3174
static void
3175
0
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3176
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3177
0
    int nbchar = 0;
3178
0
    int cur, l;
3179
0
    int chunk = 0;
3180
3181
0
    if (readahead)
3182
0
        buf[nbchar++] = readahead;
3183
3184
0
    SHRINK;
3185
0
    cur = CUR_CHAR(l);
3186
0
    while (((cur != '<') || (ctxt->token == '<')) &&
3187
0
           ((cur != '&') || (ctxt->token == '&')) &&
3188
0
     (cur != 0)) {
3189
0
  if (!(IS_CHAR(cur))) {
3190
0
      htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3191
0
                  "Invalid char in CDATA 0x%X\n", cur);
3192
0
  } else {
3193
0
      COPY_BUF(l,buf,nbchar,cur);
3194
0
  }
3195
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3196
0
            buf[nbchar] = 0;
3197
3198
      /*
3199
       * Ok the segment is to be consumed as chars.
3200
       */
3201
0
      if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3202
0
    if (areBlanks(ctxt, buf, nbchar)) {
3203
0
        if (ctxt->keepBlanks) {
3204
0
      if (ctxt->sax->characters != NULL)
3205
0
          ctxt->sax->characters(ctxt->userData, buf, nbchar);
3206
0
        } else {
3207
0
      if (ctxt->sax->ignorableWhitespace != NULL)
3208
0
          ctxt->sax->ignorableWhitespace(ctxt->userData,
3209
0
                                         buf, nbchar);
3210
0
        }
3211
0
    } else {
3212
0
        htmlCheckParagraph(ctxt);
3213
0
        if (ctxt->sax->characters != NULL)
3214
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3215
0
    }
3216
0
      }
3217
0
      nbchar = 0;
3218
0
  }
3219
0
  NEXTL(l);
3220
0
        chunk++;
3221
0
        if (chunk > HTML_PARSER_BUFFER_SIZE) {
3222
0
            chunk = 0;
3223
0
            SHRINK;
3224
0
            GROW;
3225
0
        }
3226
0
  cur = CUR_CHAR(l);
3227
0
  if (cur == 0) {
3228
0
      SHRINK;
3229
0
      GROW;
3230
0
      cur = CUR_CHAR(l);
3231
0
  }
3232
0
    }
3233
0
    if (nbchar != 0) {
3234
0
        buf[nbchar] = 0;
3235
3236
  /*
3237
   * Ok the segment is to be consumed as chars.
3238
   */
3239
0
  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3240
0
      if (areBlanks(ctxt, buf, nbchar)) {
3241
0
    if (ctxt->keepBlanks) {
3242
0
        if (ctxt->sax->characters != NULL)
3243
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3244
0
    } else {
3245
0
        if (ctxt->sax->ignorableWhitespace != NULL)
3246
0
      ctxt->sax->ignorableWhitespace(ctxt->userData,
3247
0
                                     buf, nbchar);
3248
0
    }
3249
0
      } else {
3250
0
    htmlCheckParagraph(ctxt);
3251
0
    if (ctxt->sax->characters != NULL)
3252
0
        ctxt->sax->characters(ctxt->userData, buf, nbchar);
3253
0
      }
3254
0
  }
3255
0
    } else {
3256
  /*
3257
   * Loop detection
3258
   */
3259
0
  if (cur == 0)
3260
0
      ctxt->instate = XML_PARSER_EOF;
3261
0
    }
3262
0
}
3263
3264
/**
3265
 * htmlParseCharData:
3266
 * @ctxt:  an HTML parser context
3267
 *
3268
 * parse a CharData section.
3269
 * if we are within a CDATA section ']]>' marks an end of section.
3270
 *
3271
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3272
 */
3273
3274
static void
3275
0
htmlParseCharData(htmlParserCtxtPtr ctxt) {
3276
0
    htmlParseCharDataInternal(ctxt, 0);
3277
0
}
3278
3279
/**
3280
 * htmlParseExternalID:
3281
 * @ctxt:  an HTML parser context
3282
 * @publicID:  a xmlChar** receiving PubidLiteral
3283
 *
3284
 * Parse an External ID or a Public ID
3285
 *
3286
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3287
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3288
 *
3289
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3290
 *
3291
 * Returns the function returns SystemLiteral and in the second
3292
 *                case publicID receives PubidLiteral, is strict is off
3293
 *                it is possible to return NULL and have publicID set.
3294
 */
3295
3296
static xmlChar *
3297
0
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3298
0
    xmlChar *URI = NULL;
3299
3300
0
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3301
0
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3302
0
   (UPP(4) == 'E') && (UPP(5) == 'M')) {
3303
0
        SKIP(6);
3304
0
  if (!IS_BLANK_CH(CUR)) {
3305
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3306
0
                   "Space required after 'SYSTEM'\n", NULL, NULL);
3307
0
  }
3308
0
        SKIP_BLANKS;
3309
0
  URI = htmlParseSystemLiteral(ctxt);
3310
0
  if (URI == NULL) {
3311
0
      htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3312
0
                   "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3313
0
        }
3314
0
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3315
0
         (UPP(2) == 'B') && (UPP(3) == 'L') &&
3316
0
         (UPP(4) == 'I') && (UPP(5) == 'C')) {
3317
0
        SKIP(6);
3318
0
  if (!IS_BLANK_CH(CUR)) {
3319
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3320
0
                   "Space required after 'PUBLIC'\n", NULL, NULL);
3321
0
  }
3322
0
        SKIP_BLANKS;
3323
0
  *publicID = htmlParsePubidLiteral(ctxt);
3324
0
  if (*publicID == NULL) {
3325
0
      htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3326
0
                   "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3327
0
       NULL, NULL);
3328
0
  }
3329
0
        SKIP_BLANKS;
3330
0
        if ((CUR == '"') || (CUR == '\'')) {
3331
0
      URI = htmlParseSystemLiteral(ctxt);
3332
0
  }
3333
0
    }
3334
0
    return(URI);
3335
0
}
3336
3337
/**
3338
 * xmlParsePI:
3339
 * @ctxt:  an XML parser context
3340
 *
3341
 * parse an XML Processing Instruction.
3342
 *
3343
 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3344
 */
3345
static void
3346
0
htmlParsePI(htmlParserCtxtPtr ctxt) {
3347
0
    xmlChar *buf = NULL;
3348
0
    int len = 0;
3349
0
    int size = HTML_PARSER_BUFFER_SIZE;
3350
0
    int cur, l;
3351
0
    const xmlChar *target;
3352
0
    xmlParserInputState state;
3353
0
    int count = 0;
3354
3355
0
    if ((RAW == '<') && (NXT(1) == '?')) {
3356
0
  state = ctxt->instate;
3357
0
        ctxt->instate = XML_PARSER_PI;
3358
  /*
3359
   * this is a Processing Instruction.
3360
   */
3361
0
  SKIP(2);
3362
0
  SHRINK;
3363
3364
  /*
3365
   * Parse the target name and check for special support like
3366
   * namespace.
3367
   */
3368
0
        target = htmlParseName(ctxt);
3369
0
  if (target != NULL) {
3370
0
      if (RAW == '>') {
3371
0
    SKIP(1);
3372
3373
    /*
3374
     * SAX: PI detected.
3375
     */
3376
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3377
0
        (ctxt->sax->processingInstruction != NULL))
3378
0
        ctxt->sax->processingInstruction(ctxt->userData,
3379
0
                                         target, NULL);
3380
0
    ctxt->instate = state;
3381
0
    return;
3382
0
      }
3383
0
      buf = (xmlChar *) xmlMallocAtomic(size);
3384
0
      if (buf == NULL) {
3385
0
    htmlErrMemory(ctxt, NULL);
3386
0
    ctxt->instate = state;
3387
0
    return;
3388
0
      }
3389
0
      cur = CUR;
3390
0
      if (!IS_BLANK(cur)) {
3391
0
    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3392
0
        "ParsePI: PI %s space expected\n", target, NULL);
3393
0
      }
3394
0
            SKIP_BLANKS;
3395
0
      cur = CUR_CHAR(l);
3396
0
      while ((cur != 0) && (cur != '>')) {
3397
0
    if (len + 5 >= size) {
3398
0
        xmlChar *tmp;
3399
3400
0
        size *= 2;
3401
0
        tmp = (xmlChar *) xmlRealloc(buf, size);
3402
0
        if (tmp == NULL) {
3403
0
      htmlErrMemory(ctxt, NULL);
3404
0
      xmlFree(buf);
3405
0
      ctxt->instate = state;
3406
0
      return;
3407
0
        }
3408
0
        buf = tmp;
3409
0
    }
3410
0
    count++;
3411
0
    if (count > 50) {
3412
0
        GROW;
3413
0
        count = 0;
3414
0
    }
3415
0
                if (IS_CHAR(cur)) {
3416
0
        COPY_BUF(l,buf,len,cur);
3417
0
                } else {
3418
0
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3419
0
                                    "Invalid char in processing instruction "
3420
0
                                    "0x%X\n", cur);
3421
0
                }
3422
0
    NEXTL(l);
3423
0
    cur = CUR_CHAR(l);
3424
0
    if (cur == 0) {
3425
0
        SHRINK;
3426
0
        GROW;
3427
0
        cur = CUR_CHAR(l);
3428
0
    }
3429
0
      }
3430
0
      buf[len] = 0;
3431
0
      if (cur != '>') {
3432
0
    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3433
0
          "ParsePI: PI %s never end ...\n", target, NULL);
3434
0
      } else {
3435
0
    SKIP(1);
3436
3437
    /*
3438
     * SAX: PI detected.
3439
     */
3440
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3441
0
        (ctxt->sax->processingInstruction != NULL))
3442
0
        ctxt->sax->processingInstruction(ctxt->userData,
3443
0
                                         target, buf);
3444
0
      }
3445
0
      xmlFree(buf);
3446
0
  } else {
3447
0
      htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3448
0
                         "PI is not started correctly", NULL, NULL);
3449
0
  }
3450
0
  ctxt->instate = state;
3451
0
    }
3452
0
}
3453
3454
/**
3455
 * htmlParseComment:
3456
 * @ctxt:  an HTML parser context
3457
 *
3458
 * Parse an XML (SGML) comment <!-- .... -->
3459
 *
3460
 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3461
 */
3462
static void
3463
0
htmlParseComment(htmlParserCtxtPtr ctxt) {
3464
0
    xmlChar *buf = NULL;
3465
0
    int len;
3466
0
    int size = HTML_PARSER_BUFFER_SIZE;
3467
0
    int q, ql;
3468
0
    int r, rl;
3469
0
    int cur, l;
3470
0
    int next, nl;
3471
0
    xmlParserInputState state;
3472
3473
    /*
3474
     * Check that there is a comment right here.
3475
     */
3476
0
    if ((RAW != '<') || (NXT(1) != '!') ||
3477
0
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3478
3479
0
    state = ctxt->instate;
3480
0
    ctxt->instate = XML_PARSER_COMMENT;
3481
0
    SHRINK;
3482
0
    SKIP(4);
3483
0
    buf = (xmlChar *) xmlMallocAtomic(size);
3484
0
    if (buf == NULL) {
3485
0
        htmlErrMemory(ctxt, "buffer allocation failed\n");
3486
0
  ctxt->instate = state;
3487
0
  return;
3488
0
    }
3489
0
    len = 0;
3490
0
    buf[len] = 0;
3491
0
    q = CUR_CHAR(ql);
3492
0
    if (q == 0)
3493
0
        goto unfinished;
3494
0
    if (q == '>') {
3495
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3496
0
        cur = '>';
3497
0
        goto finished;
3498
0
    }
3499
0
    NEXTL(ql);
3500
0
    r = CUR_CHAR(rl);
3501
0
    if (r == 0)
3502
0
        goto unfinished;
3503
0
    if (q == '-' && r == '>') {
3504
0
        htmlParseErr(ctxt, XML_ERR_COMMENT_ABRUPTLY_ENDED, "Comment abruptly ended", NULL, NULL);
3505
0
        cur = '>';
3506
0
        goto finished;
3507
0
    }
3508
0
    NEXTL(rl);
3509
0
    cur = CUR_CHAR(l);
3510
0
    while ((cur != 0) &&
3511
0
           ((cur != '>') ||
3512
0
      (r != '-') || (q != '-'))) {
3513
0
  NEXTL(l);
3514
0
  next = CUR_CHAR(nl);
3515
0
  if (next == 0) {
3516
0
      SHRINK;
3517
0
      GROW;
3518
0
      next = CUR_CHAR(nl);
3519
0
  }
3520
3521
0
  if ((q == '-') && (r == '-') && (cur == '!') && (next == '>')) {
3522
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3523
0
           "Comment incorrectly closed by '--!>'", NULL, NULL);
3524
0
    cur = '>';
3525
0
    break;
3526
0
  }
3527
3528
0
  if (len + 5 >= size) {
3529
0
      xmlChar *tmp;
3530
3531
0
      size *= 2;
3532
0
      tmp = (xmlChar *) xmlRealloc(buf, size);
3533
0
      if (tmp == NULL) {
3534
0
          xmlFree(buf);
3535
0
          htmlErrMemory(ctxt, "growing buffer failed\n");
3536
0
    ctxt->instate = state;
3537
0
    return;
3538
0
      }
3539
0
      buf = tmp;
3540
0
  }
3541
0
        if (IS_CHAR(q)) {
3542
0
      COPY_BUF(ql,buf,len,q);
3543
0
        } else {
3544
0
            htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3545
0
                            "Invalid char in comment 0x%X\n", q);
3546
0
        }
3547
3548
0
  q = r;
3549
0
  ql = rl;
3550
0
  r = cur;
3551
0
  rl = l;
3552
0
  cur = next;
3553
0
  l = nl;
3554
0
    }
3555
0
finished:
3556
0
    buf[len] = 0;
3557
0
    if (cur == '>') {
3558
0
        NEXT;
3559
0
  if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3560
0
      (!ctxt->disableSAX))
3561
0
      ctxt->sax->comment(ctxt->userData, buf);
3562
0
  xmlFree(buf);
3563
0
  ctxt->instate = state;
3564
0
  return;
3565
0
    }
3566
3567
0
unfinished:
3568
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3569
0
     "Comment not terminated \n<!--%.50s\n", buf, NULL);
3570
0
    xmlFree(buf);
3571
0
}
3572
3573
/**
3574
 * htmlParseCharRef:
3575
 * @ctxt:  an HTML parser context
3576
 *
3577
 * DEPRECATED: Internal function, don't use.
3578
 *
3579
 * parse Reference declarations
3580
 *
3581
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3582
 *                  '&#x' [0-9a-fA-F]+ ';'
3583
 *
3584
 * Returns the value parsed (as an int)
3585
 */
3586
int
3587
0
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3588
0
    int val = 0;
3589
3590
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3591
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3592
0
         "htmlParseCharRef: context error\n",
3593
0
         NULL, NULL);
3594
0
        return(0);
3595
0
    }
3596
0
    if ((CUR == '&') && (NXT(1) == '#') &&
3597
0
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3598
0
  SKIP(3);
3599
0
  while (CUR != ';') {
3600
0
      if ((CUR >= '0') && (CUR <= '9')) {
3601
0
                if (val < 0x110000)
3602
0
              val = val * 16 + (CUR - '0');
3603
0
            } else if ((CUR >= 'a') && (CUR <= 'f')) {
3604
0
                if (val < 0x110000)
3605
0
              val = val * 16 + (CUR - 'a') + 10;
3606
0
            } else if ((CUR >= 'A') && (CUR <= 'F')) {
3607
0
                if (val < 0x110000)
3608
0
              val = val * 16 + (CUR - 'A') + 10;
3609
0
            } else {
3610
0
          htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3611
0
                 "htmlParseCharRef: missing semicolon\n",
3612
0
           NULL, NULL);
3613
0
    break;
3614
0
      }
3615
0
      NEXT;
3616
0
  }
3617
0
  if (CUR == ';')
3618
0
      NEXT;
3619
0
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3620
0
  SKIP(2);
3621
0
  while (CUR != ';') {
3622
0
      if ((CUR >= '0') && (CUR <= '9')) {
3623
0
                if (val < 0x110000)
3624
0
              val = val * 10 + (CUR - '0');
3625
0
            } else {
3626
0
          htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3627
0
                 "htmlParseCharRef: missing semicolon\n",
3628
0
           NULL, NULL);
3629
0
    break;
3630
0
      }
3631
0
      NEXT;
3632
0
  }
3633
0
  if (CUR == ';')
3634
0
      NEXT;
3635
0
    } else {
3636
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3637
0
               "htmlParseCharRef: invalid value\n", NULL, NULL);
3638
0
    }
3639
    /*
3640
     * Check the value IS_CHAR ...
3641
     */
3642
0
    if (IS_CHAR(val)) {
3643
0
        return(val);
3644
0
    } else if (val >= 0x110000) {
3645
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHAR,
3646
0
         "htmlParseCharRef: value too large\n", NULL, NULL);
3647
0
    } else {
3648
0
  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3649
0
      "htmlParseCharRef: invalid xmlChar value %d\n",
3650
0
      val);
3651
0
    }
3652
0
    return(0);
3653
0
}
3654
3655
3656
/**
3657
 * htmlParseDocTypeDecl:
3658
 * @ctxt:  an HTML parser context
3659
 *
3660
 * parse a DOCTYPE declaration
3661
 *
3662
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3663
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3664
 */
3665
3666
static void
3667
0
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3668
0
    const xmlChar *name;
3669
0
    xmlChar *ExternalID = NULL;
3670
0
    xmlChar *URI = NULL;
3671
3672
    /*
3673
     * We know that '<!DOCTYPE' has been detected.
3674
     */
3675
0
    SKIP(9);
3676
3677
0
    SKIP_BLANKS;
3678
3679
    /*
3680
     * Parse the DOCTYPE name.
3681
     */
3682
0
    name = htmlParseName(ctxt);
3683
0
    if (name == NULL) {
3684
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3685
0
               "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3686
0
         NULL, NULL);
3687
0
    }
3688
    /*
3689
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3690
     */
3691
3692
0
    SKIP_BLANKS;
3693
3694
    /*
3695
     * Check for SystemID and ExternalID
3696
     */
3697
0
    URI = htmlParseExternalID(ctxt, &ExternalID);
3698
0
    SKIP_BLANKS;
3699
3700
    /*
3701
     * We should be at the end of the DOCTYPE declaration.
3702
     */
3703
0
    if (CUR != '>') {
3704
0
  htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3705
0
               "DOCTYPE improperly terminated\n", NULL, NULL);
3706
        /* Ignore bogus content */
3707
0
        while ((CUR != 0) && (CUR != '>'))
3708
0
            NEXT;
3709
0
    }
3710
0
    if (CUR == '>')
3711
0
        NEXT;
3712
3713
    /*
3714
     * Create or update the document accordingly to the DOCTYPE
3715
     */
3716
0
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3717
0
  (!ctxt->disableSAX))
3718
0
  ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3719
3720
    /*
3721
     * Cleanup, since we don't use all those identifiers
3722
     */
3723
0
    if (URI != NULL) xmlFree(URI);
3724
0
    if (ExternalID != NULL) xmlFree(ExternalID);
3725
0
}
3726
3727
/**
3728
 * htmlParseAttribute:
3729
 * @ctxt:  an HTML parser context
3730
 * @value:  a xmlChar ** used to store the value of the attribute
3731
 *
3732
 * parse an attribute
3733
 *
3734
 * [41] Attribute ::= Name Eq AttValue
3735
 *
3736
 * [25] Eq ::= S? '=' S?
3737
 *
3738
 * With namespace:
3739
 *
3740
 * [NS 11] Attribute ::= QName Eq AttValue
3741
 *
3742
 * Also the case QName == xmlns:??? is handled independently as a namespace
3743
 * definition.
3744
 *
3745
 * Returns the attribute name, and the value in *value.
3746
 */
3747
3748
static const xmlChar *
3749
0
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3750
0
    const xmlChar *name;
3751
0
    xmlChar *val = NULL;
3752
3753
0
    *value = NULL;
3754
0
    name = htmlParseHTMLName(ctxt);
3755
0
    if (name == NULL) {
3756
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3757
0
               "error parsing attribute name\n", NULL, NULL);
3758
0
        return(NULL);
3759
0
    }
3760
3761
    /*
3762
     * read the value
3763
     */
3764
0
    SKIP_BLANKS;
3765
0
    if (CUR == '=') {
3766
0
        NEXT;
3767
0
  SKIP_BLANKS;
3768
0
  val = htmlParseAttValue(ctxt);
3769
0
    }
3770
3771
0
    *value = val;
3772
0
    return(name);
3773
0
}
3774
3775
/**
3776
 * htmlCheckEncodingDirect:
3777
 * @ctxt:  an HTML parser context
3778
 * @attvalue: the attribute value
3779
 *
3780
 * Checks an attribute value to detect
3781
 * the encoding
3782
 * If a new encoding is detected the parser is switched to decode
3783
 * it and pass UTF8
3784
 */
3785
static void
3786
0
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3787
3788
0
    if ((ctxt == NULL) || (encoding == NULL) ||
3789
0
        (ctxt->options & HTML_PARSE_IGNORE_ENC))
3790
0
  return;
3791
3792
    /* do not change encoding */
3793
0
    if (ctxt->input->encoding != NULL)
3794
0
        return;
3795
3796
0
    if (encoding != NULL) {
3797
0
  xmlCharEncoding enc;
3798
0
  xmlCharEncodingHandlerPtr handler;
3799
3800
0
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3801
3802
0
  if (ctxt->input->encoding != NULL)
3803
0
      xmlFree((xmlChar *) ctxt->input->encoding);
3804
0
  ctxt->input->encoding = xmlStrdup(encoding);
3805
3806
0
  enc = xmlParseCharEncoding((const char *) encoding);
3807
  /*
3808
   * registered set of known encodings
3809
   */
3810
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
3811
0
      if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3812
0
           (enc == XML_CHAR_ENCODING_UTF16BE) ||
3813
0
     (enc == XML_CHAR_ENCODING_UCS4LE) ||
3814
0
     (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3815
0
    (ctxt->input->buf != NULL) &&
3816
0
    (ctxt->input->buf->encoder == NULL)) {
3817
0
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3818
0
                 "htmlCheckEncoding: wrong encoding meta\n",
3819
0
           NULL, NULL);
3820
0
      } else {
3821
0
    xmlSwitchEncoding(ctxt, enc);
3822
0
      }
3823
0
      ctxt->charset = XML_CHAR_ENCODING_UTF8;
3824
0
  } else {
3825
      /*
3826
       * fallback for unknown encodings
3827
       */
3828
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
3829
0
      if (handler != NULL) {
3830
0
    xmlSwitchToEncoding(ctxt, handler);
3831
0
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3832
0
      } else {
3833
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3834
0
                 "htmlCheckEncoding: unknown encoding %s\n",
3835
0
           encoding, NULL);
3836
0
      }
3837
0
  }
3838
3839
0
  if ((ctxt->input->buf != NULL) &&
3840
0
      (ctxt->input->buf->encoder != NULL) &&
3841
0
      (ctxt->input->buf->raw != NULL) &&
3842
0
      (ctxt->input->buf->buffer != NULL)) {
3843
0
      int nbchars;
3844
0
      int processed;
3845
3846
      /*
3847
       * convert as much as possible to the parser reading buffer.
3848
       */
3849
0
      processed = ctxt->input->cur - ctxt->input->base;
3850
0
      xmlBufShrink(ctxt->input->buf->buffer, processed);
3851
0
      nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3852
0
            xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3853
0
      if (nbchars < 0) {
3854
0
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3855
0
                 "htmlCheckEncoding: encoder error\n",
3856
0
           NULL, NULL);
3857
0
      }
3858
0
  }
3859
0
    }
3860
0
}
3861
3862
/**
3863
 * htmlCheckEncoding:
3864
 * @ctxt:  an HTML parser context
3865
 * @attvalue: the attribute value
3866
 *
3867
 * Checks an http-equiv attribute from a Meta tag to detect
3868
 * the encoding
3869
 * If a new encoding is detected the parser is switched to decode
3870
 * it and pass UTF8
3871
 */
3872
static void
3873
0
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3874
0
    const xmlChar *encoding;
3875
3876
0
    if (!attvalue)
3877
0
  return;
3878
3879
0
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3880
0
    if (encoding != NULL) {
3881
0
  encoding += 7;
3882
0
    }
3883
    /*
3884
     * skip blank
3885
     */
3886
0
    if (encoding && IS_BLANK_CH(*encoding))
3887
0
  encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3888
0
    if (encoding && *encoding == '=') {
3889
0
  encoding ++;
3890
0
  htmlCheckEncodingDirect(ctxt, encoding);
3891
0
    }
3892
0
}
3893
3894
/**
3895
 * htmlCheckMeta:
3896
 * @ctxt:  an HTML parser context
3897
 * @atts:  the attributes values
3898
 *
3899
 * Checks an attributes from a Meta tag
3900
 */
3901
static void
3902
0
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3903
0
    int i;
3904
0
    const xmlChar *att, *value;
3905
0
    int http = 0;
3906
0
    const xmlChar *content = NULL;
3907
3908
0
    if ((ctxt == NULL) || (atts == NULL))
3909
0
  return;
3910
3911
0
    i = 0;
3912
0
    att = atts[i++];
3913
0
    while (att != NULL) {
3914
0
  value = atts[i++];
3915
0
  if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3916
0
   && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3917
0
      http = 1;
3918
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3919
0
      htmlCheckEncodingDirect(ctxt, value);
3920
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3921
0
      content = value;
3922
0
  att = atts[i++];
3923
0
    }
3924
0
    if ((http) && (content != NULL))
3925
0
  htmlCheckEncoding(ctxt, content);
3926
3927
0
}
3928
3929
/**
3930
 * htmlParseStartTag:
3931
 * @ctxt:  an HTML parser context
3932
 *
3933
 * parse a start of tag either for rule element or
3934
 * EmptyElement. In both case we don't parse the tag closing chars.
3935
 *
3936
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3937
 *
3938
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3939
 *
3940
 * With namespace:
3941
 *
3942
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3943
 *
3944
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3945
 *
3946
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3947
 */
3948
3949
static int
3950
0
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3951
0
    const xmlChar *name;
3952
0
    const xmlChar *attname;
3953
0
    xmlChar *attvalue;
3954
0
    const xmlChar **atts;
3955
0
    int nbatts = 0;
3956
0
    int maxatts;
3957
0
    int meta = 0;
3958
0
    int i;
3959
0
    int discardtag = 0;
3960
3961
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3962
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3963
0
         "htmlParseStartTag: context error\n", NULL, NULL);
3964
0
  return -1;
3965
0
    }
3966
0
    if (ctxt->instate == XML_PARSER_EOF)
3967
0
        return(-1);
3968
0
    if (CUR != '<') return -1;
3969
0
    NEXT;
3970
3971
0
    atts = ctxt->atts;
3972
0
    maxatts = ctxt->maxatts;
3973
3974
0
    GROW;
3975
0
    name = htmlParseHTMLName(ctxt);
3976
0
    if (name == NULL) {
3977
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3978
0
               "htmlParseStartTag: invalid element name\n",
3979
0
         NULL, NULL);
3980
  /* Dump the bogus tag like browsers do */
3981
0
  while ((CUR != 0) && (CUR != '>') &&
3982
0
               (ctxt->instate != XML_PARSER_EOF))
3983
0
      NEXT;
3984
0
        return -1;
3985
0
    }
3986
0
    if (xmlStrEqual(name, BAD_CAST"meta"))
3987
0
  meta = 1;
3988
3989
    /*
3990
     * Check for auto-closure of HTML elements.
3991
     */
3992
0
    htmlAutoClose(ctxt, name);
3993
3994
    /*
3995
     * Check for implied HTML elements.
3996
     */
3997
0
    htmlCheckImplied(ctxt, name);
3998
3999
    /*
4000
     * Avoid html at any level > 0, head at any level != 1
4001
     * or any attempt to recurse body
4002
     */
4003
0
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
4004
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4005
0
               "htmlParseStartTag: misplaced <html> tag\n",
4006
0
         name, NULL);
4007
0
  discardtag = 1;
4008
0
  ctxt->depth++;
4009
0
    }
4010
0
    if ((ctxt->nameNr != 1) &&
4011
0
  (xmlStrEqual(name, BAD_CAST"head"))) {
4012
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4013
0
               "htmlParseStartTag: misplaced <head> tag\n",
4014
0
         name, NULL);
4015
0
  discardtag = 1;
4016
0
  ctxt->depth++;
4017
0
    }
4018
0
    if (xmlStrEqual(name, BAD_CAST"body")) {
4019
0
  int indx;
4020
0
  for (indx = 0;indx < ctxt->nameNr;indx++) {
4021
0
      if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
4022
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4023
0
                 "htmlParseStartTag: misplaced <body> tag\n",
4024
0
           name, NULL);
4025
0
    discardtag = 1;
4026
0
    ctxt->depth++;
4027
0
      }
4028
0
  }
4029
0
    }
4030
4031
    /*
4032
     * Now parse the attributes, it ends up with the ending
4033
     *
4034
     * (S Attribute)* S?
4035
     */
4036
0
    SKIP_BLANKS;
4037
0
    while ((CUR != 0) &&
4038
0
           (CUR != '>') &&
4039
0
     ((CUR != '/') || (NXT(1) != '>'))) {
4040
0
  GROW;
4041
0
  attname = htmlParseAttribute(ctxt, &attvalue);
4042
0
        if (attname != NULL) {
4043
4044
      /*
4045
       * Well formedness requires at most one declaration of an attribute
4046
       */
4047
0
      for (i = 0; i < nbatts;i += 2) {
4048
0
          if (xmlStrEqual(atts[i], attname)) {
4049
0
        htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
4050
0
                     "Attribute %s redefined\n", attname, NULL);
4051
0
        if (attvalue != NULL)
4052
0
      xmlFree(attvalue);
4053
0
        goto failed;
4054
0
    }
4055
0
      }
4056
4057
      /*
4058
       * Add the pair to atts
4059
       */
4060
0
      if (atts == NULL) {
4061
0
          maxatts = 22; /* allow for 10 attrs by default */
4062
0
          atts = (const xmlChar **)
4063
0
           xmlMalloc(maxatts * sizeof(xmlChar *));
4064
0
    if (atts == NULL) {
4065
0
        htmlErrMemory(ctxt, NULL);
4066
0
        if (attvalue != NULL)
4067
0
      xmlFree(attvalue);
4068
0
        goto failed;
4069
0
    }
4070
0
    ctxt->atts = atts;
4071
0
    ctxt->maxatts = maxatts;
4072
0
      } else if (nbatts + 4 > maxatts) {
4073
0
          const xmlChar **n;
4074
4075
0
          maxatts *= 2;
4076
0
          n = (const xmlChar **) xmlRealloc((void *) atts,
4077
0
               maxatts * sizeof(const xmlChar *));
4078
0
    if (n == NULL) {
4079
0
        htmlErrMemory(ctxt, NULL);
4080
0
        if (attvalue != NULL)
4081
0
      xmlFree(attvalue);
4082
0
        goto failed;
4083
0
    }
4084
0
    atts = n;
4085
0
    ctxt->atts = atts;
4086
0
    ctxt->maxatts = maxatts;
4087
0
      }
4088
0
      atts[nbatts++] = attname;
4089
0
      atts[nbatts++] = attvalue;
4090
0
      atts[nbatts] = NULL;
4091
0
      atts[nbatts + 1] = NULL;
4092
0
  }
4093
0
  else {
4094
0
      if (attvalue != NULL)
4095
0
          xmlFree(attvalue);
4096
      /* Dump the bogus attribute string up to the next blank or
4097
       * the end of the tag. */
4098
0
      while ((CUR != 0) &&
4099
0
             !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
4100
0
       ((CUR != '/') || (NXT(1) != '>')))
4101
0
    NEXT;
4102
0
  }
4103
4104
0
failed:
4105
0
  SKIP_BLANKS;
4106
0
    }
4107
4108
    /*
4109
     * Handle specific association to the META tag
4110
     */
4111
0
    if (meta && (nbatts != 0))
4112
0
  htmlCheckMeta(ctxt, atts);
4113
4114
    /*
4115
     * SAX: Start of Element !
4116
     */
4117
0
    if (!discardtag) {
4118
0
  htmlnamePush(ctxt, name);
4119
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
4120
0
      if (nbatts != 0)
4121
0
    ctxt->sax->startElement(ctxt->userData, name, atts);
4122
0
      else
4123
0
    ctxt->sax->startElement(ctxt->userData, name, NULL);
4124
0
  }
4125
0
    }
4126
4127
0
    if (atts != NULL) {
4128
0
        for (i = 1;i < nbatts;i += 2) {
4129
0
      if (atts[i] != NULL)
4130
0
    xmlFree((xmlChar *) atts[i]);
4131
0
  }
4132
0
    }
4133
4134
0
    return(discardtag);
4135
0
}
4136
4137
/**
4138
 * htmlParseEndTag:
4139
 * @ctxt:  an HTML parser context
4140
 *
4141
 * parse an end of tag
4142
 *
4143
 * [42] ETag ::= '</' Name S? '>'
4144
 *
4145
 * With namespace
4146
 *
4147
 * [NS 9] ETag ::= '</' QName S? '>'
4148
 *
4149
 * Returns 1 if the current level should be closed.
4150
 */
4151
4152
static int
4153
htmlParseEndTag(htmlParserCtxtPtr ctxt)
4154
0
{
4155
0
    const xmlChar *name;
4156
0
    const xmlChar *oldname;
4157
0
    int i, ret;
4158
4159
0
    if ((CUR != '<') || (NXT(1) != '/')) {
4160
0
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
4161
0
               "htmlParseEndTag: '</' not found\n", NULL, NULL);
4162
0
        return (0);
4163
0
    }
4164
0
    SKIP(2);
4165
4166
0
    name = htmlParseHTMLName(ctxt);
4167
0
    if (name == NULL)
4168
0
        return (0);
4169
    /*
4170
     * We should definitely be at the ending "S? '>'" part
4171
     */
4172
0
    SKIP_BLANKS;
4173
0
    if (CUR != '>') {
4174
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4175
0
               "End tag : expected '>'\n", NULL, NULL);
4176
        /* Skip to next '>' */
4177
0
        while ((CUR != 0) && (CUR != '>'))
4178
0
            NEXT;
4179
0
    }
4180
0
    if (CUR == '>')
4181
0
        NEXT;
4182
4183
    /*
4184
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
4185
     * out now.
4186
     */
4187
0
    if ((ctxt->depth > 0) &&
4188
0
        (xmlStrEqual(name, BAD_CAST "html") ||
4189
0
         xmlStrEqual(name, BAD_CAST "body") ||
4190
0
   xmlStrEqual(name, BAD_CAST "head"))) {
4191
0
  ctxt->depth--;
4192
0
  return (0);
4193
0
    }
4194
4195
    /*
4196
     * If the name read is not one of the element in the parsing stack
4197
     * then return, it's just an error.
4198
     */
4199
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4200
0
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4201
0
            break;
4202
0
    }
4203
0
    if (i < 0) {
4204
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4205
0
               "Unexpected end tag : %s\n", name, NULL);
4206
0
        return (0);
4207
0
    }
4208
4209
4210
    /*
4211
     * Check for auto-closure of HTML elements.
4212
     */
4213
4214
0
    htmlAutoCloseOnClose(ctxt, name);
4215
4216
    /*
4217
     * Well formedness constraints, opening and closing must match.
4218
     * With the exception that the autoclose may have popped stuff out
4219
     * of the stack.
4220
     */
4221
0
    if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4222
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4223
0
                     "Opening and ending tag mismatch: %s and %s\n",
4224
0
                     name, ctxt->name);
4225
0
    }
4226
4227
    /*
4228
     * SAX: End of Tag
4229
     */
4230
0
    oldname = ctxt->name;
4231
0
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4232
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4233
0
            ctxt->sax->endElement(ctxt->userData, name);
4234
0
  htmlNodeInfoPop(ctxt);
4235
0
        htmlnamePop(ctxt);
4236
0
        ret = 1;
4237
0
    } else {
4238
0
        ret = 0;
4239
0
    }
4240
4241
0
    return (ret);
4242
0
}
4243
4244
4245
/**
4246
 * htmlParseReference:
4247
 * @ctxt:  an HTML parser context
4248
 *
4249
 * parse and handle entity references in content,
4250
 * this will end-up in a call to character() since this is either a
4251
 * CharRef, or a predefined entity.
4252
 */
4253
static void
4254
0
htmlParseReference(htmlParserCtxtPtr ctxt) {
4255
0
    const htmlEntityDesc * ent;
4256
0
    xmlChar out[6];
4257
0
    const xmlChar *name;
4258
0
    if (CUR != '&') return;
4259
4260
0
    if (NXT(1) == '#') {
4261
0
  unsigned int c;
4262
0
  int bits, i = 0;
4263
4264
0
  c = htmlParseCharRef(ctxt);
4265
0
  if (c == 0)
4266
0
      return;
4267
4268
0
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
4269
0
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4270
0
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4271
0
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4272
4273
0
        for ( ; bits >= 0; bits-= 6) {
4274
0
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
4275
0
        }
4276
0
  out[i] = 0;
4277
4278
0
  htmlCheckParagraph(ctxt);
4279
0
  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4280
0
      ctxt->sax->characters(ctxt->userData, out, i);
4281
0
    } else {
4282
0
  ent = htmlParseEntityRef(ctxt, &name);
4283
0
  if (name == NULL) {
4284
0
      htmlCheckParagraph(ctxt);
4285
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4286
0
          ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4287
0
      return;
4288
0
  }
4289
0
  if ((ent == NULL) || !(ent->value > 0)) {
4290
0
      htmlCheckParagraph(ctxt);
4291
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4292
0
    ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4293
0
    ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4294
    /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4295
0
      }
4296
0
  } else {
4297
0
      unsigned int c;
4298
0
      int bits, i = 0;
4299
4300
0
      c = ent->value;
4301
0
      if      (c <    0x80)
4302
0
              { out[i++]= c;                bits= -6; }
4303
0
      else if (c <   0x800)
4304
0
              { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4305
0
      else if (c < 0x10000)
4306
0
              { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4307
0
      else
4308
0
              { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4309
4310
0
      for ( ; bits >= 0; bits-= 6) {
4311
0
    out[i++]= ((c >> bits) & 0x3F) | 0x80;
4312
0
      }
4313
0
      out[i] = 0;
4314
4315
0
      htmlCheckParagraph(ctxt);
4316
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4317
0
    ctxt->sax->characters(ctxt->userData, out, i);
4318
0
  }
4319
0
    }
4320
0
}
4321
4322
/**
4323
 * htmlParseContent:
4324
 * @ctxt:  an HTML parser context
4325
 *
4326
 * Parse a content: comment, sub-element, reference or text.
4327
 * Kept for compatibility with old code
4328
 */
4329
4330
static void
4331
0
htmlParseContent(htmlParserCtxtPtr ctxt) {
4332
0
    xmlChar *currentNode;
4333
0
    int depth;
4334
0
    const xmlChar *name;
4335
4336
0
    currentNode = xmlStrdup(ctxt->name);
4337
0
    depth = ctxt->nameNr;
4338
0
    while (1) {
4339
0
        GROW;
4340
4341
0
        if (ctxt->instate == XML_PARSER_EOF)
4342
0
            break;
4343
4344
  /*
4345
   * Our tag or one of it's parent or children is ending.
4346
   */
4347
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4348
0
      if (htmlParseEndTag(ctxt) &&
4349
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4350
0
    if (currentNode != NULL)
4351
0
        xmlFree(currentNode);
4352
0
    return;
4353
0
      }
4354
0
      continue; /* while */
4355
0
        }
4356
4357
0
  else if ((CUR == '<') &&
4358
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4359
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4360
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4361
0
      if (name == NULL) {
4362
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4363
0
       "htmlParseStartTag: invalid element name\n",
4364
0
       NULL, NULL);
4365
          /* Dump the bogus tag like browsers do */
4366
0
                while ((CUR != 0) && (CUR != '>'))
4367
0
              NEXT;
4368
4369
0
          if (currentNode != NULL)
4370
0
              xmlFree(currentNode);
4371
0
          return;
4372
0
      }
4373
4374
0
      if (ctxt->name != NULL) {
4375
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4376
0
              htmlAutoClose(ctxt, name);
4377
0
              continue;
4378
0
          }
4379
0
      }
4380
0
  }
4381
4382
  /*
4383
   * Has this node been popped out during parsing of
4384
   * the next element
4385
   */
4386
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4387
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4388
0
       {
4389
0
      if (currentNode != NULL) xmlFree(currentNode);
4390
0
      return;
4391
0
  }
4392
4393
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4394
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4395
      /*
4396
       * Handle SCRIPT/STYLE separately
4397
       */
4398
0
      htmlParseScript(ctxt);
4399
0
  }
4400
4401
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4402
            /*
4403
             * Sometimes DOCTYPE arrives in the middle of the document
4404
             */
4405
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4406
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4407
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4408
0
                (UPP(8) == 'E')) {
4409
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4410
0
                             "Misplaced DOCTYPE declaration\n",
4411
0
                             BAD_CAST "DOCTYPE" , NULL);
4412
0
                htmlParseDocTypeDecl(ctxt);
4413
0
            }
4414
            /*
4415
             * First case :  a comment
4416
             */
4417
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4418
0
                htmlParseComment(ctxt);
4419
0
            }
4420
0
            else {
4421
0
                htmlSkipBogusComment(ctxt);
4422
0
            }
4423
0
        }
4424
4425
        /*
4426
         * Second case : a Processing Instruction.
4427
         */
4428
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4429
0
            htmlParsePI(ctxt);
4430
0
        }
4431
4432
        /*
4433
         * Third case :  a sub-element.
4434
         */
4435
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4436
0
            htmlParseElement(ctxt);
4437
0
        }
4438
0
        else if (CUR == '<') {
4439
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4440
0
                (ctxt->sax->characters != NULL))
4441
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4442
0
            NEXT;
4443
0
        }
4444
4445
        /*
4446
         * Fourth case : a reference. If if has not been resolved,
4447
         *    parsing returns it's Name, create the node
4448
         */
4449
0
        else if (CUR == '&') {
4450
0
            htmlParseReference(ctxt);
4451
0
        }
4452
4453
        /*
4454
         * Fifth case : end of the resource
4455
         */
4456
0
        else if (CUR == 0) {
4457
0
            htmlAutoCloseOnEnd(ctxt);
4458
0
            break;
4459
0
        }
4460
4461
        /*
4462
         * Last case, text. Note that References are handled directly.
4463
         */
4464
0
        else {
4465
0
            htmlParseCharData(ctxt);
4466
0
        }
4467
0
        GROW;
4468
0
    }
4469
0
    if (currentNode != NULL) xmlFree(currentNode);
4470
0
}
4471
4472
/**
4473
 * htmlParseElement:
4474
 * @ctxt:  an HTML parser context
4475
 *
4476
 * DEPRECATED: Internal function, don't use.
4477
 *
4478
 * parse an HTML element, this is highly recursive
4479
 * this is kept for compatibility with previous code versions
4480
 *
4481
 * [39] element ::= EmptyElemTag | STag content ETag
4482
 *
4483
 * [41] Attribute ::= Name Eq AttValue
4484
 */
4485
4486
void
4487
0
htmlParseElement(htmlParserCtxtPtr ctxt) {
4488
0
    const xmlChar *name;
4489
0
    xmlChar *currentNode = NULL;
4490
0
    const htmlElemDesc * info;
4491
0
    htmlParserNodeInfo node_info;
4492
0
    int failed;
4493
0
    int depth;
4494
0
    const xmlChar *oldptr;
4495
4496
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4497
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4498
0
         "htmlParseElement: context error\n", NULL, NULL);
4499
0
  return;
4500
0
    }
4501
4502
0
    if (ctxt->instate == XML_PARSER_EOF)
4503
0
        return;
4504
4505
    /* Capture start position */
4506
0
    if (ctxt->record_info) {
4507
0
        node_info.begin_pos = ctxt->input->consumed +
4508
0
                          (CUR_PTR - ctxt->input->base);
4509
0
  node_info.begin_line = ctxt->input->line;
4510
0
    }
4511
4512
0
    failed = htmlParseStartTag(ctxt);
4513
0
    name = ctxt->name;
4514
0
    if ((failed == -1) || (name == NULL)) {
4515
0
  if (CUR == '>')
4516
0
      NEXT;
4517
0
        return;
4518
0
    }
4519
4520
    /*
4521
     * Lookup the info for that element.
4522
     */
4523
0
    info = htmlTagLookup(name);
4524
0
    if (info == NULL) {
4525
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4526
0
               "Tag %s invalid\n", name, NULL);
4527
0
    }
4528
4529
    /*
4530
     * Check for an Empty Element labeled the XML/SGML way
4531
     */
4532
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4533
0
        SKIP(2);
4534
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4535
0
      ctxt->sax->endElement(ctxt->userData, name);
4536
0
  htmlnamePop(ctxt);
4537
0
  return;
4538
0
    }
4539
4540
0
    if (CUR == '>') {
4541
0
        NEXT;
4542
0
    } else {
4543
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4544
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4545
4546
  /*
4547
   * end of parsing of this node.
4548
   */
4549
0
  if (xmlStrEqual(name, ctxt->name)) {
4550
0
      nodePop(ctxt);
4551
0
      htmlnamePop(ctxt);
4552
0
  }
4553
4554
  /*
4555
   * Capture end position and add node
4556
   */
4557
0
  if (ctxt->record_info) {
4558
0
     node_info.end_pos = ctxt->input->consumed +
4559
0
            (CUR_PTR - ctxt->input->base);
4560
0
     node_info.end_line = ctxt->input->line;
4561
0
     node_info.node = ctxt->node;
4562
0
     xmlParserAddNodeInfo(ctxt, &node_info);
4563
0
  }
4564
0
  return;
4565
0
    }
4566
4567
    /*
4568
     * Check for an Empty Element from DTD definition
4569
     */
4570
0
    if ((info != NULL) && (info->empty)) {
4571
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4572
0
      ctxt->sax->endElement(ctxt->userData, name);
4573
0
  htmlnamePop(ctxt);
4574
0
  return;
4575
0
    }
4576
4577
    /*
4578
     * Parse the content of the element:
4579
     */
4580
0
    currentNode = xmlStrdup(ctxt->name);
4581
0
    depth = ctxt->nameNr;
4582
0
    while (CUR != 0) {
4583
0
  oldptr = ctxt->input->cur;
4584
0
  htmlParseContent(ctxt);
4585
0
  if (oldptr==ctxt->input->cur) break;
4586
0
  if (ctxt->nameNr < depth) break;
4587
0
    }
4588
4589
    /*
4590
     * Capture end position and add node
4591
     */
4592
0
    if ( currentNode != NULL && ctxt->record_info ) {
4593
0
       node_info.end_pos = ctxt->input->consumed +
4594
0
                          (CUR_PTR - ctxt->input->base);
4595
0
       node_info.end_line = ctxt->input->line;
4596
0
       node_info.node = ctxt->node;
4597
0
       xmlParserAddNodeInfo(ctxt, &node_info);
4598
0
    }
4599
0
    if (CUR == 0) {
4600
0
  htmlAutoCloseOnEnd(ctxt);
4601
0
    }
4602
4603
0
    if (currentNode != NULL)
4604
0
  xmlFree(currentNode);
4605
0
}
4606
4607
static void
4608
0
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4609
    /*
4610
     * Capture end position and add node
4611
     */
4612
0
    if ( ctxt->node != NULL && ctxt->record_info ) {
4613
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4614
0
                                (CUR_PTR - ctxt->input->base);
4615
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
4616
0
       ctxt->nodeInfo->node = ctxt->node;
4617
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4618
0
       htmlNodeInfoPop(ctxt);
4619
0
    }
4620
0
    if (CUR == 0) {
4621
0
       htmlAutoCloseOnEnd(ctxt);
4622
0
    }
4623
0
}
4624
4625
/**
4626
 * htmlParseElementInternal:
4627
 * @ctxt:  an HTML parser context
4628
 *
4629
 * parse an HTML element, new version, non recursive
4630
 *
4631
 * [39] element ::= EmptyElemTag | STag content ETag
4632
 *
4633
 * [41] Attribute ::= Name Eq AttValue
4634
 */
4635
4636
static void
4637
0
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4638
0
    const xmlChar *name;
4639
0
    const htmlElemDesc * info;
4640
0
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4641
0
    int failed;
4642
4643
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4644
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4645
0
         "htmlParseElementInternal: context error\n", NULL, NULL);
4646
0
  return;
4647
0
    }
4648
4649
0
    if (ctxt->instate == XML_PARSER_EOF)
4650
0
        return;
4651
4652
    /* Capture start position */
4653
0
    if (ctxt->record_info) {
4654
0
        node_info.begin_pos = ctxt->input->consumed +
4655
0
                          (CUR_PTR - ctxt->input->base);
4656
0
  node_info.begin_line = ctxt->input->line;
4657
0
    }
4658
4659
0
    failed = htmlParseStartTag(ctxt);
4660
0
    name = ctxt->name;
4661
0
    if ((failed == -1) || (name == NULL)) {
4662
0
  if (CUR == '>')
4663
0
      NEXT;
4664
0
        return;
4665
0
    }
4666
4667
    /*
4668
     * Lookup the info for that element.
4669
     */
4670
0
    info = htmlTagLookup(name);
4671
0
    if (info == NULL) {
4672
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4673
0
               "Tag %s invalid\n", name, NULL);
4674
0
    }
4675
4676
    /*
4677
     * Check for an Empty Element labeled the XML/SGML way
4678
     */
4679
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4680
0
        SKIP(2);
4681
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4682
0
      ctxt->sax->endElement(ctxt->userData, name);
4683
0
  htmlnamePop(ctxt);
4684
0
  return;
4685
0
    }
4686
4687
0
    if (CUR == '>') {
4688
0
        NEXT;
4689
0
    } else {
4690
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4691
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4692
4693
  /*
4694
   * end of parsing of this node.
4695
   */
4696
0
  if (xmlStrEqual(name, ctxt->name)) {
4697
0
      nodePop(ctxt);
4698
0
      htmlnamePop(ctxt);
4699
0
  }
4700
4701
0
        if (ctxt->record_info)
4702
0
            htmlNodeInfoPush(ctxt, &node_info);
4703
0
        htmlParserFinishElementParsing(ctxt);
4704
0
  return;
4705
0
    }
4706
4707
    /*
4708
     * Check for an Empty Element from DTD definition
4709
     */
4710
0
    if ((info != NULL) && (info->empty)) {
4711
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4712
0
      ctxt->sax->endElement(ctxt->userData, name);
4713
0
  htmlnamePop(ctxt);
4714
0
  return;
4715
0
    }
4716
4717
0
    if (ctxt->record_info)
4718
0
        htmlNodeInfoPush(ctxt, &node_info);
4719
0
}
4720
4721
/**
4722
 * htmlParseContentInternal:
4723
 * @ctxt:  an HTML parser context
4724
 *
4725
 * Parse a content: comment, sub-element, reference or text.
4726
 * New version for non recursive htmlParseElementInternal
4727
 */
4728
4729
static void
4730
0
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4731
0
    xmlChar *currentNode;
4732
0
    int depth;
4733
0
    const xmlChar *name;
4734
4735
0
    currentNode = xmlStrdup(ctxt->name);
4736
0
    depth = ctxt->nameNr;
4737
0
    while (1) {
4738
0
        GROW;
4739
4740
0
        if (ctxt->instate == XML_PARSER_EOF)
4741
0
            break;
4742
4743
  /*
4744
   * Our tag or one of it's parent or children is ending.
4745
   */
4746
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4747
0
      if (htmlParseEndTag(ctxt) &&
4748
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4749
0
    if (currentNode != NULL)
4750
0
        xmlFree(currentNode);
4751
4752
0
          currentNode = xmlStrdup(ctxt->name);
4753
0
          depth = ctxt->nameNr;
4754
0
      }
4755
0
      continue; /* while */
4756
0
        }
4757
4758
0
  else if ((CUR == '<') &&
4759
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4760
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4761
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4762
0
      if (name == NULL) {
4763
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4764
0
       "htmlParseStartTag: invalid element name\n",
4765
0
       NULL, NULL);
4766
          /* Dump the bogus tag like browsers do */
4767
0
          while ((CUR == 0) && (CUR != '>'))
4768
0
              NEXT;
4769
4770
0
          htmlParserFinishElementParsing(ctxt);
4771
0
          if (currentNode != NULL)
4772
0
              xmlFree(currentNode);
4773
4774
0
          currentNode = xmlStrdup(ctxt->name);
4775
0
          depth = ctxt->nameNr;
4776
0
          continue;
4777
0
      }
4778
4779
0
      if (ctxt->name != NULL) {
4780
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4781
0
              htmlAutoClose(ctxt, name);
4782
0
              continue;
4783
0
          }
4784
0
      }
4785
0
  }
4786
4787
  /*
4788
   * Has this node been popped out during parsing of
4789
   * the next element
4790
   */
4791
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4792
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4793
0
       {
4794
0
      htmlParserFinishElementParsing(ctxt);
4795
0
      if (currentNode != NULL) xmlFree(currentNode);
4796
4797
0
      currentNode = xmlStrdup(ctxt->name);
4798
0
      depth = ctxt->nameNr;
4799
0
      continue;
4800
0
  }
4801
4802
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4803
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4804
      /*
4805
       * Handle SCRIPT/STYLE separately
4806
       */
4807
0
      htmlParseScript(ctxt);
4808
0
  }
4809
4810
0
        else if ((CUR == '<') && (NXT(1) == '!')) {
4811
            /*
4812
             * Sometimes DOCTYPE arrives in the middle of the document
4813
             */
4814
0
            if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
4815
0
                (UPP(4) == 'C') && (UPP(5) == 'T') &&
4816
0
                (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4817
0
                (UPP(8) == 'E')) {
4818
0
                htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4819
0
                             "Misplaced DOCTYPE declaration\n",
4820
0
                             BAD_CAST "DOCTYPE" , NULL);
4821
0
                htmlParseDocTypeDecl(ctxt);
4822
0
            }
4823
            /*
4824
             * First case :  a comment
4825
             */
4826
0
            else if ((NXT(2) == '-') && (NXT(3) == '-')) {
4827
0
                htmlParseComment(ctxt);
4828
0
            }
4829
0
            else {
4830
0
                htmlSkipBogusComment(ctxt);
4831
0
            }
4832
0
        }
4833
4834
        /*
4835
         * Second case : a Processing Instruction.
4836
         */
4837
0
        else if ((CUR == '<') && (NXT(1) == '?')) {
4838
0
            htmlParsePI(ctxt);
4839
0
        }
4840
4841
        /*
4842
         * Third case :  a sub-element.
4843
         */
4844
0
        else if ((CUR == '<') && IS_ASCII_LETTER(NXT(1))) {
4845
0
            htmlParseElementInternal(ctxt);
4846
0
            if (currentNode != NULL) xmlFree(currentNode);
4847
4848
0
            currentNode = xmlStrdup(ctxt->name);
4849
0
            depth = ctxt->nameNr;
4850
0
        }
4851
0
        else if (CUR == '<') {
4852
0
            if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
4853
0
                (ctxt->sax->characters != NULL))
4854
0
                ctxt->sax->characters(ctxt->userData, BAD_CAST "<", 1);
4855
0
            NEXT;
4856
0
        }
4857
4858
        /*
4859
         * Fourth case : a reference. If if has not been resolved,
4860
         *    parsing returns it's Name, create the node
4861
         */
4862
0
        else if (CUR == '&') {
4863
0
            htmlParseReference(ctxt);
4864
0
        }
4865
4866
        /*
4867
         * Fifth case : end of the resource
4868
         */
4869
0
        else if (CUR == 0) {
4870
0
            htmlAutoCloseOnEnd(ctxt);
4871
0
            break;
4872
0
        }
4873
4874
        /*
4875
         * Last case, text. Note that References are handled directly.
4876
         */
4877
0
        else {
4878
0
            htmlParseCharData(ctxt);
4879
0
        }
4880
0
        GROW;
4881
0
    }
4882
0
    if (currentNode != NULL) xmlFree(currentNode);
4883
0
}
4884
4885
/**
4886
 * htmlParseContent:
4887
 * @ctxt:  an HTML parser context
4888
 *
4889
 * Parse a content: comment, sub-element, reference or text.
4890
 * This is the entry point when called from parser.c
4891
 */
4892
4893
void
4894
0
__htmlParseContent(void *ctxt) {
4895
0
    if (ctxt != NULL)
4896
0
  htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4897
0
}
4898
4899
/**
4900
 * htmlParseDocument:
4901
 * @ctxt:  an HTML parser context
4902
 *
4903
 * parse an HTML document (and build a tree if using the standard SAX
4904
 * interface).
4905
 *
4906
 * Returns 0, -1 in case of error. the parser context is augmented
4907
 *                as a result of the parsing.
4908
 */
4909
4910
int
4911
0
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4912
0
    xmlChar start[4];
4913
0
    xmlCharEncoding enc;
4914
0
    xmlDtdPtr dtd;
4915
4916
0
    xmlInitParser();
4917
4918
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4919
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4920
0
         "htmlParseDocument: context error\n", NULL, NULL);
4921
0
  return(XML_ERR_INTERNAL_ERROR);
4922
0
    }
4923
0
    GROW;
4924
    /*
4925
     * SAX: beginning of the document processing.
4926
     */
4927
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4928
0
        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4929
4930
0
    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4931
0
        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4932
  /*
4933
   * Get the 4 first bytes and decode the charset
4934
   * if enc != XML_CHAR_ENCODING_NONE
4935
   * plug some encoding conversion routines.
4936
   */
4937
0
  start[0] = RAW;
4938
0
  start[1] = NXT(1);
4939
0
  start[2] = NXT(2);
4940
0
  start[3] = NXT(3);
4941
0
  enc = xmlDetectCharEncoding(&start[0], 4);
4942
0
  if (enc != XML_CHAR_ENCODING_NONE) {
4943
0
      xmlSwitchEncoding(ctxt, enc);
4944
0
  }
4945
0
    }
4946
4947
    /*
4948
     * Wipe out everything which is before the first '<'
4949
     */
4950
0
    SKIP_BLANKS;
4951
0
    if (CUR == 0) {
4952
0
  htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4953
0
               "Document is empty\n", NULL, NULL);
4954
0
    }
4955
4956
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4957
0
  ctxt->sax->startDocument(ctxt->userData);
4958
4959
4960
    /*
4961
     * Parse possible comments and PIs before any content
4962
     */
4963
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4964
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4965
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4966
0
        htmlParseComment(ctxt);
4967
0
        htmlParsePI(ctxt);
4968
0
  SKIP_BLANKS;
4969
0
    }
4970
4971
4972
    /*
4973
     * Then possibly doc type declaration(s) and more Misc
4974
     * (doctypedecl Misc*)?
4975
     */
4976
0
    if ((CUR == '<') && (NXT(1) == '!') &&
4977
0
  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4978
0
  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4979
0
  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4980
0
  (UPP(8) == 'E')) {
4981
0
  htmlParseDocTypeDecl(ctxt);
4982
0
    }
4983
0
    SKIP_BLANKS;
4984
4985
    /*
4986
     * Parse possible comments and PIs before any content
4987
     */
4988
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4989
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4990
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4991
0
        htmlParseComment(ctxt);
4992
0
        htmlParsePI(ctxt);
4993
0
  SKIP_BLANKS;
4994
0
    }
4995
4996
    /*
4997
     * Time to start parsing the tree itself
4998
     */
4999
0
    htmlParseContentInternal(ctxt);
5000
5001
    /*
5002
     * autoclose
5003
     */
5004
0
    if (CUR == 0)
5005
0
  htmlAutoCloseOnEnd(ctxt);
5006
5007
5008
    /*
5009
     * SAX: end of the document processing.
5010
     */
5011
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5012
0
        ctxt->sax->endDocument(ctxt->userData);
5013
5014
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
5015
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
5016
0
  if (dtd == NULL)
5017
0
      ctxt->myDoc->intSubset =
5018
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
5019
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
5020
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
5021
0
    }
5022
0
    if (! ctxt->wellFormed) return(-1);
5023
0
    return(0);
5024
0
}
5025
5026
5027
/************************************************************************
5028
 *                  *
5029
 *      Parser contexts handling      *
5030
 *                  *
5031
 ************************************************************************/
5032
5033
/**
5034
 * htmlInitParserCtxt:
5035
 * @ctxt:  an HTML parser context
5036
 * @sax:  SAX handler
5037
 * @userData:  user data
5038
 *
5039
 * Initialize a parser context
5040
 *
5041
 * Returns 0 in case of success and -1 in case of error
5042
 */
5043
5044
static int
5045
htmlInitParserCtxt(htmlParserCtxtPtr ctxt, const htmlSAXHandler *sax,
5046
                   void *userData)
5047
0
{
5048
0
    if (ctxt == NULL) return(-1);
5049
0
    memset(ctxt, 0, sizeof(htmlParserCtxt));
5050
5051
0
    ctxt->dict = xmlDictCreate();
5052
0
    if (ctxt->dict == NULL) {
5053
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5054
0
  return(-1);
5055
0
    }
5056
5057
0
    if (ctxt->sax == NULL)
5058
0
        ctxt->sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
5059
0
    if (ctxt->sax == NULL) {
5060
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5061
0
  return(-1);
5062
0
    }
5063
0
    if (sax == NULL) {
5064
0
        memset(ctxt->sax, 0, sizeof(htmlSAXHandler));
5065
0
        xmlSAX2InitHtmlDefaultSAXHandler(ctxt->sax);
5066
0
        ctxt->userData = ctxt;
5067
0
    } else {
5068
0
        memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
5069
0
        ctxt->userData = userData ? userData : ctxt;
5070
0
    }
5071
5072
    /* Allocate the Input stack */
5073
0
    ctxt->inputTab = (htmlParserInputPtr *)
5074
0
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
5075
0
    if (ctxt->inputTab == NULL) {
5076
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5077
0
  ctxt->inputNr = 0;
5078
0
  ctxt->inputMax = 0;
5079
0
  ctxt->input = NULL;
5080
0
  return(-1);
5081
0
    }
5082
0
    ctxt->inputNr = 0;
5083
0
    ctxt->inputMax = 5;
5084
0
    ctxt->input = NULL;
5085
0
    ctxt->version = NULL;
5086
0
    ctxt->encoding = NULL;
5087
0
    ctxt->standalone = -1;
5088
0
    ctxt->instate = XML_PARSER_START;
5089
5090
    /* Allocate the Node stack */
5091
0
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
5092
0
    if (ctxt->nodeTab == NULL) {
5093
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5094
0
  ctxt->nodeNr = 0;
5095
0
  ctxt->nodeMax = 0;
5096
0
  ctxt->node = NULL;
5097
0
  ctxt->inputNr = 0;
5098
0
  ctxt->inputMax = 0;
5099
0
  ctxt->input = NULL;
5100
0
  return(-1);
5101
0
    }
5102
0
    ctxt->nodeNr = 0;
5103
0
    ctxt->nodeMax = 10;
5104
0
    ctxt->node = NULL;
5105
5106
    /* Allocate the Name stack */
5107
0
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
5108
0
    if (ctxt->nameTab == NULL) {
5109
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
5110
0
  ctxt->nameNr = 0;
5111
0
  ctxt->nameMax = 0;
5112
0
  ctxt->name = NULL;
5113
0
  ctxt->nodeNr = 0;
5114
0
  ctxt->nodeMax = 0;
5115
0
  ctxt->node = NULL;
5116
0
  ctxt->inputNr = 0;
5117
0
  ctxt->inputMax = 0;
5118
0
  ctxt->input = NULL;
5119
0
  return(-1);
5120
0
    }
5121
0
    ctxt->nameNr = 0;
5122
0
    ctxt->nameMax = 10;
5123
0
    ctxt->name = NULL;
5124
5125
0
    ctxt->nodeInfoTab = NULL;
5126
0
    ctxt->nodeInfoNr  = 0;
5127
0
    ctxt->nodeInfoMax = 0;
5128
5129
0
    ctxt->myDoc = NULL;
5130
0
    ctxt->wellFormed = 1;
5131
0
    ctxt->replaceEntities = 0;
5132
0
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
5133
0
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
5134
0
    ctxt->html = 1;
5135
0
    ctxt->vctxt.flags = XML_VCTXT_USE_PCTXT;
5136
0
    ctxt->vctxt.userData = ctxt;
5137
0
    ctxt->vctxt.error = xmlParserValidityError;
5138
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
5139
0
    ctxt->record_info = 0;
5140
0
    ctxt->validate = 0;
5141
0
    ctxt->checkIndex = 0;
5142
0
    ctxt->catalogs = NULL;
5143
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
5144
0
    return(0);
5145
0
}
5146
5147
/**
5148
 * htmlFreeParserCtxt:
5149
 * @ctxt:  an HTML parser context
5150
 *
5151
 * Free all the memory used by a parser context. However the parsed
5152
 * document in ctxt->myDoc is not freed.
5153
 */
5154
5155
void
5156
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
5157
0
{
5158
0
    xmlFreeParserCtxt(ctxt);
5159
0
}
5160
5161
/**
5162
 * htmlNewParserCtxt:
5163
 *
5164
 * Allocate and initialize a new parser context.
5165
 *
5166
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5167
 */
5168
5169
htmlParserCtxtPtr
5170
htmlNewParserCtxt(void)
5171
0
{
5172
0
    return(htmlNewSAXParserCtxt(NULL, NULL));
5173
0
}
5174
5175
/**
5176
 * htmlNewSAXParserCtxt:
5177
 * @sax:  SAX handler
5178
 * @userData:  user data
5179
 *
5180
 * Allocate and initialize a new SAX parser context. If userData is NULL,
5181
 * the parser context will be passed as user data.
5182
 *
5183
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
5184
 */
5185
5186
htmlParserCtxtPtr
5187
htmlNewSAXParserCtxt(const htmlSAXHandler *sax, void *userData)
5188
0
{
5189
0
    xmlParserCtxtPtr ctxt;
5190
5191
0
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
5192
0
    if (ctxt == NULL) {
5193
0
        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
5194
0
  return(NULL);
5195
0
    }
5196
0
    memset(ctxt, 0, sizeof(xmlParserCtxt));
5197
0
    if (htmlInitParserCtxt(ctxt, sax, userData) < 0) {
5198
0
        htmlFreeParserCtxt(ctxt);
5199
0
  return(NULL);
5200
0
    }
5201
0
    return(ctxt);
5202
0
}
5203
5204
/**
5205
 * htmlCreateMemoryParserCtxt:
5206
 * @buffer:  a pointer to a char array
5207
 * @size:  the size of the array
5208
 *
5209
 * Create a parser context for an HTML in-memory document.
5210
 *
5211
 * Returns the new parser context or NULL
5212
 */
5213
htmlParserCtxtPtr
5214
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5215
0
    xmlParserCtxtPtr ctxt;
5216
0
    xmlParserInputPtr input;
5217
0
    xmlParserInputBufferPtr buf;
5218
5219
0
    if (buffer == NULL)
5220
0
  return(NULL);
5221
0
    if (size <= 0)
5222
0
  return(NULL);
5223
5224
0
    ctxt = htmlNewParserCtxt();
5225
0
    if (ctxt == NULL)
5226
0
  return(NULL);
5227
5228
0
    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5229
0
    if (buf == NULL) return(NULL);
5230
5231
0
    input = xmlNewInputStream(ctxt);
5232
0
    if (input == NULL) {
5233
0
  xmlFreeParserInputBuffer(buf);
5234
0
  xmlFreeParserCtxt(ctxt);
5235
0
  return(NULL);
5236
0
    }
5237
5238
0
    input->filename = NULL;
5239
0
    input->buf = buf;
5240
0
    xmlBufResetInput(buf->buffer, input);
5241
5242
0
    inputPush(ctxt, input);
5243
0
    return(ctxt);
5244
0
}
5245
5246
/**
5247
 * htmlCreateDocParserCtxt:
5248
 * @cur:  a pointer to an array of xmlChar
5249
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5250
 *
5251
 * Create a parser context for an HTML document.
5252
 *
5253
 * TODO: check the need to add encoding handling there
5254
 *
5255
 * Returns the new parser context or NULL
5256
 */
5257
static htmlParserCtxtPtr
5258
0
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5259
0
    int len;
5260
0
    htmlParserCtxtPtr ctxt;
5261
5262
0
    if (cur == NULL)
5263
0
  return(NULL);
5264
0
    len = xmlStrlen(cur);
5265
0
    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5266
0
    if (ctxt == NULL)
5267
0
  return(NULL);
5268
5269
0
    if (encoding != NULL) {
5270
0
  xmlCharEncoding enc;
5271
0
  xmlCharEncodingHandlerPtr handler;
5272
5273
0
  if (ctxt->input->encoding != NULL)
5274
0
      xmlFree((xmlChar *) ctxt->input->encoding);
5275
0
  ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5276
5277
0
  enc = xmlParseCharEncoding(encoding);
5278
  /*
5279
   * registered set of known encodings
5280
   */
5281
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
5282
0
      xmlSwitchEncoding(ctxt, enc);
5283
0
      if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5284
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5285
0
                 "Unsupported encoding %s\n",
5286
0
           (const xmlChar *) encoding, NULL);
5287
0
      }
5288
0
  } else {
5289
      /*
5290
       * fallback for unknown encodings
5291
       */
5292
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
5293
0
      if (handler != NULL) {
5294
0
    xmlSwitchToEncoding(ctxt, handler);
5295
0
      } else {
5296
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5297
0
                 "Unsupported encoding %s\n",
5298
0
           (const xmlChar *) encoding, NULL);
5299
0
      }
5300
0
  }
5301
0
    }
5302
0
    return(ctxt);
5303
0
}
5304
5305
#ifdef LIBXML_PUSH_ENABLED
5306
/************************************************************************
5307
 *                  *
5308
 *  Progressive parsing interfaces        *
5309
 *                  *
5310
 ************************************************************************/
5311
5312
/**
5313
 * htmlParseLookupSequence:
5314
 * @ctxt:  an HTML parser context
5315
 * @first:  the first char to lookup
5316
 * @next:  the next char to lookup or zero
5317
 * @third:  the next char to lookup or zero
5318
 * @ignoreattrval: skip over attribute values
5319
 *
5320
 * Try to find if a sequence (first, next, third) or  just (first next) or
5321
 * (first) is available in the input stream.
5322
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5323
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5324
 * parser, do not use liberally.
5325
 * This is basically similar to xmlParseLookupSequence()
5326
 *
5327
 * Returns the index to the current parsing point if the full sequence
5328
 *      is available, -1 otherwise.
5329
 */
5330
static int
5331
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5332
                        xmlChar next, xmlChar third, int ignoreattrval)
5333
0
{
5334
0
    int base, len;
5335
0
    htmlParserInputPtr in;
5336
0
    const xmlChar *buf;
5337
0
    int invalue = 0;
5338
0
    char valdellim = 0x0;
5339
5340
0
    in = ctxt->input;
5341
0
    if (in == NULL)
5342
0
        return (-1);
5343
5344
0
    base = in->cur - in->base;
5345
0
    if (base < 0)
5346
0
        return (-1);
5347
5348
0
    if (ctxt->checkIndex > base) {
5349
0
        base = ctxt->checkIndex;
5350
        /* Abuse hasPErefs member to restore current state. */
5351
0
        invalue = ctxt->hasPErefs & 1 ? 1 : 0;
5352
0
    }
5353
5354
0
    if (in->buf == NULL) {
5355
0
        buf = in->base;
5356
0
        len = in->length;
5357
0
    } else {
5358
0
        buf = xmlBufContent(in->buf->buffer);
5359
0
        len = xmlBufUse(in->buf->buffer);
5360
0
    }
5361
5362
    /* take into account the sequence length */
5363
0
    if (third)
5364
0
        len -= 2;
5365
0
    else if (next)
5366
0
        len--;
5367
0
    for (; base < len; base++) {
5368
0
        if (ignoreattrval) {
5369
0
            if (buf[base] == '"' || buf[base] == '\'') {
5370
0
                if (invalue) {
5371
0
                    if (buf[base] == valdellim) {
5372
0
                        invalue = 0;
5373
0
                        continue;
5374
0
                    }
5375
0
                } else {
5376
0
                    valdellim = buf[base];
5377
0
                    invalue = 1;
5378
0
                    continue;
5379
0
                }
5380
0
            } else if (invalue) {
5381
0
                continue;
5382
0
            }
5383
0
        }
5384
0
        if (buf[base] == first) {
5385
0
            if (third != 0) {
5386
0
                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5387
0
                    continue;
5388
0
            } else if (next != 0) {
5389
0
                if (buf[base + 1] != next)
5390
0
                    continue;
5391
0
            }
5392
0
            ctxt->checkIndex = 0;
5393
#ifdef DEBUG_PUSH
5394
            if (next == 0)
5395
                xmlGenericError(xmlGenericErrorContext,
5396
                                "HPP: lookup '%c' found at %d\n",
5397
                                first, base);
5398
            else if (third == 0)
5399
                xmlGenericError(xmlGenericErrorContext,
5400
                                "HPP: lookup '%c%c' found at %d\n",
5401
                                first, next, base);
5402
            else
5403
                xmlGenericError(xmlGenericErrorContext,
5404
                                "HPP: lookup '%c%c%c' found at %d\n",
5405
                                first, next, third, base);
5406
#endif
5407
0
            return (base - (in->cur - in->base));
5408
0
        }
5409
0
    }
5410
0
    ctxt->checkIndex = base;
5411
    /* Abuse hasPErefs member to track current state. */
5412
0
    if (invalue)
5413
0
        ctxt->hasPErefs |= 1;
5414
0
    else
5415
0
        ctxt->hasPErefs &= ~1;
5416
#ifdef DEBUG_PUSH
5417
    if (next == 0)
5418
        xmlGenericError(xmlGenericErrorContext,
5419
                        "HPP: lookup '%c' failed\n", first);
5420
    else if (third == 0)
5421
        xmlGenericError(xmlGenericErrorContext,
5422
                        "HPP: lookup '%c%c' failed\n", first, next);
5423
    else
5424
        xmlGenericError(xmlGenericErrorContext,
5425
                        "HPP: lookup '%c%c%c' failed\n", first, next,
5426
                        third);
5427
#endif
5428
0
    return (-1);
5429
0
}
5430
5431
/**
5432
 * htmlParseLookupCommentEnd:
5433
 * @ctxt: an HTML parser context
5434
 *
5435
 * Try to find a comment end tag in the input stream
5436
 * The search includes "-->" as well as WHATWG-recommended incorrectly-closed tags.
5437
 * (See https://html.spec.whatwg.org/multipage/parsing.html#parse-error-incorrectly-closed-comment)
5438
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5439
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5440
 * parser, do not use liberally.
5441
 * This wraps to htmlParseLookupSequence()
5442
 *
5443
 * Returns the index to the current parsing point if the full sequence is available, -1 otherwise.
5444
 */
5445
static int
5446
htmlParseLookupCommentEnd(htmlParserCtxtPtr ctxt)
5447
0
{
5448
0
    int mark = 0;
5449
0
    int cur = CUR_PTR - BASE_PTR;
5450
5451
0
    while (mark >= 0) {
5452
0
  mark = htmlParseLookupSequence(ctxt, '-', '-', 0, 0);
5453
0
  if ((mark < 0) ||
5454
0
      (NXT(mark+2) == '>') ||
5455
0
      ((NXT(mark+2) == '!') && (NXT(mark+3) == '>'))) {
5456
0
      return mark;
5457
0
  }
5458
0
  ctxt->checkIndex = cur + mark + 1;
5459
0
    }
5460
0
    return mark;
5461
0
}
5462
5463
5464
/**
5465
 * htmlParseTryOrFinish:
5466
 * @ctxt:  an HTML parser context
5467
 * @terminate:  last chunk indicator
5468
 *
5469
 * Try to progress on parsing
5470
 *
5471
 * Returns zero if no parsing was possible
5472
 */
5473
static int
5474
0
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5475
0
    int ret = 0;
5476
0
    htmlParserInputPtr in;
5477
0
    ptrdiff_t avail = 0;
5478
0
    xmlChar cur, next;
5479
5480
0
    htmlParserNodeInfo node_info;
5481
5482
#ifdef DEBUG_PUSH
5483
    switch (ctxt->instate) {
5484
  case XML_PARSER_EOF:
5485
      xmlGenericError(xmlGenericErrorContext,
5486
        "HPP: try EOF\n"); break;
5487
  case XML_PARSER_START:
5488
      xmlGenericError(xmlGenericErrorContext,
5489
        "HPP: try START\n"); break;
5490
  case XML_PARSER_MISC:
5491
      xmlGenericError(xmlGenericErrorContext,
5492
        "HPP: try MISC\n");break;
5493
  case XML_PARSER_COMMENT:
5494
      xmlGenericError(xmlGenericErrorContext,
5495
        "HPP: try COMMENT\n");break;
5496
  case XML_PARSER_PROLOG:
5497
      xmlGenericError(xmlGenericErrorContext,
5498
        "HPP: try PROLOG\n");break;
5499
  case XML_PARSER_START_TAG:
5500
      xmlGenericError(xmlGenericErrorContext,
5501
        "HPP: try START_TAG\n");break;
5502
  case XML_PARSER_CONTENT:
5503
      xmlGenericError(xmlGenericErrorContext,
5504
        "HPP: try CONTENT\n");break;
5505
  case XML_PARSER_CDATA_SECTION:
5506
      xmlGenericError(xmlGenericErrorContext,
5507
        "HPP: try CDATA_SECTION\n");break;
5508
  case XML_PARSER_END_TAG:
5509
      xmlGenericError(xmlGenericErrorContext,
5510
        "HPP: try END_TAG\n");break;
5511
  case XML_PARSER_ENTITY_DECL:
5512
      xmlGenericError(xmlGenericErrorContext,
5513
        "HPP: try ENTITY_DECL\n");break;
5514
  case XML_PARSER_ENTITY_VALUE:
5515
      xmlGenericError(xmlGenericErrorContext,
5516
        "HPP: try ENTITY_VALUE\n");break;
5517
  case XML_PARSER_ATTRIBUTE_VALUE:
5518
      xmlGenericError(xmlGenericErrorContext,
5519
        "HPP: try ATTRIBUTE_VALUE\n");break;
5520
  case XML_PARSER_DTD:
5521
      xmlGenericError(xmlGenericErrorContext,
5522
        "HPP: try DTD\n");break;
5523
  case XML_PARSER_EPILOG:
5524
      xmlGenericError(xmlGenericErrorContext,
5525
        "HPP: try EPILOG\n");break;
5526
  case XML_PARSER_PI:
5527
      xmlGenericError(xmlGenericErrorContext,
5528
        "HPP: try PI\n");break;
5529
  case XML_PARSER_SYSTEM_LITERAL:
5530
      xmlGenericError(xmlGenericErrorContext,
5531
        "HPP: try SYSTEM_LITERAL\n");break;
5532
    }
5533
#endif
5534
5535
0
    while (1) {
5536
5537
0
  in = ctxt->input;
5538
0
  if (in == NULL) break;
5539
0
  if (in->buf == NULL)
5540
0
      avail = in->length - (in->cur - in->base);
5541
0
  else
5542
0
      avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5543
0
                    (in->cur - in->base);
5544
0
  if ((avail == 0) && (terminate)) {
5545
0
      htmlAutoCloseOnEnd(ctxt);
5546
0
      if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5547
    /*
5548
     * SAX: end of the document processing.
5549
     */
5550
0
    ctxt->instate = XML_PARSER_EOF;
5551
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5552
0
        ctxt->sax->endDocument(ctxt->userData);
5553
0
      }
5554
0
  }
5555
0
        if (avail < 1)
5556
0
      goto done;
5557
        /*
5558
         * This is done to make progress and avoid an infinite loop
5559
         * if a parsing attempt was aborted by hitting a NUL byte. After
5560
         * changing htmlCurrentChar, this probably isn't necessary anymore.
5561
         * We should consider removing this check.
5562
         */
5563
0
  cur = in->cur[0];
5564
0
  if (cur == 0) {
5565
0
      SKIP(1);
5566
0
      continue;
5567
0
  }
5568
5569
0
        switch (ctxt->instate) {
5570
0
            case XML_PARSER_EOF:
5571
          /*
5572
     * Document parsing is done !
5573
     */
5574
0
          goto done;
5575
0
            case XML_PARSER_START:
5576
          /*
5577
     * Very first chars read from the document flow.
5578
     */
5579
0
    cur = in->cur[0];
5580
0
    if (IS_BLANK_CH(cur)) {
5581
0
        SKIP_BLANKS;
5582
0
        if (in->buf == NULL)
5583
0
      avail = in->length - (in->cur - in->base);
5584
0
        else
5585
0
      avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5586
0
                                (in->cur - in->base);
5587
0
    }
5588
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5589
0
        ctxt->sax->setDocumentLocator(ctxt->userData,
5590
0
              &xmlDefaultSAXLocator);
5591
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5592
0
              (!ctxt->disableSAX))
5593
0
        ctxt->sax->startDocument(ctxt->userData);
5594
5595
0
    cur = in->cur[0];
5596
0
    next = in->cur[1];
5597
0
    if ((cur == '<') && (next == '!') &&
5598
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5599
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5600
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5601
0
        (UPP(8) == 'E')) {
5602
0
        if ((!terminate) &&
5603
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5604
0
      goto done;
5605
#ifdef DEBUG_PUSH
5606
        xmlGenericError(xmlGenericErrorContext,
5607
          "HPP: Parsing internal subset\n");
5608
#endif
5609
0
        htmlParseDocTypeDecl(ctxt);
5610
0
        ctxt->instate = XML_PARSER_PROLOG;
5611
#ifdef DEBUG_PUSH
5612
        xmlGenericError(xmlGenericErrorContext,
5613
          "HPP: entering PROLOG\n");
5614
#endif
5615
0
                } else {
5616
0
        ctxt->instate = XML_PARSER_MISC;
5617
#ifdef DEBUG_PUSH
5618
        xmlGenericError(xmlGenericErrorContext,
5619
          "HPP: entering MISC\n");
5620
#endif
5621
0
    }
5622
0
    break;
5623
0
            case XML_PARSER_MISC:
5624
0
    SKIP_BLANKS;
5625
0
    if (in->buf == NULL)
5626
0
        avail = in->length - (in->cur - in->base);
5627
0
    else
5628
0
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5629
0
                            (in->cur - in->base);
5630
    /*
5631
     * no chars in buffer
5632
     */
5633
0
    if (avail < 1)
5634
0
        goto done;
5635
    /*
5636
     * not enough chars in buffer
5637
     */
5638
0
    if (avail < 2) {
5639
0
        if (!terminate)
5640
0
      goto done;
5641
0
        else
5642
0
      next = ' ';
5643
0
    } else {
5644
0
        next = in->cur[1];
5645
0
    }
5646
0
    cur = in->cur[0];
5647
0
          if ((cur == '<') && (next == '!') &&
5648
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5649
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5650
0
      goto done;
5651
#ifdef DEBUG_PUSH
5652
        xmlGenericError(xmlGenericErrorContext,
5653
          "HPP: Parsing Comment\n");
5654
#endif
5655
0
        htmlParseComment(ctxt);
5656
0
        ctxt->instate = XML_PARSER_MISC;
5657
0
          } else if ((cur == '<') && (next == '?')) {
5658
0
        if ((!terminate) &&
5659
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5660
0
      goto done;
5661
#ifdef DEBUG_PUSH
5662
        xmlGenericError(xmlGenericErrorContext,
5663
          "HPP: Parsing PI\n");
5664
#endif
5665
0
        htmlParsePI(ctxt);
5666
0
        ctxt->instate = XML_PARSER_MISC;
5667
0
    } else if ((cur == '<') && (next == '!') &&
5668
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5669
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5670
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5671
0
        (UPP(8) == 'E')) {
5672
0
        if ((!terminate) &&
5673
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5674
0
      goto done;
5675
#ifdef DEBUG_PUSH
5676
        xmlGenericError(xmlGenericErrorContext,
5677
          "HPP: Parsing internal subset\n");
5678
#endif
5679
0
        htmlParseDocTypeDecl(ctxt);
5680
0
        ctxt->instate = XML_PARSER_PROLOG;
5681
#ifdef DEBUG_PUSH
5682
        xmlGenericError(xmlGenericErrorContext,
5683
          "HPP: entering PROLOG\n");
5684
#endif
5685
0
    } else if ((cur == '<') && (next == '!') &&
5686
0
               (avail < 9)) {
5687
0
        goto done;
5688
0
    } else {
5689
0
        ctxt->instate = XML_PARSER_CONTENT;
5690
#ifdef DEBUG_PUSH
5691
        xmlGenericError(xmlGenericErrorContext,
5692
          "HPP: entering START_TAG\n");
5693
#endif
5694
0
    }
5695
0
    break;
5696
0
            case XML_PARSER_PROLOG:
5697
0
    SKIP_BLANKS;
5698
0
    if (in->buf == NULL)
5699
0
        avail = in->length - (in->cur - in->base);
5700
0
    else
5701
0
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5702
0
                            (in->cur - in->base);
5703
0
    if (avail < 2)
5704
0
        goto done;
5705
0
    cur = in->cur[0];
5706
0
    next = in->cur[1];
5707
0
    if ((cur == '<') && (next == '!') &&
5708
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5709
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5710
0
      goto done;
5711
#ifdef DEBUG_PUSH
5712
        xmlGenericError(xmlGenericErrorContext,
5713
          "HPP: Parsing Comment\n");
5714
#endif
5715
0
        htmlParseComment(ctxt);
5716
0
        ctxt->instate = XML_PARSER_PROLOG;
5717
0
          } else if ((cur == '<') && (next == '?')) {
5718
0
        if ((!terminate) &&
5719
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5720
0
      goto done;
5721
#ifdef DEBUG_PUSH
5722
        xmlGenericError(xmlGenericErrorContext,
5723
          "HPP: Parsing PI\n");
5724
#endif
5725
0
        htmlParsePI(ctxt);
5726
0
        ctxt->instate = XML_PARSER_PROLOG;
5727
0
    } else if ((cur == '<') && (next == '!') &&
5728
0
               (avail < 4)) {
5729
0
        goto done;
5730
0
    } else {
5731
0
        ctxt->instate = XML_PARSER_CONTENT;
5732
#ifdef DEBUG_PUSH
5733
        xmlGenericError(xmlGenericErrorContext,
5734
          "HPP: entering START_TAG\n");
5735
#endif
5736
0
    }
5737
0
    break;
5738
0
            case XML_PARSER_EPILOG:
5739
0
    if (in->buf == NULL)
5740
0
        avail = in->length - (in->cur - in->base);
5741
0
    else
5742
0
        avail = (ptrdiff_t)xmlBufUse(in->buf->buffer) -
5743
0
                            (in->cur - in->base);
5744
0
    if (avail < 1)
5745
0
        goto done;
5746
0
    cur = in->cur[0];
5747
0
    if (IS_BLANK_CH(cur)) {
5748
0
        htmlParseCharData(ctxt);
5749
0
        goto done;
5750
0
    }
5751
0
    if (avail < 2)
5752
0
        goto done;
5753
0
    next = in->cur[1];
5754
0
          if ((cur == '<') && (next == '!') &&
5755
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5756
0
        if ((!terminate) && (htmlParseLookupCommentEnd(ctxt) < 0))
5757
0
      goto done;
5758
#ifdef DEBUG_PUSH
5759
        xmlGenericError(xmlGenericErrorContext,
5760
          "HPP: Parsing Comment\n");
5761
#endif
5762
0
        htmlParseComment(ctxt);
5763
0
        ctxt->instate = XML_PARSER_EPILOG;
5764
0
          } else if ((cur == '<') && (next == '?')) {
5765
0
        if ((!terminate) &&
5766
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
5767
0
      goto done;
5768
#ifdef DEBUG_PUSH
5769
        xmlGenericError(xmlGenericErrorContext,
5770
          "HPP: Parsing PI\n");
5771
#endif
5772
0
        htmlParsePI(ctxt);
5773
0
        ctxt->instate = XML_PARSER_EPILOG;
5774
0
    } else if ((cur == '<') && (next == '!') &&
5775
0
               (avail < 4)) {
5776
0
        goto done;
5777
0
    } else {
5778
0
        ctxt->errNo = XML_ERR_DOCUMENT_END;
5779
0
        ctxt->wellFormed = 0;
5780
0
        ctxt->instate = XML_PARSER_EOF;
5781
#ifdef DEBUG_PUSH
5782
        xmlGenericError(xmlGenericErrorContext,
5783
          "HPP: entering EOF\n");
5784
#endif
5785
0
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5786
0
      ctxt->sax->endDocument(ctxt->userData);
5787
0
        goto done;
5788
0
    }
5789
0
    break;
5790
0
            case XML_PARSER_START_TAG: {
5791
0
          const xmlChar *name;
5792
0
    int failed;
5793
0
    const htmlElemDesc * info;
5794
5795
    /*
5796
     * no chars in buffer
5797
     */
5798
0
    if (avail < 1)
5799
0
        goto done;
5800
    /*
5801
     * not enough chars in buffer
5802
     */
5803
0
    if (avail < 2) {
5804
0
        if (!terminate)
5805
0
      goto done;
5806
0
        else
5807
0
      next = ' ';
5808
0
    } else {
5809
0
        next = in->cur[1];
5810
0
    }
5811
0
    cur = in->cur[0];
5812
0
          if (cur != '<') {
5813
0
        ctxt->instate = XML_PARSER_CONTENT;
5814
#ifdef DEBUG_PUSH
5815
        xmlGenericError(xmlGenericErrorContext,
5816
          "HPP: entering CONTENT\n");
5817
#endif
5818
0
        break;
5819
0
    }
5820
0
    if (next == '/') {
5821
0
        ctxt->instate = XML_PARSER_END_TAG;
5822
0
        ctxt->checkIndex = 0;
5823
#ifdef DEBUG_PUSH
5824
        xmlGenericError(xmlGenericErrorContext,
5825
          "HPP: entering END_TAG\n");
5826
#endif
5827
0
        break;
5828
0
    }
5829
0
    if ((!terminate) &&
5830
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
5831
0
        goto done;
5832
5833
                /* Capture start position */
5834
0
          if (ctxt->record_info) {
5835
0
               node_info.begin_pos = ctxt->input->consumed +
5836
0
                                  (CUR_PTR - ctxt->input->base);
5837
0
               node_info.begin_line = ctxt->input->line;
5838
0
          }
5839
5840
5841
0
    failed = htmlParseStartTag(ctxt);
5842
0
    name = ctxt->name;
5843
0
    if ((failed == -1) ||
5844
0
        (name == NULL)) {
5845
0
        if (CUR == '>')
5846
0
      NEXT;
5847
0
        break;
5848
0
    }
5849
5850
    /*
5851
     * Lookup the info for that element.
5852
     */
5853
0
    info = htmlTagLookup(name);
5854
0
    if (info == NULL) {
5855
0
        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5856
0
                     "Tag %s invalid\n", name, NULL);
5857
0
    }
5858
5859
    /*
5860
     * Check for an Empty Element labeled the XML/SGML way
5861
     */
5862
0
    if ((CUR == '/') && (NXT(1) == '>')) {
5863
0
        SKIP(2);
5864
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5865
0
      ctxt->sax->endElement(ctxt->userData, name);
5866
0
        htmlnamePop(ctxt);
5867
0
        ctxt->instate = XML_PARSER_CONTENT;
5868
#ifdef DEBUG_PUSH
5869
        xmlGenericError(xmlGenericErrorContext,
5870
          "HPP: entering CONTENT\n");
5871
#endif
5872
0
        break;
5873
0
    }
5874
5875
0
    if (CUR == '>') {
5876
0
        NEXT;
5877
0
    } else {
5878
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5879
0
                     "Couldn't find end of Start Tag %s\n",
5880
0
         name, NULL);
5881
5882
        /*
5883
         * end of parsing of this node.
5884
         */
5885
0
        if (xmlStrEqual(name, ctxt->name)) {
5886
0
      nodePop(ctxt);
5887
0
      htmlnamePop(ctxt);
5888
0
        }
5889
5890
0
        if (ctxt->record_info)
5891
0
            htmlNodeInfoPush(ctxt, &node_info);
5892
5893
0
        ctxt->instate = XML_PARSER_CONTENT;
5894
#ifdef DEBUG_PUSH
5895
        xmlGenericError(xmlGenericErrorContext,
5896
          "HPP: entering CONTENT\n");
5897
#endif
5898
0
        break;
5899
0
    }
5900
5901
    /*
5902
     * Check for an Empty Element from DTD definition
5903
     */
5904
0
    if ((info != NULL) && (info->empty)) {
5905
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5906
0
      ctxt->sax->endElement(ctxt->userData, name);
5907
0
        htmlnamePop(ctxt);
5908
0
    }
5909
5910
0
                if (ctxt->record_info)
5911
0
              htmlNodeInfoPush(ctxt, &node_info);
5912
5913
0
    ctxt->instate = XML_PARSER_CONTENT;
5914
#ifdef DEBUG_PUSH
5915
    xmlGenericError(xmlGenericErrorContext,
5916
      "HPP: entering CONTENT\n");
5917
#endif
5918
0
                break;
5919
0
      }
5920
0
            case XML_PARSER_CONTENT: {
5921
0
    xmlChar chr[2] = { 0, 0 };
5922
5923
                /*
5924
     * Handle preparsed entities and charRef
5925
     */
5926
0
    if (ctxt->token != 0) {
5927
0
        chr[0] = ctxt->token;
5928
0
        htmlCheckParagraph(ctxt);
5929
0
        if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5930
0
      ctxt->sax->characters(ctxt->userData, chr, 1);
5931
0
        ctxt->token = 0;
5932
0
        ctxt->checkIndex = 0;
5933
0
    }
5934
0
    if ((avail == 1) && (terminate)) {
5935
0
        cur = in->cur[0];
5936
0
        if ((cur != '<') && (cur != '&')) {
5937
0
      if (ctxt->sax != NULL) {
5938
0
                            chr[0] = cur;
5939
0
          if (IS_BLANK_CH(cur)) {
5940
0
        if (ctxt->keepBlanks) {
5941
0
            if (ctxt->sax->characters != NULL)
5942
0
          ctxt->sax->characters(
5943
0
            ctxt->userData, chr, 1);
5944
0
        } else {
5945
0
            if (ctxt->sax->ignorableWhitespace != NULL)
5946
0
          ctxt->sax->ignorableWhitespace(
5947
0
            ctxt->userData, chr, 1);
5948
0
        }
5949
0
          } else {
5950
0
        htmlCheckParagraph(ctxt);
5951
0
        if (ctxt->sax->characters != NULL)
5952
0
            ctxt->sax->characters(
5953
0
              ctxt->userData, chr, 1);
5954
0
          }
5955
0
      }
5956
0
      ctxt->token = 0;
5957
0
      ctxt->checkIndex = 0;
5958
0
      in->cur++;
5959
0
      break;
5960
0
        }
5961
0
    }
5962
0
    if (avail < 2)
5963
0
        goto done;
5964
0
    cur = in->cur[0];
5965
0
    next = in->cur[1];
5966
0
    if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5967
0
        (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5968
        /*
5969
         * Handle SCRIPT/STYLE separately
5970
         */
5971
0
        if (!terminate) {
5972
0
            int idx;
5973
0
      xmlChar val;
5974
5975
0
      idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0);
5976
0
      if (idx < 0)
5977
0
          goto done;
5978
0
            val = in->cur[idx + 2];
5979
0
      if (val == 0) /* bad cut of input */
5980
0
          goto done;
5981
0
        }
5982
0
        htmlParseScript(ctxt);
5983
0
        if ((cur == '<') && (next == '/')) {
5984
0
      ctxt->instate = XML_PARSER_END_TAG;
5985
0
      ctxt->checkIndex = 0;
5986
#ifdef DEBUG_PUSH
5987
      xmlGenericError(xmlGenericErrorContext,
5988
        "HPP: entering END_TAG\n");
5989
#endif
5990
0
      break;
5991
0
        }
5992
0
    } else if ((cur == '<') && (next == '!')) {
5993
                    /*
5994
                     * Sometimes DOCTYPE arrives in the middle of the document
5995
                     */
5996
0
                    if ((UPP(2) == 'D') && (UPP(3) == 'O') &&
5997
0
                        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5998
0
                        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5999
0
                        (UPP(8) == 'E')) {
6000
0
                        if ((!terminate) &&
6001
0
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 1) < 0))
6002
0
                            goto done;
6003
0
                        htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
6004
0
                                     "Misplaced DOCTYPE declaration\n",
6005
0
                                     BAD_CAST "DOCTYPE" , NULL);
6006
0
                        htmlParseDocTypeDecl(ctxt);
6007
0
                    } else if ((in->cur[2] == '-') && (in->cur[3] == '-')) {
6008
0
                        if ((!terminate) &&
6009
0
                            (htmlParseLookupCommentEnd(ctxt) < 0))
6010
0
                            goto done;
6011
#ifdef DEBUG_PUSH
6012
                        xmlGenericError(xmlGenericErrorContext,
6013
                                "HPP: Parsing Comment\n");
6014
#endif
6015
0
                        htmlParseComment(ctxt);
6016
0
                        ctxt->instate = XML_PARSER_CONTENT;
6017
0
                    } else {
6018
0
                        if ((!terminate) &&
6019
0
                            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6020
0
                            goto done;
6021
0
                        htmlSkipBogusComment(ctxt);
6022
0
                    }
6023
0
                } else if ((cur == '<') && (next == '?')) {
6024
0
                    if ((!terminate) &&
6025
0
                        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6026
0
                        goto done;
6027
#ifdef DEBUG_PUSH
6028
                    xmlGenericError(xmlGenericErrorContext,
6029
                            "HPP: Parsing PI\n");
6030
#endif
6031
0
                    htmlParsePI(ctxt);
6032
0
                    ctxt->instate = XML_PARSER_CONTENT;
6033
0
                } else if ((cur == '<') && (next == '!') && (avail < 4)) {
6034
0
                    goto done;
6035
0
                } else if ((cur == '<') && (next == '/')) {
6036
0
                    ctxt->instate = XML_PARSER_END_TAG;
6037
0
                    ctxt->checkIndex = 0;
6038
#ifdef DEBUG_PUSH
6039
                    xmlGenericError(xmlGenericErrorContext,
6040
                            "HPP: entering END_TAG\n");
6041
#endif
6042
0
                    break;
6043
0
                } else if ((cur == '<') && IS_ASCII_LETTER(next)) {
6044
0
                    if ((!terminate) && (next == 0))
6045
0
                        goto done;
6046
0
                    ctxt->instate = XML_PARSER_START_TAG;
6047
0
                    ctxt->checkIndex = 0;
6048
#ifdef DEBUG_PUSH
6049
                    xmlGenericError(xmlGenericErrorContext,
6050
                            "HPP: entering START_TAG\n");
6051
#endif
6052
0
                    break;
6053
0
                } else if (cur == '<') {
6054
0
                    if ((ctxt->sax != NULL) && (!ctxt->disableSAX) &&
6055
0
                        (ctxt->sax->characters != NULL))
6056
0
                        ctxt->sax->characters(ctxt->userData,
6057
0
                                              BAD_CAST "<", 1);
6058
0
                    NEXT;
6059
0
                } else {
6060
                    /*
6061
                     * check that the text sequence is complete
6062
                     * before handing out the data to the parser
6063
                     * to avoid problems with erroneous end of
6064
                     * data detection.
6065
                     */
6066
0
                    if ((!terminate) &&
6067
0
                        (htmlParseLookupSequence(ctxt, '<', 0, 0, 0) < 0))
6068
0
                        goto done;
6069
0
                    ctxt->checkIndex = 0;
6070
#ifdef DEBUG_PUSH
6071
                    xmlGenericError(xmlGenericErrorContext,
6072
                            "HPP: Parsing char data\n");
6073
#endif
6074
0
                    while ((ctxt->instate != XML_PARSER_EOF) &&
6075
0
                           (cur != '<') && (in->cur < in->end)) {
6076
0
                        if (cur == '&') {
6077
0
                            htmlParseReference(ctxt);
6078
0
                        } else {
6079
0
                            htmlParseCharData(ctxt);
6080
0
                        }
6081
0
                        cur = in->cur[0];
6082
0
                    }
6083
0
    }
6084
6085
0
    break;
6086
0
      }
6087
0
            case XML_PARSER_END_TAG:
6088
0
    if (avail < 2)
6089
0
        goto done;
6090
0
    if ((!terminate) &&
6091
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0) < 0))
6092
0
        goto done;
6093
0
    htmlParseEndTag(ctxt);
6094
0
    if (ctxt->nameNr == 0) {
6095
0
        ctxt->instate = XML_PARSER_EPILOG;
6096
0
    } else {
6097
0
        ctxt->instate = XML_PARSER_CONTENT;
6098
0
    }
6099
0
    ctxt->checkIndex = 0;
6100
#ifdef DEBUG_PUSH
6101
    xmlGenericError(xmlGenericErrorContext,
6102
      "HPP: entering CONTENT\n");
6103
#endif
6104
0
          break;
6105
0
            case XML_PARSER_CDATA_SECTION:
6106
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6107
0
      "HPP: internal error, state == CDATA\n",
6108
0
           NULL, NULL);
6109
0
    ctxt->instate = XML_PARSER_CONTENT;
6110
0
    ctxt->checkIndex = 0;
6111
#ifdef DEBUG_PUSH
6112
    xmlGenericError(xmlGenericErrorContext,
6113
      "HPP: entering CONTENT\n");
6114
#endif
6115
0
    break;
6116
0
            case XML_PARSER_DTD:
6117
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6118
0
      "HPP: internal error, state == DTD\n",
6119
0
           NULL, NULL);
6120
0
    ctxt->instate = XML_PARSER_CONTENT;
6121
0
    ctxt->checkIndex = 0;
6122
#ifdef DEBUG_PUSH
6123
    xmlGenericError(xmlGenericErrorContext,
6124
      "HPP: entering CONTENT\n");
6125
#endif
6126
0
    break;
6127
0
            case XML_PARSER_COMMENT:
6128
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6129
0
      "HPP: internal error, state == COMMENT\n",
6130
0
           NULL, NULL);
6131
0
    ctxt->instate = XML_PARSER_CONTENT;
6132
0
    ctxt->checkIndex = 0;
6133
#ifdef DEBUG_PUSH
6134
    xmlGenericError(xmlGenericErrorContext,
6135
      "HPP: entering CONTENT\n");
6136
#endif
6137
0
    break;
6138
0
            case XML_PARSER_PI:
6139
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6140
0
      "HPP: internal error, state == PI\n",
6141
0
           NULL, NULL);
6142
0
    ctxt->instate = XML_PARSER_CONTENT;
6143
0
    ctxt->checkIndex = 0;
6144
#ifdef DEBUG_PUSH
6145
    xmlGenericError(xmlGenericErrorContext,
6146
      "HPP: entering CONTENT\n");
6147
#endif
6148
0
    break;
6149
0
            case XML_PARSER_ENTITY_DECL:
6150
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6151
0
      "HPP: internal error, state == ENTITY_DECL\n",
6152
0
           NULL, NULL);
6153
0
    ctxt->instate = XML_PARSER_CONTENT;
6154
0
    ctxt->checkIndex = 0;
6155
#ifdef DEBUG_PUSH
6156
    xmlGenericError(xmlGenericErrorContext,
6157
      "HPP: entering CONTENT\n");
6158
#endif
6159
0
    break;
6160
0
            case XML_PARSER_ENTITY_VALUE:
6161
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6162
0
      "HPP: internal error, state == ENTITY_VALUE\n",
6163
0
           NULL, NULL);
6164
0
    ctxt->instate = XML_PARSER_CONTENT;
6165
0
    ctxt->checkIndex = 0;
6166
#ifdef DEBUG_PUSH
6167
    xmlGenericError(xmlGenericErrorContext,
6168
      "HPP: entering DTD\n");
6169
#endif
6170
0
    break;
6171
0
            case XML_PARSER_ATTRIBUTE_VALUE:
6172
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6173
0
      "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6174
0
           NULL, NULL);
6175
0
    ctxt->instate = XML_PARSER_START_TAG;
6176
0
    ctxt->checkIndex = 0;
6177
#ifdef DEBUG_PUSH
6178
    xmlGenericError(xmlGenericErrorContext,
6179
      "HPP: entering START_TAG\n");
6180
#endif
6181
0
    break;
6182
0
      case XML_PARSER_SYSTEM_LITERAL:
6183
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6184
0
        "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6185
0
           NULL, NULL);
6186
0
    ctxt->instate = XML_PARSER_CONTENT;
6187
0
    ctxt->checkIndex = 0;
6188
#ifdef DEBUG_PUSH
6189
    xmlGenericError(xmlGenericErrorContext,
6190
      "HPP: entering CONTENT\n");
6191
#endif
6192
0
    break;
6193
0
      case XML_PARSER_IGNORE:
6194
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6195
0
      "HPP: internal error, state == XML_PARSER_IGNORE\n",
6196
0
           NULL, NULL);
6197
0
    ctxt->instate = XML_PARSER_CONTENT;
6198
0
    ctxt->checkIndex = 0;
6199
#ifdef DEBUG_PUSH
6200
    xmlGenericError(xmlGenericErrorContext,
6201
      "HPP: entering CONTENT\n");
6202
#endif
6203
0
    break;
6204
0
      case XML_PARSER_PUBLIC_LITERAL:
6205
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6206
0
      "HPP: internal error, state == XML_PARSER_LITERAL\n",
6207
0
           NULL, NULL);
6208
0
    ctxt->instate = XML_PARSER_CONTENT;
6209
0
    ctxt->checkIndex = 0;
6210
#ifdef DEBUG_PUSH
6211
    xmlGenericError(xmlGenericErrorContext,
6212
      "HPP: entering CONTENT\n");
6213
#endif
6214
0
    break;
6215
6216
0
  }
6217
0
    }
6218
0
done:
6219
0
    if ((avail == 0) && (terminate)) {
6220
0
  htmlAutoCloseOnEnd(ctxt);
6221
0
  if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6222
      /*
6223
       * SAX: end of the document processing.
6224
       */
6225
0
      ctxt->instate = XML_PARSER_EOF;
6226
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6227
0
    ctxt->sax->endDocument(ctxt->userData);
6228
0
  }
6229
0
    }
6230
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6231
0
  ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6232
0
   (ctxt->instate == XML_PARSER_EPILOG))) {
6233
0
  xmlDtdPtr dtd;
6234
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
6235
0
  if (dtd == NULL)
6236
0
      ctxt->myDoc->intSubset =
6237
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6238
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6239
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6240
0
    }
6241
#ifdef DEBUG_PUSH
6242
    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6243
#endif
6244
0
    return(ret);
6245
0
}
6246
6247
/**
6248
 * htmlParseChunk:
6249
 * @ctxt:  an HTML parser context
6250
 * @chunk:  an char array
6251
 * @size:  the size in byte of the chunk
6252
 * @terminate:  last chunk indicator
6253
 *
6254
 * Parse a Chunk of memory
6255
 *
6256
 * Returns zero if no error, the xmlParserErrors otherwise.
6257
 */
6258
int
6259
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6260
0
              int terminate) {
6261
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
6262
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6263
0
         "htmlParseChunk: context error\n", NULL, NULL);
6264
0
  return(XML_ERR_INTERNAL_ERROR);
6265
0
    }
6266
0
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6267
0
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6268
0
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6269
0
  size_t cur = ctxt->input->cur - ctxt->input->base;
6270
0
  int res;
6271
6272
0
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6273
0
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6274
0
  if (res < 0) {
6275
0
      ctxt->errNo = XML_PARSER_EOF;
6276
0
      ctxt->disableSAX = 1;
6277
0
      return (XML_PARSER_EOF);
6278
0
  }
6279
#ifdef DEBUG_PUSH
6280
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6281
#endif
6282
6283
#if 0
6284
  if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6285
      htmlParseTryOrFinish(ctxt, terminate);
6286
#endif
6287
0
    } else if (ctxt->instate != XML_PARSER_EOF) {
6288
0
  if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6289
0
      xmlParserInputBufferPtr in = ctxt->input->buf;
6290
0
      if ((in->encoder != NULL) && (in->buffer != NULL) &&
6291
0
        (in->raw != NULL)) {
6292
0
    int nbchars;
6293
0
    size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6294
0
    size_t current = ctxt->input->cur - ctxt->input->base;
6295
6296
0
    nbchars = xmlCharEncInput(in, terminate);
6297
0
    xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6298
0
    if (nbchars < 0) {
6299
0
        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6300
0
               "encoder error\n", NULL, NULL);
6301
0
        return(XML_ERR_INVALID_ENCODING);
6302
0
    }
6303
0
      }
6304
0
  }
6305
0
    }
6306
0
    htmlParseTryOrFinish(ctxt, terminate);
6307
0
    if (terminate) {
6308
0
  if ((ctxt->instate != XML_PARSER_EOF) &&
6309
0
      (ctxt->instate != XML_PARSER_EPILOG) &&
6310
0
      (ctxt->instate != XML_PARSER_MISC)) {
6311
0
      ctxt->errNo = XML_ERR_DOCUMENT_END;
6312
0
      ctxt->wellFormed = 0;
6313
0
  }
6314
0
  if (ctxt->instate != XML_PARSER_EOF) {
6315
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6316
0
    ctxt->sax->endDocument(ctxt->userData);
6317
0
  }
6318
0
  ctxt->instate = XML_PARSER_EOF;
6319
0
    }
6320
0
    return((xmlParserErrors) ctxt->errNo);
6321
0
}
6322
6323
/************************************************************************
6324
 *                  *
6325
 *      User entry points       *
6326
 *                  *
6327
 ************************************************************************/
6328
6329
/**
6330
 * htmlCreatePushParserCtxt:
6331
 * @sax:  a SAX handler
6332
 * @user_data:  The user data returned on SAX callbacks
6333
 * @chunk:  a pointer to an array of chars
6334
 * @size:  number of chars in the array
6335
 * @filename:  an optional file name or URI
6336
 * @enc:  an optional encoding
6337
 *
6338
 * Create a parser context for using the HTML parser in push mode
6339
 * The value of @filename is used for fetching external entities
6340
 * and error/warning reports.
6341
 *
6342
 * Returns the new parser context or NULL
6343
 */
6344
htmlParserCtxtPtr
6345
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6346
                         const char *chunk, int size, const char *filename,
6347
0
       xmlCharEncoding enc) {
6348
0
    htmlParserCtxtPtr ctxt;
6349
0
    htmlParserInputPtr inputStream;
6350
0
    xmlParserInputBufferPtr buf;
6351
6352
0
    xmlInitParser();
6353
6354
0
    buf = xmlAllocParserInputBuffer(enc);
6355
0
    if (buf == NULL) return(NULL);
6356
6357
0
    ctxt = htmlNewSAXParserCtxt(sax, user_data);
6358
0
    if (ctxt == NULL) {
6359
0
  xmlFreeParserInputBuffer(buf);
6360
0
  return(NULL);
6361
0
    }
6362
0
    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6363
0
  ctxt->charset=XML_CHAR_ENCODING_UTF8;
6364
0
    if (filename == NULL) {
6365
0
  ctxt->directory = NULL;
6366
0
    } else {
6367
0
        ctxt->directory = xmlParserGetDirectory(filename);
6368
0
    }
6369
6370
0
    inputStream = htmlNewInputStream(ctxt);
6371
0
    if (inputStream == NULL) {
6372
0
  xmlFreeParserCtxt(ctxt);
6373
0
  xmlFree(buf);
6374
0
  return(NULL);
6375
0
    }
6376
6377
0
    if (filename == NULL)
6378
0
  inputStream->filename = NULL;
6379
0
    else
6380
0
  inputStream->filename = (char *)
6381
0
      xmlCanonicPath((const xmlChar *) filename);
6382
0
    inputStream->buf = buf;
6383
0
    xmlBufResetInput(buf->buffer, inputStream);
6384
6385
0
    inputPush(ctxt, inputStream);
6386
6387
0
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6388
0
        (ctxt->input->buf != NULL))  {
6389
0
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6390
0
  size_t cur = ctxt->input->cur - ctxt->input->base;
6391
6392
0
  xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6393
6394
0
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6395
#ifdef DEBUG_PUSH
6396
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6397
#endif
6398
0
    }
6399
0
    ctxt->progressive = 1;
6400
6401
0
    return(ctxt);
6402
0
}
6403
#endif /* LIBXML_PUSH_ENABLED */
6404
6405
/**
6406
 * htmlSAXParseDoc:
6407
 * @cur:  a pointer to an array of xmlChar
6408
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6409
 * @sax:  the SAX handler block
6410
 * @userData: if using SAX, this pointer will be provided on callbacks.
6411
 *
6412
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadDoc.
6413
 *
6414
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6415
 * to handle parse events. If sax is NULL, fallback to the default DOM
6416
 * behavior and return a tree.
6417
 *
6418
 * Returns the resulting document tree unless SAX is NULL or the document is
6419
 *     not well formed.
6420
 */
6421
6422
htmlDocPtr
6423
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6424
0
                htmlSAXHandlerPtr sax, void *userData) {
6425
0
    htmlDocPtr ret;
6426
0
    htmlParserCtxtPtr ctxt;
6427
6428
0
    xmlInitParser();
6429
6430
0
    if (cur == NULL) return(NULL);
6431
6432
6433
0
    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6434
0
    if (ctxt == NULL) return(NULL);
6435
0
    if (sax != NULL) {
6436
0
        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6437
0
        ctxt->sax = sax;
6438
0
        ctxt->userData = userData;
6439
0
    }
6440
6441
0
    htmlParseDocument(ctxt);
6442
0
    ret = ctxt->myDoc;
6443
0
    if (sax != NULL) {
6444
0
  ctxt->sax = NULL;
6445
0
  ctxt->userData = NULL;
6446
0
    }
6447
0
    htmlFreeParserCtxt(ctxt);
6448
6449
0
    return(ret);
6450
0
}
6451
6452
/**
6453
 * htmlParseDoc:
6454
 * @cur:  a pointer to an array of xmlChar
6455
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6456
 *
6457
 * parse an HTML in-memory document and build a tree.
6458
 *
6459
 * Returns the resulting document tree
6460
 */
6461
6462
htmlDocPtr
6463
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
6464
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6465
0
}
6466
6467
6468
/**
6469
 * htmlCreateFileParserCtxt:
6470
 * @filename:  the filename
6471
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6472
 *
6473
 * Create a parser context for a file content.
6474
 * Automatic support for ZLIB/Compress compressed document is provided
6475
 * by default if found at compile-time.
6476
 *
6477
 * Returns the new parser context or NULL
6478
 */
6479
htmlParserCtxtPtr
6480
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6481
0
{
6482
0
    htmlParserCtxtPtr ctxt;
6483
0
    htmlParserInputPtr inputStream;
6484
0
    char *canonicFilename;
6485
    /* htmlCharEncoding enc; */
6486
0
    xmlChar *content, *content_line = (xmlChar *) "charset=";
6487
6488
0
    if (filename == NULL)
6489
0
        return(NULL);
6490
6491
0
    ctxt = htmlNewParserCtxt();
6492
0
    if (ctxt == NULL) {
6493
0
  return(NULL);
6494
0
    }
6495
0
    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6496
0
    if (canonicFilename == NULL) {
6497
0
  xmlFreeParserCtxt(ctxt);
6498
0
  return(NULL);
6499
0
    }
6500
6501
0
    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6502
0
    xmlFree(canonicFilename);
6503
0
    if (inputStream == NULL) {
6504
0
  xmlFreeParserCtxt(ctxt);
6505
0
  return(NULL);
6506
0
    }
6507
6508
0
    inputPush(ctxt, inputStream);
6509
6510
    /* set encoding */
6511
0
    if (encoding) {
6512
0
        size_t l = strlen(encoding);
6513
6514
0
  if (l < 1000) {
6515
0
      content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6516
0
      if (content) {
6517
0
    strcpy ((char *)content, (char *)content_line);
6518
0
    strcat ((char *)content, (char *)encoding);
6519
0
    htmlCheckEncoding (ctxt, content);
6520
0
    xmlFree (content);
6521
0
      }
6522
0
  }
6523
0
    }
6524
6525
0
    return(ctxt);
6526
0
}
6527
6528
/**
6529
 * htmlSAXParseFile:
6530
 * @filename:  the filename
6531
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6532
 * @sax:  the SAX handler block
6533
 * @userData: if using SAX, this pointer will be provided on callbacks.
6534
 *
6535
 * DEPRECATED: Use htmlNewSAXParserCtxt and htmlCtxtReadFile.
6536
 *
6537
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6538
 * compressed document is provided by default if found at compile-time.
6539
 * It use the given SAX function block to handle the parsing callback.
6540
 * If sax is NULL, fallback to the default DOM tree building routines.
6541
 *
6542
 * Returns the resulting document tree unless SAX is NULL or the document is
6543
 *     not well formed.
6544
 */
6545
6546
htmlDocPtr
6547
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6548
0
                 void *userData) {
6549
0
    htmlDocPtr ret;
6550
0
    htmlParserCtxtPtr ctxt;
6551
0
    htmlSAXHandlerPtr oldsax = NULL;
6552
6553
0
    xmlInitParser();
6554
6555
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6556
0
    if (ctxt == NULL) return(NULL);
6557
0
    if (sax != NULL) {
6558
0
  oldsax = ctxt->sax;
6559
0
        ctxt->sax = sax;
6560
0
        ctxt->userData = userData;
6561
0
    }
6562
6563
0
    htmlParseDocument(ctxt);
6564
6565
0
    ret = ctxt->myDoc;
6566
0
    if (sax != NULL) {
6567
0
        ctxt->sax = oldsax;
6568
0
        ctxt->userData = NULL;
6569
0
    }
6570
0
    htmlFreeParserCtxt(ctxt);
6571
6572
0
    return(ret);
6573
0
}
6574
6575
/**
6576
 * htmlParseFile:
6577
 * @filename:  the filename
6578
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6579
 *
6580
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6581
 * compressed document is provided by default if found at compile-time.
6582
 *
6583
 * Returns the resulting document tree
6584
 */
6585
6586
htmlDocPtr
6587
0
htmlParseFile(const char *filename, const char *encoding) {
6588
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6589
0
}
6590
6591
/**
6592
 * htmlHandleOmittedElem:
6593
 * @val:  int 0 or 1
6594
 *
6595
 * Set and return the previous value for handling HTML omitted tags.
6596
 *
6597
 * Returns the last value for 0 for no handling, 1 for auto insertion.
6598
 */
6599
6600
int
6601
0
htmlHandleOmittedElem(int val) {
6602
0
    int old = htmlOmittedDefaultValue;
6603
6604
0
    htmlOmittedDefaultValue = val;
6605
0
    return(old);
6606
0
}
6607
6608
/**
6609
 * htmlElementAllowedHere:
6610
 * @parent: HTML parent element
6611
 * @elt: HTML element
6612
 *
6613
 * Checks whether an HTML element may be a direct child of a parent element.
6614
 * Note - doesn't check for deprecated elements
6615
 *
6616
 * Returns 1 if allowed; 0 otherwise.
6617
 */
6618
int
6619
0
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6620
0
  const char** p ;
6621
6622
0
  if ( ! elt || ! parent || ! parent->subelts )
6623
0
  return 0 ;
6624
6625
0
  for ( p = parent->subelts; *p; ++p )
6626
0
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6627
0
      return 1 ;
6628
6629
0
  return 0 ;
6630
0
}
6631
/**
6632
 * htmlElementStatusHere:
6633
 * @parent: HTML parent element
6634
 * @elt: HTML element
6635
 *
6636
 * Checks whether an HTML element may be a direct child of a parent element.
6637
 * and if so whether it is valid or deprecated.
6638
 *
6639
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6640
 */
6641
htmlStatus
6642
0
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6643
0
  if ( ! parent || ! elt )
6644
0
    return HTML_INVALID ;
6645
0
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6646
0
    return HTML_INVALID ;
6647
6648
0
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6649
0
}
6650
/**
6651
 * htmlAttrAllowed:
6652
 * @elt: HTML element
6653
 * @attr: HTML attribute
6654
 * @legacy: whether to allow deprecated attributes
6655
 *
6656
 * Checks whether an attribute is valid for an element
6657
 * Has full knowledge of Required and Deprecated attributes
6658
 *
6659
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6660
 */
6661
htmlStatus
6662
0
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6663
0
  const char** p ;
6664
6665
0
  if ( !elt || ! attr )
6666
0
  return HTML_INVALID ;
6667
6668
0
  if ( elt->attrs_req )
6669
0
    for ( p = elt->attrs_req; *p; ++p)
6670
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6671
0
        return HTML_REQUIRED ;
6672
6673
0
  if ( elt->attrs_opt )
6674
0
    for ( p = elt->attrs_opt; *p; ++p)
6675
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6676
0
        return HTML_VALID ;
6677
6678
0
  if ( legacy && elt->attrs_depr )
6679
0
    for ( p = elt->attrs_depr; *p; ++p)
6680
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6681
0
        return HTML_DEPRECATED ;
6682
6683
0
  return HTML_INVALID ;
6684
0
}
6685
/**
6686
 * htmlNodeStatus:
6687
 * @node: an htmlNodePtr in a tree
6688
 * @legacy: whether to allow deprecated elements (YES is faster here
6689
 *  for Element nodes)
6690
 *
6691
 * Checks whether the tree node is valid.  Experimental (the author
6692
 *     only uses the HTML enhancements in a SAX parser)
6693
 *
6694
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6695
 *  legacy allowed) or htmlElementStatusHere (otherwise).
6696
 *  for Attribute nodes, a return from htmlAttrAllowed
6697
 *  for other nodes, HTML_NA (no checks performed)
6698
 */
6699
htmlStatus
6700
0
htmlNodeStatus(const htmlNodePtr node, int legacy) {
6701
0
  if ( ! node )
6702
0
    return HTML_INVALID ;
6703
6704
0
  switch ( node->type ) {
6705
0
    case XML_ELEMENT_NODE:
6706
0
      return legacy
6707
0
  ? ( htmlElementAllowedHere (
6708
0
    htmlTagLookup(node->parent->name) , node->name
6709
0
    ) ? HTML_VALID : HTML_INVALID )
6710
0
  : htmlElementStatusHere(
6711
0
    htmlTagLookup(node->parent->name) ,
6712
0
    htmlTagLookup(node->name) )
6713
0
  ;
6714
0
    case XML_ATTRIBUTE_NODE:
6715
0
      return htmlAttrAllowed(
6716
0
  htmlTagLookup(node->parent->name) , node->name, legacy) ;
6717
0
    default: return HTML_NA ;
6718
0
  }
6719
0
}
6720
/************************************************************************
6721
 *                  *
6722
 *  New set (2.6.0) of simpler and more flexible APIs   *
6723
 *                  *
6724
 ************************************************************************/
6725
/**
6726
 * DICT_FREE:
6727
 * @str:  a string
6728
 *
6729
 * Free a string if it is not owned by the "dict" dictionary in the
6730
 * current scope
6731
 */
6732
#define DICT_FREE(str)            \
6733
0
  if ((str) && ((!dict) ||       \
6734
0
      (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6735
0
      xmlFree((char *)(str));
6736
6737
/**
6738
 * htmlCtxtReset:
6739
 * @ctxt: an HTML parser context
6740
 *
6741
 * Reset a parser context
6742
 */
6743
void
6744
htmlCtxtReset(htmlParserCtxtPtr ctxt)
6745
0
{
6746
0
    xmlParserInputPtr input;
6747
0
    xmlDictPtr dict;
6748
6749
0
    if (ctxt == NULL)
6750
0
        return;
6751
6752
0
    xmlInitParser();
6753
0
    dict = ctxt->dict;
6754
6755
0
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6756
0
        xmlFreeInputStream(input);
6757
0
    }
6758
0
    ctxt->inputNr = 0;
6759
0
    ctxt->input = NULL;
6760
6761
0
    ctxt->spaceNr = 0;
6762
0
    if (ctxt->spaceTab != NULL) {
6763
0
  ctxt->spaceTab[0] = -1;
6764
0
  ctxt->space = &ctxt->spaceTab[0];
6765
0
    } else {
6766
0
  ctxt->space = NULL;
6767
0
    }
6768
6769
6770
0
    ctxt->nodeNr = 0;
6771
0
    ctxt->node = NULL;
6772
6773
0
    ctxt->nameNr = 0;
6774
0
    ctxt->name = NULL;
6775
6776
0
    ctxt->nsNr = 0;
6777
6778
0
    DICT_FREE(ctxt->version);
6779
0
    ctxt->version = NULL;
6780
0
    DICT_FREE(ctxt->encoding);
6781
0
    ctxt->encoding = NULL;
6782
0
    DICT_FREE(ctxt->directory);
6783
0
    ctxt->directory = NULL;
6784
0
    DICT_FREE(ctxt->extSubURI);
6785
0
    ctxt->extSubURI = NULL;
6786
0
    DICT_FREE(ctxt->extSubSystem);
6787
0
    ctxt->extSubSystem = NULL;
6788
0
    if (ctxt->myDoc != NULL)
6789
0
        xmlFreeDoc(ctxt->myDoc);
6790
0
    ctxt->myDoc = NULL;
6791
6792
0
    ctxt->standalone = -1;
6793
0
    ctxt->hasExternalSubset = 0;
6794
0
    ctxt->hasPErefs = 0;
6795
0
    ctxt->html = 1;
6796
0
    ctxt->external = 0;
6797
0
    ctxt->instate = XML_PARSER_START;
6798
0
    ctxt->token = 0;
6799
6800
0
    ctxt->wellFormed = 1;
6801
0
    ctxt->nsWellFormed = 1;
6802
0
    ctxt->disableSAX = 0;
6803
0
    ctxt->valid = 1;
6804
0
    ctxt->vctxt.userData = ctxt;
6805
0
    ctxt->vctxt.error = xmlParserValidityError;
6806
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
6807
0
    ctxt->record_info = 0;
6808
0
    ctxt->checkIndex = 0;
6809
0
    ctxt->inSubset = 0;
6810
0
    ctxt->errNo = XML_ERR_OK;
6811
0
    ctxt->depth = 0;
6812
0
    ctxt->charset = XML_CHAR_ENCODING_NONE;
6813
0
    ctxt->catalogs = NULL;
6814
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
6815
6816
0
    if (ctxt->attsDefault != NULL) {
6817
0
        xmlHashFree(ctxt->attsDefault, xmlHashDefaultDeallocator);
6818
0
        ctxt->attsDefault = NULL;
6819
0
    }
6820
0
    if (ctxt->attsSpecial != NULL) {
6821
0
        xmlHashFree(ctxt->attsSpecial, NULL);
6822
0
        ctxt->attsSpecial = NULL;
6823
0
    }
6824
0
}
6825
6826
/**
6827
 * htmlCtxtUseOptions:
6828
 * @ctxt: an HTML parser context
6829
 * @options:  a combination of htmlParserOption(s)
6830
 *
6831
 * Applies the options to the parser context
6832
 *
6833
 * Returns 0 in case of success, the set of unknown or unimplemented options
6834
 *         in case of error.
6835
 */
6836
int
6837
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6838
0
{
6839
0
    if (ctxt == NULL)
6840
0
        return(-1);
6841
6842
0
    if (options & HTML_PARSE_NOWARNING) {
6843
0
        ctxt->sax->warning = NULL;
6844
0
        ctxt->vctxt.warning = NULL;
6845
0
        options -= XML_PARSE_NOWARNING;
6846
0
  ctxt->options |= XML_PARSE_NOWARNING;
6847
0
    }
6848
0
    if (options & HTML_PARSE_NOERROR) {
6849
0
        ctxt->sax->error = NULL;
6850
0
        ctxt->vctxt.error = NULL;
6851
0
        ctxt->sax->fatalError = NULL;
6852
0
        options -= XML_PARSE_NOERROR;
6853
0
  ctxt->options |= XML_PARSE_NOERROR;
6854
0
    }
6855
0
    if (options & HTML_PARSE_PEDANTIC) {
6856
0
        ctxt->pedantic = 1;
6857
0
        options -= XML_PARSE_PEDANTIC;
6858
0
  ctxt->options |= XML_PARSE_PEDANTIC;
6859
0
    } else
6860
0
        ctxt->pedantic = 0;
6861
0
    if (options & XML_PARSE_NOBLANKS) {
6862
0
        ctxt->keepBlanks = 0;
6863
0
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6864
0
        options -= XML_PARSE_NOBLANKS;
6865
0
  ctxt->options |= XML_PARSE_NOBLANKS;
6866
0
    } else
6867
0
        ctxt->keepBlanks = 1;
6868
0
    if (options & HTML_PARSE_RECOVER) {
6869
0
        ctxt->recovery = 1;
6870
0
  options -= HTML_PARSE_RECOVER;
6871
0
    } else
6872
0
        ctxt->recovery = 0;
6873
0
    if (options & HTML_PARSE_COMPACT) {
6874
0
  ctxt->options |= HTML_PARSE_COMPACT;
6875
0
        options -= HTML_PARSE_COMPACT;
6876
0
    }
6877
0
    if (options & XML_PARSE_HUGE) {
6878
0
  ctxt->options |= XML_PARSE_HUGE;
6879
0
        options -= XML_PARSE_HUGE;
6880
0
    }
6881
0
    if (options & HTML_PARSE_NODEFDTD) {
6882
0
  ctxt->options |= HTML_PARSE_NODEFDTD;
6883
0
        options -= HTML_PARSE_NODEFDTD;
6884
0
    }
6885
0
    if (options & HTML_PARSE_IGNORE_ENC) {
6886
0
  ctxt->options |= HTML_PARSE_IGNORE_ENC;
6887
0
        options -= HTML_PARSE_IGNORE_ENC;
6888
0
    }
6889
0
    if (options & HTML_PARSE_NOIMPLIED) {
6890
0
        ctxt->options |= HTML_PARSE_NOIMPLIED;
6891
0
        options -= HTML_PARSE_NOIMPLIED;
6892
0
    }
6893
0
    ctxt->dictNames = 0;
6894
0
    ctxt->linenumbers = 1;
6895
0
    return (options);
6896
0
}
6897
6898
/**
6899
 * htmlDoRead:
6900
 * @ctxt:  an HTML parser context
6901
 * @URL:  the base URL to use for the document
6902
 * @encoding:  the document encoding, or NULL
6903
 * @options:  a combination of htmlParserOption(s)
6904
 * @reuse:  keep the context for reuse
6905
 *
6906
 * Common front-end for the htmlRead functions
6907
 *
6908
 * Returns the resulting document tree or NULL
6909
 */
6910
static htmlDocPtr
6911
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6912
          int options, int reuse)
6913
0
{
6914
0
    htmlDocPtr ret;
6915
6916
0
    htmlCtxtUseOptions(ctxt, options);
6917
0
    ctxt->html = 1;
6918
0
    if (encoding != NULL) {
6919
0
        xmlCharEncodingHandlerPtr hdlr;
6920
6921
0
  hdlr = xmlFindCharEncodingHandler(encoding);
6922
0
  if (hdlr != NULL) {
6923
0
      xmlSwitchToEncoding(ctxt, hdlr);
6924
0
      if (ctxt->input->encoding != NULL)
6925
0
        xmlFree((xmlChar *) ctxt->input->encoding);
6926
0
            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6927
0
        }
6928
0
    }
6929
0
    if ((URL != NULL) && (ctxt->input != NULL) &&
6930
0
        (ctxt->input->filename == NULL))
6931
0
        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6932
0
    htmlParseDocument(ctxt);
6933
0
    ret = ctxt->myDoc;
6934
0
    ctxt->myDoc = NULL;
6935
0
    if (!reuse) {
6936
0
        if ((ctxt->dictNames) &&
6937
0
      (ret != NULL) &&
6938
0
      (ret->dict == ctxt->dict))
6939
0
      ctxt->dict = NULL;
6940
0
  xmlFreeParserCtxt(ctxt);
6941
0
    }
6942
0
    return (ret);
6943
0
}
6944
6945
/**
6946
 * htmlReadDoc:
6947
 * @cur:  a pointer to a zero terminated string
6948
 * @URL:  the base URL to use for the document
6949
 * @encoding:  the document encoding, or NULL
6950
 * @options:  a combination of htmlParserOption(s)
6951
 *
6952
 * parse an XML in-memory document and build a tree.
6953
 *
6954
 * Returns the resulting document tree
6955
 */
6956
htmlDocPtr
6957
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6958
0
{
6959
0
    htmlParserCtxtPtr ctxt;
6960
6961
0
    if (cur == NULL)
6962
0
        return (NULL);
6963
6964
0
    xmlInitParser();
6965
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6966
0
    if (ctxt == NULL)
6967
0
        return (NULL);
6968
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6969
0
}
6970
6971
/**
6972
 * htmlReadFile:
6973
 * @filename:  a file or URL
6974
 * @encoding:  the document encoding, or NULL
6975
 * @options:  a combination of htmlParserOption(s)
6976
 *
6977
 * parse an XML file from the filesystem or the network.
6978
 *
6979
 * Returns the resulting document tree
6980
 */
6981
htmlDocPtr
6982
htmlReadFile(const char *filename, const char *encoding, int options)
6983
0
{
6984
0
    htmlParserCtxtPtr ctxt;
6985
6986
0
    xmlInitParser();
6987
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6988
0
    if (ctxt == NULL)
6989
0
        return (NULL);
6990
0
    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6991
0
}
6992
6993
/**
6994
 * htmlReadMemory:
6995
 * @buffer:  a pointer to a char array
6996
 * @size:  the size of the array
6997
 * @URL:  the base URL to use for the document
6998
 * @encoding:  the document encoding, or NULL
6999
 * @options:  a combination of htmlParserOption(s)
7000
 *
7001
 * parse an XML in-memory document and build a tree.
7002
 *
7003
 * Returns the resulting document tree
7004
 */
7005
htmlDocPtr
7006
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
7007
0
{
7008
0
    htmlParserCtxtPtr ctxt;
7009
7010
0
    xmlInitParser();
7011
0
    ctxt = htmlCreateMemoryParserCtxt(buffer, size);
7012
0
    if (ctxt == NULL)
7013
0
        return (NULL);
7014
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
7015
0
}
7016
7017
/**
7018
 * htmlReadFd:
7019
 * @fd:  an open file descriptor
7020
 * @URL:  the base URL to use for the document
7021
 * @encoding:  the document encoding, or NULL
7022
 * @options:  a combination of htmlParserOption(s)
7023
 *
7024
 * parse an HTML from a file descriptor and build a tree.
7025
 * NOTE that the file descriptor will not be closed when the
7026
 *      reader is closed or reset.
7027
 *
7028
 * Returns the resulting document tree
7029
 */
7030
htmlDocPtr
7031
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
7032
0
{
7033
0
    htmlParserCtxtPtr ctxt;
7034
0
    xmlParserInputBufferPtr input;
7035
0
    htmlParserInputPtr stream;
7036
7037
0
    if (fd < 0)
7038
0
        return (NULL);
7039
7040
0
    xmlInitParser();
7041
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7042
0
    if (input == NULL)
7043
0
        return (NULL);
7044
0
    input->closecallback = NULL;
7045
0
    ctxt = htmlNewParserCtxt();
7046
0
    if (ctxt == NULL) {
7047
0
        xmlFreeParserInputBuffer(input);
7048
0
        return (NULL);
7049
0
    }
7050
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7051
0
    if (stream == NULL) {
7052
0
        xmlFreeParserInputBuffer(input);
7053
0
  htmlFreeParserCtxt(ctxt);
7054
0
        return (NULL);
7055
0
    }
7056
0
    inputPush(ctxt, stream);
7057
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
7058
0
}
7059
7060
/**
7061
 * htmlReadIO:
7062
 * @ioread:  an I/O read function
7063
 * @ioclose:  an I/O close function
7064
 * @ioctx:  an I/O handler
7065
 * @URL:  the base URL to use for the document
7066
 * @encoding:  the document encoding, or NULL
7067
 * @options:  a combination of htmlParserOption(s)
7068
 *
7069
 * parse an HTML document from I/O functions and source and build a tree.
7070
 *
7071
 * Returns the resulting document tree
7072
 */
7073
htmlDocPtr
7074
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
7075
          void *ioctx, const char *URL, const char *encoding, int options)
7076
0
{
7077
0
    htmlParserCtxtPtr ctxt;
7078
0
    xmlParserInputBufferPtr input;
7079
0
    xmlParserInputPtr stream;
7080
7081
0
    if (ioread == NULL)
7082
0
        return (NULL);
7083
0
    xmlInitParser();
7084
7085
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7086
0
                                         XML_CHAR_ENCODING_NONE);
7087
0
    if (input == NULL) {
7088
0
        if (ioclose != NULL)
7089
0
            ioclose(ioctx);
7090
0
        return (NULL);
7091
0
    }
7092
0
    ctxt = htmlNewParserCtxt();
7093
0
    if (ctxt == NULL) {
7094
0
        xmlFreeParserInputBuffer(input);
7095
0
        return (NULL);
7096
0
    }
7097
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7098
0
    if (stream == NULL) {
7099
0
        xmlFreeParserInputBuffer(input);
7100
0
  xmlFreeParserCtxt(ctxt);
7101
0
        return (NULL);
7102
0
    }
7103
0
    inputPush(ctxt, stream);
7104
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
7105
0
}
7106
7107
/**
7108
 * htmlCtxtReadDoc:
7109
 * @ctxt:  an HTML parser context
7110
 * @cur:  a pointer to a zero terminated string
7111
 * @URL:  the base URL to use for the document
7112
 * @encoding:  the document encoding, or NULL
7113
 * @options:  a combination of htmlParserOption(s)
7114
 *
7115
 * parse an XML in-memory document and build a tree.
7116
 * This reuses the existing @ctxt parser context
7117
 *
7118
 * Returns the resulting document tree
7119
 */
7120
htmlDocPtr
7121
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
7122
               const char *URL, const char *encoding, int options)
7123
0
{
7124
0
    if (cur == NULL)
7125
0
        return (NULL);
7126
0
    return (htmlCtxtReadMemory(ctxt, (const char *) cur, xmlStrlen(cur), URL,
7127
0
                               encoding, options));
7128
0
}
7129
7130
/**
7131
 * htmlCtxtReadFile:
7132
 * @ctxt:  an HTML parser context
7133
 * @filename:  a file or URL
7134
 * @encoding:  the document encoding, or NULL
7135
 * @options:  a combination of htmlParserOption(s)
7136
 *
7137
 * parse an XML file from the filesystem or the network.
7138
 * This reuses the existing @ctxt parser context
7139
 *
7140
 * Returns the resulting document tree
7141
 */
7142
htmlDocPtr
7143
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7144
                const char *encoding, int options)
7145
0
{
7146
0
    xmlParserInputPtr stream;
7147
7148
0
    if (filename == NULL)
7149
0
        return (NULL);
7150
0
    if (ctxt == NULL)
7151
0
        return (NULL);
7152
0
    xmlInitParser();
7153
7154
0
    htmlCtxtReset(ctxt);
7155
7156
0
    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7157
0
    if (stream == NULL) {
7158
0
        return (NULL);
7159
0
    }
7160
0
    inputPush(ctxt, stream);
7161
0
    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7162
0
}
7163
7164
/**
7165
 * htmlCtxtReadMemory:
7166
 * @ctxt:  an HTML parser context
7167
 * @buffer:  a pointer to a char array
7168
 * @size:  the size of the array
7169
 * @URL:  the base URL to use for the document
7170
 * @encoding:  the document encoding, or NULL
7171
 * @options:  a combination of htmlParserOption(s)
7172
 *
7173
 * parse an XML in-memory document and build a tree.
7174
 * This reuses the existing @ctxt parser context
7175
 *
7176
 * Returns the resulting document tree
7177
 */
7178
htmlDocPtr
7179
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7180
                  const char *URL, const char *encoding, int options)
7181
0
{
7182
0
    xmlParserInputBufferPtr input;
7183
0
    xmlParserInputPtr stream;
7184
7185
0
    if (ctxt == NULL)
7186
0
        return (NULL);
7187
0
    if (buffer == NULL)
7188
0
        return (NULL);
7189
0
    xmlInitParser();
7190
7191
0
    htmlCtxtReset(ctxt);
7192
7193
0
    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7194
0
    if (input == NULL) {
7195
0
  return(NULL);
7196
0
    }
7197
7198
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7199
0
    if (stream == NULL) {
7200
0
  xmlFreeParserInputBuffer(input);
7201
0
  return(NULL);
7202
0
    }
7203
7204
0
    inputPush(ctxt, stream);
7205
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7206
0
}
7207
7208
/**
7209
 * htmlCtxtReadFd:
7210
 * @ctxt:  an HTML parser context
7211
 * @fd:  an open file descriptor
7212
 * @URL:  the base URL to use for the document
7213
 * @encoding:  the document encoding, or NULL
7214
 * @options:  a combination of htmlParserOption(s)
7215
 *
7216
 * parse an XML from a file descriptor and build a tree.
7217
 * This reuses the existing @ctxt parser context
7218
 *
7219
 * Returns the resulting document tree
7220
 */
7221
htmlDocPtr
7222
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7223
              const char *URL, const char *encoding, int options)
7224
0
{
7225
0
    xmlParserInputBufferPtr input;
7226
0
    xmlParserInputPtr stream;
7227
7228
0
    if (fd < 0)
7229
0
        return (NULL);
7230
0
    if (ctxt == NULL)
7231
0
        return (NULL);
7232
0
    xmlInitParser();
7233
7234
0
    htmlCtxtReset(ctxt);
7235
7236
7237
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7238
0
    if (input == NULL)
7239
0
        return (NULL);
7240
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7241
0
    if (stream == NULL) {
7242
0
        xmlFreeParserInputBuffer(input);
7243
0
        return (NULL);
7244
0
    }
7245
0
    inputPush(ctxt, stream);
7246
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7247
0
}
7248
7249
/**
7250
 * htmlCtxtReadIO:
7251
 * @ctxt:  an HTML parser context
7252
 * @ioread:  an I/O read function
7253
 * @ioclose:  an I/O close function
7254
 * @ioctx:  an I/O handler
7255
 * @URL:  the base URL to use for the document
7256
 * @encoding:  the document encoding, or NULL
7257
 * @options:  a combination of htmlParserOption(s)
7258
 *
7259
 * parse an HTML document from I/O functions and source and build a tree.
7260
 * This reuses the existing @ctxt parser context
7261
 *
7262
 * Returns the resulting document tree
7263
 */
7264
htmlDocPtr
7265
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7266
              xmlInputCloseCallback ioclose, void *ioctx,
7267
        const char *URL,
7268
              const char *encoding, int options)
7269
0
{
7270
0
    xmlParserInputBufferPtr input;
7271
0
    xmlParserInputPtr stream;
7272
7273
0
    if (ioread == NULL)
7274
0
        return (NULL);
7275
0
    if (ctxt == NULL)
7276
0
        return (NULL);
7277
0
    xmlInitParser();
7278
7279
0
    htmlCtxtReset(ctxt);
7280
7281
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7282
0
                                         XML_CHAR_ENCODING_NONE);
7283
0
    if (input == NULL) {
7284
0
        if (ioclose != NULL)
7285
0
            ioclose(ioctx);
7286
0
        return (NULL);
7287
0
    }
7288
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7289
0
    if (stream == NULL) {
7290
0
        xmlFreeParserInputBuffer(input);
7291
0
        return (NULL);
7292
0
    }
7293
0
    inputPush(ctxt, stream);
7294
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7295
0
}
7296
7297
#endif /* LIBXML_HTML_ENABLED */