Coverage Report

Created: 2025-08-04 07:15

/src/libxml2-2.9.7/HTMLparser.c
Line
Count
Source (jump to first uncovered line)
1
/*
2
 * HTMLparser.c : an HTML 4.0 non-verifying parser
3
 *
4
 * See Copyright for the status of this software.
5
 *
6
 * daniel@veillard.com
7
 */
8
9
#define IN_LIBXML
10
#include "libxml.h"
11
#ifdef LIBXML_HTML_ENABLED
12
13
#include <string.h>
14
#ifdef HAVE_CTYPE_H
15
#include <ctype.h>
16
#endif
17
#ifdef HAVE_STDLIB_H
18
#include <stdlib.h>
19
#endif
20
#ifdef HAVE_SYS_STAT_H
21
#include <sys/stat.h>
22
#endif
23
#ifdef HAVE_FCNTL_H
24
#include <fcntl.h>
25
#endif
26
#ifdef HAVE_UNISTD_H
27
#include <unistd.h>
28
#endif
29
#ifdef HAVE_ZLIB_H
30
#include <zlib.h>
31
#endif
32
33
#include <libxml/xmlmemory.h>
34
#include <libxml/tree.h>
35
#include <libxml/parser.h>
36
#include <libxml/parserInternals.h>
37
#include <libxml/xmlerror.h>
38
#include <libxml/HTMLparser.h>
39
#include <libxml/HTMLtree.h>
40
#include <libxml/entities.h>
41
#include <libxml/encoding.h>
42
#include <libxml/valid.h>
43
#include <libxml/xmlIO.h>
44
#include <libxml/globals.h>
45
#include <libxml/uri.h>
46
47
#include "buf.h"
48
#include "enc.h"
49
50
#define HTML_MAX_NAMELEN 1000
51
0
#define HTML_PARSER_BIG_BUFFER_SIZE 1000
52
0
#define HTML_PARSER_BUFFER_SIZE 100
53
54
/* #define DEBUG */
55
/* #define DEBUG_PUSH */
56
57
static int htmlOmittedDefaultValue = 1;
58
59
xmlChar * htmlDecodeEntities(htmlParserCtxtPtr ctxt, int len,
60
           xmlChar end, xmlChar  end2, xmlChar end3);
61
static void htmlParseComment(htmlParserCtxtPtr ctxt);
62
63
/************************************************************************
64
 *                  *
65
 *    Some factorized error routines        *
66
 *                  *
67
 ************************************************************************/
68
69
/**
70
 * htmlErrMemory:
71
 * @ctxt:  an HTML parser context
72
 * @extra:  extra informations
73
 *
74
 * Handle a redefinition of attribute error
75
 */
76
static void
77
htmlErrMemory(xmlParserCtxtPtr ctxt, const char *extra)
78
0
{
79
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
80
0
        (ctxt->instate == XML_PARSER_EOF))
81
0
  return;
82
0
    if (ctxt != NULL) {
83
0
        ctxt->errNo = XML_ERR_NO_MEMORY;
84
0
        ctxt->instate = XML_PARSER_EOF;
85
0
        ctxt->disableSAX = 1;
86
0
    }
87
0
    if (extra)
88
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
89
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, extra,
90
0
                        NULL, NULL, 0, 0,
91
0
                        "Memory allocation failed : %s\n", extra);
92
0
    else
93
0
        __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_PARSER,
94
0
                        XML_ERR_NO_MEMORY, XML_ERR_FATAL, NULL, 0, NULL,
95
0
                        NULL, NULL, 0, 0, "Memory allocation failed\n");
96
0
}
97
98
/**
99
 * htmlParseErr:
100
 * @ctxt:  an HTML parser context
101
 * @error:  the error number
102
 * @msg:  the error message
103
 * @str1:  string infor
104
 * @str2:  string infor
105
 *
106
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
107
 */
108
static void LIBXML_ATTR_FORMAT(3,0)
109
htmlParseErr(xmlParserCtxtPtr ctxt, xmlParserErrors error,
110
             const char *msg, const xmlChar *str1, const xmlChar *str2)
111
0
{
112
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
113
0
        (ctxt->instate == XML_PARSER_EOF))
114
0
  return;
115
0
    if (ctxt != NULL)
116
0
  ctxt->errNo = error;
117
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
118
0
                    XML_ERR_ERROR, NULL, 0,
119
0
        (const char *) str1, (const char *) str2,
120
0
        NULL, 0, 0,
121
0
        msg, str1, str2);
122
0
    if (ctxt != NULL)
123
0
  ctxt->wellFormed = 0;
124
0
}
125
126
/**
127
 * htmlParseErrInt:
128
 * @ctxt:  an HTML parser context
129
 * @error:  the error number
130
 * @msg:  the error message
131
 * @val:  integer info
132
 *
133
 * Handle a fatal parser error, i.e. violating Well-Formedness constraints
134
 */
135
static void LIBXML_ATTR_FORMAT(3,0)
136
htmlParseErrInt(xmlParserCtxtPtr ctxt, xmlParserErrors error,
137
             const char *msg, int val)
138
0
{
139
0
    if ((ctxt != NULL) && (ctxt->disableSAX != 0) &&
140
0
        (ctxt->instate == XML_PARSER_EOF))
141
0
  return;
142
0
    if (ctxt != NULL)
143
0
  ctxt->errNo = error;
144
0
    __xmlRaiseError(NULL, NULL, NULL, ctxt, NULL, XML_FROM_HTML, error,
145
0
                    XML_ERR_ERROR, NULL, 0, NULL, NULL,
146
0
        NULL, val, 0, msg, val);
147
0
    if (ctxt != NULL)
148
0
  ctxt->wellFormed = 0;
149
0
}
150
151
/************************************************************************
152
 *                  *
153
 *  Parser stacks related functions and macros    *
154
 *                  *
155
 ************************************************************************/
156
157
/**
158
 * htmlnamePush:
159
 * @ctxt:  an HTML parser context
160
 * @value:  the element name
161
 *
162
 * Pushes a new element name on top of the name stack
163
 *
164
 * Returns 0 in case of error, the index in the stack otherwise
165
 */
166
static int
167
htmlnamePush(htmlParserCtxtPtr ctxt, const xmlChar * value)
168
0
{
169
0
    if ((ctxt->html < 3) && (xmlStrEqual(value, BAD_CAST "head")))
170
0
        ctxt->html = 3;
171
0
    if ((ctxt->html < 10) && (xmlStrEqual(value, BAD_CAST "body")))
172
0
        ctxt->html = 10;
173
0
    if (ctxt->nameNr >= ctxt->nameMax) {
174
0
        ctxt->nameMax *= 2;
175
0
        ctxt->nameTab = (const xmlChar * *)
176
0
                         xmlRealloc((xmlChar * *)ctxt->nameTab,
177
0
                                    ctxt->nameMax *
178
0
                                    sizeof(ctxt->nameTab[0]));
179
0
        if (ctxt->nameTab == NULL) {
180
0
            htmlErrMemory(ctxt, NULL);
181
0
            return (0);
182
0
        }
183
0
    }
184
0
    ctxt->nameTab[ctxt->nameNr] = value;
185
0
    ctxt->name = value;
186
0
    return (ctxt->nameNr++);
187
0
}
188
/**
189
 * htmlnamePop:
190
 * @ctxt: an HTML parser context
191
 *
192
 * Pops the top element name from the name stack
193
 *
194
 * Returns the name just removed
195
 */
196
static const xmlChar *
197
htmlnamePop(htmlParserCtxtPtr ctxt)
198
0
{
199
0
    const xmlChar *ret;
200
201
0
    if (ctxt->nameNr <= 0)
202
0
        return (NULL);
203
0
    ctxt->nameNr--;
204
0
    if (ctxt->nameNr < 0)
205
0
        return (NULL);
206
0
    if (ctxt->nameNr > 0)
207
0
        ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
208
0
    else
209
0
        ctxt->name = NULL;
210
0
    ret = ctxt->nameTab[ctxt->nameNr];
211
0
    ctxt->nameTab[ctxt->nameNr] = NULL;
212
0
    return (ret);
213
0
}
214
215
/**
216
 * htmlNodeInfoPush:
217
 * @ctxt:  an HTML parser context
218
 * @value:  the node info
219
 *
220
 * Pushes a new element name on top of the node info stack
221
 *
222
 * Returns 0 in case of error, the index in the stack otherwise
223
 */
224
static int
225
htmlNodeInfoPush(htmlParserCtxtPtr ctxt, htmlParserNodeInfo *value)
226
0
{
227
0
    if (ctxt->nodeInfoNr >= ctxt->nodeInfoMax) {
228
0
        if (ctxt->nodeInfoMax == 0)
229
0
                ctxt->nodeInfoMax = 5;
230
0
        ctxt->nodeInfoMax *= 2;
231
0
        ctxt->nodeInfoTab = (htmlParserNodeInfo *)
232
0
                         xmlRealloc((htmlParserNodeInfo *)ctxt->nodeInfoTab,
233
0
                                    ctxt->nodeInfoMax *
234
0
                                    sizeof(ctxt->nodeInfoTab[0]));
235
0
        if (ctxt->nodeInfoTab == NULL) {
236
0
            htmlErrMemory(ctxt, NULL);
237
0
            return (0);
238
0
        }
239
0
    }
240
0
    ctxt->nodeInfoTab[ctxt->nodeInfoNr] = *value;
241
0
    ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
242
0
    return (ctxt->nodeInfoNr++);
243
0
}
244
245
/**
246
 * htmlNodeInfoPop:
247
 * @ctxt:  an HTML parser context
248
 *
249
 * Pops the top element name from the node info stack
250
 *
251
 * Returns 0 in case of error, the pointer to NodeInfo otherwise
252
 */
253
static htmlParserNodeInfo *
254
htmlNodeInfoPop(htmlParserCtxtPtr ctxt)
255
0
{
256
0
    if (ctxt->nodeInfoNr <= 0)
257
0
        return (NULL);
258
0
    ctxt->nodeInfoNr--;
259
0
    if (ctxt->nodeInfoNr < 0)
260
0
        return (NULL);
261
0
    if (ctxt->nodeInfoNr > 0)
262
0
        ctxt->nodeInfo = &ctxt->nodeInfoTab[ctxt->nodeInfoNr - 1];
263
0
    else
264
0
        ctxt->nodeInfo = NULL;
265
0
    return &ctxt->nodeInfoTab[ctxt->nodeInfoNr];
266
0
}
267
268
/*
269
 * Macros for accessing the content. Those should be used only by the parser,
270
 * and not exported.
271
 *
272
 * Dirty macros, i.e. one need to make assumption on the context to use them
273
 *
274
 *   CUR_PTR return the current pointer to the xmlChar to be parsed.
275
 *   CUR     returns the current xmlChar value, i.e. a 8 bit value if compiled
276
 *           in ISO-Latin or UTF-8, and the current 16 bit value if compiled
277
 *           in UNICODE mode. This should be used internally by the parser
278
 *           only to compare to ASCII values otherwise it would break when
279
 *           running with UTF-8 encoding.
280
 *   NXT(n)  returns the n'th next xmlChar. Same as CUR is should be used only
281
 *           to compare on ASCII based substring.
282
 *   UPP(n)  returns the n'th next xmlChar converted to uppercase. Same as CUR
283
 *           it should be used only to compare on ASCII based substring.
284
 *   SKIP(n) Skip n xmlChar, and must also be used only to skip ASCII defined
285
 *           strings without newlines within the parser.
286
 *
287
 * Clean macros, not dependent of an ASCII context, expect UTF-8 encoding
288
 *
289
 *   CURRENT Returns the current char value, with the full decoding of
290
 *           UTF-8 if we are using this mode. It returns an int.
291
 *   NEXT    Skip to the next character, this does the proper decoding
292
 *           in UTF-8 mode. It also pop-up unfinished entities on the fly.
293
 *   NEXTL(l) Skip the current unicode character of l xmlChars long.
294
 *   COPY(to) copy one char to *to, increment CUR_PTR and to accordingly
295
 */
296
297
0
#define UPPER (toupper(*ctxt->input->cur))
298
299
0
#define SKIP(val) ctxt->nbChars += (val),ctxt->input->cur += (val),ctxt->input->col+=(val)
300
301
0
#define NXT(val) ctxt->input->cur[(val)]
302
303
0
#define UPP(val) (toupper(ctxt->input->cur[(val)]))
304
305
0
#define CUR_PTR ctxt->input->cur
306
0
#define BASE_PTR ctxt->input->base
307
308
0
#define SHRINK if ((ctxt->input->cur - ctxt->input->base > 2 * INPUT_CHUNK) && \
309
0
       (ctxt->input->end - ctxt->input->cur < 2 * INPUT_CHUNK)) \
310
0
  xmlParserInputShrink(ctxt->input)
311
312
0
#define GROW if ((ctxt->progressive == 0) &&       \
313
0
     (ctxt->input->end - ctxt->input->cur < INPUT_CHUNK))  \
314
0
  xmlParserInputGrow(ctxt->input, INPUT_CHUNK)
315
316
#define CURRENT ((int) (*ctxt->input->cur))
317
318
0
#define SKIP_BLANKS htmlSkipBlankChars(ctxt)
319
320
/* Inported from XML */
321
322
/* #define CUR (ctxt->token ? ctxt->token : (int) (*ctxt->input->cur)) */
323
0
#define CUR ((int) (*ctxt->input->cur))
324
0
#define NEXT xmlNextChar(ctxt)
325
326
0
#define RAW (ctxt->token ? -1 : (*ctxt->input->cur))
327
328
329
0
#define NEXTL(l) do {             \
330
0
    if (*(ctxt->input->cur) == '\n') {         \
331
0
  ctxt->input->line++; ctxt->input->col = 1;      \
332
0
    } else ctxt->input->col++;           \
333
0
    ctxt->token = 0; ctxt->input->cur += l; ctxt->nbChars++;    \
334
0
  } while (0)
335
336
/************
337
    \
338
    if (*ctxt->input->cur == '%') xmlParserHandlePEReference(ctxt); \
339
    if (*ctxt->input->cur == '&') xmlParserHandleReference(ctxt);
340
 ************/
341
342
0
#define CUR_CHAR(l) htmlCurrentChar(ctxt, &l)
343
#define CUR_SCHAR(s, l) xmlStringCurrentChar(ctxt, s, &l)
344
345
#define COPY_BUF(l,b,i,v)           \
346
0
    if (l == 1) b[i++] = (xmlChar) v;         \
347
0
    else i += xmlCopyChar(l,&b[i],v)
348
349
/**
350
 * htmlFindEncoding:
351
 * @the HTML parser context
352
 *
353
 * Ty to find and encoding in the current data available in the input
354
 * buffer this is needed to try to switch to the proper encoding when
355
 * one face a character error.
356
 * That's an heuristic, since it's operating outside of parsing it could
357
 * try to use a meta which had been commented out, that's the reason it
358
 * should only be used in case of error, not as a default.
359
 *
360
 * Returns an encoding string or NULL if not found, the string need to
361
 *   be freed
362
 */
363
static xmlChar *
364
0
htmlFindEncoding(xmlParserCtxtPtr ctxt) {
365
0
    const xmlChar *start, *cur, *end;
366
367
0
    if ((ctxt == NULL) || (ctxt->input == NULL) ||
368
0
        (ctxt->input->encoding != NULL) || (ctxt->input->buf == NULL) ||
369
0
        (ctxt->input->buf->encoder != NULL))
370
0
        return(NULL);
371
0
    if ((ctxt->input->cur == NULL) || (ctxt->input->end == NULL))
372
0
        return(NULL);
373
374
0
    start = ctxt->input->cur;
375
0
    end = ctxt->input->end;
376
    /* we also expect the input buffer to be zero terminated */
377
0
    if (*end != 0)
378
0
        return(NULL);
379
380
0
    cur = xmlStrcasestr(start, BAD_CAST "HTTP-EQUIV");
381
0
    if (cur == NULL)
382
0
        return(NULL);
383
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CONTENT");
384
0
    if (cur == NULL)
385
0
        return(NULL);
386
0
    cur = xmlStrcasestr(cur, BAD_CAST  "CHARSET=");
387
0
    if (cur == NULL)
388
0
        return(NULL);
389
0
    cur += 8;
390
0
    start = cur;
391
0
    while (((*cur >= 'A') && (*cur <= 'Z')) ||
392
0
           ((*cur >= 'a') && (*cur <= 'z')) ||
393
0
           ((*cur >= '0') && (*cur <= '9')) ||
394
0
           (*cur == '-') || (*cur == '_') || (*cur == ':') || (*cur == '/'))
395
0
           cur++;
396
0
    if (cur == start)
397
0
        return(NULL);
398
0
    return(xmlStrndup(start, cur - start));
399
0
}
400
401
/**
402
 * htmlCurrentChar:
403
 * @ctxt:  the HTML parser context
404
 * @len:  pointer to the length of the char read
405
 *
406
 * The current char value, if using UTF-8 this may actually span multiple
407
 * bytes in the input buffer. Implement the end of line normalization:
408
 * 2.11 End-of-Line Handling
409
 * If the encoding is unspecified, in the case we find an ISO-Latin-1
410
 * char, then the encoding converter is plugged in automatically.
411
 *
412
 * Returns the current char value and its length
413
 */
414
415
static int
416
0
htmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) {
417
0
    if (ctxt->instate == XML_PARSER_EOF)
418
0
  return(0);
419
420
0
    if (ctxt->token != 0) {
421
0
  *len = 0;
422
0
  return(ctxt->token);
423
0
    }
424
0
    if (ctxt->charset == XML_CHAR_ENCODING_UTF8) {
425
  /*
426
   * We are supposed to handle UTF8, check it's valid
427
   * From rfc2044: encoding of the Unicode values on UTF-8:
428
   *
429
   * UCS-4 range (hex.)           UTF-8 octet sequence (binary)
430
   * 0000 0000-0000 007F   0xxxxxxx
431
   * 0000 0080-0000 07FF   110xxxxx 10xxxxxx
432
   * 0000 0800-0000 FFFF   1110xxxx 10xxxxxx 10xxxxxx
433
   *
434
   * Check for the 0x110000 limit too
435
   */
436
0
  const unsigned char *cur = ctxt->input->cur;
437
0
  unsigned char c;
438
0
  unsigned int val;
439
440
0
  c = *cur;
441
0
  if (c & 0x80) {
442
0
      if (cur[1] == 0) {
443
0
    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
444
0
                cur = ctxt->input->cur;
445
0
            }
446
0
      if ((cur[1] & 0xc0) != 0x80)
447
0
    goto encoding_error;
448
0
      if ((c & 0xe0) == 0xe0) {
449
450
0
    if (cur[2] == 0) {
451
0
        xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
452
0
                    cur = ctxt->input->cur;
453
0
                }
454
0
    if ((cur[2] & 0xc0) != 0x80)
455
0
        goto encoding_error;
456
0
    if ((c & 0xf0) == 0xf0) {
457
0
        if (cur[3] == 0) {
458
0
      xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
459
0
                        cur = ctxt->input->cur;
460
0
                    }
461
0
        if (((c & 0xf8) != 0xf0) ||
462
0
      ((cur[3] & 0xc0) != 0x80))
463
0
      goto encoding_error;
464
        /* 4-byte code */
465
0
        *len = 4;
466
0
        val = (cur[0] & 0x7) << 18;
467
0
        val |= (cur[1] & 0x3f) << 12;
468
0
        val |= (cur[2] & 0x3f) << 6;
469
0
        val |= cur[3] & 0x3f;
470
0
    } else {
471
      /* 3-byte code */
472
0
        *len = 3;
473
0
        val = (cur[0] & 0xf) << 12;
474
0
        val |= (cur[1] & 0x3f) << 6;
475
0
        val |= cur[2] & 0x3f;
476
0
    }
477
0
      } else {
478
        /* 2-byte code */
479
0
    *len = 2;
480
0
    val = (cur[0] & 0x1f) << 6;
481
0
    val |= cur[1] & 0x3f;
482
0
      }
483
0
      if (!IS_CHAR(val)) {
484
0
          htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
485
0
        "Char 0x%X out of allowed range\n", val);
486
0
      }
487
0
      return(val);
488
0
  } else {
489
0
            if ((*ctxt->input->cur == 0) &&
490
0
                (ctxt->input->cur < ctxt->input->end)) {
491
0
                    htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
492
0
        "Char 0x%X out of allowed range\n", 0);
493
0
                *len = 1;
494
0
                return(' ');
495
0
            }
496
      /* 1-byte code */
497
0
      *len = 1;
498
0
      return((int) *ctxt->input->cur);
499
0
  }
500
0
    }
501
    /*
502
     * Assume it's a fixed length encoding (1) with
503
     * a compatible encoding for the ASCII set, since
504
     * XML constructs only use < 128 chars
505
     */
506
0
    *len = 1;
507
0
    if ((int) *ctxt->input->cur < 0x80)
508
0
  return((int) *ctxt->input->cur);
509
510
    /*
511
     * Humm this is bad, do an automatic flow conversion
512
     */
513
0
    {
514
0
        xmlChar * guess;
515
0
        xmlCharEncodingHandlerPtr handler;
516
517
0
        guess = htmlFindEncoding(ctxt);
518
0
        if (guess == NULL) {
519
0
            xmlSwitchEncoding(ctxt, XML_CHAR_ENCODING_8859_1);
520
0
        } else {
521
0
            if (ctxt->input->encoding != NULL)
522
0
                xmlFree((xmlChar *) ctxt->input->encoding);
523
0
            ctxt->input->encoding = guess;
524
0
            handler = xmlFindCharEncodingHandler((const char *) guess);
525
0
            if (handler != NULL) {
526
0
                xmlSwitchToEncoding(ctxt, handler);
527
0
            } else {
528
0
                htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
529
0
                             "Unsupported encoding %s", guess, NULL);
530
0
            }
531
0
        }
532
0
        ctxt->charset = XML_CHAR_ENCODING_UTF8;
533
0
    }
534
535
0
    return(xmlCurrentChar(ctxt, len));
536
537
0
encoding_error:
538
    /*
539
     * If we detect an UTF8 error that probably mean that the
540
     * input encoding didn't get properly advertized in the
541
     * declaration header. Report the error and switch the encoding
542
     * to ISO-Latin-1 (if you don't like this policy, just declare the
543
     * encoding !)
544
     */
545
0
    {
546
0
        char buffer[150];
547
548
0
  if (ctxt->input->end - ctxt->input->cur >= 4) {
549
0
      snprintf(buffer, 149, "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n",
550
0
          ctxt->input->cur[0], ctxt->input->cur[1],
551
0
          ctxt->input->cur[2], ctxt->input->cur[3]);
552
0
  } else {
553
0
      snprintf(buffer, 149, "Bytes: 0x%02X\n", ctxt->input->cur[0]);
554
0
  }
555
0
  htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
556
0
         "Input is not proper UTF-8, indicate encoding !\n",
557
0
         BAD_CAST buffer, NULL);
558
0
    }
559
560
0
    ctxt->charset = XML_CHAR_ENCODING_8859_1;
561
0
    *len = 1;
562
0
    return((int) *ctxt->input->cur);
563
0
}
564
565
/**
566
 * htmlSkipBlankChars:
567
 * @ctxt:  the HTML parser context
568
 *
569
 * skip all blanks character found at that point in the input streams.
570
 *
571
 * Returns the number of space chars skipped
572
 */
573
574
static int
575
0
htmlSkipBlankChars(xmlParserCtxtPtr ctxt) {
576
0
    int res = 0;
577
578
0
    while (IS_BLANK_CH(*(ctxt->input->cur))) {
579
0
  if ((*ctxt->input->cur == 0) &&
580
0
      (xmlParserInputGrow(ctxt->input, INPUT_CHUNK) <= 0)) {
581
0
    xmlPopInput(ctxt);
582
0
  } else {
583
0
      if (*(ctxt->input->cur) == '\n') {
584
0
    ctxt->input->line++; ctxt->input->col = 1;
585
0
      } else ctxt->input->col++;
586
0
      ctxt->input->cur++;
587
0
      ctxt->nbChars++;
588
0
      if (*ctxt->input->cur == 0)
589
0
    xmlParserInputGrow(ctxt->input, INPUT_CHUNK);
590
0
  }
591
0
  res++;
592
0
    }
593
0
    return(res);
594
0
}
595
596
597
598
/************************************************************************
599
 *                  *
600
 *  The list of HTML elements and their properties    *
601
 *                  *
602
 ************************************************************************/
603
604
/*
605
 *  Start Tag: 1 means the start tag can be ommited
606
 *  End Tag:   1 means the end tag can be ommited
607
 *             2 means it's forbidden (empty elements)
608
 *             3 means the tag is stylistic and should be closed easily
609
 *  Depr:      this element is deprecated
610
 *  DTD:       1 means that this element is valid only in the Loose DTD
611
 *             2 means that this element is valid only in the Frameset DTD
612
 *
613
 * Name,Start Tag,End Tag,Save End,Empty,Deprecated,DTD,inline,Description
614
  , subElements , impliedsubelt , Attributes, userdata
615
 */
616
617
/* Definitions and a couple of vars for HTML Elements */
618
619
#define FONTSTYLE "tt", "i", "b", "u", "s", "strike", "big", "small"
620
#define NB_FONTSTYLE 8
621
#define PHRASE "em", "strong", "dfn", "code", "samp", "kbd", "var", "cite", "abbr", "acronym"
622
#define NB_PHRASE 10
623
#define SPECIAL "a", "img", "applet", "embed", "object", "font", "basefont", "br", "script", "map", "q", "sub", "sup", "span", "bdo", "iframe"
624
#define NB_SPECIAL 16
625
#define INLINE FONTSTYLE, PHRASE, SPECIAL, FORMCTRL
626
#define NB_INLINE NB_PCDATA + NB_FONTSTYLE + NB_PHRASE + NB_SPECIAL + NB_FORMCTRL
627
#define BLOCK HEADING, LIST, "pre", "p", "dl", "div", "center", "noscript", "noframes", "blockquote", "form", "isindex", "hr", "table", "fieldset", "address"
628
#define NB_BLOCK NB_HEADING + NB_LIST + 14
629
#define FORMCTRL "input", "select", "textarea", "label", "button"
630
#define NB_FORMCTRL 5
631
#define PCDATA
632
#define NB_PCDATA 0
633
#define HEADING "h1", "h2", "h3", "h4", "h5", "h6"
634
#define NB_HEADING 6
635
#define LIST "ul", "ol", "dir", "menu"
636
#define NB_LIST 4
637
#define MODIFIER
638
#define NB_MODIFIER 0
639
#define FLOW BLOCK,INLINE
640
#define NB_FLOW NB_BLOCK + NB_INLINE
641
#define EMPTY NULL
642
643
644
static const char* const html_flow[] = { FLOW, NULL } ;
645
static const char* const html_inline[] = { INLINE, NULL } ;
646
647
/* placeholders: elts with content but no subelements */
648
static const char* const html_pcdata[] = { NULL } ;
649
#define html_cdata html_pcdata
650
651
652
/* ... and for HTML Attributes */
653
654
#define COREATTRS "id", "class", "style", "title"
655
#define NB_COREATTRS 4
656
#define I18N "lang", "dir"
657
#define NB_I18N 2
658
#define EVENTS "onclick", "ondblclick", "onmousedown", "onmouseup", "onmouseover", "onmouseout", "onkeypress", "onkeydown", "onkeyup"
659
#define NB_EVENTS 9
660
#define ATTRS COREATTRS,I18N,EVENTS
661
#define NB_ATTRS NB_NB_COREATTRS + NB_I18N + NB_EVENTS
662
#define CELLHALIGN "align", "char", "charoff"
663
#define NB_CELLHALIGN 3
664
#define CELLVALIGN "valign"
665
#define NB_CELLVALIGN 1
666
667
static const char* const html_attrs[] = { ATTRS, NULL } ;
668
static const char* const core_i18n_attrs[] = { COREATTRS, I18N, NULL } ;
669
static const char* const core_attrs[] = { COREATTRS, NULL } ;
670
static const char* const i18n_attrs[] = { I18N, NULL } ;
671
672
673
/* Other declarations that should go inline ... */
674
static const char* const a_attrs[] = { ATTRS, "charset", "type", "name",
675
  "href", "hreflang", "rel", "rev", "accesskey", "shape", "coords",
676
  "tabindex", "onfocus", "onblur", NULL } ;
677
static const char* const target_attr[] = { "target", NULL } ;
678
static const char* const rows_cols_attr[] = { "rows", "cols", NULL } ;
679
static const char* const alt_attr[] = { "alt", NULL } ;
680
static const char* const src_alt_attrs[] = { "src", "alt", NULL } ;
681
static const char* const href_attrs[] = { "href", NULL } ;
682
static const char* const clear_attrs[] = { "clear", NULL } ;
683
static const char* const inline_p[] = { INLINE, "p", NULL } ;
684
685
static const char* const flow_param[] = { FLOW, "param", NULL } ;
686
static const char* const applet_attrs[] = { COREATTRS , "codebase",
687
    "archive", "alt", "name", "height", "width", "align",
688
    "hspace", "vspace", NULL } ;
689
static const char* const area_attrs[] = { "shape", "coords", "href", "nohref",
690
  "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
691
static const char* const basefont_attrs[] =
692
  { "id", "size", "color", "face", NULL } ;
693
static const char* const quote_attrs[] = { ATTRS, "cite", NULL } ;
694
static const char* const body_contents[] = { FLOW, "ins", "del", NULL } ;
695
static const char* const body_attrs[] = { ATTRS, "onload", "onunload", NULL } ;
696
static const char* const body_depr[] = { "background", "bgcolor", "text",
697
  "link", "vlink", "alink", NULL } ;
698
static const char* const button_attrs[] = { ATTRS, "name", "value", "type",
699
  "disabled", "tabindex", "accesskey", "onfocus", "onblur", NULL } ;
700
701
702
static const char* const col_attrs[] = { ATTRS, "span", "width", CELLHALIGN, CELLVALIGN, NULL } ;
703
static const char* const col_elt[] = { "col", NULL } ;
704
static const char* const edit_attrs[] = { ATTRS, "datetime", "cite", NULL } ;
705
static const char* const compact_attrs[] = { ATTRS, "compact", NULL } ;
706
static const char* const dl_contents[] = { "dt", "dd", NULL } ;
707
static const char* const compact_attr[] = { "compact", NULL } ;
708
static const char* const label_attr[] = { "label", NULL } ;
709
static const char* const fieldset_contents[] = { FLOW, "legend" } ;
710
static const char* const font_attrs[] = { COREATTRS, I18N, "size", "color", "face" , NULL } ;
711
static const char* const form_contents[] = { HEADING, LIST, INLINE, "pre", "p", "div", "center", "noscript", "noframes", "blockquote", "isindex", "hr", "table", "fieldset", "address", NULL } ;
712
static const char* const form_attrs[] = { ATTRS, "method", "enctype", "accept", "name", "onsubmit", "onreset", "accept-charset", NULL } ;
713
static const char* const frame_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "noresize", "scrolling" , NULL } ;
714
static const char* const frameset_attrs[] = { COREATTRS, "rows", "cols", "onload", "onunload", NULL } ;
715
static const char* const frameset_contents[] = { "frameset", "frame", "noframes", NULL } ;
716
static const char* const head_attrs[] = { I18N, "profile", NULL } ;
717
static const char* const head_contents[] = { "title", "isindex", "base", "script", "style", "meta", "link", "object", NULL } ;
718
static const char* const hr_depr[] = { "align", "noshade", "size", "width", NULL } ;
719
static const char* const version_attr[] = { "version", NULL } ;
720
static const char* const html_content[] = { "head", "body", "frameset", NULL } ;
721
static const char* const iframe_attrs[] = { COREATTRS, "longdesc", "name", "src", "frameborder", "marginwidth", "marginheight", "scrolling", "align", "height", "width", NULL } ;
722
static const char* const img_attrs[] = { ATTRS, "longdesc", "name", "height", "width", "usemap", "ismap", NULL } ;
723
static const char* const embed_attrs[] = { COREATTRS, "align", "alt", "border", "code", "codebase", "frameborder", "height", "hidden", "hspace", "name", "palette", "pluginspace", "pluginurl", "src", "type", "units", "vspace", "width", NULL } ;
724
static const char* const input_attrs[] = { ATTRS, "type", "name", "value", "checked", "disabled", "readonly", "size", "maxlength", "src", "alt", "usemap", "ismap", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", "accept", NULL } ;
725
static const char* const prompt_attrs[] = { COREATTRS, I18N, "prompt", NULL } ;
726
static const char* const label_attrs[] = { ATTRS, "for", "accesskey", "onfocus", "onblur", NULL } ;
727
static const char* const legend_attrs[] = { ATTRS, "accesskey", NULL } ;
728
static const char* const align_attr[] = { "align", NULL } ;
729
static const char* const link_attrs[] = { ATTRS, "charset", "href", "hreflang", "type", "rel", "rev", "media", NULL } ;
730
static const char* const map_contents[] = { BLOCK, "area", NULL } ;
731
static const char* const name_attr[] = { "name", NULL } ;
732
static const char* const action_attr[] = { "action", NULL } ;
733
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
734
static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
735
static const char* const content_attr[] = { "content", NULL } ;
736
static const char* const type_attr[] = { "type", NULL } ;
737
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
738
static const char* const object_contents[] = { FLOW, "param", NULL } ;
739
static const char* const object_attrs[] = { ATTRS, "declare", "classid", "codebase", "data", "type", "codetype", "archive", "standby", "height", "width", "usemap", "name", "tabindex", NULL } ;
740
static const char* const object_depr[] = { "align", "border", "hspace", "vspace", NULL } ;
741
static const char* const ol_attrs[] = { "type", "compact", "start", NULL} ;
742
static const char* const option_elt[] = { "option", NULL } ;
743
static const char* const optgroup_attrs[] = { ATTRS, "disabled", NULL } ;
744
static const char* const option_attrs[] = { ATTRS, "disabled", "label", "selected", "value", NULL } ;
745
static const char* const param_attrs[] = { "id", "value", "valuetype", "type", NULL } ;
746
static const char* const width_attr[] = { "width", NULL } ;
747
static const char* const pre_content[] = { PHRASE, "tt", "i", "b", "u", "s", "strike", "a", "br", "script", "map", "q", "span", "bdo", "iframe", NULL } ;
748
static const char* const script_attrs[] = { "charset", "src", "defer", "event", "for", NULL } ;
749
static const char* const language_attr[] = { "language", NULL } ;
750
static const char* const select_content[] = { "optgroup", "option", NULL } ;
751
static const char* const select_attrs[] = { ATTRS, "name", "size", "multiple", "disabled", "tabindex", "onfocus", "onblur", "onchange", NULL } ;
752
static const char* const style_attrs[] = { I18N, "media", "title", NULL } ;
753
static const char* const table_attrs[] = { ATTRS, "summary", "width", "border", "frame", "rules", "cellspacing", "cellpadding", "datapagesize", NULL } ;
754
static const char* const table_depr[] = { "align", "bgcolor", NULL } ;
755
static const char* const table_contents[] = { "caption", "col", "colgroup", "thead", "tfoot", "tbody", "tr", NULL} ;
756
static const char* const tr_elt[] = { "tr", NULL } ;
757
static const char* const talign_attrs[] = { ATTRS, CELLHALIGN, CELLVALIGN, NULL} ;
758
static const char* const th_td_depr[] = { "nowrap", "bgcolor", "width", "height", NULL } ;
759
static const char* const th_td_attr[] = { ATTRS, "abbr", "axis", "headers", "scope", "rowspan", "colspan", CELLHALIGN, CELLVALIGN, NULL } ;
760
static const char* const textarea_attrs[] = { ATTRS, "name", "disabled", "readonly", "tabindex", "accesskey", "onfocus", "onblur", "onselect", "onchange", NULL } ;
761
static const char* const tr_contents[] = { "th", "td", NULL } ;
762
static const char* const bgcolor_attr[] = { "bgcolor", NULL } ;
763
static const char* const li_elt[] = { "li", NULL } ;
764
static const char* const ul_depr[] = { "type", "compact", NULL} ;
765
static const char* const dir_attr[] = { "dir", NULL} ;
766
767
#define DECL (const char**)
768
769
static const htmlElemDesc
770
html40ElementTable[] = {
771
{ "a",    0, 0, 0, 0, 0, 0, 1, "anchor ",
772
  DECL html_inline , NULL , DECL a_attrs , DECL target_attr, NULL
773
},
774
{ "abbr", 0, 0, 0, 0, 0, 0, 1, "abbreviated form",
775
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
776
},
777
{ "acronym",  0, 0, 0, 0, 0, 0, 1, "",
778
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
779
},
780
{ "address",  0, 0, 0, 0, 0, 0, 0, "information on author ",
781
  DECL inline_p  , NULL , DECL html_attrs, NULL, NULL
782
},
783
{ "applet", 0, 0, 0, 0, 1, 1, 2, "java applet ",
784
  DECL flow_param , NULL , NULL , DECL applet_attrs, NULL
785
},
786
{ "area", 0, 2, 2, 1, 0, 0, 0, "client-side image map area ",
787
  EMPTY ,  NULL , DECL area_attrs , DECL target_attr, DECL alt_attr
788
},
789
{ "b",    0, 3, 0, 0, 0, 0, 1, "bold text style",
790
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
791
},
792
{ "base", 0, 2, 2, 1, 0, 0, 0, "document base uri ",
793
  EMPTY , NULL , NULL , DECL target_attr, DECL href_attrs
794
},
795
{ "basefont", 0, 2, 2, 1, 1, 1, 1, "base font size " ,
796
  EMPTY , NULL , NULL, DECL basefont_attrs, NULL
797
},
798
{ "bdo",  0, 0, 0, 0, 0, 0, 1, "i18n bidi over-ride ",
799
  DECL html_inline , NULL , DECL core_i18n_attrs, NULL, DECL dir_attr
800
},
801
{ "big",  0, 3, 0, 0, 0, 0, 1, "large text style",
802
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
803
},
804
{ "blockquote", 0, 0, 0, 0, 0, 0, 0, "long quotation ",
805
  DECL html_flow , NULL , DECL quote_attrs , NULL, NULL
806
},
807
{ "body", 1, 1, 0, 0, 0, 0, 0, "document body ",
808
  DECL body_contents , "div" , DECL body_attrs, DECL body_depr, NULL
809
},
810
{ "br",   0, 2, 2, 1, 0, 0, 1, "forced line break ",
811
  EMPTY , NULL , DECL core_attrs, DECL clear_attrs , NULL
812
},
813
{ "button", 0, 0, 0, 0, 0, 0, 2, "push button ",
814
  DECL html_flow MODIFIER , NULL , DECL button_attrs, NULL, NULL
815
},
816
{ "caption",  0, 0, 0, 0, 0, 0, 0, "table caption ",
817
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
818
},
819
{ "center", 0, 3, 0, 0, 1, 1, 0, "shorthand for div align=center ",
820
  DECL html_flow , NULL , NULL, DECL html_attrs, NULL
821
},
822
{ "cite", 0, 0, 0, 0, 0, 0, 1, "citation",
823
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
824
},
825
{ "code", 0, 0, 0, 0, 0, 0, 1, "computer code fragment",
826
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
827
},
828
{ "col",  0, 2, 2, 1, 0, 0, 0, "table column ",
829
  EMPTY , NULL , DECL col_attrs , NULL, NULL
830
},
831
{ "colgroup", 0, 1, 0, 0, 0, 0, 0, "table column group ",
832
  DECL col_elt , "col" , DECL col_attrs , NULL, NULL
833
},
834
{ "dd",   0, 1, 0, 0, 0, 0, 0, "definition description ",
835
  DECL html_flow , NULL , DECL html_attrs, NULL, NULL
836
},
837
{ "del",  0, 0, 0, 0, 0, 0, 2, "deleted text ",
838
  DECL html_flow , NULL , DECL edit_attrs , NULL, NULL
839
},
840
{ "dfn",  0, 0, 0, 0, 0, 0, 1, "instance definition",
841
  DECL html_inline , NULL , DECL html_attrs, NULL, NULL
842
},
843
{ "dir",  0, 0, 0, 0, 1, 1, 0, "directory list",
844
  DECL blockli_elt, "li" , NULL, DECL compact_attrs, NULL
845
},
846
{ "div",  0, 0, 0, 0, 0, 0, 0, "generic language/style container",
847
  DECL html_flow, NULL, DECL html_attrs, DECL align_attr, NULL
848
},
849
{ "dl",   0, 0, 0, 0, 0, 0, 0, "definition list ",
850
  DECL dl_contents , "dd" , DECL html_attrs, DECL compact_attr, NULL
851
},
852
{ "dt",   0, 1, 0, 0, 0, 0, 0, "definition term ",
853
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
854
},
855
{ "em",   0, 3, 0, 0, 0, 0, 1, "emphasis",
856
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
857
},
858
{ "embed",  0, 1, 0, 0, 1, 1, 1, "generic embedded object ",
859
  EMPTY, NULL, DECL embed_attrs, NULL, NULL
860
},
861
{ "fieldset", 0, 0, 0, 0, 0, 0, 0, "form control group ",
862
  DECL fieldset_contents , NULL, DECL html_attrs, NULL, NULL
863
},
864
{ "font", 0, 3, 0, 0, 1, 1, 1, "local change to font ",
865
  DECL html_inline, NULL, NULL, DECL font_attrs, NULL
866
},
867
{ "form", 0, 0, 0, 0, 0, 0, 0, "interactive form ",
868
  DECL form_contents, "fieldset", DECL form_attrs , DECL target_attr, DECL action_attr
869
},
870
{ "frame",  0, 2, 2, 1, 0, 2, 0, "subwindow " ,
871
  EMPTY, NULL, NULL, DECL frame_attrs, NULL
872
},
873
{ "frameset", 0, 0, 0, 0, 0, 2, 0, "window subdivision" ,
874
  DECL frameset_contents, "noframes" , NULL , DECL frameset_attrs, NULL
875
},
876
{ "h1",   0, 0, 0, 0, 0, 0, 0, "heading ",
877
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
878
},
879
{ "h2",   0, 0, 0, 0, 0, 0, 0, "heading ",
880
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
881
},
882
{ "h3",   0, 0, 0, 0, 0, 0, 0, "heading ",
883
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
884
},
885
{ "h4",   0, 0, 0, 0, 0, 0, 0, "heading ",
886
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
887
},
888
{ "h5",   0, 0, 0, 0, 0, 0, 0, "heading ",
889
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
890
},
891
{ "h6",   0, 0, 0, 0, 0, 0, 0, "heading ",
892
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
893
},
894
{ "head", 1, 1, 0, 0, 0, 0, 0, "document head ",
895
  DECL head_contents, NULL, DECL head_attrs, NULL, NULL
896
},
897
{ "hr",   0, 2, 2, 1, 0, 0, 0, "horizontal rule " ,
898
  EMPTY, NULL, DECL html_attrs, DECL hr_depr, NULL
899
},
900
{ "html", 1, 1, 0, 0, 0, 0, 0, "document root element ",
901
  DECL html_content , NULL , DECL i18n_attrs, DECL version_attr, NULL
902
},
903
{ "i",    0, 3, 0, 0, 0, 0, 1, "italic text style",
904
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
905
},
906
{ "iframe", 0, 0, 0, 0, 0, 1, 2, "inline subwindow ",
907
  DECL html_flow, NULL, NULL, DECL iframe_attrs, NULL
908
},
909
{ "img",  0, 2, 2, 1, 0, 0, 1, "embedded image ",
910
  EMPTY, NULL, DECL img_attrs, DECL align_attr, DECL src_alt_attrs
911
},
912
{ "input",  0, 2, 2, 1, 0, 0, 1, "form control ",
913
  EMPTY, NULL, DECL input_attrs , DECL align_attr, NULL
914
},
915
{ "ins",  0, 0, 0, 0, 0, 0, 2, "inserted text",
916
  DECL html_flow, NULL, DECL edit_attrs, NULL, NULL
917
},
918
{ "isindex",  0, 2, 2, 1, 1, 1, 0, "single line prompt ",
919
  EMPTY, NULL, NULL, DECL prompt_attrs, NULL
920
},
921
{ "kbd",  0, 0, 0, 0, 0, 0, 1, "text to be entered by the user",
922
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
923
},
924
{ "label",  0, 0, 0, 0, 0, 0, 1, "form field label text ",
925
  DECL html_inline MODIFIER, NULL, DECL label_attrs , NULL, NULL
926
},
927
{ "legend", 0, 0, 0, 0, 0, 0, 0, "fieldset legend ",
928
  DECL html_inline, NULL, DECL legend_attrs , DECL align_attr, NULL
929
},
930
{ "li",   0, 1, 1, 0, 0, 0, 0, "list item ",
931
  DECL html_flow, NULL, DECL html_attrs, NULL, NULL
932
},
933
{ "link", 0, 2, 2, 1, 0, 0, 0, "a media-independent link ",
934
  EMPTY, NULL, DECL link_attrs, DECL target_attr, NULL
935
},
936
{ "map",  0, 0, 0, 0, 0, 0, 2, "client-side image map ",
937
  DECL map_contents , NULL, DECL html_attrs , NULL, DECL name_attr
938
},
939
{ "menu", 0, 0, 0, 0, 1, 1, 0, "menu list ",
940
  DECL blockli_elt , NULL, NULL, DECL compact_attrs, NULL
941
},
942
{ "meta", 0, 2, 2, 1, 0, 0, 0, "generic metainformation ",
943
  EMPTY, NULL, DECL meta_attrs , NULL , DECL content_attr
944
},
945
{ "noframes", 0, 0, 0, 0, 0, 2, 0, "alternate content container for non frame-based rendering ",
946
  DECL noframes_content, "body" , DECL html_attrs, NULL, NULL
947
},
948
{ "noscript", 0, 0, 0, 0, 0, 0, 0, "alternate content container for non script-based rendering ",
949
  DECL html_flow, "div", DECL html_attrs, NULL, NULL
950
},
951
{ "object", 0, 0, 0, 0, 0, 0, 2, "generic embedded object ",
952
  DECL object_contents , "div" , DECL object_attrs, DECL object_depr, NULL
953
},
954
{ "ol",   0, 0, 0, 0, 0, 0, 0, "ordered list ",
955
  DECL li_elt , "li" , DECL html_attrs, DECL ol_attrs, NULL
956
},
957
{ "optgroup", 0, 0, 0, 0, 0, 0, 0, "option group ",
958
  DECL option_elt , "option", DECL optgroup_attrs, NULL, DECL label_attr
959
},
960
{ "option", 0, 1, 0, 0, 0, 0, 0, "selectable choice " ,
961
  DECL html_pcdata, NULL, DECL option_attrs, NULL, NULL
962
},
963
{ "p",    0, 1, 0, 0, 0, 0, 0, "paragraph ",
964
  DECL html_inline, NULL, DECL html_attrs, DECL align_attr, NULL
965
},
966
{ "param",  0, 2, 2, 1, 0, 0, 0, "named property value ",
967
  EMPTY, NULL, DECL param_attrs, NULL, DECL name_attr
968
},
969
{ "pre",  0, 0, 0, 0, 0, 0, 0, "preformatted text ",
970
  DECL pre_content, NULL, DECL html_attrs, DECL width_attr, NULL
971
},
972
{ "q",    0, 0, 0, 0, 0, 0, 1, "short inline quotation ",
973
  DECL html_inline, NULL, DECL quote_attrs, NULL, NULL
974
},
975
{ "s",    0, 3, 0, 0, 1, 1, 1, "strike-through text style",
976
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
977
},
978
{ "samp", 0, 0, 0, 0, 0, 0, 1, "sample program output, scripts, etc.",
979
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
980
},
981
{ "script", 0, 0, 0, 0, 0, 0, 2, "script statements ",
982
  DECL html_cdata, NULL, DECL script_attrs, DECL language_attr, DECL type_attr
983
},
984
{ "select", 0, 0, 0, 0, 0, 0, 1, "option selector ",
985
  DECL select_content, NULL, DECL select_attrs, NULL, NULL
986
},
987
{ "small",  0, 3, 0, 0, 0, 0, 1, "small text style",
988
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
989
},
990
{ "span", 0, 0, 0, 0, 0, 0, 1, "generic language/style container ",
991
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
992
},
993
{ "strike", 0, 3, 0, 0, 1, 1, 1, "strike-through text",
994
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
995
},
996
{ "strong", 0, 3, 0, 0, 0, 0, 1, "strong emphasis",
997
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
998
},
999
{ "style",  0, 0, 0, 0, 0, 0, 0, "style info ",
1000
  DECL html_cdata, NULL, DECL style_attrs, NULL, DECL type_attr
1001
},
1002
{ "sub",  0, 3, 0, 0, 0, 0, 1, "subscript",
1003
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1004
},
1005
{ "sup",  0, 3, 0, 0, 0, 0, 1, "superscript ",
1006
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1007
},
1008
{ "table",  0, 0, 0, 0, 0, 0, 0, "",
1009
  DECL table_contents , "tr" , DECL table_attrs , DECL table_depr, NULL
1010
},
1011
{ "tbody",  1, 0, 0, 0, 0, 0, 0, "table body ",
1012
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1013
},
1014
{ "td",   0, 0, 0, 0, 0, 0, 0, "table data cell",
1015
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1016
},
1017
{ "textarea", 0, 0, 0, 0, 0, 0, 1, "multi-line text field ",
1018
  DECL html_pcdata, NULL, DECL textarea_attrs, NULL, DECL rows_cols_attr
1019
},
1020
{ "tfoot",  0, 1, 0, 0, 0, 0, 0, "table footer ",
1021
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1022
},
1023
{ "th",   0, 1, 0, 0, 0, 0, 0, "table header cell",
1024
  DECL html_flow, NULL, DECL th_td_attr, DECL th_td_depr, NULL
1025
},
1026
{ "thead",  0, 1, 0, 0, 0, 0, 0, "table header ",
1027
  DECL tr_elt , "tr" , DECL talign_attrs, NULL, NULL
1028
},
1029
{ "title",  0, 0, 0, 0, 0, 0, 0, "document title ",
1030
  DECL html_pcdata, NULL, DECL i18n_attrs, NULL, NULL
1031
},
1032
{ "tr",   0, 0, 0, 0, 0, 0, 0, "table row ",
1033
  DECL tr_contents , "td" , DECL talign_attrs, DECL bgcolor_attr, NULL
1034
},
1035
{ "tt",   0, 3, 0, 0, 0, 0, 1, "teletype or monospaced text style",
1036
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1037
},
1038
{ "u",    0, 3, 0, 0, 1, 1, 1, "underlined text style",
1039
  DECL html_inline, NULL, NULL, DECL html_attrs, NULL
1040
},
1041
{ "ul",   0, 0, 0, 0, 0, 0, 0, "unordered list ",
1042
  DECL li_elt , "li" , DECL html_attrs, DECL ul_depr, NULL
1043
},
1044
{ "var",  0, 0, 0, 0, 0, 0, 1, "instance of a variable or program argument",
1045
  DECL html_inline, NULL, DECL html_attrs, NULL, NULL
1046
}
1047
};
1048
1049
/*
1050
 * start tags that imply the end of current element
1051
 */
1052
static const char * const htmlStartClose[] = {
1053
"form",   "form", "p", "hr", "h1", "h2", "h3", "h4", "h5", "h6",
1054
    "dl", "ul", "ol", "menu", "dir", "address", "pre",
1055
    "listing", "xmp", "head", NULL,
1056
"head",   "p", NULL,
1057
"title",  "p", NULL,
1058
"body",   "head", "style", "link", "title", "p", NULL,
1059
"frameset", "head", "style", "link", "title", "p", NULL,
1060
"li",   "p", "h1", "h2", "h3", "h4", "h5", "h6", "dl", "address",
1061
    "pre", "listing", "xmp", "head", "li", NULL,
1062
"hr",   "p", "head", NULL,
1063
"h1",   "p", "head", NULL,
1064
"h2",   "p", "head", NULL,
1065
"h3",   "p", "head", NULL,
1066
"h4",   "p", "head", NULL,
1067
"h5",   "p", "head", NULL,
1068
"h6",   "p", "head", NULL,
1069
"dir",    "p", "head", NULL,
1070
"address",  "p", "head", "ul", NULL,
1071
"pre",    "p", "head", "ul", NULL,
1072
"listing",  "p", "head", NULL,
1073
"xmp",    "p", "head", NULL,
1074
"blockquote", "p", "head", NULL,
1075
"dl",   "p", "dt", "menu", "dir", "address", "pre", "listing",
1076
    "xmp", "head", NULL,
1077
"dt",   "p", "menu", "dir", "address", "pre", "listing", "xmp",
1078
                "head", "dd", NULL,
1079
"dd",   "p", "menu", "dir", "address", "pre", "listing", "xmp",
1080
                "head", "dt", NULL,
1081
"ul",   "p", "head", "ol", "menu", "dir", "address", "pre",
1082
    "listing", "xmp", NULL,
1083
"ol",   "p", "head", "ul", NULL,
1084
"menu",   "p", "head", "ul", NULL,
1085
"p",    "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
1086
"div",    "p", "head", NULL,
1087
"noscript", "p", NULL,
1088
"center", "font", "b", "i", "p", "head", NULL,
1089
"a",    "a", "head", NULL,
1090
"caption",  "p", NULL,
1091
"colgroup", "caption", "colgroup", "col", "p", NULL,
1092
"col",    "caption", "col", "p", NULL,
1093
"table",  "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre",
1094
    "listing", "xmp", "a", NULL,
1095
"th",   "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1096
"td",   "th", "td", "p", "span", "font", "a", "b", "i", "u", NULL,
1097
"tr",   "th", "td", "tr", "caption", "col", "colgroup", "p", NULL,
1098
"thead",  "caption", "col", "colgroup", NULL,
1099
"tfoot",  "th", "td", "tr", "caption", "col", "colgroup", "thead",
1100
    "tbody", "p", NULL,
1101
"tbody",  "th", "td", "tr", "caption", "col", "colgroup", "thead",
1102
    "tfoot", "tbody", "p", NULL,
1103
"optgroup", "option", NULL,
1104
"option", "option", NULL,
1105
"fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6",
1106
    "pre", "listing", "xmp", "a", NULL,
1107
/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */
1108
"tt",   "head", NULL,
1109
"i",    "head", NULL,
1110
"b",    "head", NULL,
1111
"u",    "head", NULL,
1112
"s",    "head", NULL,
1113
"strike", "head", NULL,
1114
"big",    "head", NULL,
1115
"small",  "head", NULL,
1116
1117
"em",   "head", NULL,
1118
"strong", "head", NULL,
1119
"dfn",    "head", NULL,
1120
"code",   "head", NULL,
1121
"samp",   "head", NULL,
1122
"kbd",    "head", NULL,
1123
"var",    "head", NULL,
1124
"cite",   "head", NULL,
1125
"abbr",   "head", NULL,
1126
"acronym",  "head", NULL,
1127
1128
/* "a" */
1129
"img",    "head", NULL,
1130
/* "applet" */
1131
/* "embed" */
1132
/* "object" */
1133
"font",   "head", NULL,
1134
/* "basefont" */
1135
"br",   "head", NULL,
1136
/* "script" */
1137
"map",    "head", NULL,
1138
"q",    "head", NULL,
1139
"sub",    "head", NULL,
1140
"sup",    "head", NULL,
1141
"span",   "head", NULL,
1142
"bdo",    "head", NULL,
1143
"iframe", "head", NULL,
1144
NULL
1145
};
1146
1147
/*
1148
 * The list of HTML elements which are supposed not to have
1149
 * CDATA content and where a p element will be implied
1150
 *
1151
 * TODO: extend that list by reading the HTML SGML DTD on
1152
 *       implied paragraph
1153
 */
1154
static const char *const htmlNoContentElements[] = {
1155
    "html",
1156
    "head",
1157
    NULL
1158
};
1159
1160
/*
1161
 * The list of HTML attributes which are of content %Script;
1162
 * NOTE: when adding ones, check htmlIsScriptAttribute() since
1163
 *       it assumes the name starts with 'on'
1164
 */
1165
static const char *const htmlScriptAttributes[] = {
1166
    "onclick",
1167
    "ondblclick",
1168
    "onmousedown",
1169
    "onmouseup",
1170
    "onmouseover",
1171
    "onmousemove",
1172
    "onmouseout",
1173
    "onkeypress",
1174
    "onkeydown",
1175
    "onkeyup",
1176
    "onload",
1177
    "onunload",
1178
    "onfocus",
1179
    "onblur",
1180
    "onsubmit",
1181
    "onreset",
1182
    "onchange",
1183
    "onselect"
1184
};
1185
1186
/*
1187
 * This table is used by the htmlparser to know what to do with
1188
 * broken html pages. By assigning different priorities to different
1189
 * elements the parser can decide how to handle extra endtags.
1190
 * Endtags are only allowed to close elements with lower or equal
1191
 * priority.
1192
 */
1193
1194
typedef struct {
1195
    const char *name;
1196
    int priority;
1197
} elementPriority;
1198
1199
static const elementPriority htmlEndPriority[] = {
1200
    {"div",   150},
1201
    {"td",    160},
1202
    {"th",    160},
1203
    {"tr",    170},
1204
    {"thead", 180},
1205
    {"tbody", 180},
1206
    {"tfoot", 180},
1207
    {"table", 190},
1208
    {"head",  200},
1209
    {"body",  200},
1210
    {"html",  220},
1211
    {NULL,    100} /* Default priority */
1212
};
1213
1214
static const char** htmlStartCloseIndex[100];
1215
static int htmlStartCloseIndexinitialized = 0;
1216
1217
/************************************************************************
1218
 *                  *
1219
 *  functions to handle HTML specific data      *
1220
 *                  *
1221
 ************************************************************************/
1222
1223
/**
1224
 * htmlInitAutoClose:
1225
 *
1226
 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1227
 * This is not reentrant. Call xmlInitParser() once before processing in
1228
 * case of use in multithreaded programs.
1229
 */
1230
void
1231
14
htmlInitAutoClose(void) {
1232
14
    int indx, i = 0;
1233
1234
14
    if (htmlStartCloseIndexinitialized) return;
1235
1236
1.41k
    for (indx = 0;indx < 100;indx ++) htmlStartCloseIndex[indx] = NULL;
1237
14
    indx = 0;
1238
1.00k
    while ((htmlStartClose[i] != NULL) && (indx < 100 - 1)) {
1239
994
        htmlStartCloseIndex[indx++] = (const char**) &htmlStartClose[i];
1240
5.50k
  while (htmlStartClose[i] != NULL) i++;
1241
994
  i++;
1242
994
    }
1243
14
    htmlStartCloseIndexinitialized = 1;
1244
14
}
1245
1246
/**
1247
 * htmlTagLookup:
1248
 * @tag:  The tag name in lowercase
1249
 *
1250
 * Lookup the HTML tag in the ElementTable
1251
 *
1252
 * Returns the related htmlElemDescPtr or NULL if not found.
1253
 */
1254
const htmlElemDesc *
1255
0
htmlTagLookup(const xmlChar *tag) {
1256
0
    unsigned int i;
1257
1258
0
    for (i = 0; i < (sizeof(html40ElementTable) /
1259
0
                     sizeof(html40ElementTable[0]));i++) {
1260
0
        if (!xmlStrcasecmp(tag, BAD_CAST html40ElementTable[i].name))
1261
0
      return((htmlElemDescPtr) &html40ElementTable[i]);
1262
0
    }
1263
0
    return(NULL);
1264
0
}
1265
1266
/**
1267
 * htmlGetEndPriority:
1268
 * @name: The name of the element to look up the priority for.
1269
 *
1270
 * Return value: The "endtag" priority.
1271
 **/
1272
static int
1273
0
htmlGetEndPriority (const xmlChar *name) {
1274
0
    int i = 0;
1275
1276
0
    while ((htmlEndPriority[i].name != NULL) &&
1277
0
     (!xmlStrEqual((const xmlChar *)htmlEndPriority[i].name, name)))
1278
0
  i++;
1279
1280
0
    return(htmlEndPriority[i].priority);
1281
0
}
1282
1283
1284
/**
1285
 * htmlCheckAutoClose:
1286
 * @newtag:  The new tag name
1287
 * @oldtag:  The old tag name
1288
 *
1289
 * Checks whether the new tag is one of the registered valid tags for
1290
 * closing old.
1291
 * Initialize the htmlStartCloseIndex for fast lookup of closing tags names.
1292
 *
1293
 * Returns 0 if no, 1 if yes.
1294
 */
1295
static int
1296
htmlCheckAutoClose(const xmlChar * newtag, const xmlChar * oldtag)
1297
0
{
1298
0
    int i, indx;
1299
0
    const char **closed = NULL;
1300
1301
0
    if (htmlStartCloseIndexinitialized == 0)
1302
0
        htmlInitAutoClose();
1303
1304
    /* inefficient, but not a big deal */
1305
0
    for (indx = 0; indx < 100; indx++) {
1306
0
        closed = htmlStartCloseIndex[indx];
1307
0
        if (closed == NULL)
1308
0
            return (0);
1309
0
        if (xmlStrEqual(BAD_CAST * closed, newtag))
1310
0
            break;
1311
0
    }
1312
1313
0
    i = closed - htmlStartClose;
1314
0
    i++;
1315
0
    while (htmlStartClose[i] != NULL) {
1316
0
        if (xmlStrEqual(BAD_CAST htmlStartClose[i], oldtag)) {
1317
0
            return (1);
1318
0
        }
1319
0
        i++;
1320
0
    }
1321
0
    return (0);
1322
0
}
1323
1324
/**
1325
 * htmlAutoCloseOnClose:
1326
 * @ctxt:  an HTML parser context
1327
 * @newtag:  The new tag name
1328
 * @force:  force the tag closure
1329
 *
1330
 * The HTML DTD allows an ending tag to implicitly close other tags.
1331
 */
1332
static void
1333
htmlAutoCloseOnClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1334
0
{
1335
0
    const htmlElemDesc *info;
1336
0
    int i, priority;
1337
1338
0
    priority = htmlGetEndPriority(newtag);
1339
1340
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1341
1342
0
        if (xmlStrEqual(newtag, ctxt->nameTab[i]))
1343
0
            break;
1344
        /*
1345
         * A missplaced endtag can only close elements with lower
1346
         * or equal priority, so if we find an element with higher
1347
         * priority before we find an element with
1348
         * matching name, we just ignore this endtag
1349
         */
1350
0
        if (htmlGetEndPriority(ctxt->nameTab[i]) > priority)
1351
0
            return;
1352
0
    }
1353
0
    if (i < 0)
1354
0
        return;
1355
1356
0
    while (!xmlStrEqual(newtag, ctxt->name)) {
1357
0
        info = htmlTagLookup(ctxt->name);
1358
0
        if ((info != NULL) && (info->endTag == 3)) {
1359
0
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
1360
0
                   "Opening and ending tag mismatch: %s and %s\n",
1361
0
       newtag, ctxt->name);
1362
0
        }
1363
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1364
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1365
0
  htmlnamePop(ctxt);
1366
0
    }
1367
0
}
1368
1369
/**
1370
 * htmlAutoCloseOnEnd:
1371
 * @ctxt:  an HTML parser context
1372
 *
1373
 * Close all remaining tags at the end of the stream
1374
 */
1375
static void
1376
htmlAutoCloseOnEnd(htmlParserCtxtPtr ctxt)
1377
0
{
1378
0
    int i;
1379
1380
0
    if (ctxt->nameNr == 0)
1381
0
        return;
1382
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
1383
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1384
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1385
0
  htmlnamePop(ctxt);
1386
0
    }
1387
0
}
1388
1389
/**
1390
 * htmlAutoClose:
1391
 * @ctxt:  an HTML parser context
1392
 * @newtag:  The new tag name or NULL
1393
 *
1394
 * The HTML DTD allows a tag to implicitly close other tags.
1395
 * The list is kept in htmlStartClose array. This function is
1396
 * called when a new tag has been detected and generates the
1397
 * appropriates closes if possible/needed.
1398
 * If newtag is NULL this mean we are at the end of the resource
1399
 * and we should check
1400
 */
1401
static void
1402
htmlAutoClose(htmlParserCtxtPtr ctxt, const xmlChar * newtag)
1403
0
{
1404
0
    while ((newtag != NULL) && (ctxt->name != NULL) &&
1405
0
           (htmlCheckAutoClose(newtag, ctxt->name))) {
1406
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1407
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1408
0
  htmlnamePop(ctxt);
1409
0
    }
1410
0
    if (newtag == NULL) {
1411
0
        htmlAutoCloseOnEnd(ctxt);
1412
0
        return;
1413
0
    }
1414
0
    while ((newtag == NULL) && (ctxt->name != NULL) &&
1415
0
           ((xmlStrEqual(ctxt->name, BAD_CAST "head")) ||
1416
0
            (xmlStrEqual(ctxt->name, BAD_CAST "body")) ||
1417
0
            (xmlStrEqual(ctxt->name, BAD_CAST "html")))) {
1418
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
1419
0
            ctxt->sax->endElement(ctxt->userData, ctxt->name);
1420
0
  htmlnamePop(ctxt);
1421
0
    }
1422
0
}
1423
1424
/**
1425
 * htmlAutoCloseTag:
1426
 * @doc:  the HTML document
1427
 * @name:  The tag name
1428
 * @elem:  the HTML element
1429
 *
1430
 * The HTML DTD allows a tag to implicitly close other tags.
1431
 * The list is kept in htmlStartClose array. This function checks
1432
 * if the element or one of it's children would autoclose the
1433
 * given tag.
1434
 *
1435
 * Returns 1 if autoclose, 0 otherwise
1436
 */
1437
int
1438
0
htmlAutoCloseTag(htmlDocPtr doc, const xmlChar *name, htmlNodePtr elem) {
1439
0
    htmlNodePtr child;
1440
1441
0
    if (elem == NULL) return(1);
1442
0
    if (xmlStrEqual(name, elem->name)) return(0);
1443
0
    if (htmlCheckAutoClose(elem->name, name)) return(1);
1444
0
    child = elem->children;
1445
0
    while (child != NULL) {
1446
0
        if (htmlAutoCloseTag(doc, name, child)) return(1);
1447
0
  child = child->next;
1448
0
    }
1449
0
    return(0);
1450
0
}
1451
1452
/**
1453
 * htmlIsAutoClosed:
1454
 * @doc:  the HTML document
1455
 * @elem:  the HTML element
1456
 *
1457
 * The HTML DTD allows a tag to implicitly close other tags.
1458
 * The list is kept in htmlStartClose array. This function checks
1459
 * if a tag is autoclosed by one of it's child
1460
 *
1461
 * Returns 1 if autoclosed, 0 otherwise
1462
 */
1463
int
1464
0
htmlIsAutoClosed(htmlDocPtr doc, htmlNodePtr elem) {
1465
0
    htmlNodePtr child;
1466
1467
0
    if (elem == NULL) return(1);
1468
0
    child = elem->children;
1469
0
    while (child != NULL) {
1470
0
  if (htmlAutoCloseTag(doc, elem->name, child)) return(1);
1471
0
  child = child->next;
1472
0
    }
1473
0
    return(0);
1474
0
}
1475
1476
/**
1477
 * htmlCheckImplied:
1478
 * @ctxt:  an HTML parser context
1479
 * @newtag:  The new tag name
1480
 *
1481
 * The HTML DTD allows a tag to exists only implicitly
1482
 * called when a new tag has been detected and generates the
1483
 * appropriates implicit tags if missing
1484
 */
1485
static void
1486
0
htmlCheckImplied(htmlParserCtxtPtr ctxt, const xmlChar *newtag) {
1487
0
    int i;
1488
1489
0
    if (ctxt->options & HTML_PARSE_NOIMPLIED)
1490
0
        return;
1491
0
    if (!htmlOmittedDefaultValue)
1492
0
  return;
1493
0
    if (xmlStrEqual(newtag, BAD_CAST"html"))
1494
0
  return;
1495
0
    if (ctxt->nameNr <= 0) {
1496
0
  htmlnamePush(ctxt, BAD_CAST"html");
1497
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1498
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"html", NULL);
1499
0
    }
1500
0
    if ((xmlStrEqual(newtag, BAD_CAST"body")) || (xmlStrEqual(newtag, BAD_CAST"head")))
1501
0
        return;
1502
0
    if ((ctxt->nameNr <= 1) &&
1503
0
        ((xmlStrEqual(newtag, BAD_CAST"script")) ||
1504
0
   (xmlStrEqual(newtag, BAD_CAST"style")) ||
1505
0
   (xmlStrEqual(newtag, BAD_CAST"meta")) ||
1506
0
   (xmlStrEqual(newtag, BAD_CAST"link")) ||
1507
0
   (xmlStrEqual(newtag, BAD_CAST"title")) ||
1508
0
   (xmlStrEqual(newtag, BAD_CAST"base")))) {
1509
0
        if (ctxt->html >= 3) {
1510
            /* we already saw or generated an <head> before */
1511
0
            return;
1512
0
        }
1513
        /*
1514
         * dropped OBJECT ... i you put it first BODY will be
1515
         * assumed !
1516
         */
1517
0
        htmlnamePush(ctxt, BAD_CAST"head");
1518
0
        if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1519
0
            ctxt->sax->startElement(ctxt->userData, BAD_CAST"head", NULL);
1520
0
    } else if ((!xmlStrEqual(newtag, BAD_CAST"noframes")) &&
1521
0
         (!xmlStrEqual(newtag, BAD_CAST"frame")) &&
1522
0
         (!xmlStrEqual(newtag, BAD_CAST"frameset"))) {
1523
0
        if (ctxt->html >= 10) {
1524
            /* we already saw or generated a <body> before */
1525
0
            return;
1526
0
        }
1527
0
  for (i = 0;i < ctxt->nameNr;i++) {
1528
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"body")) {
1529
0
    return;
1530
0
      }
1531
0
      if (xmlStrEqual(ctxt->nameTab[i], BAD_CAST"head")) {
1532
0
    return;
1533
0
      }
1534
0
  }
1535
1536
0
  htmlnamePush(ctxt, BAD_CAST"body");
1537
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1538
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"body", NULL);
1539
0
    }
1540
0
}
1541
1542
/**
1543
 * htmlCheckParagraph
1544
 * @ctxt:  an HTML parser context
1545
 *
1546
 * Check whether a p element need to be implied before inserting
1547
 * characters in the current element.
1548
 *
1549
 * Returns 1 if a paragraph has been inserted, 0 if not and -1
1550
 *         in case of error.
1551
 */
1552
1553
static int
1554
0
htmlCheckParagraph(htmlParserCtxtPtr ctxt) {
1555
0
    const xmlChar *tag;
1556
0
    int i;
1557
1558
0
    if (ctxt == NULL)
1559
0
  return(-1);
1560
0
    tag = ctxt->name;
1561
0
    if (tag == NULL) {
1562
0
  htmlAutoClose(ctxt, BAD_CAST"p");
1563
0
  htmlCheckImplied(ctxt, BAD_CAST"p");
1564
0
  htmlnamePush(ctxt, BAD_CAST"p");
1565
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1566
0
      ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1567
0
  return(1);
1568
0
    }
1569
0
    if (!htmlOmittedDefaultValue)
1570
0
  return(0);
1571
0
    for (i = 0; htmlNoContentElements[i] != NULL; i++) {
1572
0
  if (xmlStrEqual(tag, BAD_CAST htmlNoContentElements[i])) {
1573
0
      htmlAutoClose(ctxt, BAD_CAST"p");
1574
0
      htmlCheckImplied(ctxt, BAD_CAST"p");
1575
0
      htmlnamePush(ctxt, BAD_CAST"p");
1576
0
      if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL))
1577
0
    ctxt->sax->startElement(ctxt->userData, BAD_CAST"p", NULL);
1578
0
      return(1);
1579
0
  }
1580
0
    }
1581
0
    return(0);
1582
0
}
1583
1584
/**
1585
 * htmlIsScriptAttribute:
1586
 * @name:  an attribute name
1587
 *
1588
 * Check if an attribute is of content type Script
1589
 *
1590
 * Returns 1 is the attribute is a script 0 otherwise
1591
 */
1592
int
1593
0
htmlIsScriptAttribute(const xmlChar *name) {
1594
0
    unsigned int i;
1595
1596
0
    if (name == NULL)
1597
0
      return(0);
1598
    /*
1599
     * all script attributes start with 'on'
1600
     */
1601
0
    if ((name[0] != 'o') || (name[1] != 'n'))
1602
0
      return(0);
1603
0
    for (i = 0;
1604
0
   i < sizeof(htmlScriptAttributes)/sizeof(htmlScriptAttributes[0]);
1605
0
   i++) {
1606
0
  if (xmlStrEqual(name, (const xmlChar *) htmlScriptAttributes[i]))
1607
0
      return(1);
1608
0
    }
1609
0
    return(0);
1610
0
}
1611
1612
/************************************************************************
1613
 *                  *
1614
 *  The list of HTML predefined entities      *
1615
 *                  *
1616
 ************************************************************************/
1617
1618
1619
static const htmlEntityDesc  html40EntitiesTable[] = {
1620
/*
1621
 * the 4 absolute ones, plus apostrophe.
1622
 */
1623
{ 34, "quot", "quotation mark = APL quote, U+0022 ISOnum" },
1624
{ 38, "amp",  "ampersand, U+0026 ISOnum" },
1625
{ 39, "apos", "single quote" },
1626
{ 60, "lt", "less-than sign, U+003C ISOnum" },
1627
{ 62, "gt", "greater-than sign, U+003E ISOnum" },
1628
1629
/*
1630
 * A bunch still in the 128-255 range
1631
 * Replacing them depend really on the charset used.
1632
 */
1633
{ 160,  "nbsp", "no-break space = non-breaking space, U+00A0 ISOnum" },
1634
{ 161,  "iexcl","inverted exclamation mark, U+00A1 ISOnum" },
1635
{ 162,  "cent", "cent sign, U+00A2 ISOnum" },
1636
{ 163,  "pound","pound sign, U+00A3 ISOnum" },
1637
{ 164,  "curren","currency sign, U+00A4 ISOnum" },
1638
{ 165,  "yen",  "yen sign = yuan sign, U+00A5 ISOnum" },
1639
{ 166,  "brvbar","broken bar = broken vertical bar, U+00A6 ISOnum" },
1640
{ 167,  "sect", "section sign, U+00A7 ISOnum" },
1641
{ 168,  "uml",  "diaeresis = spacing diaeresis, U+00A8 ISOdia" },
1642
{ 169,  "copy", "copyright sign, U+00A9 ISOnum" },
1643
{ 170,  "ordf", "feminine ordinal indicator, U+00AA ISOnum" },
1644
{ 171,  "laquo","left-pointing double angle quotation mark = left pointing guillemet, U+00AB ISOnum" },
1645
{ 172,  "not",  "not sign, U+00AC ISOnum" },
1646
{ 173,  "shy",  "soft hyphen = discretionary hyphen, U+00AD ISOnum" },
1647
{ 174,  "reg",  "registered sign = registered trade mark sign, U+00AE ISOnum" },
1648
{ 175,  "macr", "macron = spacing macron = overline = APL overbar, U+00AF ISOdia" },
1649
{ 176,  "deg",  "degree sign, U+00B0 ISOnum" },
1650
{ 177,  "plusmn","plus-minus sign = plus-or-minus sign, U+00B1 ISOnum" },
1651
{ 178,  "sup2", "superscript two = superscript digit two = squared, U+00B2 ISOnum" },
1652
{ 179,  "sup3", "superscript three = superscript digit three = cubed, U+00B3 ISOnum" },
1653
{ 180,  "acute","acute accent = spacing acute, U+00B4 ISOdia" },
1654
{ 181,  "micro","micro sign, U+00B5 ISOnum" },
1655
{ 182,  "para", "pilcrow sign = paragraph sign, U+00B6 ISOnum" },
1656
{ 183,  "middot","middle dot = Georgian comma Greek middle dot, U+00B7 ISOnum" },
1657
{ 184,  "cedil","cedilla = spacing cedilla, U+00B8 ISOdia" },
1658
{ 185,  "sup1", "superscript one = superscript digit one, U+00B9 ISOnum" },
1659
{ 186,  "ordm", "masculine ordinal indicator, U+00BA ISOnum" },
1660
{ 187,  "raquo","right-pointing double angle quotation mark right pointing guillemet, U+00BB ISOnum" },
1661
{ 188,  "frac14","vulgar fraction one quarter = fraction one quarter, U+00BC ISOnum" },
1662
{ 189,  "frac12","vulgar fraction one half = fraction one half, U+00BD ISOnum" },
1663
{ 190,  "frac34","vulgar fraction three quarters = fraction three quarters, U+00BE ISOnum" },
1664
{ 191,  "iquest","inverted question mark = turned question mark, U+00BF ISOnum" },
1665
{ 192,  "Agrave","latin capital letter A with grave = latin capital letter A grave, U+00C0 ISOlat1" },
1666
{ 193,  "Aacute","latin capital letter A with acute, U+00C1 ISOlat1" },
1667
{ 194,  "Acirc","latin capital letter A with circumflex, U+00C2 ISOlat1" },
1668
{ 195,  "Atilde","latin capital letter A with tilde, U+00C3 ISOlat1" },
1669
{ 196,  "Auml", "latin capital letter A with diaeresis, U+00C4 ISOlat1" },
1670
{ 197,  "Aring","latin capital letter A with ring above = latin capital letter A ring, U+00C5 ISOlat1" },
1671
{ 198,  "AElig","latin capital letter AE = latin capital ligature AE, U+00C6 ISOlat1" },
1672
{ 199,  "Ccedil","latin capital letter C with cedilla, U+00C7 ISOlat1" },
1673
{ 200,  "Egrave","latin capital letter E with grave, U+00C8 ISOlat1" },
1674
{ 201,  "Eacute","latin capital letter E with acute, U+00C9 ISOlat1" },
1675
{ 202,  "Ecirc","latin capital letter E with circumflex, U+00CA ISOlat1" },
1676
{ 203,  "Euml", "latin capital letter E with diaeresis, U+00CB ISOlat1" },
1677
{ 204,  "Igrave","latin capital letter I with grave, U+00CC ISOlat1" },
1678
{ 205,  "Iacute","latin capital letter I with acute, U+00CD ISOlat1" },
1679
{ 206,  "Icirc","latin capital letter I with circumflex, U+00CE ISOlat1" },
1680
{ 207,  "Iuml", "latin capital letter I with diaeresis, U+00CF ISOlat1" },
1681
{ 208,  "ETH",  "latin capital letter ETH, U+00D0 ISOlat1" },
1682
{ 209,  "Ntilde","latin capital letter N with tilde, U+00D1 ISOlat1" },
1683
{ 210,  "Ograve","latin capital letter O with grave, U+00D2 ISOlat1" },
1684
{ 211,  "Oacute","latin capital letter O with acute, U+00D3 ISOlat1" },
1685
{ 212,  "Ocirc","latin capital letter O with circumflex, U+00D4 ISOlat1" },
1686
{ 213,  "Otilde","latin capital letter O with tilde, U+00D5 ISOlat1" },
1687
{ 214,  "Ouml", "latin capital letter O with diaeresis, U+00D6 ISOlat1" },
1688
{ 215,  "times","multiplication sign, U+00D7 ISOnum" },
1689
{ 216,  "Oslash","latin capital letter O with stroke latin capital letter O slash, U+00D8 ISOlat1" },
1690
{ 217,  "Ugrave","latin capital letter U with grave, U+00D9 ISOlat1" },
1691
{ 218,  "Uacute","latin capital letter U with acute, U+00DA ISOlat1" },
1692
{ 219,  "Ucirc","latin capital letter U with circumflex, U+00DB ISOlat1" },
1693
{ 220,  "Uuml", "latin capital letter U with diaeresis, U+00DC ISOlat1" },
1694
{ 221,  "Yacute","latin capital letter Y with acute, U+00DD ISOlat1" },
1695
{ 222,  "THORN","latin capital letter THORN, U+00DE ISOlat1" },
1696
{ 223,  "szlig","latin small letter sharp s = ess-zed, U+00DF ISOlat1" },
1697
{ 224,  "agrave","latin small letter a with grave = latin small letter a grave, U+00E0 ISOlat1" },
1698
{ 225,  "aacute","latin small letter a with acute, U+00E1 ISOlat1" },
1699
{ 226,  "acirc","latin small letter a with circumflex, U+00E2 ISOlat1" },
1700
{ 227,  "atilde","latin small letter a with tilde, U+00E3 ISOlat1" },
1701
{ 228,  "auml", "latin small letter a with diaeresis, U+00E4 ISOlat1" },
1702
{ 229,  "aring","latin small letter a with ring above = latin small letter a ring, U+00E5 ISOlat1" },
1703
{ 230,  "aelig","latin small letter ae = latin small ligature ae, U+00E6 ISOlat1" },
1704
{ 231,  "ccedil","latin small letter c with cedilla, U+00E7 ISOlat1" },
1705
{ 232,  "egrave","latin small letter e with grave, U+00E8 ISOlat1" },
1706
{ 233,  "eacute","latin small letter e with acute, U+00E9 ISOlat1" },
1707
{ 234,  "ecirc","latin small letter e with circumflex, U+00EA ISOlat1" },
1708
{ 235,  "euml", "latin small letter e with diaeresis, U+00EB ISOlat1" },
1709
{ 236,  "igrave","latin small letter i with grave, U+00EC ISOlat1" },
1710
{ 237,  "iacute","latin small letter i with acute, U+00ED ISOlat1" },
1711
{ 238,  "icirc","latin small letter i with circumflex, U+00EE ISOlat1" },
1712
{ 239,  "iuml", "latin small letter i with diaeresis, U+00EF ISOlat1" },
1713
{ 240,  "eth",  "latin small letter eth, U+00F0 ISOlat1" },
1714
{ 241,  "ntilde","latin small letter n with tilde, U+00F1 ISOlat1" },
1715
{ 242,  "ograve","latin small letter o with grave, U+00F2 ISOlat1" },
1716
{ 243,  "oacute","latin small letter o with acute, U+00F3 ISOlat1" },
1717
{ 244,  "ocirc","latin small letter o with circumflex, U+00F4 ISOlat1" },
1718
{ 245,  "otilde","latin small letter o with tilde, U+00F5 ISOlat1" },
1719
{ 246,  "ouml", "latin small letter o with diaeresis, U+00F6 ISOlat1" },
1720
{ 247,  "divide","division sign, U+00F7 ISOnum" },
1721
{ 248,  "oslash","latin small letter o with stroke, = latin small letter o slash, U+00F8 ISOlat1" },
1722
{ 249,  "ugrave","latin small letter u with grave, U+00F9 ISOlat1" },
1723
{ 250,  "uacute","latin small letter u with acute, U+00FA ISOlat1" },
1724
{ 251,  "ucirc","latin small letter u with circumflex, U+00FB ISOlat1" },
1725
{ 252,  "uuml", "latin small letter u with diaeresis, U+00FC ISOlat1" },
1726
{ 253,  "yacute","latin small letter y with acute, U+00FD ISOlat1" },
1727
{ 254,  "thorn","latin small letter thorn with, U+00FE ISOlat1" },
1728
{ 255,  "yuml", "latin small letter y with diaeresis, U+00FF ISOlat1" },
1729
1730
{ 338,  "OElig","latin capital ligature OE, U+0152 ISOlat2" },
1731
{ 339,  "oelig","latin small ligature oe, U+0153 ISOlat2" },
1732
{ 352,  "Scaron","latin capital letter S with caron, U+0160 ISOlat2" },
1733
{ 353,  "scaron","latin small letter s with caron, U+0161 ISOlat2" },
1734
{ 376,  "Yuml", "latin capital letter Y with diaeresis, U+0178 ISOlat2" },
1735
1736
/*
1737
 * Anything below should really be kept as entities references
1738
 */
1739
{ 402,  "fnof", "latin small f with hook = function = florin, U+0192 ISOtech" },
1740
1741
{ 710,  "circ", "modifier letter circumflex accent, U+02C6 ISOpub" },
1742
{ 732,  "tilde","small tilde, U+02DC ISOdia" },
1743
1744
{ 913,  "Alpha","greek capital letter alpha, U+0391" },
1745
{ 914,  "Beta", "greek capital letter beta, U+0392" },
1746
{ 915,  "Gamma","greek capital letter gamma, U+0393 ISOgrk3" },
1747
{ 916,  "Delta","greek capital letter delta, U+0394 ISOgrk3" },
1748
{ 917,  "Epsilon","greek capital letter epsilon, U+0395" },
1749
{ 918,  "Zeta", "greek capital letter zeta, U+0396" },
1750
{ 919,  "Eta",  "greek capital letter eta, U+0397" },
1751
{ 920,  "Theta","greek capital letter theta, U+0398 ISOgrk3" },
1752
{ 921,  "Iota", "greek capital letter iota, U+0399" },
1753
{ 922,  "Kappa","greek capital letter kappa, U+039A" },
1754
{ 923,  "Lambda", "greek capital letter lambda, U+039B ISOgrk3" },
1755
{ 924,  "Mu", "greek capital letter mu, U+039C" },
1756
{ 925,  "Nu", "greek capital letter nu, U+039D" },
1757
{ 926,  "Xi", "greek capital letter xi, U+039E ISOgrk3" },
1758
{ 927,  "Omicron","greek capital letter omicron, U+039F" },
1759
{ 928,  "Pi", "greek capital letter pi, U+03A0 ISOgrk3" },
1760
{ 929,  "Rho",  "greek capital letter rho, U+03A1" },
1761
{ 931,  "Sigma","greek capital letter sigma, U+03A3 ISOgrk3" },
1762
{ 932,  "Tau",  "greek capital letter tau, U+03A4" },
1763
{ 933,  "Upsilon","greek capital letter upsilon, U+03A5 ISOgrk3" },
1764
{ 934,  "Phi",  "greek capital letter phi, U+03A6 ISOgrk3" },
1765
{ 935,  "Chi",  "greek capital letter chi, U+03A7" },
1766
{ 936,  "Psi",  "greek capital letter psi, U+03A8 ISOgrk3" },
1767
{ 937,  "Omega","greek capital letter omega, U+03A9 ISOgrk3" },
1768
1769
{ 945,  "alpha","greek small letter alpha, U+03B1 ISOgrk3" },
1770
{ 946,  "beta", "greek small letter beta, U+03B2 ISOgrk3" },
1771
{ 947,  "gamma","greek small letter gamma, U+03B3 ISOgrk3" },
1772
{ 948,  "delta","greek small letter delta, U+03B4 ISOgrk3" },
1773
{ 949,  "epsilon","greek small letter epsilon, U+03B5 ISOgrk3" },
1774
{ 950,  "zeta", "greek small letter zeta, U+03B6 ISOgrk3" },
1775
{ 951,  "eta",  "greek small letter eta, U+03B7 ISOgrk3" },
1776
{ 952,  "theta","greek small letter theta, U+03B8 ISOgrk3" },
1777
{ 953,  "iota", "greek small letter iota, U+03B9 ISOgrk3" },
1778
{ 954,  "kappa","greek small letter kappa, U+03BA ISOgrk3" },
1779
{ 955,  "lambda","greek small letter lambda, U+03BB ISOgrk3" },
1780
{ 956,  "mu", "greek small letter mu, U+03BC ISOgrk3" },
1781
{ 957,  "nu", "greek small letter nu, U+03BD ISOgrk3" },
1782
{ 958,  "xi", "greek small letter xi, U+03BE ISOgrk3" },
1783
{ 959,  "omicron","greek small letter omicron, U+03BF NEW" },
1784
{ 960,  "pi", "greek small letter pi, U+03C0 ISOgrk3" },
1785
{ 961,  "rho",  "greek small letter rho, U+03C1 ISOgrk3" },
1786
{ 962,  "sigmaf","greek small letter final sigma, U+03C2 ISOgrk3" },
1787
{ 963,  "sigma","greek small letter sigma, U+03C3 ISOgrk3" },
1788
{ 964,  "tau",  "greek small letter tau, U+03C4 ISOgrk3" },
1789
{ 965,  "upsilon","greek small letter upsilon, U+03C5 ISOgrk3" },
1790
{ 966,  "phi",  "greek small letter phi, U+03C6 ISOgrk3" },
1791
{ 967,  "chi",  "greek small letter chi, U+03C7 ISOgrk3" },
1792
{ 968,  "psi",  "greek small letter psi, U+03C8 ISOgrk3" },
1793
{ 969,  "omega","greek small letter omega, U+03C9 ISOgrk3" },
1794
{ 977,  "thetasym","greek small letter theta symbol, U+03D1 NEW" },
1795
{ 978,  "upsih","greek upsilon with hook symbol, U+03D2 NEW" },
1796
{ 982,  "piv",  "greek pi symbol, U+03D6 ISOgrk3" },
1797
1798
{ 8194, "ensp", "en space, U+2002 ISOpub" },
1799
{ 8195, "emsp", "em space, U+2003 ISOpub" },
1800
{ 8201, "thinsp","thin space, U+2009 ISOpub" },
1801
{ 8204, "zwnj", "zero width non-joiner, U+200C NEW RFC 2070" },
1802
{ 8205, "zwj",  "zero width joiner, U+200D NEW RFC 2070" },
1803
{ 8206, "lrm",  "left-to-right mark, U+200E NEW RFC 2070" },
1804
{ 8207, "rlm",  "right-to-left mark, U+200F NEW RFC 2070" },
1805
{ 8211, "ndash","en dash, U+2013 ISOpub" },
1806
{ 8212, "mdash","em dash, U+2014 ISOpub" },
1807
{ 8216, "lsquo","left single quotation mark, U+2018 ISOnum" },
1808
{ 8217, "rsquo","right single quotation mark, U+2019 ISOnum" },
1809
{ 8218, "sbquo","single low-9 quotation mark, U+201A NEW" },
1810
{ 8220, "ldquo","left double quotation mark, U+201C ISOnum" },
1811
{ 8221, "rdquo","right double quotation mark, U+201D ISOnum" },
1812
{ 8222, "bdquo","double low-9 quotation mark, U+201E NEW" },
1813
{ 8224, "dagger","dagger, U+2020 ISOpub" },
1814
{ 8225, "Dagger","double dagger, U+2021 ISOpub" },
1815
1816
{ 8226, "bull", "bullet = black small circle, U+2022 ISOpub" },
1817
{ 8230, "hellip","horizontal ellipsis = three dot leader, U+2026 ISOpub" },
1818
1819
{ 8240, "permil","per mille sign, U+2030 ISOtech" },
1820
1821
{ 8242, "prime","prime = minutes = feet, U+2032 ISOtech" },
1822
{ 8243, "Prime","double prime = seconds = inches, U+2033 ISOtech" },
1823
1824
{ 8249, "lsaquo","single left-pointing angle quotation mark, U+2039 ISO proposed" },
1825
{ 8250, "rsaquo","single right-pointing angle quotation mark, U+203A ISO proposed" },
1826
1827
{ 8254, "oline","overline = spacing overscore, U+203E NEW" },
1828
{ 8260, "frasl","fraction slash, U+2044 NEW" },
1829
1830
{ 8364, "euro", "euro sign, U+20AC NEW" },
1831
1832
{ 8465, "image","blackletter capital I = imaginary part, U+2111 ISOamso" },
1833
{ 8472, "weierp","script capital P = power set = Weierstrass p, U+2118 ISOamso" },
1834
{ 8476, "real", "blackletter capital R = real part symbol, U+211C ISOamso" },
1835
{ 8482, "trade","trade mark sign, U+2122 ISOnum" },
1836
{ 8501, "alefsym","alef symbol = first transfinite cardinal, U+2135 NEW" },
1837
{ 8592, "larr", "leftwards arrow, U+2190 ISOnum" },
1838
{ 8593, "uarr", "upwards arrow, U+2191 ISOnum" },
1839
{ 8594, "rarr", "rightwards arrow, U+2192 ISOnum" },
1840
{ 8595, "darr", "downwards arrow, U+2193 ISOnum" },
1841
{ 8596, "harr", "left right arrow, U+2194 ISOamsa" },
1842
{ 8629, "crarr","downwards arrow with corner leftwards = carriage return, U+21B5 NEW" },
1843
{ 8656, "lArr", "leftwards double arrow, U+21D0 ISOtech" },
1844
{ 8657, "uArr", "upwards double arrow, U+21D1 ISOamsa" },
1845
{ 8658, "rArr", "rightwards double arrow, U+21D2 ISOtech" },
1846
{ 8659, "dArr", "downwards double arrow, U+21D3 ISOamsa" },
1847
{ 8660, "hArr", "left right double arrow, U+21D4 ISOamsa" },
1848
1849
{ 8704, "forall","for all, U+2200 ISOtech" },
1850
{ 8706, "part", "partial differential, U+2202 ISOtech" },
1851
{ 8707, "exist","there exists, U+2203 ISOtech" },
1852
{ 8709, "empty","empty set = null set = diameter, U+2205 ISOamso" },
1853
{ 8711, "nabla","nabla = backward difference, U+2207 ISOtech" },
1854
{ 8712, "isin", "element of, U+2208 ISOtech" },
1855
{ 8713, "notin","not an element of, U+2209 ISOtech" },
1856
{ 8715, "ni", "contains as member, U+220B ISOtech" },
1857
{ 8719, "prod", "n-ary product = product sign, U+220F ISOamsb" },
1858
{ 8721, "sum",  "n-ary summation, U+2211 ISOamsb" },
1859
{ 8722, "minus","minus sign, U+2212 ISOtech" },
1860
{ 8727, "lowast","asterisk operator, U+2217 ISOtech" },
1861
{ 8730, "radic","square root = radical sign, U+221A ISOtech" },
1862
{ 8733, "prop", "proportional to, U+221D ISOtech" },
1863
{ 8734, "infin","infinity, U+221E ISOtech" },
1864
{ 8736, "ang",  "angle, U+2220 ISOamso" },
1865
{ 8743, "and",  "logical and = wedge, U+2227 ISOtech" },
1866
{ 8744, "or", "logical or = vee, U+2228 ISOtech" },
1867
{ 8745, "cap",  "intersection = cap, U+2229 ISOtech" },
1868
{ 8746, "cup",  "union = cup, U+222A ISOtech" },
1869
{ 8747, "int",  "integral, U+222B ISOtech" },
1870
{ 8756, "there4","therefore, U+2234 ISOtech" },
1871
{ 8764, "sim",  "tilde operator = varies with = similar to, U+223C ISOtech" },
1872
{ 8773, "cong", "approximately equal to, U+2245 ISOtech" },
1873
{ 8776, "asymp","almost equal to = asymptotic to, U+2248 ISOamsr" },
1874
{ 8800, "ne", "not equal to, U+2260 ISOtech" },
1875
{ 8801, "equiv","identical to, U+2261 ISOtech" },
1876
{ 8804, "le", "less-than or equal to, U+2264 ISOtech" },
1877
{ 8805, "ge", "greater-than or equal to, U+2265 ISOtech" },
1878
{ 8834, "sub",  "subset of, U+2282 ISOtech" },
1879
{ 8835, "sup",  "superset of, U+2283 ISOtech" },
1880
{ 8836, "nsub", "not a subset of, U+2284 ISOamsn" },
1881
{ 8838, "sube", "subset of or equal to, U+2286 ISOtech" },
1882
{ 8839, "supe", "superset of or equal to, U+2287 ISOtech" },
1883
{ 8853, "oplus","circled plus = direct sum, U+2295 ISOamsb" },
1884
{ 8855, "otimes","circled times = vector product, U+2297 ISOamsb" },
1885
{ 8869, "perp", "up tack = orthogonal to = perpendicular, U+22A5 ISOtech" },
1886
{ 8901, "sdot", "dot operator, U+22C5 ISOamsb" },
1887
{ 8968, "lceil","left ceiling = apl upstile, U+2308 ISOamsc" },
1888
{ 8969, "rceil","right ceiling, U+2309 ISOamsc" },
1889
{ 8970, "lfloor","left floor = apl downstile, U+230A ISOamsc" },
1890
{ 8971, "rfloor","right floor, U+230B ISOamsc" },
1891
{ 9001, "lang", "left-pointing angle bracket = bra, U+2329 ISOtech" },
1892
{ 9002, "rang", "right-pointing angle bracket = ket, U+232A ISOtech" },
1893
{ 9674, "loz",  "lozenge, U+25CA ISOpub" },
1894
1895
{ 9824, "spades","black spade suit, U+2660 ISOpub" },
1896
{ 9827, "clubs","black club suit = shamrock, U+2663 ISOpub" },
1897
{ 9829, "hearts","black heart suit = valentine, U+2665 ISOpub" },
1898
{ 9830, "diams","black diamond suit, U+2666 ISOpub" },
1899
1900
};
1901
1902
/************************************************************************
1903
 *                  *
1904
 *    Commodity functions to handle entities      *
1905
 *                  *
1906
 ************************************************************************/
1907
1908
/*
1909
 * Macro used to grow the current buffer.
1910
 */
1911
0
#define growBuffer(buffer) {           \
1912
0
    xmlChar *tmp;             \
1913
0
    buffer##_size *= 2;             \
1914
0
    tmp = (xmlChar *) xmlRealloc(buffer, buffer##_size * sizeof(xmlChar)); \
1915
0
    if (tmp == NULL) {           \
1916
0
  htmlErrMemory(ctxt, "growing buffer\n");      \
1917
0
  xmlFree(buffer);            \
1918
0
  return(NULL);             \
1919
0
    }                  \
1920
0
    buffer = tmp;             \
1921
0
}
1922
1923
/**
1924
 * htmlEntityLookup:
1925
 * @name: the entity name
1926
 *
1927
 * Lookup the given entity in EntitiesTable
1928
 *
1929
 * TODO: the linear scan is really ugly, an hash table is really needed.
1930
 *
1931
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1932
 */
1933
const htmlEntityDesc *
1934
0
htmlEntityLookup(const xmlChar *name) {
1935
0
    unsigned int i;
1936
1937
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
1938
0
                    sizeof(html40EntitiesTable[0]));i++) {
1939
0
        if (xmlStrEqual(name, BAD_CAST html40EntitiesTable[i].name)) {
1940
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1941
0
  }
1942
0
    }
1943
0
    return(NULL);
1944
0
}
1945
1946
/**
1947
 * htmlEntityValueLookup:
1948
 * @value: the entity's unicode value
1949
 *
1950
 * Lookup the given entity in EntitiesTable
1951
 *
1952
 * TODO: the linear scan is really ugly, an hash table is really needed.
1953
 *
1954
 * Returns the associated htmlEntityDescPtr if found, NULL otherwise.
1955
 */
1956
const htmlEntityDesc *
1957
0
htmlEntityValueLookup(unsigned int value) {
1958
0
    unsigned int i;
1959
1960
0
    for (i = 0;i < (sizeof(html40EntitiesTable)/
1961
0
                    sizeof(html40EntitiesTable[0]));i++) {
1962
0
        if (html40EntitiesTable[i].value >= value) {
1963
0
      if (html40EntitiesTable[i].value > value)
1964
0
    break;
1965
0
            return((htmlEntityDescPtr) &html40EntitiesTable[i]);
1966
0
  }
1967
0
    }
1968
0
    return(NULL);
1969
0
}
1970
1971
/**
1972
 * UTF8ToHtml:
1973
 * @out:  a pointer to an array of bytes to store the result
1974
 * @outlen:  the length of @out
1975
 * @in:  a pointer to an array of UTF-8 chars
1976
 * @inlen:  the length of @in
1977
 *
1978
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
1979
 * plus HTML entities block of chars out.
1980
 *
1981
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
1982
 * The value of @inlen after return is the number of octets consumed
1983
 *     as the return value is positive, else unpredictable.
1984
 * The value of @outlen after return is the number of octets consumed.
1985
 */
1986
int
1987
UTF8ToHtml(unsigned char* out, int *outlen,
1988
0
              const unsigned char* in, int *inlen) {
1989
0
    const unsigned char* processed = in;
1990
0
    const unsigned char* outend;
1991
0
    const unsigned char* outstart = out;
1992
0
    const unsigned char* instart = in;
1993
0
    const unsigned char* inend;
1994
0
    unsigned int c, d;
1995
0
    int trailing;
1996
1997
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL)) return(-1);
1998
0
    if (in == NULL) {
1999
        /*
2000
   * initialization nothing to do
2001
   */
2002
0
  *outlen = 0;
2003
0
  *inlen = 0;
2004
0
  return(0);
2005
0
    }
2006
0
    inend = in + (*inlen);
2007
0
    outend = out + (*outlen);
2008
0
    while (in < inend) {
2009
0
  d = *in++;
2010
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2011
0
  else if (d < 0xC0) {
2012
      /* trailing byte in leading position */
2013
0
      *outlen = out - outstart;
2014
0
      *inlen = processed - instart;
2015
0
      return(-2);
2016
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2017
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2018
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2019
0
  else {
2020
      /* no chance for this in Ascii */
2021
0
      *outlen = out - outstart;
2022
0
      *inlen = processed - instart;
2023
0
      return(-2);
2024
0
  }
2025
2026
0
  if (inend - in < trailing) {
2027
0
      break;
2028
0
  }
2029
2030
0
  for ( ; trailing; trailing--) {
2031
0
      if ((in >= inend) || (((d= *in++) & 0xC0) != 0x80))
2032
0
    break;
2033
0
      c <<= 6;
2034
0
      c |= d & 0x3F;
2035
0
  }
2036
2037
  /* assertion: c is a single UTF-4 value */
2038
0
  if (c < 0x80) {
2039
0
      if (out + 1 >= outend)
2040
0
    break;
2041
0
      *out++ = c;
2042
0
  } else {
2043
0
      int len;
2044
0
      const htmlEntityDesc * ent;
2045
0
      const char *cp;
2046
0
      char nbuf[16];
2047
2048
      /*
2049
       * Try to lookup a predefined HTML entity for it
2050
       */
2051
2052
0
      ent = htmlEntityValueLookup(c);
2053
0
      if (ent == NULL) {
2054
0
        snprintf(nbuf, sizeof(nbuf), "#%u", c);
2055
0
        cp = nbuf;
2056
0
      }
2057
0
      else
2058
0
        cp = ent->name;
2059
0
      len = strlen(cp);
2060
0
      if (out + 2 + len >= outend)
2061
0
    break;
2062
0
      *out++ = '&';
2063
0
      memcpy(out, cp, len);
2064
0
      out += len;
2065
0
      *out++ = ';';
2066
0
  }
2067
0
  processed = in;
2068
0
    }
2069
0
    *outlen = out - outstart;
2070
0
    *inlen = processed - instart;
2071
0
    return(0);
2072
0
}
2073
2074
/**
2075
 * htmlEncodeEntities:
2076
 * @out:  a pointer to an array of bytes to store the result
2077
 * @outlen:  the length of @out
2078
 * @in:  a pointer to an array of UTF-8 chars
2079
 * @inlen:  the length of @in
2080
 * @quoteChar: the quote character to escape (' or ") or zero.
2081
 *
2082
 * Take a block of UTF-8 chars in and try to convert it to an ASCII
2083
 * plus HTML entities block of chars out.
2084
 *
2085
 * Returns 0 if success, -2 if the transcoding fails, or -1 otherwise
2086
 * The value of @inlen after return is the number of octets consumed
2087
 *     as the return value is positive, else unpredictable.
2088
 * The value of @outlen after return is the number of octets consumed.
2089
 */
2090
int
2091
htmlEncodeEntities(unsigned char* out, int *outlen,
2092
0
       const unsigned char* in, int *inlen, int quoteChar) {
2093
0
    const unsigned char* processed = in;
2094
0
    const unsigned char* outend;
2095
0
    const unsigned char* outstart = out;
2096
0
    const unsigned char* instart = in;
2097
0
    const unsigned char* inend;
2098
0
    unsigned int c, d;
2099
0
    int trailing;
2100
2101
0
    if ((out == NULL) || (outlen == NULL) || (inlen == NULL) || (in == NULL))
2102
0
        return(-1);
2103
0
    outend = out + (*outlen);
2104
0
    inend = in + (*inlen);
2105
0
    while (in < inend) {
2106
0
  d = *in++;
2107
0
  if      (d < 0x80)  { c= d; trailing= 0; }
2108
0
  else if (d < 0xC0) {
2109
      /* trailing byte in leading position */
2110
0
      *outlen = out - outstart;
2111
0
      *inlen = processed - instart;
2112
0
      return(-2);
2113
0
        } else if (d < 0xE0)  { c= d & 0x1F; trailing= 1; }
2114
0
        else if (d < 0xF0)  { c= d & 0x0F; trailing= 2; }
2115
0
        else if (d < 0xF8)  { c= d & 0x07; trailing= 3; }
2116
0
  else {
2117
      /* no chance for this in Ascii */
2118
0
      *outlen = out - outstart;
2119
0
      *inlen = processed - instart;
2120
0
      return(-2);
2121
0
  }
2122
2123
0
  if (inend - in < trailing)
2124
0
      break;
2125
2126
0
  while (trailing--) {
2127
0
      if (((d= *in++) & 0xC0) != 0x80) {
2128
0
    *outlen = out - outstart;
2129
0
    *inlen = processed - instart;
2130
0
    return(-2);
2131
0
      }
2132
0
      c <<= 6;
2133
0
      c |= d & 0x3F;
2134
0
  }
2135
2136
  /* assertion: c is a single UTF-4 value */
2137
0
  if ((c < 0x80) && (c != (unsigned int) quoteChar) &&
2138
0
      (c != '&') && (c != '<') && (c != '>')) {
2139
0
      if (out >= outend)
2140
0
    break;
2141
0
      *out++ = c;
2142
0
  } else {
2143
0
      const htmlEntityDesc * ent;
2144
0
      const char *cp;
2145
0
      char nbuf[16];
2146
0
      int len;
2147
2148
      /*
2149
       * Try to lookup a predefined HTML entity for it
2150
       */
2151
0
      ent = htmlEntityValueLookup(c);
2152
0
      if (ent == NULL) {
2153
0
    snprintf(nbuf, sizeof(nbuf), "#%u", c);
2154
0
    cp = nbuf;
2155
0
      }
2156
0
      else
2157
0
    cp = ent->name;
2158
0
      len = strlen(cp);
2159
0
      if (out + 2 + len > outend)
2160
0
    break;
2161
0
      *out++ = '&';
2162
0
      memcpy(out, cp, len);
2163
0
      out += len;
2164
0
      *out++ = ';';
2165
0
  }
2166
0
  processed = in;
2167
0
    }
2168
0
    *outlen = out - outstart;
2169
0
    *inlen = processed - instart;
2170
0
    return(0);
2171
0
}
2172
2173
/************************************************************************
2174
 *                  *
2175
 *    Commodity functions to handle streams     *
2176
 *                  *
2177
 ************************************************************************/
2178
2179
/**
2180
 * htmlNewInputStream:
2181
 * @ctxt:  an HTML parser context
2182
 *
2183
 * Create a new input stream structure
2184
 * Returns the new input stream or NULL
2185
 */
2186
static htmlParserInputPtr
2187
0
htmlNewInputStream(htmlParserCtxtPtr ctxt) {
2188
0
    htmlParserInputPtr input;
2189
2190
0
    input = (xmlParserInputPtr) xmlMalloc(sizeof(htmlParserInput));
2191
0
    if (input == NULL) {
2192
0
        htmlErrMemory(ctxt, "couldn't allocate a new input stream\n");
2193
0
  return(NULL);
2194
0
    }
2195
0
    memset(input, 0, sizeof(htmlParserInput));
2196
0
    input->filename = NULL;
2197
0
    input->directory = NULL;
2198
0
    input->base = NULL;
2199
0
    input->cur = NULL;
2200
0
    input->buf = NULL;
2201
0
    input->line = 1;
2202
0
    input->col = 1;
2203
0
    input->buf = NULL;
2204
0
    input->free = NULL;
2205
0
    input->version = NULL;
2206
0
    input->consumed = 0;
2207
0
    input->length = 0;
2208
0
    return(input);
2209
0
}
2210
2211
2212
/************************************************************************
2213
 *                  *
2214
 *    Commodity functions, cleanup needed ?     *
2215
 *                  *
2216
 ************************************************************************/
2217
/*
2218
 * all tags allowing pc data from the html 4.01 loose dtd
2219
 * NOTE: it might be more apropriate to integrate this information
2220
 * into the html40ElementTable array but I don't want to risk any
2221
 * binary incomptibility
2222
 */
2223
static const char *allowPCData[] = {
2224
    "a", "abbr", "acronym", "address", "applet", "b", "bdo", "big",
2225
    "blockquote", "body", "button", "caption", "center", "cite", "code",
2226
    "dd", "del", "dfn", "div", "dt", "em", "font", "form", "h1", "h2",
2227
    "h3", "h4", "h5", "h6", "i", "iframe", "ins", "kbd", "label", "legend",
2228
    "li", "noframes", "noscript", "object", "p", "pre", "q", "s", "samp",
2229
    "small", "span", "strike", "strong", "td", "th", "tt", "u", "var"
2230
};
2231
2232
/**
2233
 * areBlanks:
2234
 * @ctxt:  an HTML parser context
2235
 * @str:  a xmlChar *
2236
 * @len:  the size of @str
2237
 *
2238
 * Is this a sequence of blank chars that one can ignore ?
2239
 *
2240
 * Returns 1 if ignorable 0 otherwise.
2241
 */
2242
2243
0
static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
2244
0
    unsigned int i;
2245
0
    int j;
2246
0
    xmlNodePtr lastChild;
2247
0
    xmlDtdPtr dtd;
2248
2249
0
    for (j = 0;j < len;j++)
2250
0
        if (!(IS_BLANK_CH(str[j]))) return(0);
2251
2252
0
    if (CUR == 0) return(1);
2253
0
    if (CUR != '<') return(0);
2254
0
    if (ctxt->name == NULL)
2255
0
  return(1);
2256
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"html"))
2257
0
  return(1);
2258
0
    if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
2259
0
  return(1);
2260
2261
    /* Only strip CDATA children of the body tag for strict HTML DTDs */
2262
0
    if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
2263
0
        dtd = xmlGetIntSubset(ctxt->myDoc);
2264
0
        if (dtd != NULL && dtd->ExternalID != NULL) {
2265
0
            if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
2266
0
                    !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
2267
0
                return(1);
2268
0
        }
2269
0
    }
2270
2271
0
    if (ctxt->node == NULL) return(0);
2272
0
    lastChild = xmlGetLastChild(ctxt->node);
2273
0
    while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
2274
0
  lastChild = lastChild->prev;
2275
0
    if (lastChild == NULL) {
2276
0
        if ((ctxt->node->type != XML_ELEMENT_NODE) &&
2277
0
            (ctxt->node->content != NULL)) return(0);
2278
  /* keep ws in constructs like ...<b> </b>...
2279
     for all tags "b" allowing PCDATA */
2280
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2281
0
      if ( xmlStrEqual(ctxt->name, BAD_CAST allowPCData[i]) ) {
2282
0
    return(0);
2283
0
      }
2284
0
  }
2285
0
    } else if (xmlNodeIsText(lastChild)) {
2286
0
        return(0);
2287
0
    } else {
2288
  /* keep ws in constructs like <p><b>xy</b> <i>z</i><p>
2289
     for all tags "p" allowing PCDATA */
2290
0
  for ( i = 0; i < sizeof(allowPCData)/sizeof(allowPCData[0]); i++ ) {
2291
0
      if ( xmlStrEqual(lastChild->name, BAD_CAST allowPCData[i]) ) {
2292
0
    return(0);
2293
0
      }
2294
0
  }
2295
0
    }
2296
0
    return(1);
2297
0
}
2298
2299
/**
2300
 * htmlNewDocNoDtD:
2301
 * @URI:  URI for the dtd, or NULL
2302
 * @ExternalID:  the external ID of the DTD, or NULL
2303
 *
2304
 * Creates a new HTML document without a DTD node if @URI and @ExternalID
2305
 * are NULL
2306
 *
2307
 * Returns a new document, do not initialize the DTD if not provided
2308
 */
2309
htmlDocPtr
2310
0
htmlNewDocNoDtD(const xmlChar *URI, const xmlChar *ExternalID) {
2311
0
    xmlDocPtr cur;
2312
2313
    /*
2314
     * Allocate a new document and fill the fields.
2315
     */
2316
0
    cur = (xmlDocPtr) xmlMalloc(sizeof(xmlDoc));
2317
0
    if (cur == NULL) {
2318
0
  htmlErrMemory(NULL, "HTML document creation failed\n");
2319
0
  return(NULL);
2320
0
    }
2321
0
    memset(cur, 0, sizeof(xmlDoc));
2322
2323
0
    cur->type = XML_HTML_DOCUMENT_NODE;
2324
0
    cur->version = NULL;
2325
0
    cur->intSubset = NULL;
2326
0
    cur->doc = cur;
2327
0
    cur->name = NULL;
2328
0
    cur->children = NULL;
2329
0
    cur->extSubset = NULL;
2330
0
    cur->oldNs = NULL;
2331
0
    cur->encoding = NULL;
2332
0
    cur->standalone = 1;
2333
0
    cur->compression = 0;
2334
0
    cur->ids = NULL;
2335
0
    cur->refs = NULL;
2336
0
    cur->_private = NULL;
2337
0
    cur->charset = XML_CHAR_ENCODING_UTF8;
2338
0
    cur->properties = XML_DOC_HTML | XML_DOC_USERBUILT;
2339
0
    if ((ExternalID != NULL) ||
2340
0
  (URI != NULL))
2341
0
  xmlCreateIntSubset(cur, BAD_CAST "html", ExternalID, URI);
2342
0
    return(cur);
2343
0
}
2344
2345
/**
2346
 * htmlNewDoc:
2347
 * @URI:  URI for the dtd, or NULL
2348
 * @ExternalID:  the external ID of the DTD, or NULL
2349
 *
2350
 * Creates a new HTML document
2351
 *
2352
 * Returns a new document
2353
 */
2354
htmlDocPtr
2355
0
htmlNewDoc(const xmlChar *URI, const xmlChar *ExternalID) {
2356
0
    if ((URI == NULL) && (ExternalID == NULL))
2357
0
  return(htmlNewDocNoDtD(
2358
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd",
2359
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN"));
2360
2361
0
    return(htmlNewDocNoDtD(URI, ExternalID));
2362
0
}
2363
2364
2365
/************************************************************************
2366
 *                  *
2367
 *      The parser itself       *
2368
 *  Relates to http://www.w3.org/TR/html40        *
2369
 *                  *
2370
 ************************************************************************/
2371
2372
/************************************************************************
2373
 *                  *
2374
 *      The parser itself       *
2375
 *                  *
2376
 ************************************************************************/
2377
2378
static const xmlChar * htmlParseNameComplex(xmlParserCtxtPtr ctxt);
2379
2380
/**
2381
 * htmlParseHTMLName:
2382
 * @ctxt:  an HTML parser context
2383
 *
2384
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2385
 * since HTML names are not case-sensitive.
2386
 *
2387
 * Returns the Tag Name parsed or NULL
2388
 */
2389
2390
static const xmlChar *
2391
0
htmlParseHTMLName(htmlParserCtxtPtr ctxt) {
2392
0
    int i = 0;
2393
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2394
2395
0
    if (!IS_ASCII_LETTER(CUR) && (CUR != '_') &&
2396
0
        (CUR != ':') && (CUR != '.')) return(NULL);
2397
2398
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2399
0
           ((IS_ASCII_LETTER(CUR)) || (IS_ASCII_DIGIT(CUR)) ||
2400
0
     (CUR == ':') || (CUR == '-') || (CUR == '_') ||
2401
0
           (CUR == '.'))) {
2402
0
  if ((CUR >= 'A') && (CUR <= 'Z')) loc[i] = CUR + 0x20;
2403
0
        else loc[i] = CUR;
2404
0
  i++;
2405
2406
0
  NEXT;
2407
0
    }
2408
2409
0
    return(xmlDictLookup(ctxt->dict, loc, i));
2410
0
}
2411
2412
2413
/**
2414
 * htmlParseHTMLName_nonInvasive:
2415
 * @ctxt:  an HTML parser context
2416
 *
2417
 * parse an HTML tag or attribute name, note that we convert it to lowercase
2418
 * since HTML names are not case-sensitive, this doesn't consume the data
2419
 * from the stream, it's a look-ahead
2420
 *
2421
 * Returns the Tag Name parsed or NULL
2422
 */
2423
2424
static const xmlChar *
2425
0
htmlParseHTMLName_nonInvasive(htmlParserCtxtPtr ctxt) {
2426
0
    int i = 0;
2427
0
    xmlChar loc[HTML_PARSER_BUFFER_SIZE];
2428
2429
0
    if (!IS_ASCII_LETTER(NXT(1)) && (NXT(1) != '_') &&
2430
0
        (NXT(1) != ':')) return(NULL);
2431
2432
0
    while ((i < HTML_PARSER_BUFFER_SIZE) &&
2433
0
           ((IS_ASCII_LETTER(NXT(1+i))) || (IS_ASCII_DIGIT(NXT(1+i))) ||
2434
0
     (NXT(1+i) == ':') || (NXT(1+i) == '-') || (NXT(1+i) == '_'))) {
2435
0
  if ((NXT(1+i) >= 'A') && (NXT(1+i) <= 'Z')) loc[i] = NXT(1+i) + 0x20;
2436
0
        else loc[i] = NXT(1+i);
2437
0
  i++;
2438
0
    }
2439
2440
0
    return(xmlDictLookup(ctxt->dict, loc, i));
2441
0
}
2442
2443
2444
/**
2445
 * htmlParseName:
2446
 * @ctxt:  an HTML parser context
2447
 *
2448
 * parse an HTML name, this routine is case sensitive.
2449
 *
2450
 * Returns the Name parsed or NULL
2451
 */
2452
2453
static const xmlChar *
2454
0
htmlParseName(htmlParserCtxtPtr ctxt) {
2455
0
    const xmlChar *in;
2456
0
    const xmlChar *ret;
2457
0
    int count = 0;
2458
2459
0
    GROW;
2460
2461
    /*
2462
     * Accelerator for simple ASCII names
2463
     */
2464
0
    in = ctxt->input->cur;
2465
0
    if (((*in >= 0x61) && (*in <= 0x7A)) ||
2466
0
  ((*in >= 0x41) && (*in <= 0x5A)) ||
2467
0
  (*in == '_') || (*in == ':')) {
2468
0
  in++;
2469
0
  while (((*in >= 0x61) && (*in <= 0x7A)) ||
2470
0
         ((*in >= 0x41) && (*in <= 0x5A)) ||
2471
0
         ((*in >= 0x30) && (*in <= 0x39)) ||
2472
0
         (*in == '_') || (*in == '-') ||
2473
0
         (*in == ':') || (*in == '.'))
2474
0
      in++;
2475
2476
0
  if (in == ctxt->input->end)
2477
0
      return(NULL);
2478
2479
0
  if ((*in > 0) && (*in < 0x80)) {
2480
0
      count = in - ctxt->input->cur;
2481
0
      ret = xmlDictLookup(ctxt->dict, ctxt->input->cur, count);
2482
0
      ctxt->input->cur = in;
2483
0
      ctxt->nbChars += count;
2484
0
      ctxt->input->col += count;
2485
0
      return(ret);
2486
0
  }
2487
0
    }
2488
0
    return(htmlParseNameComplex(ctxt));
2489
0
}
2490
2491
static const xmlChar *
2492
0
htmlParseNameComplex(xmlParserCtxtPtr ctxt) {
2493
0
    int len = 0, l;
2494
0
    int c;
2495
0
    int count = 0;
2496
0
    const xmlChar *base = ctxt->input->base;
2497
2498
    /*
2499
     * Handler for more complex cases
2500
     */
2501
0
    GROW;
2502
0
    c = CUR_CHAR(l);
2503
0
    if ((c == ' ') || (c == '>') || (c == '/') || /* accelerators */
2504
0
  (!IS_LETTER(c) && (c != '_') &&
2505
0
         (c != ':'))) {
2506
0
  return(NULL);
2507
0
    }
2508
2509
0
    while ((c != ' ') && (c != '>') && (c != '/') && /* test bigname.xml */
2510
0
     ((IS_LETTER(c)) || (IS_DIGIT(c)) ||
2511
0
            (c == '.') || (c == '-') ||
2512
0
      (c == '_') || (c == ':') ||
2513
0
      (IS_COMBINING(c)) ||
2514
0
      (IS_EXTENDER(c)))) {
2515
0
  if (count++ > 100) {
2516
0
      count = 0;
2517
0
      GROW;
2518
0
  }
2519
0
  len += l;
2520
0
  NEXTL(l);
2521
0
  c = CUR_CHAR(l);
2522
0
  if (ctxt->input->base != base) {
2523
      /*
2524
       * We changed encoding from an unknown encoding
2525
       * Input buffer changed location, so we better start again
2526
       */
2527
0
      return(htmlParseNameComplex(ctxt));
2528
0
  }
2529
0
    }
2530
2531
0
    if (ctxt->input->cur - ctxt->input->base < len) {
2532
        /* Sanity check */
2533
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
2534
0
                     "unexpected change of input buffer", NULL, NULL);
2535
0
        return (NULL);
2536
0
    }
2537
2538
0
    return(xmlDictLookup(ctxt->dict, ctxt->input->cur - len, len));
2539
0
}
2540
2541
2542
/**
2543
 * htmlParseHTMLAttribute:
2544
 * @ctxt:  an HTML parser context
2545
 * @stop:  a char stop value
2546
 *
2547
 * parse an HTML attribute value till the stop (quote), if
2548
 * stop is 0 then it stops at the first space
2549
 *
2550
 * Returns the attribute parsed or NULL
2551
 */
2552
2553
static xmlChar *
2554
0
htmlParseHTMLAttribute(htmlParserCtxtPtr ctxt, const xmlChar stop) {
2555
0
    xmlChar *buffer = NULL;
2556
0
    int buffer_size = 0;
2557
0
    xmlChar *out = NULL;
2558
0
    const xmlChar *name = NULL;
2559
0
    const xmlChar *cur = NULL;
2560
0
    const htmlEntityDesc * ent;
2561
2562
    /*
2563
     * allocate a translation buffer.
2564
     */
2565
0
    buffer_size = HTML_PARSER_BUFFER_SIZE;
2566
0
    buffer = (xmlChar *) xmlMallocAtomic(buffer_size * sizeof(xmlChar));
2567
0
    if (buffer == NULL) {
2568
0
  htmlErrMemory(ctxt, "buffer allocation failed\n");
2569
0
  return(NULL);
2570
0
    }
2571
0
    out = buffer;
2572
2573
    /*
2574
     * Ok loop until we reach one of the ending chars
2575
     */
2576
0
    while ((CUR != 0) && (CUR != stop)) {
2577
0
  if ((stop == 0) && (CUR == '>')) break;
2578
0
  if ((stop == 0) && (IS_BLANK_CH(CUR))) break;
2579
0
        if (CUR == '&') {
2580
0
      if (NXT(1) == '#') {
2581
0
    unsigned int c;
2582
0
    int bits;
2583
2584
0
    c = htmlParseCharRef(ctxt);
2585
0
    if      (c <    0x80)
2586
0
            { *out++  = c;                bits= -6; }
2587
0
    else if (c <   0x800)
2588
0
            { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2589
0
    else if (c < 0x10000)
2590
0
            { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2591
0
    else
2592
0
            { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2593
2594
0
    for ( ; bits >= 0; bits-= 6) {
2595
0
        *out++  = ((c >> bits) & 0x3F) | 0x80;
2596
0
    }
2597
2598
0
    if (out - buffer > buffer_size - 100) {
2599
0
      int indx = out - buffer;
2600
2601
0
      growBuffer(buffer);
2602
0
      out = &buffer[indx];
2603
0
    }
2604
0
      } else {
2605
0
    ent = htmlParseEntityRef(ctxt, &name);
2606
0
    if (name == NULL) {
2607
0
        *out++ = '&';
2608
0
        if (out - buffer > buffer_size - 100) {
2609
0
      int indx = out - buffer;
2610
2611
0
      growBuffer(buffer);
2612
0
      out = &buffer[indx];
2613
0
        }
2614
0
    } else if (ent == NULL) {
2615
0
        *out++ = '&';
2616
0
        cur = name;
2617
0
        while (*cur != 0) {
2618
0
      if (out - buffer > buffer_size - 100) {
2619
0
          int indx = out - buffer;
2620
2621
0
          growBuffer(buffer);
2622
0
          out = &buffer[indx];
2623
0
      }
2624
0
      *out++ = *cur++;
2625
0
        }
2626
0
    } else {
2627
0
        unsigned int c;
2628
0
        int bits;
2629
2630
0
        if (out - buffer > buffer_size - 100) {
2631
0
      int indx = out - buffer;
2632
2633
0
      growBuffer(buffer);
2634
0
      out = &buffer[indx];
2635
0
        }
2636
0
        c = ent->value;
2637
0
        if      (c <    0x80)
2638
0
      { *out++  = c;                bits= -6; }
2639
0
        else if (c <   0x800)
2640
0
      { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2641
0
        else if (c < 0x10000)
2642
0
      { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2643
0
        else
2644
0
      { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2645
2646
0
        for ( ; bits >= 0; bits-= 6) {
2647
0
      *out++  = ((c >> bits) & 0x3F) | 0x80;
2648
0
        }
2649
0
    }
2650
0
      }
2651
0
  } else {
2652
0
      unsigned int c;
2653
0
      int bits, l;
2654
2655
0
      if (out - buffer > buffer_size - 100) {
2656
0
    int indx = out - buffer;
2657
2658
0
    growBuffer(buffer);
2659
0
    out = &buffer[indx];
2660
0
      }
2661
0
      c = CUR_CHAR(l);
2662
0
      if      (c <    0x80)
2663
0
        { *out++  = c;                bits= -6; }
2664
0
      else if (c <   0x800)
2665
0
        { *out++  =((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
2666
0
      else if (c < 0x10000)
2667
0
        { *out++  =((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
2668
0
      else
2669
0
        { *out++  =((c >> 18) & 0x07) | 0xF0;  bits= 12; }
2670
2671
0
      for ( ; bits >= 0; bits-= 6) {
2672
0
    *out++  = ((c >> bits) & 0x3F) | 0x80;
2673
0
      }
2674
0
      NEXT;
2675
0
  }
2676
0
    }
2677
0
    *out = 0;
2678
0
    return(buffer);
2679
0
}
2680
2681
/**
2682
 * htmlParseEntityRef:
2683
 * @ctxt:  an HTML parser context
2684
 * @str:  location to store the entity name
2685
 *
2686
 * parse an HTML ENTITY references
2687
 *
2688
 * [68] EntityRef ::= '&' Name ';'
2689
 *
2690
 * Returns the associated htmlEntityDescPtr if found, or NULL otherwise,
2691
 *         if non-NULL *str will have to be freed by the caller.
2692
 */
2693
const htmlEntityDesc *
2694
0
htmlParseEntityRef(htmlParserCtxtPtr ctxt, const xmlChar **str) {
2695
0
    const xmlChar *name;
2696
0
    const htmlEntityDesc * ent = NULL;
2697
2698
0
    if (str != NULL) *str = NULL;
2699
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) return(NULL);
2700
2701
0
    if (CUR == '&') {
2702
0
        NEXT;
2703
0
        name = htmlParseName(ctxt);
2704
0
  if (name == NULL) {
2705
0
      htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
2706
0
                   "htmlParseEntityRef: no name\n", NULL, NULL);
2707
0
  } else {
2708
0
      GROW;
2709
0
      if (CUR == ';') {
2710
0
          if (str != NULL)
2711
0
        *str = name;
2712
2713
    /*
2714
     * Lookup the entity in the table.
2715
     */
2716
0
    ent = htmlEntityLookup(name);
2717
0
    if (ent != NULL) /* OK that's ugly !!! */
2718
0
        NEXT;
2719
0
      } else {
2720
0
    htmlParseErr(ctxt, XML_ERR_ENTITYREF_SEMICOL_MISSING,
2721
0
                 "htmlParseEntityRef: expecting ';'\n",
2722
0
           NULL, NULL);
2723
0
          if (str != NULL)
2724
0
        *str = name;
2725
0
      }
2726
0
  }
2727
0
    }
2728
0
    return(ent);
2729
0
}
2730
2731
/**
2732
 * htmlParseAttValue:
2733
 * @ctxt:  an HTML parser context
2734
 *
2735
 * parse a value for an attribute
2736
 * Note: the parser won't do substitution of entities here, this
2737
 * will be handled later in xmlStringGetNodeList, unless it was
2738
 * asked for ctxt->replaceEntities != 0
2739
 *
2740
 * Returns the AttValue parsed or NULL.
2741
 */
2742
2743
static xmlChar *
2744
0
htmlParseAttValue(htmlParserCtxtPtr ctxt) {
2745
0
    xmlChar *ret = NULL;
2746
2747
0
    if (CUR == '"') {
2748
0
        NEXT;
2749
0
  ret = htmlParseHTMLAttribute(ctxt, '"');
2750
0
        if (CUR != '"') {
2751
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2752
0
                   "AttValue: \" expected\n", NULL, NULL);
2753
0
  } else
2754
0
      NEXT;
2755
0
    } else if (CUR == '\'') {
2756
0
        NEXT;
2757
0
  ret = htmlParseHTMLAttribute(ctxt, '\'');
2758
0
        if (CUR != '\'') {
2759
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_NOT_FINISHED,
2760
0
                   "AttValue: ' expected\n", NULL, NULL);
2761
0
  } else
2762
0
      NEXT;
2763
0
    } else {
2764
        /*
2765
   * That's an HTMLism, the attribute value may not be quoted
2766
   */
2767
0
  ret = htmlParseHTMLAttribute(ctxt, 0);
2768
0
  if (ret == NULL) {
2769
0
      htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_WITHOUT_VALUE,
2770
0
                   "AttValue: no value found\n", NULL, NULL);
2771
0
  }
2772
0
    }
2773
0
    return(ret);
2774
0
}
2775
2776
/**
2777
 * htmlParseSystemLiteral:
2778
 * @ctxt:  an HTML parser context
2779
 *
2780
 * parse an HTML Literal
2781
 *
2782
 * [11] SystemLiteral ::= ('"' [^"]* '"') | ("'" [^']* "'")
2783
 *
2784
 * Returns the SystemLiteral parsed or NULL
2785
 */
2786
2787
static xmlChar *
2788
0
htmlParseSystemLiteral(htmlParserCtxtPtr ctxt) {
2789
0
    size_t len = 0, startPosition = 0;
2790
0
    xmlChar *ret = NULL;
2791
2792
0
    if (CUR == '"') {
2793
0
        NEXT;
2794
2795
0
        if (CUR_PTR < BASE_PTR)
2796
0
            return(ret);
2797
0
        startPosition = CUR_PTR - BASE_PTR;
2798
2799
0
  while ((IS_CHAR_CH(CUR)) && (CUR != '"')) {
2800
0
      NEXT;
2801
0
      len++;
2802
0
  }
2803
0
  if (!IS_CHAR_CH(CUR)) {
2804
0
      htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2805
0
       "Unfinished SystemLiteral\n", NULL, NULL);
2806
0
  } else {
2807
0
      ret = xmlStrndup((BASE_PTR+startPosition), len);
2808
0
      NEXT;
2809
0
        }
2810
0
    } else if (CUR == '\'') {
2811
0
        NEXT;
2812
2813
0
        if (CUR_PTR < BASE_PTR)
2814
0
            return(ret);
2815
0
        startPosition = CUR_PTR - BASE_PTR;
2816
2817
0
  while ((IS_CHAR_CH(CUR)) && (CUR != '\'')) {
2818
0
      NEXT;
2819
0
      len++;
2820
0
  }
2821
0
  if (!IS_CHAR_CH(CUR)) {
2822
0
      htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2823
0
       "Unfinished SystemLiteral\n", NULL, NULL);
2824
0
  } else {
2825
0
      ret = xmlStrndup((BASE_PTR+startPosition), len);
2826
0
      NEXT;
2827
0
        }
2828
0
    } else {
2829
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2830
0
               " or ' expected\n", NULL, NULL);
2831
0
    }
2832
2833
0
    return(ret);
2834
0
}
2835
2836
/**
2837
 * htmlParsePubidLiteral:
2838
 * @ctxt:  an HTML parser context
2839
 *
2840
 * parse an HTML public literal
2841
 *
2842
 * [12] PubidLiteral ::= '"' PubidChar* '"' | "'" (PubidChar - "'")* "'"
2843
 *
2844
 * Returns the PubidLiteral parsed or NULL.
2845
 */
2846
2847
static xmlChar *
2848
0
htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
2849
0
    size_t len = 0, startPosition = 0;
2850
0
    xmlChar *ret = NULL;
2851
    /*
2852
     * Name ::= (Letter | '_') (NameChar)*
2853
     */
2854
0
    if (CUR == '"') {
2855
0
        NEXT;
2856
2857
0
        if (CUR_PTR < BASE_PTR)
2858
0
            return(ret);
2859
0
        startPosition = CUR_PTR - BASE_PTR;
2860
2861
0
        while (IS_PUBIDCHAR_CH(CUR)) {
2862
0
            len++;
2863
0
            NEXT;
2864
0
        }
2865
2866
0
  if (CUR != '"') {
2867
0
      htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2868
0
                   "Unfinished PubidLiteral\n", NULL, NULL);
2869
0
  } else {
2870
0
      ret = xmlStrndup((BASE_PTR + startPosition), len);
2871
0
      NEXT;
2872
0
  }
2873
0
    } else if (CUR == '\'') {
2874
0
        NEXT;
2875
2876
0
        if (CUR_PTR < BASE_PTR)
2877
0
            return(ret);
2878
0
        startPosition = CUR_PTR - BASE_PTR;
2879
2880
0
        while ((IS_PUBIDCHAR_CH(CUR)) && (CUR != '\'')){
2881
0
            len++;
2882
0
            NEXT;
2883
0
        }
2884
2885
0
  if (CUR != '\'') {
2886
0
      htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_FINISHED,
2887
0
                   "Unfinished PubidLiteral\n", NULL, NULL);
2888
0
  } else {
2889
0
      ret = xmlStrndup((BASE_PTR + startPosition), len);
2890
0
      NEXT;
2891
0
  }
2892
0
    } else {
2893
0
  htmlParseErr(ctxt, XML_ERR_LITERAL_NOT_STARTED,
2894
0
               "PubidLiteral \" or ' expected\n", NULL, NULL);
2895
0
    }
2896
2897
0
    return(ret);
2898
0
}
2899
2900
/**
2901
 * htmlParseScript:
2902
 * @ctxt:  an HTML parser context
2903
 *
2904
 * parse the content of an HTML SCRIPT or STYLE element
2905
 * http://www.w3.org/TR/html4/sgml/dtd.html#Script
2906
 * http://www.w3.org/TR/html4/sgml/dtd.html#StyleSheet
2907
 * http://www.w3.org/TR/html4/types.html#type-script
2908
 * http://www.w3.org/TR/html4/types.html#h-6.15
2909
 * http://www.w3.org/TR/html4/appendix/notes.html#h-B.3.2.1
2910
 *
2911
 * Script data ( %Script; in the DTD) can be the content of the SCRIPT
2912
 * element and the value of intrinsic event attributes. User agents must
2913
 * not evaluate script data as HTML markup but instead must pass it on as
2914
 * data to a script engine.
2915
 * NOTES:
2916
 * - The content is passed like CDATA
2917
 * - the attributes for style and scripting "onXXX" are also described
2918
 *   as CDATA but SGML allows entities references in attributes so their
2919
 *   processing is identical as other attributes
2920
 */
2921
static void
2922
0
htmlParseScript(htmlParserCtxtPtr ctxt) {
2923
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
2924
0
    int nbchar = 0;
2925
0
    int cur,l;
2926
2927
0
    SHRINK;
2928
0
    cur = CUR_CHAR(l);
2929
0
    while (IS_CHAR_CH(cur)) {
2930
0
  if ((cur == '<') && (NXT(1) == '/')) {
2931
            /*
2932
             * One should break here, the specification is clear:
2933
             * Authors should therefore escape "</" within the content.
2934
             * Escape mechanisms are specific to each scripting or
2935
             * style sheet language.
2936
             *
2937
             * In recovery mode, only break if end tag match the
2938
             * current tag, effectively ignoring all tags inside the
2939
             * script/style block and treating the entire block as
2940
             * CDATA.
2941
             */
2942
0
            if (ctxt->recovery) {
2943
0
                if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
2944
0
           xmlStrlen(ctxt->name)) == 0)
2945
0
                {
2946
0
                    break; /* while */
2947
0
                } else {
2948
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
2949
0
         "Element %s embeds close tag\n",
2950
0
                     ctxt->name, NULL);
2951
0
    }
2952
0
            } else {
2953
0
                if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
2954
0
                    ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
2955
0
                {
2956
0
                    break; /* while */
2957
0
                }
2958
0
            }
2959
0
  }
2960
0
  COPY_BUF(l,buf,nbchar,cur);
2961
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
2962
0
      if (ctxt->sax->cdataBlock!= NULL) {
2963
    /*
2964
     * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2965
     */
2966
0
    ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2967
0
      } else if (ctxt->sax->characters != NULL) {
2968
0
    ctxt->sax->characters(ctxt->userData, buf, nbchar);
2969
0
      }
2970
0
      nbchar = 0;
2971
0
  }
2972
0
  GROW;
2973
0
  NEXTL(l);
2974
0
  cur = CUR_CHAR(l);
2975
0
    }
2976
2977
0
    if ((!(IS_CHAR_CH(cur))) && (!((cur == 0) && (ctxt->progressive)))) {
2978
0
        htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
2979
0
                    "Invalid char in CDATA 0x%X\n", cur);
2980
0
        if (ctxt->input->cur < ctxt->input->end) {
2981
0
            NEXT;
2982
0
        }
2983
0
    }
2984
2985
0
    if ((nbchar != 0) && (ctxt->sax != NULL) && (!ctxt->disableSAX)) {
2986
0
  if (ctxt->sax->cdataBlock!= NULL) {
2987
      /*
2988
       * Insert as CDATA, which is the same as HTML_PRESERVE_NODE
2989
       */
2990
0
      ctxt->sax->cdataBlock(ctxt->userData, buf, nbchar);
2991
0
  } else if (ctxt->sax->characters != NULL) {
2992
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
2993
0
  }
2994
0
    }
2995
0
}
2996
2997
2998
/**
2999
 * htmlParseCharDataInternal:
3000
 * @ctxt:  an HTML parser context
3001
 * @readahead: optional read ahead character in ascii range
3002
 *
3003
 * parse a CharData section.
3004
 * if we are within a CDATA section ']]>' marks an end of section.
3005
 *
3006
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3007
 */
3008
3009
static void
3010
0
htmlParseCharDataInternal(htmlParserCtxtPtr ctxt, int readahead) {
3011
0
    xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 6];
3012
0
    int nbchar = 0;
3013
0
    int cur, l;
3014
0
    int chunk = 0;
3015
3016
0
    if (readahead)
3017
0
        buf[nbchar++] = readahead;
3018
3019
0
    SHRINK;
3020
0
    cur = CUR_CHAR(l);
3021
0
    while (((cur != '<') || (ctxt->token == '<')) &&
3022
0
           ((cur != '&') || (ctxt->token == '&')) &&
3023
0
     (cur != 0)) {
3024
0
  if (!(IS_CHAR(cur))) {
3025
0
      htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3026
0
                  "Invalid char in CDATA 0x%X\n", cur);
3027
0
  } else {
3028
0
      COPY_BUF(l,buf,nbchar,cur);
3029
0
  }
3030
0
  if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
3031
      /*
3032
       * Ok the segment is to be consumed as chars.
3033
       */
3034
0
      if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3035
0
    if (areBlanks(ctxt, buf, nbchar)) {
3036
0
        if (ctxt->keepBlanks) {
3037
0
      if (ctxt->sax->characters != NULL)
3038
0
          ctxt->sax->characters(ctxt->userData, buf, nbchar);
3039
0
        } else {
3040
0
      if (ctxt->sax->ignorableWhitespace != NULL)
3041
0
          ctxt->sax->ignorableWhitespace(ctxt->userData,
3042
0
                                         buf, nbchar);
3043
0
        }
3044
0
    } else {
3045
0
        htmlCheckParagraph(ctxt);
3046
0
        if (ctxt->sax->characters != NULL)
3047
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3048
0
    }
3049
0
      }
3050
0
      nbchar = 0;
3051
0
  }
3052
0
  NEXTL(l);
3053
0
        chunk++;
3054
0
        if (chunk > HTML_PARSER_BUFFER_SIZE) {
3055
0
            chunk = 0;
3056
0
            SHRINK;
3057
0
            GROW;
3058
0
        }
3059
0
  cur = CUR_CHAR(l);
3060
0
  if (cur == 0) {
3061
0
      SHRINK;
3062
0
      GROW;
3063
0
      cur = CUR_CHAR(l);
3064
0
  }
3065
0
    }
3066
0
    if (nbchar != 0) {
3067
0
        buf[nbchar] = 0;
3068
3069
  /*
3070
   * Ok the segment is to be consumed as chars.
3071
   */
3072
0
  if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) {
3073
0
      if (areBlanks(ctxt, buf, nbchar)) {
3074
0
    if (ctxt->keepBlanks) {
3075
0
        if (ctxt->sax->characters != NULL)
3076
0
      ctxt->sax->characters(ctxt->userData, buf, nbchar);
3077
0
    } else {
3078
0
        if (ctxt->sax->ignorableWhitespace != NULL)
3079
0
      ctxt->sax->ignorableWhitespace(ctxt->userData,
3080
0
                                     buf, nbchar);
3081
0
    }
3082
0
      } else {
3083
0
    htmlCheckParagraph(ctxt);
3084
0
    if (ctxt->sax->characters != NULL)
3085
0
        ctxt->sax->characters(ctxt->userData, buf, nbchar);
3086
0
      }
3087
0
  }
3088
0
    } else {
3089
  /*
3090
   * Loop detection
3091
   */
3092
0
  if (cur == 0)
3093
0
      ctxt->instate = XML_PARSER_EOF;
3094
0
    }
3095
0
}
3096
3097
/**
3098
 * htmlParseCharData:
3099
 * @ctxt:  an HTML parser context
3100
 *
3101
 * parse a CharData section.
3102
 * if we are within a CDATA section ']]>' marks an end of section.
3103
 *
3104
 * [14] CharData ::= [^<&]* - ([^<&]* ']]>' [^<&]*)
3105
 */
3106
3107
static void
3108
0
htmlParseCharData(htmlParserCtxtPtr ctxt) {
3109
0
    htmlParseCharDataInternal(ctxt, 0);
3110
0
}
3111
3112
/**
3113
 * htmlParseExternalID:
3114
 * @ctxt:  an HTML parser context
3115
 * @publicID:  a xmlChar** receiving PubidLiteral
3116
 *
3117
 * Parse an External ID or a Public ID
3118
 *
3119
 * [75] ExternalID ::= 'SYSTEM' S SystemLiteral
3120
 *                   | 'PUBLIC' S PubidLiteral S SystemLiteral
3121
 *
3122
 * [83] PublicID ::= 'PUBLIC' S PubidLiteral
3123
 *
3124
 * Returns the function returns SystemLiteral and in the second
3125
 *                case publicID receives PubidLiteral, is strict is off
3126
 *                it is possible to return NULL and have publicID set.
3127
 */
3128
3129
static xmlChar *
3130
0
htmlParseExternalID(htmlParserCtxtPtr ctxt, xmlChar **publicID) {
3131
0
    xmlChar *URI = NULL;
3132
3133
0
    if ((UPPER == 'S') && (UPP(1) == 'Y') &&
3134
0
         (UPP(2) == 'S') && (UPP(3) == 'T') &&
3135
0
   (UPP(4) == 'E') && (UPP(5) == 'M')) {
3136
0
        SKIP(6);
3137
0
  if (!IS_BLANK_CH(CUR)) {
3138
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3139
0
                   "Space required after 'SYSTEM'\n", NULL, NULL);
3140
0
  }
3141
0
        SKIP_BLANKS;
3142
0
  URI = htmlParseSystemLiteral(ctxt);
3143
0
  if (URI == NULL) {
3144
0
      htmlParseErr(ctxt, XML_ERR_URI_REQUIRED,
3145
0
                   "htmlParseExternalID: SYSTEM, no URI\n", NULL, NULL);
3146
0
        }
3147
0
    } else if ((UPPER == 'P') && (UPP(1) == 'U') &&
3148
0
         (UPP(2) == 'B') && (UPP(3) == 'L') &&
3149
0
         (UPP(4) == 'I') && (UPP(5) == 'C')) {
3150
0
        SKIP(6);
3151
0
  if (!IS_BLANK_CH(CUR)) {
3152
0
      htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3153
0
                   "Space required after 'PUBLIC'\n", NULL, NULL);
3154
0
  }
3155
0
        SKIP_BLANKS;
3156
0
  *publicID = htmlParsePubidLiteral(ctxt);
3157
0
  if (*publicID == NULL) {
3158
0
      htmlParseErr(ctxt, XML_ERR_PUBID_REQUIRED,
3159
0
                   "htmlParseExternalID: PUBLIC, no Public Identifier\n",
3160
0
       NULL, NULL);
3161
0
  }
3162
0
        SKIP_BLANKS;
3163
0
        if ((CUR == '"') || (CUR == '\'')) {
3164
0
      URI = htmlParseSystemLiteral(ctxt);
3165
0
  }
3166
0
    }
3167
0
    return(URI);
3168
0
}
3169
3170
/**
3171
 * xmlParsePI:
3172
 * @ctxt:  an XML parser context
3173
 *
3174
 * parse an XML Processing Instruction.
3175
 *
3176
 * [16] PI ::= '<?' PITarget (S (Char* - (Char* '?>' Char*)))? '?>'
3177
 */
3178
static void
3179
0
htmlParsePI(htmlParserCtxtPtr ctxt) {
3180
0
    xmlChar *buf = NULL;
3181
0
    int len = 0;
3182
0
    int size = HTML_PARSER_BUFFER_SIZE;
3183
0
    int cur, l;
3184
0
    const xmlChar *target;
3185
0
    xmlParserInputState state;
3186
0
    int count = 0;
3187
3188
0
    if ((RAW == '<') && (NXT(1) == '?')) {
3189
0
  state = ctxt->instate;
3190
0
        ctxt->instate = XML_PARSER_PI;
3191
  /*
3192
   * this is a Processing Instruction.
3193
   */
3194
0
  SKIP(2);
3195
0
  SHRINK;
3196
3197
  /*
3198
   * Parse the target name and check for special support like
3199
   * namespace.
3200
   */
3201
0
        target = htmlParseName(ctxt);
3202
0
  if (target != NULL) {
3203
0
      if (RAW == '>') {
3204
0
    SKIP(1);
3205
3206
    /*
3207
     * SAX: PI detected.
3208
     */
3209
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3210
0
        (ctxt->sax->processingInstruction != NULL))
3211
0
        ctxt->sax->processingInstruction(ctxt->userData,
3212
0
                                         target, NULL);
3213
0
    ctxt->instate = state;
3214
0
    return;
3215
0
      }
3216
0
      buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3217
0
      if (buf == NULL) {
3218
0
    htmlErrMemory(ctxt, NULL);
3219
0
    ctxt->instate = state;
3220
0
    return;
3221
0
      }
3222
0
      cur = CUR;
3223
0
      if (!IS_BLANK(cur)) {
3224
0
    htmlParseErr(ctxt, XML_ERR_SPACE_REQUIRED,
3225
0
        "ParsePI: PI %s space expected\n", target, NULL);
3226
0
      }
3227
0
            SKIP_BLANKS;
3228
0
      cur = CUR_CHAR(l);
3229
0
      while (IS_CHAR(cur) && (cur != '>')) {
3230
0
    if (len + 5 >= size) {
3231
0
        xmlChar *tmp;
3232
3233
0
        size *= 2;
3234
0
        tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3235
0
        if (tmp == NULL) {
3236
0
      htmlErrMemory(ctxt, NULL);
3237
0
      xmlFree(buf);
3238
0
      ctxt->instate = state;
3239
0
      return;
3240
0
        }
3241
0
        buf = tmp;
3242
0
    }
3243
0
    count++;
3244
0
    if (count > 50) {
3245
0
        GROW;
3246
0
        count = 0;
3247
0
    }
3248
0
    COPY_BUF(l,buf,len,cur);
3249
0
    NEXTL(l);
3250
0
    cur = CUR_CHAR(l);
3251
0
    if (cur == 0) {
3252
0
        SHRINK;
3253
0
        GROW;
3254
0
        cur = CUR_CHAR(l);
3255
0
    }
3256
0
      }
3257
0
      buf[len] = 0;
3258
0
      if (cur != '>') {
3259
0
    htmlParseErr(ctxt, XML_ERR_PI_NOT_FINISHED,
3260
0
          "ParsePI: PI %s never end ...\n", target, NULL);
3261
0
      } else {
3262
0
    SKIP(1);
3263
3264
    /*
3265
     * SAX: PI detected.
3266
     */
3267
0
    if ((ctxt->sax) && (!ctxt->disableSAX) &&
3268
0
        (ctxt->sax->processingInstruction != NULL))
3269
0
        ctxt->sax->processingInstruction(ctxt->userData,
3270
0
                                         target, buf);
3271
0
      }
3272
0
      xmlFree(buf);
3273
0
  } else {
3274
0
      htmlParseErr(ctxt, XML_ERR_PI_NOT_STARTED,
3275
0
                         "PI is not started correctly", NULL, NULL);
3276
0
  }
3277
0
  ctxt->instate = state;
3278
0
    }
3279
0
}
3280
3281
/**
3282
 * htmlParseComment:
3283
 * @ctxt:  an HTML parser context
3284
 *
3285
 * Parse an XML (SGML) comment <!-- .... -->
3286
 *
3287
 * [15] Comment ::= '<!--' ((Char - '-') | ('-' (Char - '-')))* '-->'
3288
 */
3289
static void
3290
0
htmlParseComment(htmlParserCtxtPtr ctxt) {
3291
0
    xmlChar *buf = NULL;
3292
0
    int len;
3293
0
    int size = HTML_PARSER_BUFFER_SIZE;
3294
0
    int q, ql;
3295
0
    int r, rl;
3296
0
    int cur, l;
3297
0
    xmlParserInputState state;
3298
3299
    /*
3300
     * Check that there is a comment right here.
3301
     */
3302
0
    if ((RAW != '<') || (NXT(1) != '!') ||
3303
0
        (NXT(2) != '-') || (NXT(3) != '-')) return;
3304
3305
0
    state = ctxt->instate;
3306
0
    ctxt->instate = XML_PARSER_COMMENT;
3307
0
    SHRINK;
3308
0
    SKIP(4);
3309
0
    buf = (xmlChar *) xmlMallocAtomic(size * sizeof(xmlChar));
3310
0
    if (buf == NULL) {
3311
0
        htmlErrMemory(ctxt, "buffer allocation failed\n");
3312
0
  ctxt->instate = state;
3313
0
  return;
3314
0
    }
3315
0
    len = 0;
3316
0
    buf[len] = 0;
3317
0
    q = CUR_CHAR(ql);
3318
0
    if (!IS_CHAR(q))
3319
0
        goto unfinished;
3320
0
    NEXTL(ql);
3321
0
    r = CUR_CHAR(rl);
3322
0
    if (!IS_CHAR(r))
3323
0
        goto unfinished;
3324
0
    NEXTL(rl);
3325
0
    cur = CUR_CHAR(l);
3326
0
    while (IS_CHAR(cur) &&
3327
0
           ((cur != '>') ||
3328
0
      (r != '-') || (q != '-'))) {
3329
0
  if (len + 5 >= size) {
3330
0
      xmlChar *tmp;
3331
3332
0
      size *= 2;
3333
0
      tmp = (xmlChar *) xmlRealloc(buf, size * sizeof(xmlChar));
3334
0
      if (tmp == NULL) {
3335
0
          xmlFree(buf);
3336
0
          htmlErrMemory(ctxt, "growing buffer failed\n");
3337
0
    ctxt->instate = state;
3338
0
    return;
3339
0
      }
3340
0
      buf = tmp;
3341
0
  }
3342
0
  COPY_BUF(ql,buf,len,q);
3343
0
  q = r;
3344
0
  ql = rl;
3345
0
  r = cur;
3346
0
  rl = l;
3347
0
  NEXTL(l);
3348
0
  cur = CUR_CHAR(l);
3349
0
  if (cur == 0) {
3350
0
      SHRINK;
3351
0
      GROW;
3352
0
      cur = CUR_CHAR(l);
3353
0
  }
3354
0
    }
3355
0
    buf[len] = 0;
3356
0
    if (IS_CHAR(cur)) {
3357
0
        NEXT;
3358
0
  if ((ctxt->sax != NULL) && (ctxt->sax->comment != NULL) &&
3359
0
      (!ctxt->disableSAX))
3360
0
      ctxt->sax->comment(ctxt->userData, buf);
3361
0
  xmlFree(buf);
3362
0
  ctxt->instate = state;
3363
0
  return;
3364
0
    }
3365
3366
0
unfinished:
3367
0
    htmlParseErr(ctxt, XML_ERR_COMMENT_NOT_FINISHED,
3368
0
     "Comment not terminated \n<!--%.50s\n", buf, NULL);
3369
0
    xmlFree(buf);
3370
0
}
3371
3372
/**
3373
 * htmlParseCharRef:
3374
 * @ctxt:  an HTML parser context
3375
 *
3376
 * parse Reference declarations
3377
 *
3378
 * [66] CharRef ::= '&#' [0-9]+ ';' |
3379
 *                  '&#x' [0-9a-fA-F]+ ';'
3380
 *
3381
 * Returns the value parsed (as an int)
3382
 */
3383
int
3384
0
htmlParseCharRef(htmlParserCtxtPtr ctxt) {
3385
0
    int val = 0;
3386
3387
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3388
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3389
0
         "htmlParseCharRef: context error\n",
3390
0
         NULL, NULL);
3391
0
        return(0);
3392
0
    }
3393
0
    if ((CUR == '&') && (NXT(1) == '#') &&
3394
0
        ((NXT(2) == 'x') || NXT(2) == 'X')) {
3395
0
  SKIP(3);
3396
0
  while (CUR != ';') {
3397
0
      if ((CUR >= '0') && (CUR <= '9'))
3398
0
          val = val * 16 + (CUR - '0');
3399
0
      else if ((CUR >= 'a') && (CUR <= 'f'))
3400
0
          val = val * 16 + (CUR - 'a') + 10;
3401
0
      else if ((CUR >= 'A') && (CUR <= 'F'))
3402
0
          val = val * 16 + (CUR - 'A') + 10;
3403
0
      else {
3404
0
          htmlParseErr(ctxt, XML_ERR_INVALID_HEX_CHARREF,
3405
0
                 "htmlParseCharRef: missing semicolon\n",
3406
0
           NULL, NULL);
3407
0
    break;
3408
0
      }
3409
0
      NEXT;
3410
0
  }
3411
0
  if (CUR == ';')
3412
0
      NEXT;
3413
0
    } else if  ((CUR == '&') && (NXT(1) == '#')) {
3414
0
  SKIP(2);
3415
0
  while (CUR != ';') {
3416
0
      if ((CUR >= '0') && (CUR <= '9'))
3417
0
          val = val * 10 + (CUR - '0');
3418
0
      else {
3419
0
          htmlParseErr(ctxt, XML_ERR_INVALID_DEC_CHARREF,
3420
0
                 "htmlParseCharRef: missing semicolon\n",
3421
0
           NULL, NULL);
3422
0
    break;
3423
0
      }
3424
0
      NEXT;
3425
0
  }
3426
0
  if (CUR == ';')
3427
0
      NEXT;
3428
0
    } else {
3429
0
  htmlParseErr(ctxt, XML_ERR_INVALID_CHARREF,
3430
0
               "htmlParseCharRef: invalid value\n", NULL, NULL);
3431
0
    }
3432
    /*
3433
     * Check the value IS_CHAR ...
3434
     */
3435
0
    if (IS_CHAR(val)) {
3436
0
        return(val);
3437
0
    } else {
3438
0
  htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
3439
0
      "htmlParseCharRef: invalid xmlChar value %d\n",
3440
0
      val);
3441
0
    }
3442
0
    return(0);
3443
0
}
3444
3445
3446
/**
3447
 * htmlParseDocTypeDecl:
3448
 * @ctxt:  an HTML parser context
3449
 *
3450
 * parse a DOCTYPE declaration
3451
 *
3452
 * [28] doctypedecl ::= '<!DOCTYPE' S Name (S ExternalID)? S?
3453
 *                      ('[' (markupdecl | PEReference | S)* ']' S?)? '>'
3454
 */
3455
3456
static void
3457
0
htmlParseDocTypeDecl(htmlParserCtxtPtr ctxt) {
3458
0
    const xmlChar *name;
3459
0
    xmlChar *ExternalID = NULL;
3460
0
    xmlChar *URI = NULL;
3461
3462
    /*
3463
     * We know that '<!DOCTYPE' has been detected.
3464
     */
3465
0
    SKIP(9);
3466
3467
0
    SKIP_BLANKS;
3468
3469
    /*
3470
     * Parse the DOCTYPE name.
3471
     */
3472
0
    name = htmlParseName(ctxt);
3473
0
    if (name == NULL) {
3474
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3475
0
               "htmlParseDocTypeDecl : no DOCTYPE name !\n",
3476
0
         NULL, NULL);
3477
0
    }
3478
    /*
3479
     * Check that upper(name) == "HTML" !!!!!!!!!!!!!
3480
     */
3481
3482
0
    SKIP_BLANKS;
3483
3484
    /*
3485
     * Check for SystemID and ExternalID
3486
     */
3487
0
    URI = htmlParseExternalID(ctxt, &ExternalID);
3488
0
    SKIP_BLANKS;
3489
3490
    /*
3491
     * We should be at the end of the DOCTYPE declaration.
3492
     */
3493
0
    if (CUR != '>') {
3494
0
  htmlParseErr(ctxt, XML_ERR_DOCTYPE_NOT_FINISHED,
3495
0
               "DOCTYPE improperly terminated\n", NULL, NULL);
3496
        /* We shouldn't try to resynchronize ... */
3497
0
    }
3498
0
    NEXT;
3499
3500
    /*
3501
     * Create or update the document accordingly to the DOCTYPE
3502
     */
3503
0
    if ((ctxt->sax != NULL) && (ctxt->sax->internalSubset != NULL) &&
3504
0
  (!ctxt->disableSAX))
3505
0
  ctxt->sax->internalSubset(ctxt->userData, name, ExternalID, URI);
3506
3507
    /*
3508
     * Cleanup, since we don't use all those identifiers
3509
     */
3510
0
    if (URI != NULL) xmlFree(URI);
3511
0
    if (ExternalID != NULL) xmlFree(ExternalID);
3512
0
}
3513
3514
/**
3515
 * htmlParseAttribute:
3516
 * @ctxt:  an HTML parser context
3517
 * @value:  a xmlChar ** used to store the value of the attribute
3518
 *
3519
 * parse an attribute
3520
 *
3521
 * [41] Attribute ::= Name Eq AttValue
3522
 *
3523
 * [25] Eq ::= S? '=' S?
3524
 *
3525
 * With namespace:
3526
 *
3527
 * [NS 11] Attribute ::= QName Eq AttValue
3528
 *
3529
 * Also the case QName == xmlns:??? is handled independently as a namespace
3530
 * definition.
3531
 *
3532
 * Returns the attribute name, and the value in *value.
3533
 */
3534
3535
static const xmlChar *
3536
0
htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
3537
0
    const xmlChar *name;
3538
0
    xmlChar *val = NULL;
3539
3540
0
    *value = NULL;
3541
0
    name = htmlParseHTMLName(ctxt);
3542
0
    if (name == NULL) {
3543
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3544
0
               "error parsing attribute name\n", NULL, NULL);
3545
0
        return(NULL);
3546
0
    }
3547
3548
    /*
3549
     * read the value
3550
     */
3551
0
    SKIP_BLANKS;
3552
0
    if (CUR == '=') {
3553
0
        NEXT;
3554
0
  SKIP_BLANKS;
3555
0
  val = htmlParseAttValue(ctxt);
3556
0
    }
3557
3558
0
    *value = val;
3559
0
    return(name);
3560
0
}
3561
3562
/**
3563
 * htmlCheckEncodingDirect:
3564
 * @ctxt:  an HTML parser context
3565
 * @attvalue: the attribute value
3566
 *
3567
 * Checks an attribute value to detect
3568
 * the encoding
3569
 * If a new encoding is detected the parser is switched to decode
3570
 * it and pass UTF8
3571
 */
3572
static void
3573
0
htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
3574
3575
0
    if ((ctxt == NULL) || (encoding == NULL) ||
3576
0
        (ctxt->options & HTML_PARSE_IGNORE_ENC))
3577
0
  return;
3578
3579
    /* do not change encoding */
3580
0
    if (ctxt->input->encoding != NULL)
3581
0
        return;
3582
3583
0
    if (encoding != NULL) {
3584
0
  xmlCharEncoding enc;
3585
0
  xmlCharEncodingHandlerPtr handler;
3586
3587
0
  while ((*encoding == ' ') || (*encoding == '\t')) encoding++;
3588
3589
0
  if (ctxt->input->encoding != NULL)
3590
0
      xmlFree((xmlChar *) ctxt->input->encoding);
3591
0
  ctxt->input->encoding = xmlStrdup(encoding);
3592
3593
0
  enc = xmlParseCharEncoding((const char *) encoding);
3594
  /*
3595
   * registered set of known encodings
3596
   */
3597
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
3598
0
      if (((enc == XML_CHAR_ENCODING_UTF16LE) ||
3599
0
           (enc == XML_CHAR_ENCODING_UTF16BE) ||
3600
0
     (enc == XML_CHAR_ENCODING_UCS4LE) ||
3601
0
     (enc == XML_CHAR_ENCODING_UCS4BE)) &&
3602
0
    (ctxt->input->buf != NULL) &&
3603
0
    (ctxt->input->buf->encoder == NULL)) {
3604
0
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3605
0
                 "htmlCheckEncoding: wrong encoding meta\n",
3606
0
           NULL, NULL);
3607
0
      } else {
3608
0
    xmlSwitchEncoding(ctxt, enc);
3609
0
      }
3610
0
      ctxt->charset = XML_CHAR_ENCODING_UTF8;
3611
0
  } else {
3612
      /*
3613
       * fallback for unknown encodings
3614
       */
3615
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
3616
0
      if (handler != NULL) {
3617
0
    xmlSwitchToEncoding(ctxt, handler);
3618
0
    ctxt->charset = XML_CHAR_ENCODING_UTF8;
3619
0
      } else {
3620
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
3621
0
                 "htmlCheckEncoding: unknown encoding %s\n",
3622
0
           encoding, NULL);
3623
0
      }
3624
0
  }
3625
3626
0
  if ((ctxt->input->buf != NULL) &&
3627
0
      (ctxt->input->buf->encoder != NULL) &&
3628
0
      (ctxt->input->buf->raw != NULL) &&
3629
0
      (ctxt->input->buf->buffer != NULL)) {
3630
0
      int nbchars;
3631
0
      int processed;
3632
3633
      /*
3634
       * convert as much as possible to the parser reading buffer.
3635
       */
3636
0
      processed = ctxt->input->cur - ctxt->input->base;
3637
0
      xmlBufShrink(ctxt->input->buf->buffer, processed);
3638
0
      nbchars = xmlCharEncInput(ctxt->input->buf, 1);
3639
0
      if (nbchars < 0) {
3640
0
    htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
3641
0
                 "htmlCheckEncoding: encoder error\n",
3642
0
           NULL, NULL);
3643
0
      }
3644
0
            xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input);
3645
0
  }
3646
0
    }
3647
0
}
3648
3649
/**
3650
 * htmlCheckEncoding:
3651
 * @ctxt:  an HTML parser context
3652
 * @attvalue: the attribute value
3653
 *
3654
 * Checks an http-equiv attribute from a Meta tag to detect
3655
 * the encoding
3656
 * If a new encoding is detected the parser is switched to decode
3657
 * it and pass UTF8
3658
 */
3659
static void
3660
0
htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
3661
0
    const xmlChar *encoding;
3662
3663
0
    if (!attvalue)
3664
0
  return;
3665
3666
0
    encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
3667
0
    if (encoding != NULL) {
3668
0
  encoding += 7;
3669
0
    }
3670
    /*
3671
     * skip blank
3672
     */
3673
0
    if (encoding && IS_BLANK_CH(*encoding))
3674
0
  encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
3675
0
    if (encoding && *encoding == '=') {
3676
0
  encoding ++;
3677
0
  htmlCheckEncodingDirect(ctxt, encoding);
3678
0
    }
3679
0
}
3680
3681
/**
3682
 * htmlCheckMeta:
3683
 * @ctxt:  an HTML parser context
3684
 * @atts:  the attributes values
3685
 *
3686
 * Checks an attributes from a Meta tag
3687
 */
3688
static void
3689
0
htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
3690
0
    int i;
3691
0
    const xmlChar *att, *value;
3692
0
    int http = 0;
3693
0
    const xmlChar *content = NULL;
3694
3695
0
    if ((ctxt == NULL) || (atts == NULL))
3696
0
  return;
3697
3698
0
    i = 0;
3699
0
    att = atts[i++];
3700
0
    while (att != NULL) {
3701
0
  value = atts[i++];
3702
0
  if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
3703
0
   && (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
3704
0
      http = 1;
3705
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
3706
0
      htmlCheckEncodingDirect(ctxt, value);
3707
0
  else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
3708
0
      content = value;
3709
0
  att = atts[i++];
3710
0
    }
3711
0
    if ((http) && (content != NULL))
3712
0
  htmlCheckEncoding(ctxt, content);
3713
3714
0
}
3715
3716
/**
3717
 * htmlParseStartTag:
3718
 * @ctxt:  an HTML parser context
3719
 *
3720
 * parse a start of tag either for rule element or
3721
 * EmptyElement. In both case we don't parse the tag closing chars.
3722
 *
3723
 * [40] STag ::= '<' Name (S Attribute)* S? '>'
3724
 *
3725
 * [44] EmptyElemTag ::= '<' Name (S Attribute)* S? '/>'
3726
 *
3727
 * With namespace:
3728
 *
3729
 * [NS 8] STag ::= '<' QName (S Attribute)* S? '>'
3730
 *
3731
 * [NS 10] EmptyElement ::= '<' QName (S Attribute)* S? '/>'
3732
 *
3733
 * Returns 0 in case of success, -1 in case of error and 1 if discarded
3734
 */
3735
3736
static int
3737
0
htmlParseStartTag(htmlParserCtxtPtr ctxt) {
3738
0
    const xmlChar *name;
3739
0
    const xmlChar *attname;
3740
0
    xmlChar *attvalue;
3741
0
    const xmlChar **atts;
3742
0
    int nbatts = 0;
3743
0
    int maxatts;
3744
0
    int meta = 0;
3745
0
    int i;
3746
0
    int discardtag = 0;
3747
3748
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
3749
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3750
0
         "htmlParseStartTag: context error\n", NULL, NULL);
3751
0
  return -1;
3752
0
    }
3753
0
    if (ctxt->instate == XML_PARSER_EOF)
3754
0
        return(-1);
3755
0
    if (CUR != '<') return -1;
3756
0
    NEXT;
3757
3758
0
    atts = ctxt->atts;
3759
0
    maxatts = ctxt->maxatts;
3760
3761
0
    GROW;
3762
0
    name = htmlParseHTMLName(ctxt);
3763
0
    if (name == NULL) {
3764
0
  htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
3765
0
               "htmlParseStartTag: invalid element name\n",
3766
0
         NULL, NULL);
3767
  /* if recover preserve text on classic misconstructs */
3768
0
  if ((ctxt->recovery) && ((IS_BLANK_CH(CUR)) || (CUR == '<') ||
3769
0
      (CUR == '=') || (CUR == '>') || (((CUR >= '0') && (CUR <= '9'))))) {
3770
0
      htmlParseCharDataInternal(ctxt, '<');
3771
0
      return(-1);
3772
0
  }
3773
3774
3775
  /* Dump the bogus tag like browsers do */
3776
0
  while ((IS_CHAR_CH(CUR)) && (CUR != '>') &&
3777
0
               (ctxt->instate != XML_PARSER_EOF))
3778
0
      NEXT;
3779
0
        return -1;
3780
0
    }
3781
0
    if (xmlStrEqual(name, BAD_CAST"meta"))
3782
0
  meta = 1;
3783
3784
    /*
3785
     * Check for auto-closure of HTML elements.
3786
     */
3787
0
    htmlAutoClose(ctxt, name);
3788
3789
    /*
3790
     * Check for implied HTML elements.
3791
     */
3792
0
    htmlCheckImplied(ctxt, name);
3793
3794
    /*
3795
     * Avoid html at any level > 0, head at any level != 1
3796
     * or any attempt to recurse body
3797
     */
3798
0
    if ((ctxt->nameNr > 0) && (xmlStrEqual(name, BAD_CAST"html"))) {
3799
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3800
0
               "htmlParseStartTag: misplaced <html> tag\n",
3801
0
         name, NULL);
3802
0
  discardtag = 1;
3803
0
  ctxt->depth++;
3804
0
    }
3805
0
    if ((ctxt->nameNr != 1) &&
3806
0
  (xmlStrEqual(name, BAD_CAST"head"))) {
3807
0
  htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3808
0
               "htmlParseStartTag: misplaced <head> tag\n",
3809
0
         name, NULL);
3810
0
  discardtag = 1;
3811
0
  ctxt->depth++;
3812
0
    }
3813
0
    if (xmlStrEqual(name, BAD_CAST"body")) {
3814
0
  int indx;
3815
0
  for (indx = 0;indx < ctxt->nameNr;indx++) {
3816
0
      if (xmlStrEqual(ctxt->nameTab[indx], BAD_CAST"body")) {
3817
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
3818
0
                 "htmlParseStartTag: misplaced <body> tag\n",
3819
0
           name, NULL);
3820
0
    discardtag = 1;
3821
0
    ctxt->depth++;
3822
0
      }
3823
0
  }
3824
0
    }
3825
3826
    /*
3827
     * Now parse the attributes, it ends up with the ending
3828
     *
3829
     * (S Attribute)* S?
3830
     */
3831
0
    SKIP_BLANKS;
3832
0
    while ((IS_CHAR_CH(CUR)) &&
3833
0
           (CUR != '>') &&
3834
0
     ((CUR != '/') || (NXT(1) != '>'))) {
3835
0
  long cons = ctxt->nbChars;
3836
3837
0
  GROW;
3838
0
  attname = htmlParseAttribute(ctxt, &attvalue);
3839
0
        if (attname != NULL) {
3840
3841
      /*
3842
       * Well formedness requires at most one declaration of an attribute
3843
       */
3844
0
      for (i = 0; i < nbatts;i += 2) {
3845
0
          if (xmlStrEqual(atts[i], attname)) {
3846
0
        htmlParseErr(ctxt, XML_ERR_ATTRIBUTE_REDEFINED,
3847
0
                     "Attribute %s redefined\n", attname, NULL);
3848
0
        if (attvalue != NULL)
3849
0
      xmlFree(attvalue);
3850
0
        goto failed;
3851
0
    }
3852
0
      }
3853
3854
      /*
3855
       * Add the pair to atts
3856
       */
3857
0
      if (atts == NULL) {
3858
0
          maxatts = 22; /* allow for 10 attrs by default */
3859
0
          atts = (const xmlChar **)
3860
0
           xmlMalloc(maxatts * sizeof(xmlChar *));
3861
0
    if (atts == NULL) {
3862
0
        htmlErrMemory(ctxt, NULL);
3863
0
        if (attvalue != NULL)
3864
0
      xmlFree(attvalue);
3865
0
        goto failed;
3866
0
    }
3867
0
    ctxt->atts = atts;
3868
0
    ctxt->maxatts = maxatts;
3869
0
      } else if (nbatts + 4 > maxatts) {
3870
0
          const xmlChar **n;
3871
3872
0
          maxatts *= 2;
3873
0
          n = (const xmlChar **) xmlRealloc((void *) atts,
3874
0
               maxatts * sizeof(const xmlChar *));
3875
0
    if (n == NULL) {
3876
0
        htmlErrMemory(ctxt, NULL);
3877
0
        if (attvalue != NULL)
3878
0
      xmlFree(attvalue);
3879
0
        goto failed;
3880
0
    }
3881
0
    atts = n;
3882
0
    ctxt->atts = atts;
3883
0
    ctxt->maxatts = maxatts;
3884
0
      }
3885
0
      atts[nbatts++] = attname;
3886
0
      atts[nbatts++] = attvalue;
3887
0
      atts[nbatts] = NULL;
3888
0
      atts[nbatts + 1] = NULL;
3889
0
  }
3890
0
  else {
3891
0
      if (attvalue != NULL)
3892
0
          xmlFree(attvalue);
3893
      /* Dump the bogus attribute string up to the next blank or
3894
       * the end of the tag. */
3895
0
      while ((IS_CHAR_CH(CUR)) &&
3896
0
             !(IS_BLANK_CH(CUR)) && (CUR != '>') &&
3897
0
       ((CUR != '/') || (NXT(1) != '>')))
3898
0
    NEXT;
3899
0
  }
3900
3901
0
failed:
3902
0
  SKIP_BLANKS;
3903
0
        if (cons == ctxt->nbChars) {
3904
0
      htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
3905
0
                   "htmlParseStartTag: problem parsing attributes\n",
3906
0
       NULL, NULL);
3907
0
      break;
3908
0
  }
3909
0
    }
3910
3911
    /*
3912
     * Handle specific association to the META tag
3913
     */
3914
0
    if (meta && (nbatts != 0))
3915
0
  htmlCheckMeta(ctxt, atts);
3916
3917
    /*
3918
     * SAX: Start of Element !
3919
     */
3920
0
    if (!discardtag) {
3921
0
  htmlnamePush(ctxt, name);
3922
0
  if ((ctxt->sax != NULL) && (ctxt->sax->startElement != NULL)) {
3923
0
      if (nbatts != 0)
3924
0
    ctxt->sax->startElement(ctxt->userData, name, atts);
3925
0
      else
3926
0
    ctxt->sax->startElement(ctxt->userData, name, NULL);
3927
0
  }
3928
0
    }
3929
3930
0
    if (atts != NULL) {
3931
0
        for (i = 1;i < nbatts;i += 2) {
3932
0
      if (atts[i] != NULL)
3933
0
    xmlFree((xmlChar *) atts[i]);
3934
0
  }
3935
0
    }
3936
3937
0
    return(discardtag);
3938
0
}
3939
3940
/**
3941
 * htmlParseEndTag:
3942
 * @ctxt:  an HTML parser context
3943
 *
3944
 * parse an end of tag
3945
 *
3946
 * [42] ETag ::= '</' Name S? '>'
3947
 *
3948
 * With namespace
3949
 *
3950
 * [NS 9] ETag ::= '</' QName S? '>'
3951
 *
3952
 * Returns 1 if the current level should be closed.
3953
 */
3954
3955
static int
3956
htmlParseEndTag(htmlParserCtxtPtr ctxt)
3957
0
{
3958
0
    const xmlChar *name;
3959
0
    const xmlChar *oldname;
3960
0
    int i, ret;
3961
3962
0
    if ((CUR != '<') || (NXT(1) != '/')) {
3963
0
        htmlParseErr(ctxt, XML_ERR_LTSLASH_REQUIRED,
3964
0
               "htmlParseEndTag: '</' not found\n", NULL, NULL);
3965
0
        return (0);
3966
0
    }
3967
0
    SKIP(2);
3968
3969
0
    name = htmlParseHTMLName(ctxt);
3970
0
    if (name == NULL)
3971
0
        return (0);
3972
    /*
3973
     * We should definitely be at the ending "S? '>'" part
3974
     */
3975
0
    SKIP_BLANKS;
3976
0
    if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
3977
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
3978
0
               "End tag : expected '>'\n", NULL, NULL);
3979
0
  if (ctxt->recovery) {
3980
      /*
3981
       * We're not at the ending > !!
3982
       * Error, unless in recover mode where we search forwards
3983
       * until we find a >
3984
       */
3985
0
      while (CUR != '\0' && CUR != '>') NEXT;
3986
0
      NEXT;
3987
0
  }
3988
0
    } else
3989
0
        NEXT;
3990
3991
    /*
3992
     * if we ignored misplaced tags in htmlParseStartTag don't pop them
3993
     * out now.
3994
     */
3995
0
    if ((ctxt->depth > 0) &&
3996
0
        (xmlStrEqual(name, BAD_CAST "html") ||
3997
0
         xmlStrEqual(name, BAD_CAST "body") ||
3998
0
   xmlStrEqual(name, BAD_CAST "head"))) {
3999
0
  ctxt->depth--;
4000
0
  return (0);
4001
0
    }
4002
4003
    /*
4004
     * If the name read is not one of the element in the parsing stack
4005
     * then return, it's just an error.
4006
     */
4007
0
    for (i = (ctxt->nameNr - 1); i >= 0; i--) {
4008
0
        if (xmlStrEqual(name, ctxt->nameTab[i]))
4009
0
            break;
4010
0
    }
4011
0
    if (i < 0) {
4012
0
        htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4013
0
               "Unexpected end tag : %s\n", name, NULL);
4014
0
        return (0);
4015
0
    }
4016
4017
4018
    /*
4019
     * Check for auto-closure of HTML elements.
4020
     */
4021
4022
0
    htmlAutoCloseOnClose(ctxt, name);
4023
4024
    /*
4025
     * Well formedness constraints, opening and closing must match.
4026
     * With the exception that the autoclose may have popped stuff out
4027
     * of the stack.
4028
     */
4029
0
    if (!xmlStrEqual(name, ctxt->name)) {
4030
0
        if ((ctxt->name != NULL) && (!xmlStrEqual(ctxt->name, name))) {
4031
0
            htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
4032
0
                   "Opening and ending tag mismatch: %s and %s\n",
4033
0
       name, ctxt->name);
4034
0
        }
4035
0
    }
4036
4037
    /*
4038
     * SAX: End of Tag
4039
     */
4040
0
    oldname = ctxt->name;
4041
0
    if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
4042
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4043
0
            ctxt->sax->endElement(ctxt->userData, name);
4044
0
  htmlNodeInfoPop(ctxt);
4045
0
        htmlnamePop(ctxt);
4046
0
        ret = 1;
4047
0
    } else {
4048
0
        ret = 0;
4049
0
    }
4050
4051
0
    return (ret);
4052
0
}
4053
4054
4055
/**
4056
 * htmlParseReference:
4057
 * @ctxt:  an HTML parser context
4058
 *
4059
 * parse and handle entity references in content,
4060
 * this will end-up in a call to character() since this is either a
4061
 * CharRef, or a predefined entity.
4062
 */
4063
static void
4064
0
htmlParseReference(htmlParserCtxtPtr ctxt) {
4065
0
    const htmlEntityDesc * ent;
4066
0
    xmlChar out[6];
4067
0
    const xmlChar *name;
4068
0
    if (CUR != '&') return;
4069
4070
0
    if (NXT(1) == '#') {
4071
0
  unsigned int c;
4072
0
  int bits, i = 0;
4073
4074
0
  c = htmlParseCharRef(ctxt);
4075
0
  if (c == 0)
4076
0
      return;
4077
4078
0
        if      (c <    0x80) { out[i++]= c;                bits= -6; }
4079
0
        else if (c <   0x800) { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4080
0
        else if (c < 0x10000) { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4081
0
        else                  { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4082
4083
0
        for ( ; bits >= 0; bits-= 6) {
4084
0
            out[i++]= ((c >> bits) & 0x3F) | 0x80;
4085
0
        }
4086
0
  out[i] = 0;
4087
4088
0
  htmlCheckParagraph(ctxt);
4089
0
  if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4090
0
      ctxt->sax->characters(ctxt->userData, out, i);
4091
0
    } else {
4092
0
  ent = htmlParseEntityRef(ctxt, &name);
4093
0
  if (name == NULL) {
4094
0
      htmlCheckParagraph(ctxt);
4095
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4096
0
          ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4097
0
      return;
4098
0
  }
4099
0
  if ((ent == NULL) || !(ent->value > 0)) {
4100
0
      htmlCheckParagraph(ctxt);
4101
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL)) {
4102
0
    ctxt->sax->characters(ctxt->userData, BAD_CAST "&", 1);
4103
0
    ctxt->sax->characters(ctxt->userData, name, xmlStrlen(name));
4104
    /* ctxt->sax->characters(ctxt->userData, BAD_CAST ";", 1); */
4105
0
      }
4106
0
  } else {
4107
0
      unsigned int c;
4108
0
      int bits, i = 0;
4109
4110
0
      c = ent->value;
4111
0
      if      (c <    0x80)
4112
0
              { out[i++]= c;                bits= -6; }
4113
0
      else if (c <   0x800)
4114
0
              { out[i++]=((c >>  6) & 0x1F) | 0xC0;  bits=  0; }
4115
0
      else if (c < 0x10000)
4116
0
              { out[i++]=((c >> 12) & 0x0F) | 0xE0;  bits=  6; }
4117
0
      else
4118
0
              { out[i++]=((c >> 18) & 0x07) | 0xF0;  bits= 12; }
4119
4120
0
      for ( ; bits >= 0; bits-= 6) {
4121
0
    out[i++]= ((c >> bits) & 0x3F) | 0x80;
4122
0
      }
4123
0
      out[i] = 0;
4124
4125
0
      htmlCheckParagraph(ctxt);
4126
0
      if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
4127
0
    ctxt->sax->characters(ctxt->userData, out, i);
4128
0
  }
4129
0
    }
4130
0
}
4131
4132
/**
4133
 * htmlParseContent:
4134
 * @ctxt:  an HTML parser context
4135
 *
4136
 * Parse a content: comment, sub-element, reference or text.
4137
 * Kept for compatibility with old code
4138
 */
4139
4140
static void
4141
0
htmlParseContent(htmlParserCtxtPtr ctxt) {
4142
0
    xmlChar *currentNode;
4143
0
    int depth;
4144
0
    const xmlChar *name;
4145
4146
0
    currentNode = xmlStrdup(ctxt->name);
4147
0
    depth = ctxt->nameNr;
4148
0
    while (1) {
4149
0
  long cons = ctxt->nbChars;
4150
4151
0
        GROW;
4152
4153
0
        if (ctxt->instate == XML_PARSER_EOF)
4154
0
            break;
4155
4156
  /*
4157
   * Our tag or one of it's parent or children is ending.
4158
   */
4159
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4160
0
      if (htmlParseEndTag(ctxt) &&
4161
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4162
0
    if (currentNode != NULL)
4163
0
        xmlFree(currentNode);
4164
0
    return;
4165
0
      }
4166
0
      continue; /* while */
4167
0
        }
4168
4169
0
  else if ((CUR == '<') &&
4170
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4171
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4172
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4173
0
      if (name == NULL) {
4174
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4175
0
       "htmlParseStartTag: invalid element name\n",
4176
0
       NULL, NULL);
4177
          /* Dump the bogus tag like browsers do */
4178
0
        while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4179
0
              NEXT;
4180
4181
0
          if (currentNode != NULL)
4182
0
              xmlFree(currentNode);
4183
0
          return;
4184
0
      }
4185
4186
0
      if (ctxt->name != NULL) {
4187
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4188
0
              htmlAutoClose(ctxt, name);
4189
0
              continue;
4190
0
          }
4191
0
      }
4192
0
  }
4193
4194
  /*
4195
   * Has this node been popped out during parsing of
4196
   * the next element
4197
   */
4198
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4199
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4200
0
       {
4201
0
      if (currentNode != NULL) xmlFree(currentNode);
4202
0
      return;
4203
0
  }
4204
4205
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4206
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4207
      /*
4208
       * Handle SCRIPT/STYLE separately
4209
       */
4210
0
      htmlParseScript(ctxt);
4211
0
  } else {
4212
      /*
4213
       * Sometimes DOCTYPE arrives in the middle of the document
4214
       */
4215
0
      if ((CUR == '<') && (NXT(1) == '!') &&
4216
0
    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4217
0
    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4218
0
    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4219
0
    (UPP(8) == 'E')) {
4220
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4221
0
                 "Misplaced DOCTYPE declaration\n",
4222
0
           BAD_CAST "DOCTYPE" , NULL);
4223
0
    htmlParseDocTypeDecl(ctxt);
4224
0
      }
4225
4226
      /*
4227
       * First case :  a comment
4228
       */
4229
0
      if ((CUR == '<') && (NXT(1) == '!') &&
4230
0
    (NXT(2) == '-') && (NXT(3) == '-')) {
4231
0
    htmlParseComment(ctxt);
4232
0
      }
4233
4234
      /*
4235
       * Second case : a Processing Instruction.
4236
       */
4237
0
      else if ((CUR == '<') && (NXT(1) == '?')) {
4238
0
    htmlParsePI(ctxt);
4239
0
      }
4240
4241
      /*
4242
       * Third case :  a sub-element.
4243
       */
4244
0
      else if (CUR == '<') {
4245
0
    htmlParseElement(ctxt);
4246
0
      }
4247
4248
      /*
4249
       * Fourth case : a reference. If if has not been resolved,
4250
       *    parsing returns it's Name, create the node
4251
       */
4252
0
      else if (CUR == '&') {
4253
0
    htmlParseReference(ctxt);
4254
0
      }
4255
4256
      /*
4257
       * Fifth case : end of the resource
4258
       */
4259
0
      else if (CUR == 0) {
4260
0
    htmlAutoCloseOnEnd(ctxt);
4261
0
    break;
4262
0
      }
4263
4264
      /*
4265
       * Last case, text. Note that References are handled directly.
4266
       */
4267
0
      else {
4268
0
    htmlParseCharData(ctxt);
4269
0
      }
4270
4271
0
      if (cons == ctxt->nbChars) {
4272
0
    if (ctxt->node != NULL) {
4273
0
        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4274
0
                     "detected an error in element content\n",
4275
0
         NULL, NULL);
4276
0
    }
4277
0
    break;
4278
0
      }
4279
0
  }
4280
0
        GROW;
4281
0
    }
4282
0
    if (currentNode != NULL) xmlFree(currentNode);
4283
0
}
4284
4285
/**
4286
 * htmlParseElement:
4287
 * @ctxt:  an HTML parser context
4288
 *
4289
 * parse an HTML element, this is highly recursive
4290
 * this is kept for compatibility with previous code versions
4291
 *
4292
 * [39] element ::= EmptyElemTag | STag content ETag
4293
 *
4294
 * [41] Attribute ::= Name Eq AttValue
4295
 */
4296
4297
void
4298
0
htmlParseElement(htmlParserCtxtPtr ctxt) {
4299
0
    const xmlChar *name;
4300
0
    xmlChar *currentNode = NULL;
4301
0
    const htmlElemDesc * info;
4302
0
    htmlParserNodeInfo node_info;
4303
0
    int failed;
4304
0
    int depth;
4305
0
    const xmlChar *oldptr;
4306
4307
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4308
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4309
0
         "htmlParseElement: context error\n", NULL, NULL);
4310
0
  return;
4311
0
    }
4312
4313
0
    if (ctxt->instate == XML_PARSER_EOF)
4314
0
        return;
4315
4316
    /* Capture start position */
4317
0
    if (ctxt->record_info) {
4318
0
        node_info.begin_pos = ctxt->input->consumed +
4319
0
                          (CUR_PTR - ctxt->input->base);
4320
0
  node_info.begin_line = ctxt->input->line;
4321
0
    }
4322
4323
0
    failed = htmlParseStartTag(ctxt);
4324
0
    name = ctxt->name;
4325
0
    if ((failed == -1) || (name == NULL)) {
4326
0
  if (CUR == '>')
4327
0
      NEXT;
4328
0
        return;
4329
0
    }
4330
4331
    /*
4332
     * Lookup the info for that element.
4333
     */
4334
0
    info = htmlTagLookup(name);
4335
0
    if (info == NULL) {
4336
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4337
0
               "Tag %s invalid\n", name, NULL);
4338
0
    }
4339
4340
    /*
4341
     * Check for an Empty Element labeled the XML/SGML way
4342
     */
4343
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4344
0
        SKIP(2);
4345
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4346
0
      ctxt->sax->endElement(ctxt->userData, name);
4347
0
  htmlnamePop(ctxt);
4348
0
  return;
4349
0
    }
4350
4351
0
    if (CUR == '>') {
4352
0
        NEXT;
4353
0
    } else {
4354
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4355
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4356
4357
  /*
4358
   * end of parsing of this node.
4359
   */
4360
0
  if (xmlStrEqual(name, ctxt->name)) {
4361
0
      nodePop(ctxt);
4362
0
      htmlnamePop(ctxt);
4363
0
  }
4364
4365
  /*
4366
   * Capture end position and add node
4367
   */
4368
0
  if (ctxt->record_info) {
4369
0
     node_info.end_pos = ctxt->input->consumed +
4370
0
            (CUR_PTR - ctxt->input->base);
4371
0
     node_info.end_line = ctxt->input->line;
4372
0
     node_info.node = ctxt->node;
4373
0
     xmlParserAddNodeInfo(ctxt, &node_info);
4374
0
  }
4375
0
  return;
4376
0
    }
4377
4378
    /*
4379
     * Check for an Empty Element from DTD definition
4380
     */
4381
0
    if ((info != NULL) && (info->empty)) {
4382
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4383
0
      ctxt->sax->endElement(ctxt->userData, name);
4384
0
  htmlnamePop(ctxt);
4385
0
  return;
4386
0
    }
4387
4388
    /*
4389
     * Parse the content of the element:
4390
     */
4391
0
    currentNode = xmlStrdup(ctxt->name);
4392
0
    depth = ctxt->nameNr;
4393
0
    while (IS_CHAR_CH(CUR)) {
4394
0
  oldptr = ctxt->input->cur;
4395
0
  htmlParseContent(ctxt);
4396
0
  if (oldptr==ctxt->input->cur) break;
4397
0
  if (ctxt->nameNr < depth) break;
4398
0
    }
4399
4400
    /*
4401
     * Capture end position and add node
4402
     */
4403
0
    if ( currentNode != NULL && ctxt->record_info ) {
4404
0
       node_info.end_pos = ctxt->input->consumed +
4405
0
                          (CUR_PTR - ctxt->input->base);
4406
0
       node_info.end_line = ctxt->input->line;
4407
0
       node_info.node = ctxt->node;
4408
0
       xmlParserAddNodeInfo(ctxt, &node_info);
4409
0
    }
4410
0
    if (!IS_CHAR_CH(CUR)) {
4411
0
  htmlAutoCloseOnEnd(ctxt);
4412
0
    }
4413
4414
0
    if (currentNode != NULL)
4415
0
  xmlFree(currentNode);
4416
0
}
4417
4418
static void
4419
0
htmlParserFinishElementParsing(htmlParserCtxtPtr ctxt) {
4420
    /*
4421
     * Capture end position and add node
4422
     */
4423
0
    if ( ctxt->node != NULL && ctxt->record_info ) {
4424
0
       ctxt->nodeInfo->end_pos = ctxt->input->consumed +
4425
0
                                (CUR_PTR - ctxt->input->base);
4426
0
       ctxt->nodeInfo->end_line = ctxt->input->line;
4427
0
       ctxt->nodeInfo->node = ctxt->node;
4428
0
       xmlParserAddNodeInfo(ctxt, ctxt->nodeInfo);
4429
0
       htmlNodeInfoPop(ctxt);
4430
0
    }
4431
0
    if (!IS_CHAR_CH(CUR)) {
4432
0
       htmlAutoCloseOnEnd(ctxt);
4433
0
    }
4434
0
}
4435
4436
/**
4437
 * htmlParseElementInternal:
4438
 * @ctxt:  an HTML parser context
4439
 *
4440
 * parse an HTML element, new version, non recursive
4441
 *
4442
 * [39] element ::= EmptyElemTag | STag content ETag
4443
 *
4444
 * [41] Attribute ::= Name Eq AttValue
4445
 */
4446
4447
static void
4448
0
htmlParseElementInternal(htmlParserCtxtPtr ctxt) {
4449
0
    const xmlChar *name;
4450
0
    const htmlElemDesc * info;
4451
0
    htmlParserNodeInfo node_info = { NULL, 0, 0, 0, 0 };
4452
0
    int failed;
4453
4454
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4455
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4456
0
         "htmlParseElementInternal: context error\n", NULL, NULL);
4457
0
  return;
4458
0
    }
4459
4460
0
    if (ctxt->instate == XML_PARSER_EOF)
4461
0
        return;
4462
4463
    /* Capture start position */
4464
0
    if (ctxt->record_info) {
4465
0
        node_info.begin_pos = ctxt->input->consumed +
4466
0
                          (CUR_PTR - ctxt->input->base);
4467
0
  node_info.begin_line = ctxt->input->line;
4468
0
    }
4469
4470
0
    failed = htmlParseStartTag(ctxt);
4471
0
    name = ctxt->name;
4472
0
    if ((failed == -1) || (name == NULL)) {
4473
0
  if (CUR == '>')
4474
0
      NEXT;
4475
0
        return;
4476
0
    }
4477
4478
    /*
4479
     * Lookup the info for that element.
4480
     */
4481
0
    info = htmlTagLookup(name);
4482
0
    if (info == NULL) {
4483
0
  htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
4484
0
               "Tag %s invalid\n", name, NULL);
4485
0
    }
4486
4487
    /*
4488
     * Check for an Empty Element labeled the XML/SGML way
4489
     */
4490
0
    if ((CUR == '/') && (NXT(1) == '>')) {
4491
0
        SKIP(2);
4492
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4493
0
      ctxt->sax->endElement(ctxt->userData, name);
4494
0
  htmlnamePop(ctxt);
4495
0
  return;
4496
0
    }
4497
4498
0
    if (CUR == '>') {
4499
0
        NEXT;
4500
0
    } else {
4501
0
  htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
4502
0
               "Couldn't find end of Start Tag %s\n", name, NULL);
4503
4504
  /*
4505
   * end of parsing of this node.
4506
   */
4507
0
  if (xmlStrEqual(name, ctxt->name)) {
4508
0
      nodePop(ctxt);
4509
0
      htmlnamePop(ctxt);
4510
0
  }
4511
4512
0
        if (ctxt->record_info)
4513
0
            htmlNodeInfoPush(ctxt, &node_info);
4514
0
        htmlParserFinishElementParsing(ctxt);
4515
0
  return;
4516
0
    }
4517
4518
    /*
4519
     * Check for an Empty Element from DTD definition
4520
     */
4521
0
    if ((info != NULL) && (info->empty)) {
4522
0
  if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
4523
0
      ctxt->sax->endElement(ctxt->userData, name);
4524
0
  htmlnamePop(ctxt);
4525
0
  return;
4526
0
    }
4527
4528
0
    if (ctxt->record_info)
4529
0
        htmlNodeInfoPush(ctxt, &node_info);
4530
0
}
4531
4532
/**
4533
 * htmlParseContentInternal:
4534
 * @ctxt:  an HTML parser context
4535
 *
4536
 * Parse a content: comment, sub-element, reference or text.
4537
 * New version for non recursive htmlParseElementInternal
4538
 */
4539
4540
static void
4541
0
htmlParseContentInternal(htmlParserCtxtPtr ctxt) {
4542
0
    xmlChar *currentNode;
4543
0
    int depth;
4544
0
    const xmlChar *name;
4545
4546
0
    currentNode = xmlStrdup(ctxt->name);
4547
0
    depth = ctxt->nameNr;
4548
0
    while (1) {
4549
0
  long cons = ctxt->nbChars;
4550
4551
0
        GROW;
4552
4553
0
        if (ctxt->instate == XML_PARSER_EOF)
4554
0
            break;
4555
4556
  /*
4557
   * Our tag or one of it's parent or children is ending.
4558
   */
4559
0
        if ((CUR == '<') && (NXT(1) == '/')) {
4560
0
      if (htmlParseEndTag(ctxt) &&
4561
0
    ((currentNode != NULL) || (ctxt->nameNr == 0))) {
4562
0
    if (currentNode != NULL)
4563
0
        xmlFree(currentNode);
4564
4565
0
          currentNode = xmlStrdup(ctxt->name);
4566
0
          depth = ctxt->nameNr;
4567
0
      }
4568
0
      continue; /* while */
4569
0
        }
4570
4571
0
  else if ((CUR == '<') &&
4572
0
           ((IS_ASCII_LETTER(NXT(1))) ||
4573
0
      (NXT(1) == '_') || (NXT(1) == ':'))) {
4574
0
      name = htmlParseHTMLName_nonInvasive(ctxt);
4575
0
      if (name == NULL) {
4576
0
          htmlParseErr(ctxt, XML_ERR_NAME_REQUIRED,
4577
0
       "htmlParseStartTag: invalid element name\n",
4578
0
       NULL, NULL);
4579
          /* Dump the bogus tag like browsers do */
4580
0
          while ((IS_CHAR_CH(CUR)) && (CUR != '>'))
4581
0
              NEXT;
4582
4583
0
          htmlParserFinishElementParsing(ctxt);
4584
0
          if (currentNode != NULL)
4585
0
              xmlFree(currentNode);
4586
4587
0
          currentNode = xmlStrdup(ctxt->name);
4588
0
          depth = ctxt->nameNr;
4589
0
          continue;
4590
0
      }
4591
4592
0
      if (ctxt->name != NULL) {
4593
0
          if (htmlCheckAutoClose(name, ctxt->name) == 1) {
4594
0
              htmlAutoClose(ctxt, name);
4595
0
              continue;
4596
0
          }
4597
0
      }
4598
0
  }
4599
4600
  /*
4601
   * Has this node been popped out during parsing of
4602
   * the next element
4603
   */
4604
0
        if ((ctxt->nameNr > 0) && (depth >= ctxt->nameNr) &&
4605
0
      (!xmlStrEqual(currentNode, ctxt->name)))
4606
0
       {
4607
0
      htmlParserFinishElementParsing(ctxt);
4608
0
      if (currentNode != NULL) xmlFree(currentNode);
4609
4610
0
      currentNode = xmlStrdup(ctxt->name);
4611
0
      depth = ctxt->nameNr;
4612
0
      continue;
4613
0
  }
4614
4615
0
  if ((CUR != 0) && ((xmlStrEqual(currentNode, BAD_CAST"script")) ||
4616
0
      (xmlStrEqual(currentNode, BAD_CAST"style")))) {
4617
      /*
4618
       * Handle SCRIPT/STYLE separately
4619
       */
4620
0
      htmlParseScript(ctxt);
4621
0
  } else {
4622
      /*
4623
       * Sometimes DOCTYPE arrives in the middle of the document
4624
       */
4625
0
      if ((CUR == '<') && (NXT(1) == '!') &&
4626
0
    (UPP(2) == 'D') && (UPP(3) == 'O') &&
4627
0
    (UPP(4) == 'C') && (UPP(5) == 'T') &&
4628
0
    (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4629
0
    (UPP(8) == 'E')) {
4630
0
    htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
4631
0
                 "Misplaced DOCTYPE declaration\n",
4632
0
           BAD_CAST "DOCTYPE" , NULL);
4633
0
    htmlParseDocTypeDecl(ctxt);
4634
0
      }
4635
4636
      /*
4637
       * First case :  a comment
4638
       */
4639
0
      if ((CUR == '<') && (NXT(1) == '!') &&
4640
0
    (NXT(2) == '-') && (NXT(3) == '-')) {
4641
0
    htmlParseComment(ctxt);
4642
0
      }
4643
4644
      /*
4645
       * Second case : a Processing Instruction.
4646
       */
4647
0
      else if ((CUR == '<') && (NXT(1) == '?')) {
4648
0
    htmlParsePI(ctxt);
4649
0
      }
4650
4651
      /*
4652
       * Third case :  a sub-element.
4653
       */
4654
0
      else if (CUR == '<') {
4655
0
    htmlParseElementInternal(ctxt);
4656
0
    if (currentNode != NULL) xmlFree(currentNode);
4657
4658
0
    currentNode = xmlStrdup(ctxt->name);
4659
0
    depth = ctxt->nameNr;
4660
0
      }
4661
4662
      /*
4663
       * Fourth case : a reference. If if has not been resolved,
4664
       *    parsing returns it's Name, create the node
4665
       */
4666
0
      else if (CUR == '&') {
4667
0
    htmlParseReference(ctxt);
4668
0
      }
4669
4670
      /*
4671
       * Fifth case : end of the resource
4672
       */
4673
0
      else if (CUR == 0) {
4674
0
    htmlAutoCloseOnEnd(ctxt);
4675
0
    break;
4676
0
      }
4677
4678
      /*
4679
       * Last case, text. Note that References are handled directly.
4680
       */
4681
0
      else {
4682
0
    htmlParseCharData(ctxt);
4683
0
      }
4684
4685
0
      if (cons == ctxt->nbChars) {
4686
0
    if (ctxt->node != NULL) {
4687
0
        htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4688
0
                     "detected an error in element content\n",
4689
0
         NULL, NULL);
4690
0
    }
4691
0
    break;
4692
0
      }
4693
0
  }
4694
0
        GROW;
4695
0
    }
4696
0
    if (currentNode != NULL) xmlFree(currentNode);
4697
0
}
4698
4699
/**
4700
 * htmlParseContent:
4701
 * @ctxt:  an HTML parser context
4702
 *
4703
 * Parse a content: comment, sub-element, reference or text.
4704
 * This is the entry point when called from parser.c
4705
 */
4706
4707
void
4708
0
__htmlParseContent(void *ctxt) {
4709
0
    if (ctxt != NULL)
4710
0
  htmlParseContentInternal((htmlParserCtxtPtr) ctxt);
4711
0
}
4712
4713
/**
4714
 * htmlParseDocument:
4715
 * @ctxt:  an HTML parser context
4716
 *
4717
 * parse an HTML document (and build a tree if using the standard SAX
4718
 * interface).
4719
 *
4720
 * Returns 0, -1 in case of error. the parser context is augmented
4721
 *                as a result of the parsing.
4722
 */
4723
4724
int
4725
0
htmlParseDocument(htmlParserCtxtPtr ctxt) {
4726
0
    xmlChar start[4];
4727
0
    xmlCharEncoding enc;
4728
0
    xmlDtdPtr dtd;
4729
4730
0
    xmlInitParser();
4731
4732
0
    htmlDefaultSAXHandlerInit();
4733
4734
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
4735
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
4736
0
         "htmlParseDocument: context error\n", NULL, NULL);
4737
0
  return(XML_ERR_INTERNAL_ERROR);
4738
0
    }
4739
0
    ctxt->html = 1;
4740
0
    ctxt->linenumbers = 1;
4741
0
    GROW;
4742
    /*
4743
     * SAX: beginning of the document processing.
4744
     */
4745
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
4746
0
        ctxt->sax->setDocumentLocator(ctxt->userData, &xmlDefaultSAXLocator);
4747
4748
0
    if ((ctxt->encoding == (const xmlChar *)XML_CHAR_ENCODING_NONE) &&
4749
0
        ((ctxt->input->end - ctxt->input->cur) >= 4)) {
4750
  /*
4751
   * Get the 4 first bytes and decode the charset
4752
   * if enc != XML_CHAR_ENCODING_NONE
4753
   * plug some encoding conversion routines.
4754
   */
4755
0
  start[0] = RAW;
4756
0
  start[1] = NXT(1);
4757
0
  start[2] = NXT(2);
4758
0
  start[3] = NXT(3);
4759
0
  enc = xmlDetectCharEncoding(&start[0], 4);
4760
0
  if (enc != XML_CHAR_ENCODING_NONE) {
4761
0
      xmlSwitchEncoding(ctxt, enc);
4762
0
  }
4763
0
    }
4764
4765
    /*
4766
     * Wipe out everything which is before the first '<'
4767
     */
4768
0
    SKIP_BLANKS;
4769
0
    if (CUR == 0) {
4770
0
  htmlParseErr(ctxt, XML_ERR_DOCUMENT_EMPTY,
4771
0
               "Document is empty\n", NULL, NULL);
4772
0
    }
4773
4774
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) && (!ctxt->disableSAX))
4775
0
  ctxt->sax->startDocument(ctxt->userData);
4776
4777
4778
    /*
4779
     * Parse possible comments and PIs before any content
4780
     */
4781
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4782
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4783
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4784
0
        htmlParseComment(ctxt);
4785
0
        htmlParsePI(ctxt);
4786
0
  SKIP_BLANKS;
4787
0
    }
4788
4789
4790
    /*
4791
     * Then possibly doc type declaration(s) and more Misc
4792
     * (doctypedecl Misc*)?
4793
     */
4794
0
    if ((CUR == '<') && (NXT(1) == '!') &&
4795
0
  (UPP(2) == 'D') && (UPP(3) == 'O') &&
4796
0
  (UPP(4) == 'C') && (UPP(5) == 'T') &&
4797
0
  (UPP(6) == 'Y') && (UPP(7) == 'P') &&
4798
0
  (UPP(8) == 'E')) {
4799
0
  htmlParseDocTypeDecl(ctxt);
4800
0
    }
4801
0
    SKIP_BLANKS;
4802
4803
    /*
4804
     * Parse possible comments and PIs before any content
4805
     */
4806
0
    while (((CUR == '<') && (NXT(1) == '!') &&
4807
0
            (NXT(2) == '-') && (NXT(3) == '-')) ||
4808
0
     ((CUR == '<') && (NXT(1) == '?'))) {
4809
0
        htmlParseComment(ctxt);
4810
0
        htmlParsePI(ctxt);
4811
0
  SKIP_BLANKS;
4812
0
    }
4813
4814
    /*
4815
     * Time to start parsing the tree itself
4816
     */
4817
0
    htmlParseContentInternal(ctxt);
4818
4819
    /*
4820
     * autoclose
4821
     */
4822
0
    if (CUR == 0)
4823
0
  htmlAutoCloseOnEnd(ctxt);
4824
4825
4826
    /*
4827
     * SAX: end of the document processing.
4828
     */
4829
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
4830
0
        ctxt->sax->endDocument(ctxt->userData);
4831
4832
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL)) {
4833
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
4834
0
  if (dtd == NULL)
4835
0
      ctxt->myDoc->intSubset =
4836
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
4837
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
4838
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
4839
0
    }
4840
0
    if (! ctxt->wellFormed) return(-1);
4841
0
    return(0);
4842
0
}
4843
4844
4845
/************************************************************************
4846
 *                  *
4847
 *      Parser contexts handling      *
4848
 *                  *
4849
 ************************************************************************/
4850
4851
/**
4852
 * htmlInitParserCtxt:
4853
 * @ctxt:  an HTML parser context
4854
 *
4855
 * Initialize a parser context
4856
 *
4857
 * Returns 0 in case of success and -1 in case of error
4858
 */
4859
4860
static int
4861
htmlInitParserCtxt(htmlParserCtxtPtr ctxt)
4862
0
{
4863
0
    htmlSAXHandler *sax;
4864
4865
0
    if (ctxt == NULL) return(-1);
4866
0
    memset(ctxt, 0, sizeof(htmlParserCtxt));
4867
4868
0
    ctxt->dict = xmlDictCreate();
4869
0
    if (ctxt->dict == NULL) {
4870
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4871
0
  return(-1);
4872
0
    }
4873
0
    sax = (htmlSAXHandler *) xmlMalloc(sizeof(htmlSAXHandler));
4874
0
    if (sax == NULL) {
4875
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4876
0
  return(-1);
4877
0
    }
4878
0
    else
4879
0
        memset(sax, 0, sizeof(htmlSAXHandler));
4880
4881
    /* Allocate the Input stack */
4882
0
    ctxt->inputTab = (htmlParserInputPtr *)
4883
0
                      xmlMalloc(5 * sizeof(htmlParserInputPtr));
4884
0
    if (ctxt->inputTab == NULL) {
4885
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4886
0
  ctxt->inputNr = 0;
4887
0
  ctxt->inputMax = 0;
4888
0
  ctxt->input = NULL;
4889
0
  return(-1);
4890
0
    }
4891
0
    ctxt->inputNr = 0;
4892
0
    ctxt->inputMax = 5;
4893
0
    ctxt->input = NULL;
4894
0
    ctxt->version = NULL;
4895
0
    ctxt->encoding = NULL;
4896
0
    ctxt->standalone = -1;
4897
0
    ctxt->instate = XML_PARSER_START;
4898
4899
    /* Allocate the Node stack */
4900
0
    ctxt->nodeTab = (htmlNodePtr *) xmlMalloc(10 * sizeof(htmlNodePtr));
4901
0
    if (ctxt->nodeTab == NULL) {
4902
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4903
0
  ctxt->nodeNr = 0;
4904
0
  ctxt->nodeMax = 0;
4905
0
  ctxt->node = NULL;
4906
0
  ctxt->inputNr = 0;
4907
0
  ctxt->inputMax = 0;
4908
0
  ctxt->input = NULL;
4909
0
  return(-1);
4910
0
    }
4911
0
    ctxt->nodeNr = 0;
4912
0
    ctxt->nodeMax = 10;
4913
0
    ctxt->node = NULL;
4914
4915
    /* Allocate the Name stack */
4916
0
    ctxt->nameTab = (const xmlChar **) xmlMalloc(10 * sizeof(xmlChar *));
4917
0
    if (ctxt->nameTab == NULL) {
4918
0
        htmlErrMemory(NULL, "htmlInitParserCtxt: out of memory\n");
4919
0
  ctxt->nameNr = 0;
4920
0
  ctxt->nameMax = 0;
4921
0
  ctxt->name = NULL;
4922
0
  ctxt->nodeNr = 0;
4923
0
  ctxt->nodeMax = 0;
4924
0
  ctxt->node = NULL;
4925
0
  ctxt->inputNr = 0;
4926
0
  ctxt->inputMax = 0;
4927
0
  ctxt->input = NULL;
4928
0
  return(-1);
4929
0
    }
4930
0
    ctxt->nameNr = 0;
4931
0
    ctxt->nameMax = 10;
4932
0
    ctxt->name = NULL;
4933
4934
0
    ctxt->nodeInfoTab = NULL;
4935
0
    ctxt->nodeInfoNr  = 0;
4936
0
    ctxt->nodeInfoMax = 0;
4937
4938
0
    if (sax == NULL) ctxt->sax = (xmlSAXHandlerPtr) &htmlDefaultSAXHandler;
4939
0
    else {
4940
0
        ctxt->sax = sax;
4941
0
  memcpy(sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
4942
0
    }
4943
0
    ctxt->userData = ctxt;
4944
0
    ctxt->myDoc = NULL;
4945
0
    ctxt->wellFormed = 1;
4946
0
    ctxt->replaceEntities = 0;
4947
0
    ctxt->linenumbers = xmlLineNumbersDefaultValue;
4948
0
    ctxt->keepBlanks = xmlKeepBlanksDefaultValue;
4949
0
    ctxt->html = 1;
4950
0
    ctxt->vctxt.finishDtd = XML_CTXT_FINISH_DTD_0;
4951
0
    ctxt->vctxt.userData = ctxt;
4952
0
    ctxt->vctxt.error = xmlParserValidityError;
4953
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
4954
0
    ctxt->record_info = 0;
4955
0
    ctxt->validate = 0;
4956
0
    ctxt->nbChars = 0;
4957
0
    ctxt->checkIndex = 0;
4958
0
    ctxt->catalogs = NULL;
4959
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
4960
0
    return(0);
4961
0
}
4962
4963
/**
4964
 * htmlFreeParserCtxt:
4965
 * @ctxt:  an HTML parser context
4966
 *
4967
 * Free all the memory used by a parser context. However the parsed
4968
 * document in ctxt->myDoc is not freed.
4969
 */
4970
4971
void
4972
htmlFreeParserCtxt(htmlParserCtxtPtr ctxt)
4973
0
{
4974
0
    xmlFreeParserCtxt(ctxt);
4975
0
}
4976
4977
/**
4978
 * htmlNewParserCtxt:
4979
 *
4980
 * Allocate and initialize a new parser context.
4981
 *
4982
 * Returns the htmlParserCtxtPtr or NULL in case of allocation error
4983
 */
4984
4985
htmlParserCtxtPtr
4986
htmlNewParserCtxt(void)
4987
0
{
4988
0
    xmlParserCtxtPtr ctxt;
4989
4990
0
    ctxt = (xmlParserCtxtPtr) xmlMalloc(sizeof(xmlParserCtxt));
4991
0
    if (ctxt == NULL) {
4992
0
        htmlErrMemory(NULL, "NewParserCtxt: out of memory\n");
4993
0
  return(NULL);
4994
0
    }
4995
0
    memset(ctxt, 0, sizeof(xmlParserCtxt));
4996
0
    if (htmlInitParserCtxt(ctxt) < 0) {
4997
0
        htmlFreeParserCtxt(ctxt);
4998
0
  return(NULL);
4999
0
    }
5000
0
    return(ctxt);
5001
0
}
5002
5003
/**
5004
 * htmlCreateMemoryParserCtxt:
5005
 * @buffer:  a pointer to a char array
5006
 * @size:  the size of the array
5007
 *
5008
 * Create a parser context for an HTML in-memory document.
5009
 *
5010
 * Returns the new parser context or NULL
5011
 */
5012
htmlParserCtxtPtr
5013
0
htmlCreateMemoryParserCtxt(const char *buffer, int size) {
5014
0
    xmlParserCtxtPtr ctxt;
5015
0
    xmlParserInputPtr input;
5016
0
    xmlParserInputBufferPtr buf;
5017
5018
0
    if (buffer == NULL)
5019
0
  return(NULL);
5020
0
    if (size <= 0)
5021
0
  return(NULL);
5022
5023
0
    ctxt = htmlNewParserCtxt();
5024
0
    if (ctxt == NULL)
5025
0
  return(NULL);
5026
5027
0
    buf = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
5028
0
    if (buf == NULL) return(NULL);
5029
5030
0
    input = xmlNewInputStream(ctxt);
5031
0
    if (input == NULL) {
5032
0
  xmlFreeParserCtxt(ctxt);
5033
0
  return(NULL);
5034
0
    }
5035
5036
0
    input->filename = NULL;
5037
0
    input->buf = buf;
5038
0
    xmlBufResetInput(buf->buffer, input);
5039
5040
0
    inputPush(ctxt, input);
5041
0
    return(ctxt);
5042
0
}
5043
5044
/**
5045
 * htmlCreateDocParserCtxt:
5046
 * @cur:  a pointer to an array of xmlChar
5047
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
5048
 *
5049
 * Create a parser context for an HTML document.
5050
 *
5051
 * TODO: check the need to add encoding handling there
5052
 *
5053
 * Returns the new parser context or NULL
5054
 */
5055
static htmlParserCtxtPtr
5056
0
htmlCreateDocParserCtxt(const xmlChar *cur, const char *encoding) {
5057
0
    int len;
5058
0
    htmlParserCtxtPtr ctxt;
5059
5060
0
    if (cur == NULL)
5061
0
  return(NULL);
5062
0
    len = xmlStrlen(cur);
5063
0
    ctxt = htmlCreateMemoryParserCtxt((char *)cur, len);
5064
0
    if (ctxt == NULL)
5065
0
  return(NULL);
5066
5067
0
    if (encoding != NULL) {
5068
0
  xmlCharEncoding enc;
5069
0
  xmlCharEncodingHandlerPtr handler;
5070
5071
0
  if (ctxt->input->encoding != NULL)
5072
0
      xmlFree((xmlChar *) ctxt->input->encoding);
5073
0
  ctxt->input->encoding = xmlStrdup((const xmlChar *) encoding);
5074
5075
0
  enc = xmlParseCharEncoding(encoding);
5076
  /*
5077
   * registered set of known encodings
5078
   */
5079
0
  if (enc != XML_CHAR_ENCODING_ERROR) {
5080
0
      xmlSwitchEncoding(ctxt, enc);
5081
0
      if (ctxt->errNo == XML_ERR_UNSUPPORTED_ENCODING) {
5082
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5083
0
                 "Unsupported encoding %s\n",
5084
0
           (const xmlChar *) encoding, NULL);
5085
0
      }
5086
0
  } else {
5087
      /*
5088
       * fallback for unknown encodings
5089
       */
5090
0
      handler = xmlFindCharEncodingHandler((const char *) encoding);
5091
0
      if (handler != NULL) {
5092
0
    xmlSwitchToEncoding(ctxt, handler);
5093
0
      } else {
5094
0
    htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
5095
0
                 "Unsupported encoding %s\n",
5096
0
           (const xmlChar *) encoding, NULL);
5097
0
      }
5098
0
  }
5099
0
    }
5100
0
    return(ctxt);
5101
0
}
5102
5103
#ifdef LIBXML_PUSH_ENABLED
5104
/************************************************************************
5105
 *                  *
5106
 *  Progressive parsing interfaces        *
5107
 *                  *
5108
 ************************************************************************/
5109
5110
/**
5111
 * htmlParseLookupSequence:
5112
 * @ctxt:  an HTML parser context
5113
 * @first:  the first char to lookup
5114
 * @next:  the next char to lookup or zero
5115
 * @third:  the next char to lookup or zero
5116
 * @comment: flag to force checking inside comments
5117
 *
5118
 * Try to find if a sequence (first, next, third) or  just (first next) or
5119
 * (first) is available in the input stream.
5120
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5121
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5122
 * parser, do not use liberally.
5123
 * This is basically similar to xmlParseLookupSequence()
5124
 *
5125
 * Returns the index to the current parsing point if the full sequence
5126
 *      is available, -1 otherwise.
5127
 */
5128
static int
5129
htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first,
5130
                        xmlChar next, xmlChar third, int iscomment,
5131
                        int ignoreattrval)
5132
0
{
5133
0
    int base, len;
5134
0
    htmlParserInputPtr in;
5135
0
    const xmlChar *buf;
5136
0
    int incomment = 0;
5137
0
    int invalue = 0;
5138
0
    char valdellim = 0x0;
5139
5140
0
    in = ctxt->input;
5141
0
    if (in == NULL)
5142
0
        return (-1);
5143
5144
0
    base = in->cur - in->base;
5145
0
    if (base < 0)
5146
0
        return (-1);
5147
5148
0
    if (ctxt->checkIndex > base)
5149
0
        base = ctxt->checkIndex;
5150
5151
0
    if (in->buf == NULL) {
5152
0
        buf = in->base;
5153
0
        len = in->length;
5154
0
    } else {
5155
0
        buf = xmlBufContent(in->buf->buffer);
5156
0
        len = xmlBufUse(in->buf->buffer);
5157
0
    }
5158
5159
    /* take into account the sequence length */
5160
0
    if (third)
5161
0
        len -= 2;
5162
0
    else if (next)
5163
0
        len--;
5164
0
    for (; base < len; base++) {
5165
0
        if ((!incomment) && (base + 4 < len) && (!iscomment)) {
5166
0
            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5167
0
                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5168
0
                incomment = 1;
5169
                /* do not increment past <! - some people use <!--> */
5170
0
                base += 2;
5171
0
            }
5172
0
        }
5173
0
        if (ignoreattrval) {
5174
0
            if (buf[base] == '"' || buf[base] == '\'') {
5175
0
                if (invalue) {
5176
0
                    if (buf[base] == valdellim) {
5177
0
                        invalue = 0;
5178
0
                        continue;
5179
0
                    }
5180
0
                } else {
5181
0
                    valdellim = buf[base];
5182
0
                    invalue = 1;
5183
0
                    continue;
5184
0
                }
5185
0
            } else if (invalue) {
5186
0
                continue;
5187
0
            }
5188
0
        }
5189
0
        if (incomment) {
5190
0
            if (base + 3 > len)
5191
0
                return (-1);
5192
0
            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5193
0
                (buf[base + 2] == '>')) {
5194
0
                incomment = 0;
5195
0
                base += 2;
5196
0
            }
5197
0
            continue;
5198
0
        }
5199
0
        if (buf[base] == first) {
5200
0
            if (third != 0) {
5201
0
                if ((buf[base + 1] != next) || (buf[base + 2] != third))
5202
0
                    continue;
5203
0
            } else if (next != 0) {
5204
0
                if (buf[base + 1] != next)
5205
0
                    continue;
5206
0
            }
5207
0
            ctxt->checkIndex = 0;
5208
#ifdef DEBUG_PUSH
5209
            if (next == 0)
5210
                xmlGenericError(xmlGenericErrorContext,
5211
                                "HPP: lookup '%c' found at %d\n",
5212
                                first, base);
5213
            else if (third == 0)
5214
                xmlGenericError(xmlGenericErrorContext,
5215
                                "HPP: lookup '%c%c' found at %d\n",
5216
                                first, next, base);
5217
            else
5218
                xmlGenericError(xmlGenericErrorContext,
5219
                                "HPP: lookup '%c%c%c' found at %d\n",
5220
                                first, next, third, base);
5221
#endif
5222
0
            return (base - (in->cur - in->base));
5223
0
        }
5224
0
    }
5225
0
    if ((!incomment) && (!invalue))
5226
0
        ctxt->checkIndex = base;
5227
#ifdef DEBUG_PUSH
5228
    if (next == 0)
5229
        xmlGenericError(xmlGenericErrorContext,
5230
                        "HPP: lookup '%c' failed\n", first);
5231
    else if (third == 0)
5232
        xmlGenericError(xmlGenericErrorContext,
5233
                        "HPP: lookup '%c%c' failed\n", first, next);
5234
    else
5235
        xmlGenericError(xmlGenericErrorContext,
5236
                        "HPP: lookup '%c%c%c' failed\n", first, next,
5237
                        third);
5238
#endif
5239
0
    return (-1);
5240
0
}
5241
5242
/**
5243
 * htmlParseLookupChars:
5244
 * @ctxt: an HTML parser context
5245
 * @stop: Array of chars, which stop the lookup.
5246
 * @stopLen: Length of stop-Array
5247
 *
5248
 * Try to find if any char of the stop-Array is available in the input
5249
 * stream.
5250
 * This function has a side effect of (possibly) incrementing ctxt->checkIndex
5251
 * to avoid rescanning sequences of bytes, it DOES change the state of the
5252
 * parser, do not use liberally.
5253
 *
5254
 * Returns the index to the current parsing point if a stopChar
5255
 *      is available, -1 otherwise.
5256
 */
5257
static int
5258
htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop,
5259
                     int stopLen)
5260
0
{
5261
0
    int base, len;
5262
0
    htmlParserInputPtr in;
5263
0
    const xmlChar *buf;
5264
0
    int incomment = 0;
5265
0
    int i;
5266
5267
0
    in = ctxt->input;
5268
0
    if (in == NULL)
5269
0
        return (-1);
5270
5271
0
    base = in->cur - in->base;
5272
0
    if (base < 0)
5273
0
        return (-1);
5274
5275
0
    if (ctxt->checkIndex > base)
5276
0
        base = ctxt->checkIndex;
5277
5278
0
    if (in->buf == NULL) {
5279
0
        buf = in->base;
5280
0
        len = in->length;
5281
0
    } else {
5282
0
        buf = xmlBufContent(in->buf->buffer);
5283
0
        len = xmlBufUse(in->buf->buffer);
5284
0
    }
5285
5286
0
    for (; base < len; base++) {
5287
0
        if (!incomment && (base + 4 < len)) {
5288
0
            if ((buf[base] == '<') && (buf[base + 1] == '!') &&
5289
0
                (buf[base + 2] == '-') && (buf[base + 3] == '-')) {
5290
0
                incomment = 1;
5291
                /* do not increment past <! - some people use <!--> */
5292
0
                base += 2;
5293
0
            }
5294
0
        }
5295
0
        if (incomment) {
5296
0
            if (base + 3 > len)
5297
0
                return (-1);
5298
0
            if ((buf[base] == '-') && (buf[base + 1] == '-') &&
5299
0
                (buf[base + 2] == '>')) {
5300
0
                incomment = 0;
5301
0
                base += 2;
5302
0
            }
5303
0
            continue;
5304
0
        }
5305
0
        for (i = 0; i < stopLen; ++i) {
5306
0
            if (buf[base] == stop[i]) {
5307
0
                ctxt->checkIndex = 0;
5308
0
                return (base - (in->cur - in->base));
5309
0
            }
5310
0
        }
5311
0
    }
5312
0
    ctxt->checkIndex = base;
5313
0
    return (-1);
5314
0
}
5315
5316
/**
5317
 * htmlParseTryOrFinish:
5318
 * @ctxt:  an HTML parser context
5319
 * @terminate:  last chunk indicator
5320
 *
5321
 * Try to progress on parsing
5322
 *
5323
 * Returns zero if no parsing was possible
5324
 */
5325
static int
5326
0
htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
5327
0
    int ret = 0;
5328
0
    htmlParserInputPtr in;
5329
0
    int avail = 0;
5330
0
    xmlChar cur, next;
5331
5332
0
    htmlParserNodeInfo node_info;
5333
5334
#ifdef DEBUG_PUSH
5335
    switch (ctxt->instate) {
5336
  case XML_PARSER_EOF:
5337
      xmlGenericError(xmlGenericErrorContext,
5338
        "HPP: try EOF\n"); break;
5339
  case XML_PARSER_START:
5340
      xmlGenericError(xmlGenericErrorContext,
5341
        "HPP: try START\n"); break;
5342
  case XML_PARSER_MISC:
5343
      xmlGenericError(xmlGenericErrorContext,
5344
        "HPP: try MISC\n");break;
5345
  case XML_PARSER_COMMENT:
5346
      xmlGenericError(xmlGenericErrorContext,
5347
        "HPP: try COMMENT\n");break;
5348
  case XML_PARSER_PROLOG:
5349
      xmlGenericError(xmlGenericErrorContext,
5350
        "HPP: try PROLOG\n");break;
5351
  case XML_PARSER_START_TAG:
5352
      xmlGenericError(xmlGenericErrorContext,
5353
        "HPP: try START_TAG\n");break;
5354
  case XML_PARSER_CONTENT:
5355
      xmlGenericError(xmlGenericErrorContext,
5356
        "HPP: try CONTENT\n");break;
5357
  case XML_PARSER_CDATA_SECTION:
5358
      xmlGenericError(xmlGenericErrorContext,
5359
        "HPP: try CDATA_SECTION\n");break;
5360
  case XML_PARSER_END_TAG:
5361
      xmlGenericError(xmlGenericErrorContext,
5362
        "HPP: try END_TAG\n");break;
5363
  case XML_PARSER_ENTITY_DECL:
5364
      xmlGenericError(xmlGenericErrorContext,
5365
        "HPP: try ENTITY_DECL\n");break;
5366
  case XML_PARSER_ENTITY_VALUE:
5367
      xmlGenericError(xmlGenericErrorContext,
5368
        "HPP: try ENTITY_VALUE\n");break;
5369
  case XML_PARSER_ATTRIBUTE_VALUE:
5370
      xmlGenericError(xmlGenericErrorContext,
5371
        "HPP: try ATTRIBUTE_VALUE\n");break;
5372
  case XML_PARSER_DTD:
5373
      xmlGenericError(xmlGenericErrorContext,
5374
        "HPP: try DTD\n");break;
5375
  case XML_PARSER_EPILOG:
5376
      xmlGenericError(xmlGenericErrorContext,
5377
        "HPP: try EPILOG\n");break;
5378
  case XML_PARSER_PI:
5379
      xmlGenericError(xmlGenericErrorContext,
5380
        "HPP: try PI\n");break;
5381
  case XML_PARSER_SYSTEM_LITERAL:
5382
      xmlGenericError(xmlGenericErrorContext,
5383
        "HPP: try SYSTEM_LITERAL\n");break;
5384
    }
5385
#endif
5386
5387
0
    while (1) {
5388
5389
0
  in = ctxt->input;
5390
0
  if (in == NULL) break;
5391
0
  if (in->buf == NULL)
5392
0
      avail = in->length - (in->cur - in->base);
5393
0
  else
5394
0
      avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5395
0
  if ((avail == 0) && (terminate)) {
5396
0
      htmlAutoCloseOnEnd(ctxt);
5397
0
      if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
5398
    /*
5399
     * SAX: end of the document processing.
5400
     */
5401
0
    ctxt->instate = XML_PARSER_EOF;
5402
0
    if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5403
0
        ctxt->sax->endDocument(ctxt->userData);
5404
0
      }
5405
0
  }
5406
0
        if (avail < 1)
5407
0
      goto done;
5408
0
  cur = in->cur[0];
5409
0
  if (cur == 0) {
5410
0
      SKIP(1);
5411
0
      continue;
5412
0
  }
5413
5414
0
        switch (ctxt->instate) {
5415
0
            case XML_PARSER_EOF:
5416
          /*
5417
     * Document parsing is done !
5418
     */
5419
0
          goto done;
5420
0
            case XML_PARSER_START:
5421
          /*
5422
     * Very first chars read from the document flow.
5423
     */
5424
0
    cur = in->cur[0];
5425
0
    if (IS_BLANK_CH(cur)) {
5426
0
        SKIP_BLANKS;
5427
0
        if (in->buf == NULL)
5428
0
      avail = in->length - (in->cur - in->base);
5429
0
        else
5430
0
      avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5431
0
    }
5432
0
    if ((ctxt->sax) && (ctxt->sax->setDocumentLocator))
5433
0
        ctxt->sax->setDocumentLocator(ctxt->userData,
5434
0
              &xmlDefaultSAXLocator);
5435
0
    if ((ctxt->sax) && (ctxt->sax->startDocument) &&
5436
0
              (!ctxt->disableSAX))
5437
0
        ctxt->sax->startDocument(ctxt->userData);
5438
5439
0
    cur = in->cur[0];
5440
0
    next = in->cur[1];
5441
0
    if ((cur == '<') && (next == '!') &&
5442
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5443
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5444
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5445
0
        (UPP(8) == 'E')) {
5446
0
        if ((!terminate) &&
5447
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5448
0
      goto done;
5449
#ifdef DEBUG_PUSH
5450
        xmlGenericError(xmlGenericErrorContext,
5451
          "HPP: Parsing internal subset\n");
5452
#endif
5453
0
        htmlParseDocTypeDecl(ctxt);
5454
0
        ctxt->instate = XML_PARSER_PROLOG;
5455
#ifdef DEBUG_PUSH
5456
        xmlGenericError(xmlGenericErrorContext,
5457
          "HPP: entering PROLOG\n");
5458
#endif
5459
0
                } else {
5460
0
        ctxt->instate = XML_PARSER_MISC;
5461
#ifdef DEBUG_PUSH
5462
        xmlGenericError(xmlGenericErrorContext,
5463
          "HPP: entering MISC\n");
5464
#endif
5465
0
    }
5466
0
    break;
5467
0
            case XML_PARSER_MISC:
5468
0
    SKIP_BLANKS;
5469
0
    if (in->buf == NULL)
5470
0
        avail = in->length - (in->cur - in->base);
5471
0
    else
5472
0
        avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5473
    /*
5474
     * no chars in buffer
5475
     */
5476
0
    if (avail < 1)
5477
0
        goto done;
5478
    /*
5479
     * not enouth chars in buffer
5480
     */
5481
0
    if (avail < 2) {
5482
0
        if (!terminate)
5483
0
      goto done;
5484
0
        else
5485
0
      next = ' ';
5486
0
    } else {
5487
0
        next = in->cur[1];
5488
0
    }
5489
0
    cur = in->cur[0];
5490
0
          if ((cur == '<') && (next == '!') &&
5491
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5492
0
        if ((!terminate) &&
5493
0
            (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5494
0
      goto done;
5495
#ifdef DEBUG_PUSH
5496
        xmlGenericError(xmlGenericErrorContext,
5497
          "HPP: Parsing Comment\n");
5498
#endif
5499
0
        htmlParseComment(ctxt);
5500
0
        ctxt->instate = XML_PARSER_MISC;
5501
0
          } else if ((cur == '<') && (next == '?')) {
5502
0
        if ((!terminate) &&
5503
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5504
0
      goto done;
5505
#ifdef DEBUG_PUSH
5506
        xmlGenericError(xmlGenericErrorContext,
5507
          "HPP: Parsing PI\n");
5508
#endif
5509
0
        htmlParsePI(ctxt);
5510
0
        ctxt->instate = XML_PARSER_MISC;
5511
0
    } else if ((cur == '<') && (next == '!') &&
5512
0
        (UPP(2) == 'D') && (UPP(3) == 'O') &&
5513
0
        (UPP(4) == 'C') && (UPP(5) == 'T') &&
5514
0
        (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5515
0
        (UPP(8) == 'E')) {
5516
0
        if ((!terminate) &&
5517
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5518
0
      goto done;
5519
#ifdef DEBUG_PUSH
5520
        xmlGenericError(xmlGenericErrorContext,
5521
          "HPP: Parsing internal subset\n");
5522
#endif
5523
0
        htmlParseDocTypeDecl(ctxt);
5524
0
        ctxt->instate = XML_PARSER_PROLOG;
5525
#ifdef DEBUG_PUSH
5526
        xmlGenericError(xmlGenericErrorContext,
5527
          "HPP: entering PROLOG\n");
5528
#endif
5529
0
    } else if ((cur == '<') && (next == '!') &&
5530
0
               (avail < 9)) {
5531
0
        goto done;
5532
0
    } else {
5533
0
        ctxt->instate = XML_PARSER_START_TAG;
5534
#ifdef DEBUG_PUSH
5535
        xmlGenericError(xmlGenericErrorContext,
5536
          "HPP: entering START_TAG\n");
5537
#endif
5538
0
    }
5539
0
    break;
5540
0
            case XML_PARSER_PROLOG:
5541
0
    SKIP_BLANKS;
5542
0
    if (in->buf == NULL)
5543
0
        avail = in->length - (in->cur - in->base);
5544
0
    else
5545
0
        avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5546
0
    if (avail < 2)
5547
0
        goto done;
5548
0
    cur = in->cur[0];
5549
0
    next = in->cur[1];
5550
0
    if ((cur == '<') && (next == '!') &&
5551
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5552
0
        if ((!terminate) &&
5553
0
            (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5554
0
      goto done;
5555
#ifdef DEBUG_PUSH
5556
        xmlGenericError(xmlGenericErrorContext,
5557
          "HPP: Parsing Comment\n");
5558
#endif
5559
0
        htmlParseComment(ctxt);
5560
0
        ctxt->instate = XML_PARSER_PROLOG;
5561
0
          } else if ((cur == '<') && (next == '?')) {
5562
0
        if ((!terminate) &&
5563
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5564
0
      goto done;
5565
#ifdef DEBUG_PUSH
5566
        xmlGenericError(xmlGenericErrorContext,
5567
          "HPP: Parsing PI\n");
5568
#endif
5569
0
        htmlParsePI(ctxt);
5570
0
        ctxt->instate = XML_PARSER_PROLOG;
5571
0
    } else if ((cur == '<') && (next == '!') &&
5572
0
               (avail < 4)) {
5573
0
        goto done;
5574
0
    } else {
5575
0
        ctxt->instate = XML_PARSER_START_TAG;
5576
#ifdef DEBUG_PUSH
5577
        xmlGenericError(xmlGenericErrorContext,
5578
          "HPP: entering START_TAG\n");
5579
#endif
5580
0
    }
5581
0
    break;
5582
0
            case XML_PARSER_EPILOG:
5583
0
    if (in->buf == NULL)
5584
0
        avail = in->length - (in->cur - in->base);
5585
0
    else
5586
0
        avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base);
5587
0
    if (avail < 1)
5588
0
        goto done;
5589
0
    cur = in->cur[0];
5590
0
    if (IS_BLANK_CH(cur)) {
5591
0
        htmlParseCharData(ctxt);
5592
0
        goto done;
5593
0
    }
5594
0
    if (avail < 2)
5595
0
        goto done;
5596
0
    next = in->cur[1];
5597
0
          if ((cur == '<') && (next == '!') &&
5598
0
        (in->cur[2] == '-') && (in->cur[3] == '-')) {
5599
0
        if ((!terminate) &&
5600
0
            (htmlParseLookupSequence(ctxt, '-', '-', '>', 1, 1) < 0))
5601
0
      goto done;
5602
#ifdef DEBUG_PUSH
5603
        xmlGenericError(xmlGenericErrorContext,
5604
          "HPP: Parsing Comment\n");
5605
#endif
5606
0
        htmlParseComment(ctxt);
5607
0
        ctxt->instate = XML_PARSER_EPILOG;
5608
0
          } else if ((cur == '<') && (next == '?')) {
5609
0
        if ((!terminate) &&
5610
0
            (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5611
0
      goto done;
5612
#ifdef DEBUG_PUSH
5613
        xmlGenericError(xmlGenericErrorContext,
5614
          "HPP: Parsing PI\n");
5615
#endif
5616
0
        htmlParsePI(ctxt);
5617
0
        ctxt->instate = XML_PARSER_EPILOG;
5618
0
    } else if ((cur == '<') && (next == '!') &&
5619
0
               (avail < 4)) {
5620
0
        goto done;
5621
0
    } else {
5622
0
        ctxt->errNo = XML_ERR_DOCUMENT_END;
5623
0
        ctxt->wellFormed = 0;
5624
0
        ctxt->instate = XML_PARSER_EOF;
5625
#ifdef DEBUG_PUSH
5626
        xmlGenericError(xmlGenericErrorContext,
5627
          "HPP: entering EOF\n");
5628
#endif
5629
0
        if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
5630
0
      ctxt->sax->endDocument(ctxt->userData);
5631
0
        goto done;
5632
0
    }
5633
0
    break;
5634
0
            case XML_PARSER_START_TAG: {
5635
0
          const xmlChar *name;
5636
0
    int failed;
5637
0
    const htmlElemDesc * info;
5638
5639
    /*
5640
     * no chars in buffer
5641
     */
5642
0
    if (avail < 1)
5643
0
        goto done;
5644
    /*
5645
     * not enouth chars in buffer
5646
     */
5647
0
    if (avail < 2) {
5648
0
        if (!terminate)
5649
0
      goto done;
5650
0
        else
5651
0
      next = ' ';
5652
0
    } else {
5653
0
        next = in->cur[1];
5654
0
    }
5655
0
    cur = in->cur[0];
5656
0
          if (cur != '<') {
5657
0
        ctxt->instate = XML_PARSER_CONTENT;
5658
#ifdef DEBUG_PUSH
5659
        xmlGenericError(xmlGenericErrorContext,
5660
          "HPP: entering CONTENT\n");
5661
#endif
5662
0
        break;
5663
0
    }
5664
0
    if (next == '/') {
5665
0
        ctxt->instate = XML_PARSER_END_TAG;
5666
0
        ctxt->checkIndex = 0;
5667
#ifdef DEBUG_PUSH
5668
        xmlGenericError(xmlGenericErrorContext,
5669
          "HPP: entering END_TAG\n");
5670
#endif
5671
0
        break;
5672
0
    }
5673
0
    if ((!terminate) &&
5674
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5675
0
        goto done;
5676
5677
                /* Capture start position */
5678
0
          if (ctxt->record_info) {
5679
0
               node_info.begin_pos = ctxt->input->consumed +
5680
0
                                  (CUR_PTR - ctxt->input->base);
5681
0
               node_info.begin_line = ctxt->input->line;
5682
0
          }
5683
5684
5685
0
    failed = htmlParseStartTag(ctxt);
5686
0
    name = ctxt->name;
5687
0
    if ((failed == -1) ||
5688
0
        (name == NULL)) {
5689
0
        if (CUR == '>')
5690
0
      NEXT;
5691
0
        break;
5692
0
    }
5693
5694
    /*
5695
     * Lookup the info for that element.
5696
     */
5697
0
    info = htmlTagLookup(name);
5698
0
    if (info == NULL) {
5699
0
        htmlParseErr(ctxt, XML_HTML_UNKNOWN_TAG,
5700
0
                     "Tag %s invalid\n", name, NULL);
5701
0
    }
5702
5703
    /*
5704
     * Check for an Empty Element labeled the XML/SGML way
5705
     */
5706
0
    if ((CUR == '/') && (NXT(1) == '>')) {
5707
0
        SKIP(2);
5708
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5709
0
      ctxt->sax->endElement(ctxt->userData, name);
5710
0
        htmlnamePop(ctxt);
5711
0
        ctxt->instate = XML_PARSER_CONTENT;
5712
#ifdef DEBUG_PUSH
5713
        xmlGenericError(xmlGenericErrorContext,
5714
          "HPP: entering CONTENT\n");
5715
#endif
5716
0
        break;
5717
0
    }
5718
5719
0
    if (CUR == '>') {
5720
0
        NEXT;
5721
0
    } else {
5722
0
        htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
5723
0
                     "Couldn't find end of Start Tag %s\n",
5724
0
         name, NULL);
5725
5726
        /*
5727
         * end of parsing of this node.
5728
         */
5729
0
        if (xmlStrEqual(name, ctxt->name)) {
5730
0
      nodePop(ctxt);
5731
0
      htmlnamePop(ctxt);
5732
0
        }
5733
5734
0
        if (ctxt->record_info)
5735
0
            htmlNodeInfoPush(ctxt, &node_info);
5736
5737
0
        ctxt->instate = XML_PARSER_CONTENT;
5738
#ifdef DEBUG_PUSH
5739
        xmlGenericError(xmlGenericErrorContext,
5740
          "HPP: entering CONTENT\n");
5741
#endif
5742
0
        break;
5743
0
    }
5744
5745
    /*
5746
     * Check for an Empty Element from DTD definition
5747
     */
5748
0
    if ((info != NULL) && (info->empty)) {
5749
0
        if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
5750
0
      ctxt->sax->endElement(ctxt->userData, name);
5751
0
        htmlnamePop(ctxt);
5752
0
    }
5753
5754
0
                if (ctxt->record_info)
5755
0
              htmlNodeInfoPush(ctxt, &node_info);
5756
5757
0
    ctxt->instate = XML_PARSER_CONTENT;
5758
#ifdef DEBUG_PUSH
5759
    xmlGenericError(xmlGenericErrorContext,
5760
      "HPP: entering CONTENT\n");
5761
#endif
5762
0
                break;
5763
0
      }
5764
0
            case XML_PARSER_CONTENT: {
5765
0
    long cons;
5766
                /*
5767
     * Handle preparsed entities and charRef
5768
     */
5769
0
    if (ctxt->token != 0) {
5770
0
        xmlChar chr[2] = { 0 , 0 } ;
5771
5772
0
        chr[0] = (xmlChar) ctxt->token;
5773
0
        htmlCheckParagraph(ctxt);
5774
0
        if ((ctxt->sax != NULL) && (ctxt->sax->characters != NULL))
5775
0
      ctxt->sax->characters(ctxt->userData, chr, 1);
5776
0
        ctxt->token = 0;
5777
0
        ctxt->checkIndex = 0;
5778
0
    }
5779
0
    if ((avail == 1) && (terminate)) {
5780
0
        cur = in->cur[0];
5781
0
        if ((cur != '<') && (cur != '&')) {
5782
0
      if (ctxt->sax != NULL) {
5783
0
          if (IS_BLANK_CH(cur)) {
5784
0
        if (ctxt->keepBlanks) {
5785
0
            if (ctxt->sax->characters != NULL)
5786
0
          ctxt->sax->characters(
5787
0
            ctxt->userData, &in->cur[0], 1);
5788
0
        } else {
5789
0
            if (ctxt->sax->ignorableWhitespace != NULL)
5790
0
          ctxt->sax->ignorableWhitespace(
5791
0
            ctxt->userData, &in->cur[0], 1);
5792
0
        }
5793
0
          } else {
5794
0
        htmlCheckParagraph(ctxt);
5795
0
        if (ctxt->sax->characters != NULL)
5796
0
            ctxt->sax->characters(
5797
0
              ctxt->userData, &in->cur[0], 1);
5798
0
          }
5799
0
      }
5800
0
      ctxt->token = 0;
5801
0
      ctxt->checkIndex = 0;
5802
0
      in->cur++;
5803
0
      break;
5804
0
        }
5805
0
    }
5806
0
    if (avail < 2)
5807
0
        goto done;
5808
0
    cur = in->cur[0];
5809
0
    next = in->cur[1];
5810
0
    cons = ctxt->nbChars;
5811
0
    if ((xmlStrEqual(ctxt->name, BAD_CAST"script")) ||
5812
0
        (xmlStrEqual(ctxt->name, BAD_CAST"style"))) {
5813
        /*
5814
         * Handle SCRIPT/STYLE separately
5815
         */
5816
0
        if (!terminate) {
5817
0
            int idx;
5818
0
      xmlChar val;
5819
5820
0
      idx = htmlParseLookupSequence(ctxt, '<', '/', 0, 0, 0);
5821
0
      if (idx < 0)
5822
0
          goto done;
5823
0
            val = in->cur[idx + 2];
5824
0
      if (val == 0) /* bad cut of input */
5825
0
          goto done;
5826
0
        }
5827
0
        htmlParseScript(ctxt);
5828
0
        if ((cur == '<') && (next == '/')) {
5829
0
      ctxt->instate = XML_PARSER_END_TAG;
5830
0
      ctxt->checkIndex = 0;
5831
#ifdef DEBUG_PUSH
5832
      xmlGenericError(xmlGenericErrorContext,
5833
        "HPP: entering END_TAG\n");
5834
#endif
5835
0
      break;
5836
0
        }
5837
0
    } else {
5838
        /*
5839
         * Sometimes DOCTYPE arrives in the middle of the document
5840
         */
5841
0
        if ((cur == '<') && (next == '!') &&
5842
0
      (UPP(2) == 'D') && (UPP(3) == 'O') &&
5843
0
      (UPP(4) == 'C') && (UPP(5) == 'T') &&
5844
0
      (UPP(6) == 'Y') && (UPP(7) == 'P') &&
5845
0
      (UPP(8) == 'E')) {
5846
0
      if ((!terminate) &&
5847
0
          (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5848
0
          goto done;
5849
0
      htmlParseErr(ctxt, XML_HTML_STRUCURE_ERROR,
5850
0
                   "Misplaced DOCTYPE declaration\n",
5851
0
             BAD_CAST "DOCTYPE" , NULL);
5852
0
      htmlParseDocTypeDecl(ctxt);
5853
0
        } else if ((cur == '<') && (next == '!') &&
5854
0
      (in->cur[2] == '-') && (in->cur[3] == '-')) {
5855
0
      if ((!terminate) &&
5856
0
          (htmlParseLookupSequence(
5857
0
        ctxt, '-', '-', '>', 1, 1) < 0))
5858
0
          goto done;
5859
#ifdef DEBUG_PUSH
5860
      xmlGenericError(xmlGenericErrorContext,
5861
        "HPP: Parsing Comment\n");
5862
#endif
5863
0
      htmlParseComment(ctxt);
5864
0
      ctxt->instate = XML_PARSER_CONTENT;
5865
0
        } else if ((cur == '<') && (next == '?')) {
5866
0
      if ((!terminate) &&
5867
0
          (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5868
0
          goto done;
5869
#ifdef DEBUG_PUSH
5870
      xmlGenericError(xmlGenericErrorContext,
5871
        "HPP: Parsing PI\n");
5872
#endif
5873
0
      htmlParsePI(ctxt);
5874
0
      ctxt->instate = XML_PARSER_CONTENT;
5875
0
        } else if ((cur == '<') && (next == '!') && (avail < 4)) {
5876
0
      goto done;
5877
0
        } else if ((cur == '<') && (next == '/')) {
5878
0
      ctxt->instate = XML_PARSER_END_TAG;
5879
0
      ctxt->checkIndex = 0;
5880
#ifdef DEBUG_PUSH
5881
      xmlGenericError(xmlGenericErrorContext,
5882
        "HPP: entering END_TAG\n");
5883
#endif
5884
0
      break;
5885
0
        } else if (cur == '<') {
5886
0
      ctxt->instate = XML_PARSER_START_TAG;
5887
0
      ctxt->checkIndex = 0;
5888
#ifdef DEBUG_PUSH
5889
      xmlGenericError(xmlGenericErrorContext,
5890
        "HPP: entering START_TAG\n");
5891
#endif
5892
0
      break;
5893
0
        } else if (cur == '&') {
5894
0
      if ((!terminate) &&
5895
0
          (htmlParseLookupChars(ctxt,
5896
0
                                                  BAD_CAST "; >/", 4) < 0))
5897
0
          goto done;
5898
#ifdef DEBUG_PUSH
5899
      xmlGenericError(xmlGenericErrorContext,
5900
        "HPP: Parsing Reference\n");
5901
#endif
5902
      /* TODO: check generation of subtrees if noent !!! */
5903
0
      htmlParseReference(ctxt);
5904
0
        } else {
5905
            /*
5906
       * check that the text sequence is complete
5907
       * before handing out the data to the parser
5908
       * to avoid problems with erroneous end of
5909
       * data detection.
5910
       */
5911
0
      if ((!terminate) &&
5912
0
                            (htmlParseLookupChars(ctxt, BAD_CAST "<&", 2) < 0))
5913
0
          goto done;
5914
0
      ctxt->checkIndex = 0;
5915
#ifdef DEBUG_PUSH
5916
      xmlGenericError(xmlGenericErrorContext,
5917
        "HPP: Parsing char data\n");
5918
#endif
5919
0
      htmlParseCharData(ctxt);
5920
0
        }
5921
0
    }
5922
0
    if (cons == ctxt->nbChars) {
5923
0
        if (ctxt->node != NULL) {
5924
0
      htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5925
0
                   "detected an error in element content\n",
5926
0
             NULL, NULL);
5927
0
        }
5928
0
        NEXT;
5929
0
        break;
5930
0
    }
5931
5932
0
    break;
5933
0
      }
5934
0
            case XML_PARSER_END_TAG:
5935
0
    if (avail < 2)
5936
0
        goto done;
5937
0
    if ((!terminate) &&
5938
0
        (htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
5939
0
        goto done;
5940
0
    htmlParseEndTag(ctxt);
5941
0
    if (ctxt->nameNr == 0) {
5942
0
        ctxt->instate = XML_PARSER_EPILOG;
5943
0
    } else {
5944
0
        ctxt->instate = XML_PARSER_CONTENT;
5945
0
    }
5946
0
    ctxt->checkIndex = 0;
5947
#ifdef DEBUG_PUSH
5948
    xmlGenericError(xmlGenericErrorContext,
5949
      "HPP: entering CONTENT\n");
5950
#endif
5951
0
          break;
5952
0
            case XML_PARSER_CDATA_SECTION:
5953
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5954
0
      "HPP: internal error, state == CDATA\n",
5955
0
           NULL, NULL);
5956
0
    ctxt->instate = XML_PARSER_CONTENT;
5957
0
    ctxt->checkIndex = 0;
5958
#ifdef DEBUG_PUSH
5959
    xmlGenericError(xmlGenericErrorContext,
5960
      "HPP: entering CONTENT\n");
5961
#endif
5962
0
    break;
5963
0
            case XML_PARSER_DTD:
5964
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5965
0
      "HPP: internal error, state == DTD\n",
5966
0
           NULL, NULL);
5967
0
    ctxt->instate = XML_PARSER_CONTENT;
5968
0
    ctxt->checkIndex = 0;
5969
#ifdef DEBUG_PUSH
5970
    xmlGenericError(xmlGenericErrorContext,
5971
      "HPP: entering CONTENT\n");
5972
#endif
5973
0
    break;
5974
0
            case XML_PARSER_COMMENT:
5975
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5976
0
      "HPP: internal error, state == COMMENT\n",
5977
0
           NULL, NULL);
5978
0
    ctxt->instate = XML_PARSER_CONTENT;
5979
0
    ctxt->checkIndex = 0;
5980
#ifdef DEBUG_PUSH
5981
    xmlGenericError(xmlGenericErrorContext,
5982
      "HPP: entering CONTENT\n");
5983
#endif
5984
0
    break;
5985
0
            case XML_PARSER_PI:
5986
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5987
0
      "HPP: internal error, state == PI\n",
5988
0
           NULL, NULL);
5989
0
    ctxt->instate = XML_PARSER_CONTENT;
5990
0
    ctxt->checkIndex = 0;
5991
#ifdef DEBUG_PUSH
5992
    xmlGenericError(xmlGenericErrorContext,
5993
      "HPP: entering CONTENT\n");
5994
#endif
5995
0
    break;
5996
0
            case XML_PARSER_ENTITY_DECL:
5997
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
5998
0
      "HPP: internal error, state == ENTITY_DECL\n",
5999
0
           NULL, NULL);
6000
0
    ctxt->instate = XML_PARSER_CONTENT;
6001
0
    ctxt->checkIndex = 0;
6002
#ifdef DEBUG_PUSH
6003
    xmlGenericError(xmlGenericErrorContext,
6004
      "HPP: entering CONTENT\n");
6005
#endif
6006
0
    break;
6007
0
            case XML_PARSER_ENTITY_VALUE:
6008
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6009
0
      "HPP: internal error, state == ENTITY_VALUE\n",
6010
0
           NULL, NULL);
6011
0
    ctxt->instate = XML_PARSER_CONTENT;
6012
0
    ctxt->checkIndex = 0;
6013
#ifdef DEBUG_PUSH
6014
    xmlGenericError(xmlGenericErrorContext,
6015
      "HPP: entering DTD\n");
6016
#endif
6017
0
    break;
6018
0
            case XML_PARSER_ATTRIBUTE_VALUE:
6019
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6020
0
      "HPP: internal error, state == ATTRIBUTE_VALUE\n",
6021
0
           NULL, NULL);
6022
0
    ctxt->instate = XML_PARSER_START_TAG;
6023
0
    ctxt->checkIndex = 0;
6024
#ifdef DEBUG_PUSH
6025
    xmlGenericError(xmlGenericErrorContext,
6026
      "HPP: entering START_TAG\n");
6027
#endif
6028
0
    break;
6029
0
      case XML_PARSER_SYSTEM_LITERAL:
6030
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6031
0
        "HPP: internal error, state == XML_PARSER_SYSTEM_LITERAL\n",
6032
0
           NULL, NULL);
6033
0
    ctxt->instate = XML_PARSER_CONTENT;
6034
0
    ctxt->checkIndex = 0;
6035
#ifdef DEBUG_PUSH
6036
    xmlGenericError(xmlGenericErrorContext,
6037
      "HPP: entering CONTENT\n");
6038
#endif
6039
0
    break;
6040
0
      case XML_PARSER_IGNORE:
6041
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6042
0
      "HPP: internal error, state == XML_PARSER_IGNORE\n",
6043
0
           NULL, NULL);
6044
0
    ctxt->instate = XML_PARSER_CONTENT;
6045
0
    ctxt->checkIndex = 0;
6046
#ifdef DEBUG_PUSH
6047
    xmlGenericError(xmlGenericErrorContext,
6048
      "HPP: entering CONTENT\n");
6049
#endif
6050
0
    break;
6051
0
      case XML_PARSER_PUBLIC_LITERAL:
6052
0
    htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6053
0
      "HPP: internal error, state == XML_PARSER_LITERAL\n",
6054
0
           NULL, NULL);
6055
0
    ctxt->instate = XML_PARSER_CONTENT;
6056
0
    ctxt->checkIndex = 0;
6057
#ifdef DEBUG_PUSH
6058
    xmlGenericError(xmlGenericErrorContext,
6059
      "HPP: entering CONTENT\n");
6060
#endif
6061
0
    break;
6062
6063
0
  }
6064
0
    }
6065
0
done:
6066
0
    if ((avail == 0) && (terminate)) {
6067
0
  htmlAutoCloseOnEnd(ctxt);
6068
0
  if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) {
6069
      /*
6070
       * SAX: end of the document processing.
6071
       */
6072
0
      ctxt->instate = XML_PARSER_EOF;
6073
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6074
0
    ctxt->sax->endDocument(ctxt->userData);
6075
0
  }
6076
0
    }
6077
0
    if ((!(ctxt->options & HTML_PARSE_NODEFDTD)) && (ctxt->myDoc != NULL) &&
6078
0
  ((terminate) || (ctxt->instate == XML_PARSER_EOF) ||
6079
0
   (ctxt->instate == XML_PARSER_EPILOG))) {
6080
0
  xmlDtdPtr dtd;
6081
0
  dtd = xmlGetIntSubset(ctxt->myDoc);
6082
0
  if (dtd == NULL)
6083
0
      ctxt->myDoc->intSubset =
6084
0
    xmlCreateIntSubset(ctxt->myDoc, BAD_CAST "html",
6085
0
        BAD_CAST "-//W3C//DTD HTML 4.0 Transitional//EN",
6086
0
        BAD_CAST "http://www.w3.org/TR/REC-html40/loose.dtd");
6087
0
    }
6088
#ifdef DEBUG_PUSH
6089
    xmlGenericError(xmlGenericErrorContext, "HPP: done %d\n", ret);
6090
#endif
6091
0
    return(ret);
6092
0
}
6093
6094
/**
6095
 * htmlParseChunk:
6096
 * @ctxt:  an HTML parser context
6097
 * @chunk:  an char array
6098
 * @size:  the size in byte of the chunk
6099
 * @terminate:  last chunk indicator
6100
 *
6101
 * Parse a Chunk of memory
6102
 *
6103
 * Returns zero if no error, the xmlParserErrors otherwise.
6104
 */
6105
int
6106
htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
6107
0
              int terminate) {
6108
0
    if ((ctxt == NULL) || (ctxt->input == NULL)) {
6109
0
  htmlParseErr(ctxt, XML_ERR_INTERNAL_ERROR,
6110
0
         "htmlParseChunk: context error\n", NULL, NULL);
6111
0
  return(XML_ERR_INTERNAL_ERROR);
6112
0
    }
6113
0
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6114
0
        (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF))  {
6115
0
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6116
0
  size_t cur = ctxt->input->cur - ctxt->input->base;
6117
0
  int res;
6118
6119
0
  res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6120
0
  if (res < 0) {
6121
0
      ctxt->errNo = XML_PARSER_EOF;
6122
0
      ctxt->disableSAX = 1;
6123
0
      return (XML_PARSER_EOF);
6124
0
  }
6125
0
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6126
#ifdef DEBUG_PUSH
6127
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6128
#endif
6129
6130
#if 0
6131
  if ((terminate) || (ctxt->input->buf->buffer->use > 80))
6132
      htmlParseTryOrFinish(ctxt, terminate);
6133
#endif
6134
0
    } else if (ctxt->instate != XML_PARSER_EOF) {
6135
0
  if ((ctxt->input != NULL) && ctxt->input->buf != NULL) {
6136
0
      xmlParserInputBufferPtr in = ctxt->input->buf;
6137
0
      if ((in->encoder != NULL) && (in->buffer != NULL) &&
6138
0
        (in->raw != NULL)) {
6139
0
    int nbchars;
6140
0
    size_t base = xmlBufGetInputBase(in->buffer, ctxt->input);
6141
0
    size_t current = ctxt->input->cur - ctxt->input->base;
6142
6143
0
    nbchars = xmlCharEncInput(in, terminate);
6144
0
    if (nbchars < 0) {
6145
0
        htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING,
6146
0
               "encoder error\n", NULL, NULL);
6147
0
        return(XML_ERR_INVALID_ENCODING);
6148
0
    }
6149
0
    xmlBufSetInputBaseCur(in->buffer, ctxt->input, base, current);
6150
0
      }
6151
0
  }
6152
0
    }
6153
0
    htmlParseTryOrFinish(ctxt, terminate);
6154
0
    if (terminate) {
6155
0
  if ((ctxt->instate != XML_PARSER_EOF) &&
6156
0
      (ctxt->instate != XML_PARSER_EPILOG) &&
6157
0
      (ctxt->instate != XML_PARSER_MISC)) {
6158
0
      ctxt->errNo = XML_ERR_DOCUMENT_END;
6159
0
      ctxt->wellFormed = 0;
6160
0
  }
6161
0
  if (ctxt->instate != XML_PARSER_EOF) {
6162
0
      if ((ctxt->sax) && (ctxt->sax->endDocument != NULL))
6163
0
    ctxt->sax->endDocument(ctxt->userData);
6164
0
  }
6165
0
  ctxt->instate = XML_PARSER_EOF;
6166
0
    }
6167
0
    return((xmlParserErrors) ctxt->errNo);
6168
0
}
6169
6170
/************************************************************************
6171
 *                  *
6172
 *      User entry points       *
6173
 *                  *
6174
 ************************************************************************/
6175
6176
/**
6177
 * htmlCreatePushParserCtxt:
6178
 * @sax:  a SAX handler
6179
 * @user_data:  The user data returned on SAX callbacks
6180
 * @chunk:  a pointer to an array of chars
6181
 * @size:  number of chars in the array
6182
 * @filename:  an optional file name or URI
6183
 * @enc:  an optional encoding
6184
 *
6185
 * Create a parser context for using the HTML parser in push mode
6186
 * The value of @filename is used for fetching external entities
6187
 * and error/warning reports.
6188
 *
6189
 * Returns the new parser context or NULL
6190
 */
6191
htmlParserCtxtPtr
6192
htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data,
6193
                         const char *chunk, int size, const char *filename,
6194
0
       xmlCharEncoding enc) {
6195
0
    htmlParserCtxtPtr ctxt;
6196
0
    htmlParserInputPtr inputStream;
6197
0
    xmlParserInputBufferPtr buf;
6198
6199
0
    xmlInitParser();
6200
6201
0
    buf = xmlAllocParserInputBuffer(enc);
6202
0
    if (buf == NULL) return(NULL);
6203
6204
0
    ctxt = htmlNewParserCtxt();
6205
0
    if (ctxt == NULL) {
6206
0
  xmlFreeParserInputBuffer(buf);
6207
0
  return(NULL);
6208
0
    }
6209
0
    if(enc==XML_CHAR_ENCODING_UTF8 || buf->encoder)
6210
0
  ctxt->charset=XML_CHAR_ENCODING_UTF8;
6211
0
    if (sax != NULL) {
6212
0
  if (ctxt->sax != (xmlSAXHandlerPtr) &htmlDefaultSAXHandler)
6213
0
      xmlFree(ctxt->sax);
6214
0
  ctxt->sax = (htmlSAXHandlerPtr) xmlMalloc(sizeof(htmlSAXHandler));
6215
0
  if (ctxt->sax == NULL) {
6216
0
      xmlFree(buf);
6217
0
      xmlFree(ctxt);
6218
0
      return(NULL);
6219
0
  }
6220
0
  memcpy(ctxt->sax, sax, sizeof(htmlSAXHandler));
6221
0
  if (user_data != NULL)
6222
0
      ctxt->userData = user_data;
6223
0
    }
6224
0
    if (filename == NULL) {
6225
0
  ctxt->directory = NULL;
6226
0
    } else {
6227
0
        ctxt->directory = xmlParserGetDirectory(filename);
6228
0
    }
6229
6230
0
    inputStream = htmlNewInputStream(ctxt);
6231
0
    if (inputStream == NULL) {
6232
0
  xmlFreeParserCtxt(ctxt);
6233
0
  xmlFree(buf);
6234
0
  return(NULL);
6235
0
    }
6236
6237
0
    if (filename == NULL)
6238
0
  inputStream->filename = NULL;
6239
0
    else
6240
0
  inputStream->filename = (char *)
6241
0
      xmlCanonicPath((const xmlChar *) filename);
6242
0
    inputStream->buf = buf;
6243
0
    xmlBufResetInput(buf->buffer, inputStream);
6244
6245
0
    inputPush(ctxt, inputStream);
6246
6247
0
    if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) &&
6248
0
        (ctxt->input->buf != NULL))  {
6249
0
  size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input);
6250
0
  size_t cur = ctxt->input->cur - ctxt->input->base;
6251
6252
0
  xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
6253
6254
0
        xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur);
6255
#ifdef DEBUG_PUSH
6256
  xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
6257
#endif
6258
0
    }
6259
0
    ctxt->progressive = 1;
6260
6261
0
    return(ctxt);
6262
0
}
6263
#endif /* LIBXML_PUSH_ENABLED */
6264
6265
/**
6266
 * htmlSAXParseDoc:
6267
 * @cur:  a pointer to an array of xmlChar
6268
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6269
 * @sax:  the SAX handler block
6270
 * @userData: if using SAX, this pointer will be provided on callbacks.
6271
 *
6272
 * Parse an HTML in-memory document. If sax is not NULL, use the SAX callbacks
6273
 * to handle parse events. If sax is NULL, fallback to the default DOM
6274
 * behavior and return a tree.
6275
 *
6276
 * Returns the resulting document tree unless SAX is NULL or the document is
6277
 *     not well formed.
6278
 */
6279
6280
htmlDocPtr
6281
htmlSAXParseDoc(const xmlChar *cur, const char *encoding,
6282
0
                htmlSAXHandlerPtr sax, void *userData) {
6283
0
    htmlDocPtr ret;
6284
0
    htmlParserCtxtPtr ctxt;
6285
6286
0
    xmlInitParser();
6287
6288
0
    if (cur == NULL) return(NULL);
6289
6290
6291
0
    ctxt = htmlCreateDocParserCtxt(cur, encoding);
6292
0
    if (ctxt == NULL) return(NULL);
6293
0
    if (sax != NULL) {
6294
0
        if (ctxt->sax != NULL) xmlFree (ctxt->sax);
6295
0
        ctxt->sax = sax;
6296
0
        ctxt->userData = userData;
6297
0
    }
6298
6299
0
    htmlParseDocument(ctxt);
6300
0
    ret = ctxt->myDoc;
6301
0
    if (sax != NULL) {
6302
0
  ctxt->sax = NULL;
6303
0
  ctxt->userData = NULL;
6304
0
    }
6305
0
    htmlFreeParserCtxt(ctxt);
6306
6307
0
    return(ret);
6308
0
}
6309
6310
/**
6311
 * htmlParseDoc:
6312
 * @cur:  a pointer to an array of xmlChar
6313
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6314
 *
6315
 * parse an HTML in-memory document and build a tree.
6316
 *
6317
 * Returns the resulting document tree
6318
 */
6319
6320
htmlDocPtr
6321
0
htmlParseDoc(const xmlChar *cur, const char *encoding) {
6322
0
    return(htmlSAXParseDoc(cur, encoding, NULL, NULL));
6323
0
}
6324
6325
6326
/**
6327
 * htmlCreateFileParserCtxt:
6328
 * @filename:  the filename
6329
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6330
 *
6331
 * Create a parser context for a file content.
6332
 * Automatic support for ZLIB/Compress compressed document is provided
6333
 * by default if found at compile-time.
6334
 *
6335
 * Returns the new parser context or NULL
6336
 */
6337
htmlParserCtxtPtr
6338
htmlCreateFileParserCtxt(const char *filename, const char *encoding)
6339
0
{
6340
0
    htmlParserCtxtPtr ctxt;
6341
0
    htmlParserInputPtr inputStream;
6342
0
    char *canonicFilename;
6343
    /* htmlCharEncoding enc; */
6344
0
    xmlChar *content, *content_line = (xmlChar *) "charset=";
6345
6346
0
    if (filename == NULL)
6347
0
        return(NULL);
6348
6349
0
    ctxt = htmlNewParserCtxt();
6350
0
    if (ctxt == NULL) {
6351
0
  return(NULL);
6352
0
    }
6353
0
    canonicFilename = (char *) xmlCanonicPath((const xmlChar *) filename);
6354
0
    if (canonicFilename == NULL) {
6355
0
#ifdef LIBXML_SAX1_ENABLED
6356
0
  if (xmlDefaultSAXHandler.error != NULL) {
6357
0
      xmlDefaultSAXHandler.error(NULL, "out of memory\n");
6358
0
  }
6359
0
#endif
6360
0
  xmlFreeParserCtxt(ctxt);
6361
0
  return(NULL);
6362
0
    }
6363
6364
0
    inputStream = xmlLoadExternalEntity(canonicFilename, NULL, ctxt);
6365
0
    xmlFree(canonicFilename);
6366
0
    if (inputStream == NULL) {
6367
0
  xmlFreeParserCtxt(ctxt);
6368
0
  return(NULL);
6369
0
    }
6370
6371
0
    inputPush(ctxt, inputStream);
6372
6373
    /* set encoding */
6374
0
    if (encoding) {
6375
0
        size_t l = strlen(encoding);
6376
6377
0
  if (l < 1000) {
6378
0
      content = xmlMallocAtomic (xmlStrlen(content_line) + l + 1);
6379
0
      if (content) {
6380
0
    strcpy ((char *)content, (char *)content_line);
6381
0
    strcat ((char *)content, (char *)encoding);
6382
0
    htmlCheckEncoding (ctxt, content);
6383
0
    xmlFree (content);
6384
0
      }
6385
0
  }
6386
0
    }
6387
6388
0
    return(ctxt);
6389
0
}
6390
6391
/**
6392
 * htmlSAXParseFile:
6393
 * @filename:  the filename
6394
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6395
 * @sax:  the SAX handler block
6396
 * @userData: if using SAX, this pointer will be provided on callbacks.
6397
 *
6398
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6399
 * compressed document is provided by default if found at compile-time.
6400
 * It use the given SAX function block to handle the parsing callback.
6401
 * If sax is NULL, fallback to the default DOM tree building routines.
6402
 *
6403
 * Returns the resulting document tree unless SAX is NULL or the document is
6404
 *     not well formed.
6405
 */
6406
6407
htmlDocPtr
6408
htmlSAXParseFile(const char *filename, const char *encoding, htmlSAXHandlerPtr sax,
6409
0
                 void *userData) {
6410
0
    htmlDocPtr ret;
6411
0
    htmlParserCtxtPtr ctxt;
6412
0
    htmlSAXHandlerPtr oldsax = NULL;
6413
6414
0
    xmlInitParser();
6415
6416
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6417
0
    if (ctxt == NULL) return(NULL);
6418
0
    if (sax != NULL) {
6419
0
  oldsax = ctxt->sax;
6420
0
        ctxt->sax = sax;
6421
0
        ctxt->userData = userData;
6422
0
    }
6423
6424
0
    htmlParseDocument(ctxt);
6425
6426
0
    ret = ctxt->myDoc;
6427
0
    if (sax != NULL) {
6428
0
        ctxt->sax = oldsax;
6429
0
        ctxt->userData = NULL;
6430
0
    }
6431
0
    htmlFreeParserCtxt(ctxt);
6432
6433
0
    return(ret);
6434
0
}
6435
6436
/**
6437
 * htmlParseFile:
6438
 * @filename:  the filename
6439
 * @encoding:  a free form C string describing the HTML document encoding, or NULL
6440
 *
6441
 * parse an HTML file and build a tree. Automatic support for ZLIB/Compress
6442
 * compressed document is provided by default if found at compile-time.
6443
 *
6444
 * Returns the resulting document tree
6445
 */
6446
6447
htmlDocPtr
6448
0
htmlParseFile(const char *filename, const char *encoding) {
6449
0
    return(htmlSAXParseFile(filename, encoding, NULL, NULL));
6450
0
}
6451
6452
/**
6453
 * htmlHandleOmittedElem:
6454
 * @val:  int 0 or 1
6455
 *
6456
 * Set and return the previous value for handling HTML omitted tags.
6457
 *
6458
 * Returns the last value for 0 for no handling, 1 for auto insertion.
6459
 */
6460
6461
int
6462
0
htmlHandleOmittedElem(int val) {
6463
0
    int old = htmlOmittedDefaultValue;
6464
6465
0
    htmlOmittedDefaultValue = val;
6466
0
    return(old);
6467
0
}
6468
6469
/**
6470
 * htmlElementAllowedHere:
6471
 * @parent: HTML parent element
6472
 * @elt: HTML element
6473
 *
6474
 * Checks whether an HTML element may be a direct child of a parent element.
6475
 * Note - doesn't check for deprecated elements
6476
 *
6477
 * Returns 1 if allowed; 0 otherwise.
6478
 */
6479
int
6480
0
htmlElementAllowedHere(const htmlElemDesc* parent, const xmlChar* elt) {
6481
0
  const char** p ;
6482
6483
0
  if ( ! elt || ! parent || ! parent->subelts )
6484
0
  return 0 ;
6485
6486
0
  for ( p = parent->subelts; *p; ++p )
6487
0
    if ( !xmlStrcmp((const xmlChar *)*p, elt) )
6488
0
      return 1 ;
6489
6490
0
  return 0 ;
6491
0
}
6492
/**
6493
 * htmlElementStatusHere:
6494
 * @parent: HTML parent element
6495
 * @elt: HTML element
6496
 *
6497
 * Checks whether an HTML element may be a direct child of a parent element.
6498
 * and if so whether it is valid or deprecated.
6499
 *
6500
 * Returns one of HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6501
 */
6502
htmlStatus
6503
0
htmlElementStatusHere(const htmlElemDesc* parent, const htmlElemDesc* elt) {
6504
0
  if ( ! parent || ! elt )
6505
0
    return HTML_INVALID ;
6506
0
  if ( ! htmlElementAllowedHere(parent, (const xmlChar*) elt->name ) )
6507
0
    return HTML_INVALID ;
6508
6509
0
  return ( elt->dtd == 0 ) ? HTML_VALID : HTML_DEPRECATED ;
6510
0
}
6511
/**
6512
 * htmlAttrAllowed:
6513
 * @elt: HTML element
6514
 * @attr: HTML attribute
6515
 * @legacy: whether to allow deprecated attributes
6516
 *
6517
 * Checks whether an attribute is valid for an element
6518
 * Has full knowledge of Required and Deprecated attributes
6519
 *
6520
 * Returns one of HTML_REQUIRED, HTML_VALID, HTML_DEPRECATED, HTML_INVALID
6521
 */
6522
htmlStatus
6523
0
htmlAttrAllowed(const htmlElemDesc* elt, const xmlChar* attr, int legacy) {
6524
0
  const char** p ;
6525
6526
0
  if ( !elt || ! attr )
6527
0
  return HTML_INVALID ;
6528
6529
0
  if ( elt->attrs_req )
6530
0
    for ( p = elt->attrs_req; *p; ++p)
6531
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6532
0
        return HTML_REQUIRED ;
6533
6534
0
  if ( elt->attrs_opt )
6535
0
    for ( p = elt->attrs_opt; *p; ++p)
6536
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6537
0
        return HTML_VALID ;
6538
6539
0
  if ( legacy && elt->attrs_depr )
6540
0
    for ( p = elt->attrs_depr; *p; ++p)
6541
0
      if ( !xmlStrcmp((const xmlChar*)*p, attr) )
6542
0
        return HTML_DEPRECATED ;
6543
6544
0
  return HTML_INVALID ;
6545
0
}
6546
/**
6547
 * htmlNodeStatus:
6548
 * @node: an htmlNodePtr in a tree
6549
 * @legacy: whether to allow deprecated elements (YES is faster here
6550
 *  for Element nodes)
6551
 *
6552
 * Checks whether the tree node is valid.  Experimental (the author
6553
 *     only uses the HTML enhancements in a SAX parser)
6554
 *
6555
 * Return: for Element nodes, a return from htmlElementAllowedHere (if
6556
 *  legacy allowed) or htmlElementStatusHere (otherwise).
6557
 *  for Attribute nodes, a return from htmlAttrAllowed
6558
 *  for other nodes, HTML_NA (no checks performed)
6559
 */
6560
htmlStatus
6561
0
htmlNodeStatus(const htmlNodePtr node, int legacy) {
6562
0
  if ( ! node )
6563
0
    return HTML_INVALID ;
6564
6565
0
  switch ( node->type ) {
6566
0
    case XML_ELEMENT_NODE:
6567
0
      return legacy
6568
0
  ? ( htmlElementAllowedHere (
6569
0
    htmlTagLookup(node->parent->name) , node->name
6570
0
    ) ? HTML_VALID : HTML_INVALID )
6571
0
  : htmlElementStatusHere(
6572
0
    htmlTagLookup(node->parent->name) ,
6573
0
    htmlTagLookup(node->name) )
6574
0
  ;
6575
0
    case XML_ATTRIBUTE_NODE:
6576
0
      return htmlAttrAllowed(
6577
0
  htmlTagLookup(node->parent->name) , node->name, legacy) ;
6578
0
    default: return HTML_NA ;
6579
0
  }
6580
0
}
6581
/************************************************************************
6582
 *                  *
6583
 *  New set (2.6.0) of simpler and more flexible APIs   *
6584
 *                  *
6585
 ************************************************************************/
6586
/**
6587
 * DICT_FREE:
6588
 * @str:  a string
6589
 *
6590
 * Free a string if it is not owned by the "dict" dictionary in the
6591
 * current scope
6592
 */
6593
#define DICT_FREE(str)            \
6594
0
  if ((str) && ((!dict) ||       \
6595
0
      (xmlDictOwns(dict, (const xmlChar *)(str)) == 0)))  \
6596
0
      xmlFree((char *)(str));
6597
6598
/**
6599
 * htmlCtxtReset:
6600
 * @ctxt: an HTML parser context
6601
 *
6602
 * Reset a parser context
6603
 */
6604
void
6605
htmlCtxtReset(htmlParserCtxtPtr ctxt)
6606
0
{
6607
0
    xmlParserInputPtr input;
6608
0
    xmlDictPtr dict;
6609
6610
0
    if (ctxt == NULL)
6611
0
        return;
6612
6613
0
    xmlInitParser();
6614
0
    dict = ctxt->dict;
6615
6616
0
    while ((input = inputPop(ctxt)) != NULL) { /* Non consuming */
6617
0
        xmlFreeInputStream(input);
6618
0
    }
6619
0
    ctxt->inputNr = 0;
6620
0
    ctxt->input = NULL;
6621
6622
0
    ctxt->spaceNr = 0;
6623
0
    if (ctxt->spaceTab != NULL) {
6624
0
  ctxt->spaceTab[0] = -1;
6625
0
  ctxt->space = &ctxt->spaceTab[0];
6626
0
    } else {
6627
0
  ctxt->space = NULL;
6628
0
    }
6629
6630
6631
0
    ctxt->nodeNr = 0;
6632
0
    ctxt->node = NULL;
6633
6634
0
    ctxt->nameNr = 0;
6635
0
    ctxt->name = NULL;
6636
6637
0
    DICT_FREE(ctxt->version);
6638
0
    ctxt->version = NULL;
6639
0
    DICT_FREE(ctxt->encoding);
6640
0
    ctxt->encoding = NULL;
6641
0
    DICT_FREE(ctxt->directory);
6642
0
    ctxt->directory = NULL;
6643
0
    DICT_FREE(ctxt->extSubURI);
6644
0
    ctxt->extSubURI = NULL;
6645
0
    DICT_FREE(ctxt->extSubSystem);
6646
0
    ctxt->extSubSystem = NULL;
6647
0
    if (ctxt->myDoc != NULL)
6648
0
        xmlFreeDoc(ctxt->myDoc);
6649
0
    ctxt->myDoc = NULL;
6650
6651
0
    ctxt->standalone = -1;
6652
0
    ctxt->hasExternalSubset = 0;
6653
0
    ctxt->hasPErefs = 0;
6654
0
    ctxt->html = 1;
6655
0
    ctxt->external = 0;
6656
0
    ctxt->instate = XML_PARSER_START;
6657
0
    ctxt->token = 0;
6658
6659
0
    ctxt->wellFormed = 1;
6660
0
    ctxt->nsWellFormed = 1;
6661
0
    ctxt->disableSAX = 0;
6662
0
    ctxt->valid = 1;
6663
0
    ctxt->vctxt.userData = ctxt;
6664
0
    ctxt->vctxt.error = xmlParserValidityError;
6665
0
    ctxt->vctxt.warning = xmlParserValidityWarning;
6666
0
    ctxt->record_info = 0;
6667
0
    ctxt->nbChars = 0;
6668
0
    ctxt->checkIndex = 0;
6669
0
    ctxt->inSubset = 0;
6670
0
    ctxt->errNo = XML_ERR_OK;
6671
0
    ctxt->depth = 0;
6672
0
    ctxt->charset = XML_CHAR_ENCODING_NONE;
6673
0
    ctxt->catalogs = NULL;
6674
0
    xmlInitNodeInfoSeq(&ctxt->node_seq);
6675
6676
0
    if (ctxt->attsDefault != NULL) {
6677
0
        xmlHashFree(ctxt->attsDefault, (xmlHashDeallocator) xmlFree);
6678
0
        ctxt->attsDefault = NULL;
6679
0
    }
6680
0
    if (ctxt->attsSpecial != NULL) {
6681
0
        xmlHashFree(ctxt->attsSpecial, NULL);
6682
0
        ctxt->attsSpecial = NULL;
6683
0
    }
6684
0
}
6685
6686
/**
6687
 * htmlCtxtUseOptions:
6688
 * @ctxt: an HTML parser context
6689
 * @options:  a combination of htmlParserOption(s)
6690
 *
6691
 * Applies the options to the parser context
6692
 *
6693
 * Returns 0 in case of success, the set of unknown or unimplemented options
6694
 *         in case of error.
6695
 */
6696
int
6697
htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
6698
0
{
6699
0
    if (ctxt == NULL)
6700
0
        return(-1);
6701
6702
0
    if (options & HTML_PARSE_NOWARNING) {
6703
0
        ctxt->sax->warning = NULL;
6704
0
        ctxt->vctxt.warning = NULL;
6705
0
        options -= XML_PARSE_NOWARNING;
6706
0
  ctxt->options |= XML_PARSE_NOWARNING;
6707
0
    }
6708
0
    if (options & HTML_PARSE_NOERROR) {
6709
0
        ctxt->sax->error = NULL;
6710
0
        ctxt->vctxt.error = NULL;
6711
0
        ctxt->sax->fatalError = NULL;
6712
0
        options -= XML_PARSE_NOERROR;
6713
0
  ctxt->options |= XML_PARSE_NOERROR;
6714
0
    }
6715
0
    if (options & HTML_PARSE_PEDANTIC) {
6716
0
        ctxt->pedantic = 1;
6717
0
        options -= XML_PARSE_PEDANTIC;
6718
0
  ctxt->options |= XML_PARSE_PEDANTIC;
6719
0
    } else
6720
0
        ctxt->pedantic = 0;
6721
0
    if (options & XML_PARSE_NOBLANKS) {
6722
0
        ctxt->keepBlanks = 0;
6723
0
        ctxt->sax->ignorableWhitespace = xmlSAX2IgnorableWhitespace;
6724
0
        options -= XML_PARSE_NOBLANKS;
6725
0
  ctxt->options |= XML_PARSE_NOBLANKS;
6726
0
    } else
6727
0
        ctxt->keepBlanks = 1;
6728
0
    if (options & HTML_PARSE_RECOVER) {
6729
0
        ctxt->recovery = 1;
6730
0
  options -= HTML_PARSE_RECOVER;
6731
0
    } else
6732
0
        ctxt->recovery = 0;
6733
0
    if (options & HTML_PARSE_COMPACT) {
6734
0
  ctxt->options |= HTML_PARSE_COMPACT;
6735
0
        options -= HTML_PARSE_COMPACT;
6736
0
    }
6737
0
    if (options & XML_PARSE_HUGE) {
6738
0
  ctxt->options |= XML_PARSE_HUGE;
6739
0
        options -= XML_PARSE_HUGE;
6740
0
    }
6741
0
    if (options & HTML_PARSE_NODEFDTD) {
6742
0
  ctxt->options |= HTML_PARSE_NODEFDTD;
6743
0
        options -= HTML_PARSE_NODEFDTD;
6744
0
    }
6745
0
    if (options & HTML_PARSE_IGNORE_ENC) {
6746
0
  ctxt->options |= HTML_PARSE_IGNORE_ENC;
6747
0
        options -= HTML_PARSE_IGNORE_ENC;
6748
0
    }
6749
0
    if (options & HTML_PARSE_NOIMPLIED) {
6750
0
        ctxt->options |= HTML_PARSE_NOIMPLIED;
6751
0
        options -= HTML_PARSE_NOIMPLIED;
6752
0
    }
6753
0
    ctxt->dictNames = 0;
6754
0
    return (options);
6755
0
}
6756
6757
/**
6758
 * htmlDoRead:
6759
 * @ctxt:  an HTML parser context
6760
 * @URL:  the base URL to use for the document
6761
 * @encoding:  the document encoding, or NULL
6762
 * @options:  a combination of htmlParserOption(s)
6763
 * @reuse:  keep the context for reuse
6764
 *
6765
 * Common front-end for the htmlRead functions
6766
 *
6767
 * Returns the resulting document tree or NULL
6768
 */
6769
static htmlDocPtr
6770
htmlDoRead(htmlParserCtxtPtr ctxt, const char *URL, const char *encoding,
6771
          int options, int reuse)
6772
0
{
6773
0
    htmlDocPtr ret;
6774
6775
0
    htmlCtxtUseOptions(ctxt, options);
6776
0
    ctxt->html = 1;
6777
0
    if (encoding != NULL) {
6778
0
        xmlCharEncodingHandlerPtr hdlr;
6779
6780
0
  hdlr = xmlFindCharEncodingHandler(encoding);
6781
0
  if (hdlr != NULL) {
6782
0
      xmlSwitchToEncoding(ctxt, hdlr);
6783
0
      if (ctxt->input->encoding != NULL)
6784
0
        xmlFree((xmlChar *) ctxt->input->encoding);
6785
0
            ctxt->input->encoding = xmlStrdup((xmlChar *)encoding);
6786
0
        }
6787
0
    }
6788
0
    if ((URL != NULL) && (ctxt->input != NULL) &&
6789
0
        (ctxt->input->filename == NULL))
6790
0
        ctxt->input->filename = (char *) xmlStrdup((const xmlChar *) URL);
6791
0
    htmlParseDocument(ctxt);
6792
0
    ret = ctxt->myDoc;
6793
0
    ctxt->myDoc = NULL;
6794
0
    if (!reuse) {
6795
0
        if ((ctxt->dictNames) &&
6796
0
      (ret != NULL) &&
6797
0
      (ret->dict == ctxt->dict))
6798
0
      ctxt->dict = NULL;
6799
0
  xmlFreeParserCtxt(ctxt);
6800
0
    }
6801
0
    return (ret);
6802
0
}
6803
6804
/**
6805
 * htmlReadDoc:
6806
 * @cur:  a pointer to a zero terminated string
6807
 * @URL:  the base URL to use for the document
6808
 * @encoding:  the document encoding, or NULL
6809
 * @options:  a combination of htmlParserOption(s)
6810
 *
6811
 * parse an XML in-memory document and build a tree.
6812
 *
6813
 * Returns the resulting document tree
6814
 */
6815
htmlDocPtr
6816
htmlReadDoc(const xmlChar * cur, const char *URL, const char *encoding, int options)
6817
0
{
6818
0
    htmlParserCtxtPtr ctxt;
6819
6820
0
    if (cur == NULL)
6821
0
        return (NULL);
6822
6823
0
    xmlInitParser();
6824
0
    ctxt = htmlCreateDocParserCtxt(cur, NULL);
6825
0
    if (ctxt == NULL)
6826
0
        return (NULL);
6827
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6828
0
}
6829
6830
/**
6831
 * htmlReadFile:
6832
 * @filename:  a file or URL
6833
 * @encoding:  the document encoding, or NULL
6834
 * @options:  a combination of htmlParserOption(s)
6835
 *
6836
 * parse an XML file from the filesystem or the network.
6837
 *
6838
 * Returns the resulting document tree
6839
 */
6840
htmlDocPtr
6841
htmlReadFile(const char *filename, const char *encoding, int options)
6842
0
{
6843
0
    htmlParserCtxtPtr ctxt;
6844
6845
0
    xmlInitParser();
6846
0
    ctxt = htmlCreateFileParserCtxt(filename, encoding);
6847
0
    if (ctxt == NULL)
6848
0
        return (NULL);
6849
0
    return (htmlDoRead(ctxt, NULL, NULL, options, 0));
6850
0
}
6851
6852
/**
6853
 * htmlReadMemory:
6854
 * @buffer:  a pointer to a char array
6855
 * @size:  the size of the array
6856
 * @URL:  the base URL to use for the document
6857
 * @encoding:  the document encoding, or NULL
6858
 * @options:  a combination of htmlParserOption(s)
6859
 *
6860
 * parse an XML in-memory document and build a tree.
6861
 *
6862
 * Returns the resulting document tree
6863
 */
6864
htmlDocPtr
6865
htmlReadMemory(const char *buffer, int size, const char *URL, const char *encoding, int options)
6866
0
{
6867
0
    htmlParserCtxtPtr ctxt;
6868
6869
0
    xmlInitParser();
6870
0
    ctxt = xmlCreateMemoryParserCtxt(buffer, size);
6871
0
    if (ctxt == NULL)
6872
0
        return (NULL);
6873
0
    htmlDefaultSAXHandlerInit();
6874
0
    if (ctxt->sax != NULL)
6875
0
        memcpy(ctxt->sax, &htmlDefaultSAXHandler, sizeof(xmlSAXHandlerV1));
6876
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6877
0
}
6878
6879
/**
6880
 * htmlReadFd:
6881
 * @fd:  an open file descriptor
6882
 * @URL:  the base URL to use for the document
6883
 * @encoding:  the document encoding, or NULL
6884
 * @options:  a combination of htmlParserOption(s)
6885
 *
6886
 * parse an XML from a file descriptor and build a tree.
6887
 *
6888
 * Returns the resulting document tree
6889
 */
6890
htmlDocPtr
6891
htmlReadFd(int fd, const char *URL, const char *encoding, int options)
6892
0
{
6893
0
    htmlParserCtxtPtr ctxt;
6894
0
    xmlParserInputBufferPtr input;
6895
0
    xmlParserInputPtr stream;
6896
6897
0
    if (fd < 0)
6898
0
        return (NULL);
6899
0
    xmlInitParser();
6900
6901
0
    xmlInitParser();
6902
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
6903
0
    if (input == NULL)
6904
0
        return (NULL);
6905
0
    ctxt = xmlNewParserCtxt();
6906
0
    if (ctxt == NULL) {
6907
0
        xmlFreeParserInputBuffer(input);
6908
0
        return (NULL);
6909
0
    }
6910
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6911
0
    if (stream == NULL) {
6912
0
        xmlFreeParserInputBuffer(input);
6913
0
  xmlFreeParserCtxt(ctxt);
6914
0
        return (NULL);
6915
0
    }
6916
0
    inputPush(ctxt, stream);
6917
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6918
0
}
6919
6920
/**
6921
 * htmlReadIO:
6922
 * @ioread:  an I/O read function
6923
 * @ioclose:  an I/O close function
6924
 * @ioctx:  an I/O handler
6925
 * @URL:  the base URL to use for the document
6926
 * @encoding:  the document encoding, or NULL
6927
 * @options:  a combination of htmlParserOption(s)
6928
 *
6929
 * parse an HTML document from I/O functions and source and build a tree.
6930
 *
6931
 * Returns the resulting document tree
6932
 */
6933
htmlDocPtr
6934
htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
6935
          void *ioctx, const char *URL, const char *encoding, int options)
6936
0
{
6937
0
    htmlParserCtxtPtr ctxt;
6938
0
    xmlParserInputBufferPtr input;
6939
0
    xmlParserInputPtr stream;
6940
6941
0
    if (ioread == NULL)
6942
0
        return (NULL);
6943
0
    xmlInitParser();
6944
6945
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
6946
0
                                         XML_CHAR_ENCODING_NONE);
6947
0
    if (input == NULL) {
6948
0
        if (ioclose != NULL)
6949
0
            ioclose(ioctx);
6950
0
        return (NULL);
6951
0
    }
6952
0
    ctxt = htmlNewParserCtxt();
6953
0
    if (ctxt == NULL) {
6954
0
        xmlFreeParserInputBuffer(input);
6955
0
        return (NULL);
6956
0
    }
6957
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
6958
0
    if (stream == NULL) {
6959
0
        xmlFreeParserInputBuffer(input);
6960
0
  xmlFreeParserCtxt(ctxt);
6961
0
        return (NULL);
6962
0
    }
6963
0
    inputPush(ctxt, stream);
6964
0
    return (htmlDoRead(ctxt, URL, encoding, options, 0));
6965
0
}
6966
6967
/**
6968
 * htmlCtxtReadDoc:
6969
 * @ctxt:  an HTML parser context
6970
 * @cur:  a pointer to a zero terminated string
6971
 * @URL:  the base URL to use for the document
6972
 * @encoding:  the document encoding, or NULL
6973
 * @options:  a combination of htmlParserOption(s)
6974
 *
6975
 * parse an XML in-memory document and build a tree.
6976
 * This reuses the existing @ctxt parser context
6977
 *
6978
 * Returns the resulting document tree
6979
 */
6980
htmlDocPtr
6981
htmlCtxtReadDoc(htmlParserCtxtPtr ctxt, const xmlChar * cur,
6982
               const char *URL, const char *encoding, int options)
6983
0
{
6984
0
    xmlParserInputPtr stream;
6985
6986
0
    if (cur == NULL)
6987
0
        return (NULL);
6988
0
    if (ctxt == NULL)
6989
0
        return (NULL);
6990
0
    xmlInitParser();
6991
6992
0
    htmlCtxtReset(ctxt);
6993
6994
0
    stream = xmlNewStringInputStream(ctxt, cur);
6995
0
    if (stream == NULL) {
6996
0
        return (NULL);
6997
0
    }
6998
0
    inputPush(ctxt, stream);
6999
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7000
0
}
7001
7002
/**
7003
 * htmlCtxtReadFile:
7004
 * @ctxt:  an HTML parser context
7005
 * @filename:  a file or URL
7006
 * @encoding:  the document encoding, or NULL
7007
 * @options:  a combination of htmlParserOption(s)
7008
 *
7009
 * parse an XML file from the filesystem or the network.
7010
 * This reuses the existing @ctxt parser context
7011
 *
7012
 * Returns the resulting document tree
7013
 */
7014
htmlDocPtr
7015
htmlCtxtReadFile(htmlParserCtxtPtr ctxt, const char *filename,
7016
                const char *encoding, int options)
7017
0
{
7018
0
    xmlParserInputPtr stream;
7019
7020
0
    if (filename == NULL)
7021
0
        return (NULL);
7022
0
    if (ctxt == NULL)
7023
0
        return (NULL);
7024
0
    xmlInitParser();
7025
7026
0
    htmlCtxtReset(ctxt);
7027
7028
0
    stream = xmlLoadExternalEntity(filename, NULL, ctxt);
7029
0
    if (stream == NULL) {
7030
0
        return (NULL);
7031
0
    }
7032
0
    inputPush(ctxt, stream);
7033
0
    return (htmlDoRead(ctxt, NULL, encoding, options, 1));
7034
0
}
7035
7036
/**
7037
 * htmlCtxtReadMemory:
7038
 * @ctxt:  an HTML parser context
7039
 * @buffer:  a pointer to a char array
7040
 * @size:  the size of the array
7041
 * @URL:  the base URL to use for the document
7042
 * @encoding:  the document encoding, or NULL
7043
 * @options:  a combination of htmlParserOption(s)
7044
 *
7045
 * parse an XML in-memory document and build a tree.
7046
 * This reuses the existing @ctxt parser context
7047
 *
7048
 * Returns the resulting document tree
7049
 */
7050
htmlDocPtr
7051
htmlCtxtReadMemory(htmlParserCtxtPtr ctxt, const char *buffer, int size,
7052
                  const char *URL, const char *encoding, int options)
7053
0
{
7054
0
    xmlParserInputBufferPtr input;
7055
0
    xmlParserInputPtr stream;
7056
7057
0
    if (ctxt == NULL)
7058
0
        return (NULL);
7059
0
    if (buffer == NULL)
7060
0
        return (NULL);
7061
0
    xmlInitParser();
7062
7063
0
    htmlCtxtReset(ctxt);
7064
7065
0
    input = xmlParserInputBufferCreateMem(buffer, size, XML_CHAR_ENCODING_NONE);
7066
0
    if (input == NULL) {
7067
0
  return(NULL);
7068
0
    }
7069
7070
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7071
0
    if (stream == NULL) {
7072
0
  xmlFreeParserInputBuffer(input);
7073
0
  return(NULL);
7074
0
    }
7075
7076
0
    inputPush(ctxt, stream);
7077
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7078
0
}
7079
7080
/**
7081
 * htmlCtxtReadFd:
7082
 * @ctxt:  an HTML parser context
7083
 * @fd:  an open file descriptor
7084
 * @URL:  the base URL to use for the document
7085
 * @encoding:  the document encoding, or NULL
7086
 * @options:  a combination of htmlParserOption(s)
7087
 *
7088
 * parse an XML from a file descriptor and build a tree.
7089
 * This reuses the existing @ctxt parser context
7090
 *
7091
 * Returns the resulting document tree
7092
 */
7093
htmlDocPtr
7094
htmlCtxtReadFd(htmlParserCtxtPtr ctxt, int fd,
7095
              const char *URL, const char *encoding, int options)
7096
0
{
7097
0
    xmlParserInputBufferPtr input;
7098
0
    xmlParserInputPtr stream;
7099
7100
0
    if (fd < 0)
7101
0
        return (NULL);
7102
0
    if (ctxt == NULL)
7103
0
        return (NULL);
7104
0
    xmlInitParser();
7105
7106
0
    htmlCtxtReset(ctxt);
7107
7108
7109
0
    input = xmlParserInputBufferCreateFd(fd, XML_CHAR_ENCODING_NONE);
7110
0
    if (input == NULL)
7111
0
        return (NULL);
7112
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7113
0
    if (stream == NULL) {
7114
0
        xmlFreeParserInputBuffer(input);
7115
0
        return (NULL);
7116
0
    }
7117
0
    inputPush(ctxt, stream);
7118
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7119
0
}
7120
7121
/**
7122
 * htmlCtxtReadIO:
7123
 * @ctxt:  an HTML parser context
7124
 * @ioread:  an I/O read function
7125
 * @ioclose:  an I/O close function
7126
 * @ioctx:  an I/O handler
7127
 * @URL:  the base URL to use for the document
7128
 * @encoding:  the document encoding, or NULL
7129
 * @options:  a combination of htmlParserOption(s)
7130
 *
7131
 * parse an HTML document from I/O functions and source and build a tree.
7132
 * This reuses the existing @ctxt parser context
7133
 *
7134
 * Returns the resulting document tree
7135
 */
7136
htmlDocPtr
7137
htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
7138
              xmlInputCloseCallback ioclose, void *ioctx,
7139
        const char *URL,
7140
              const char *encoding, int options)
7141
0
{
7142
0
    xmlParserInputBufferPtr input;
7143
0
    xmlParserInputPtr stream;
7144
7145
0
    if (ioread == NULL)
7146
0
        return (NULL);
7147
0
    if (ctxt == NULL)
7148
0
        return (NULL);
7149
0
    xmlInitParser();
7150
7151
0
    htmlCtxtReset(ctxt);
7152
7153
0
    input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
7154
0
                                         XML_CHAR_ENCODING_NONE);
7155
0
    if (input == NULL) {
7156
0
        if (ioclose != NULL)
7157
0
            ioclose(ioctx);
7158
0
        return (NULL);
7159
0
    }
7160
0
    stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
7161
0
    if (stream == NULL) {
7162
0
        xmlFreeParserInputBuffer(input);
7163
0
        return (NULL);
7164
0
    }
7165
0
    inputPush(ctxt, stream);
7166
0
    return (htmlDoRead(ctxt, URL, encoding, options, 1));
7167
0
}
7168
7169
#define bottom_HTMLparser
7170
#include "elfgcchack.h"
7171
#endif /* LIBXML_HTML_ENABLED */