Coverage Report

Created: 2026-04-12 08:01

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tidy-html5/src/lexer.c
Line
Count
Source
1
/* lexer.c -- Lexer for html parser
2
  
3
  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
*/
7
8
/*
9
  Given a file stream fp it returns a sequence of tokens.
10
11
     GetToken(fp) gets the next token
12
     UngetToken(fp) provides one level undo
13
14
  The tags include an attribute list:
15
16
    - linked list of attribute/value nodes
17
    - each node has 2 NULL-terminated strings.
18
    - entities are replaced in attribute values
19
20
  white space is compacted if not in preformatted mode
21
  If not in preformatted mode then leading white space
22
  is discarded and subsequent white space sequences
23
  compacted to single space characters.
24
25
  If XmlTags is no then Tag names are folded to upper
26
  case and attribute names to lower case.
27
28
 Not yet done:
29
    -   Doctype subset and marked sections
30
*/
31
32
#include "tidy-int.h"
33
#include "lexer.h"
34
#include "parser.h"
35
#include "entities.h"
36
#include "streamio.h"
37
#include "message.h"
38
#include "tmbstr.h"
39
#include "clean.h"
40
#include "utf8.h"
41
#include "streamio.h"
42
#include "sprtf.h"
43
44
#if defined(ENABLE_DEBUG_LOG)
45
/* #define DEBUG_ALLOCATION   special EXTRA allocation debug information - VERY NOISY */
46
static void check_me(char *name);
47
static Bool show_attrs = yes;
48
#define MX_TXT 8
49
static char buffer[(MX_TXT*4)+8]; /* NOTE extra for '...'\0 tail */
50
static tmbstr get_text_string(Lexer* lexer, Node *node)
51
{
52
    uint len = node->end - node->start;
53
    tmbstr cp = lexer->lexbuf + node->start;
54
    tmbstr end = lexer->lexbuf + node->end;
55
    unsigned char c;
56
    uint i = 0;
57
    Bool insp = no;
58
    if (len <= ((MX_TXT * 2) + 3)) {
59
        buffer[0] = 0;
60
        while (cp < end) {
61
            c = *cp;
62
            cp++;
63
            if (c == '\n') {
64
                buffer[i++] = '\\';
65
                buffer[i++] = 'n';
66
            } else if (c == '\t') {
67
                buffer[i++] = '\\';
68
                buffer[i++] = 't';
69
            } else if ( c == ' ' ) {
70
                if (!insp)
71
                    buffer[i++] = c;
72
                insp = yes;
73
            } else {
74
                buffer[i++] = c;
75
                insp = no;
76
            }
77
        }
78
    } else {
79
        char *end1 = cp + MX_TXT;
80
        char *bgn = cp + (len - MX_TXT);
81
        buffer[0] = 0;
82
        if (bgn < end1)
83
            bgn = end1;
84
        while (cp < end1) {
85
            c = *cp;
86
            cp++;
87
            if (c == '\n') {
88
                buffer[i++] = '\\';
89
                buffer[i++] = 'n';
90
            } else if (c == '\t') {
91
                buffer[i++] = '\\';
92
                buffer[i++] = 't';
93
            } else if ( c == ' ' ) {
94
                if (!insp)
95
                    buffer[i++] = c;
96
                insp = yes;
97
            } else {
98
                buffer[i++] = c;
99
                insp = no;
100
            }
101
            if (i >= MX_TXT)
102
                break;
103
        }
104
        c = '.';
105
        if ((i < len)&&(cp < bgn)) {
106
            buffer[i++] = c;
107
            cp++;
108
            if ((i < len)&&(cp < bgn)) {
109
                buffer[i++] = c;
110
                cp++;
111
                if ((i < len)&&(cp < bgn)) {
112
                    buffer[i++] = c;
113
                    cp++;
114
                }
115
            }
116
        }
117
        cp = bgn;
118
        insp = no;
119
        while (cp < end) {
120
            c = *cp;
121
            cp++;
122
            if (c == '\n') {
123
                buffer[i++] = '\\';
124
                buffer[i++] = 'n';
125
            } else if (c == '\t') {
126
                buffer[i++] = '\\';
127
                buffer[i++] = 't';
128
            } else if ( c == ' ' ) {
129
                if (!insp)
130
                    buffer[i++] = c;
131
                insp = yes;
132
            } else {
133
                buffer[i++] = c;
134
                insp = no;
135
            }
136
        }
137
    }
138
    buffer[i] = 0;
139
    return buffer;
140
}
141
static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node )
142
{
143
    Lexer* lexer = doc->lexer;
144
    Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? yes : no;
145
    int line = ( doc->lexer ? doc->lexer->lines : 0 );
146
    int col  = ( doc->lexer ? doc->lexer->columns : 0 );
147
    tmbstr src = lex ? "lexer" : "stream";
148
    SPRTF("R=%d C=%d: ", line, col );
149
    /* DEBUG: Be able to set a TRAP on a SPECIFIC row,col */
150
    if ((line == 3) && (col == 1)) {
151
        check_me("Show_Node"); /* just a debug trap */
152
    }
153
    if (lexer && lexer->token && 
154
        ((lexer->token->type == TextNode)||(node && (node->type == TextNode)))) {
155
        if (show_attrs) {
156
            uint len = node ? node->end - node->start : 0;
157
            tmbstr cp = node ? get_text_string( lexer, node ) : "NULL";
158
            SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len, src );
159
        } else {
160
            SPRTF("Returning %s TextNode %p... %s\n", msg, node, src );
161
        }
162
    } else {
163
        tmbstr name = node ? node->element ? node->element : "blank" : "NULL";
164
        if (show_attrs) {
165
            AttVal* av;
166
            SPRTF("Returning %s node <%s", msg, name);
167
            if (node) {
168
                for (av = node->attributes; av; av = av->next) {
169
                    name = av->attribute;
170
                    if (name) {
171
                        SPRTF(" %s",name);
172
                        if (av->value) {
173
                            SPRTF("=\"%s\"", av->value);
174
                        }
175
                    }
176
                }
177
            }
178
            SPRTF("> %s\n", src);
179
        } else {
180
            SPRTF("Returning %s node %p <%s>... %s\n", msg, node,
181
                name, src );
182
        }
183
    }
184
}
185
#define GTDBG(a,b,c) Show_Node(a,b,c)
186
#else /* ENABLE_DEBUG_LOG */
187
#define GTDBG(a,b,c)
188
#endif /* defined(ENABLE_DEBUG_LOG) */
189
190
/* Forward references
191
*/
192
/* swallows closing '>' */
193
static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
194
195
static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, 
196
                             Node **asp, Node **php );
197
198
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
199
                         Bool *isempty, int *pdelim );
200
201
static Node *ParseDocTypeDecl(TidyDocImpl* doc);
202
203
static void AddAttrToList( AttVal** list, AttVal* av );
204
205
/* used to classify characters for lexical purposes */
206
1.38G
#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
207
static uint lexmap[128];
208
209
39.2k
#define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
210
72.7k
#define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
211
212
static struct _doctypes
213
{
214
    uint score;
215
    uint vers;
216
    uint vers_out;
217
    Bool xhtml;
218
    ctmbstr name;
219
    ctmbstr fpi;
220
    ctmbstr si;
221
} const W3C_Doctypes[] =
222
{
223
  {  2, HT20, 200, no,  "HTML 2.0",               "-//IETF//DTD HTML 2.0//EN",              NULL,                                                       },
224
  {  2, HT20, 200, no,  "HTML 2.0",               "-//IETF//DTD HTML//EN",                  NULL,                                                       },
225
  {  2, HT20, 200, no,  "HTML 2.0",               "-//W3C//DTD HTML 2.0//EN",               NULL,                                                       },
226
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2//EN",               NULL,                                                       },
227
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2 Final//EN",         NULL,                                                       },
228
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2 Draft//EN",         NULL,                                                       },
229
  {  6, H40S, 400, no,  "HTML 4.0 Strict",        "-//W3C//DTD HTML 4.0//EN",               "http://www.w3.org/TR/REC-html40/strict.dtd"                },
230
  {  8, H40T, 400, no,  "HTML 4.0 Transitional",  "-//W3C//DTD HTML 4.0 Transitional//EN",  "http://www.w3.org/TR/REC-html40/loose.dtd"                 },
231
  {  7, H40F, 400, no,  "HTML 4.0 Frameset",      "-//W3C//DTD HTML 4.0 Frameset//EN",      "http://www.w3.org/TR/REC-html40/frameset.dtd"              },
232
  {  3, H41S, 401, no,  "HTML 4.01 Strict",       "-//W3C//DTD HTML 4.01//EN",              "http://www.w3.org/TR/html4/strict.dtd"                     },
233
  {  5, H41T, 401, no,  "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd"                      },
234
  {  4, H41F, 401, no,  "HTML 4.01 Frameset",     "-//W3C//DTD HTML 4.01 Frameset//EN",     "http://www.w3.org/TR/html4/frameset.dtd"                   },
235
  {  9, X10S, 100, yes, "XHTML 1.0 Strict",       "-//W3C//DTD XHTML 1.0 Strict//EN",       "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"         },
236
  { 11, X10T, 100, yes, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"   },
237
  { 10, X10F, 100, yes, "XHTML 1.0 Frameset",     "-//W3C//DTD XHTML 1.0 Frameset//EN",     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"       },
238
  { 12, XH11, 110, yes, "XHTML 1.1",              "-//W3C//DTD XHTML 1.1//EN",              "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"              },
239
  { 13, XB10, 100, yes, "XHTML Basic 1.0",        "-//W3C//DTD XHTML Basic 1.0//EN",        "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"        },
240
241
  { 20, HT50, 500, no,  "HTML5",                  NULL,                                     NULL                                                        },
242
  { 21, XH50, 500, yes, "XHTML5",                 NULL,                                     NULL                                                        },
243
244
  /* final entry */
245
  {  0,    0, 0,  no,  NULL,                     NULL,                                     NULL                                                        }
246
};
247
248
/* 
249
 * Issue #643 - Since VERS_FROM40 was extended to include VERS_HTML5
250
 * to be used in the expanded entity table some 155 times,
251
 * need a special macro here to denote just HTML 4 plus XHTML,
252
 * which is actually the former define of VERS_FROM40
253
 */
254
8.39M
#define VERS_HMTL40PX        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
255
256
int TY_(HTMLVersion)(TidyDocImpl* doc)
257
8.32M
{
258
8.32M
    uint i;
259
8.32M
    uint j = 0;
260
8.32M
    uint score = 0;
261
8.32M
    uint vers = doc->lexer->versions;
262
8.32M
    uint dtver = doc->lexer->doctype;
263
8.32M
    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
264
8.32M
    Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
265
8.28M
                 !cfgBool(doc, TidyHtmlOut);
266
8.32M
    Bool html4 = ((dtmode == TidyDoctypeStrict) || (dtmode == TidyDoctypeLoose) ||
267
8.32M
                  (VERS_HMTL40PX & dtver) ? yes : no);
268
8.32M
    Bool html5 = (!html4 && ((dtmode == TidyDoctypeAuto) ||
269
8.30M
                  (dtmode == TidyDoctypeHtml5)) ? yes : no);
270
271
8.32M
    if (xhtml && dtver == VERS_UNKNOWN) return XH50;
272
97.9k
    if (dtver == VERS_UNKNOWN) return HT50;
273
    /* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */
274
52.5k
    if (!xhtml && (dtver == VERS_HTML5)) return HT50;
275
    /* Issue #377 - If xhtml and (doctype == html5) and constrained vers contains XH50 return that,
276
       and really if tidy defaults to 'html5', then maybe 'auto' should also apply! */
277
52.5k
    if (xhtml && html5 && ((vers & VERS_HTML5) == XH50)) return XH50;
278
279
1.03M
    for (i = 0; W3C_Doctypes[i].name; ++i)
280
987k
    {
281
987k
        if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
282
312k
            (html4 && !(VERS_HMTL40PX & W3C_Doctypes[i].vers)))
283
687k
            continue;
284
285
300k
        if (vers & W3C_Doctypes[i].vers &&
286
100k
            (W3C_Doctypes[i].score < score || !score))
287
39.7k
        {
288
39.7k
            score = W3C_Doctypes[i].score;
289
39.7k
            j = i;
290
39.7k
        }
291
300k
    }
292
293
51.9k
    if (score)
294
39.0k
        return W3C_Doctypes[j].vers;
295
296
12.9k
    return VERS_UNKNOWN;
297
51.9k
}
298
299
static ctmbstr GetFPIFromVers(uint vers)
300
3.34k
{
301
3.34k
    uint i;
302
303
64.7k
    for (i = 0; W3C_Doctypes[i].name; ++i)
304
61.7k
        if (W3C_Doctypes[i].vers == vers)
305
371
            return W3C_Doctypes[i].fpi;
306
307
2.97k
    return NULL;
308
3.34k
}
309
310
static ctmbstr GetSIFromVers(uint vers)
311
37.7k
{
312
37.7k
    uint i;
313
314
715k
    for (i = 0; W3C_Doctypes[i].name; ++i)
315
715k
        if (W3C_Doctypes[i].vers == vers)
316
37.7k
            return W3C_Doctypes[i].si;
317
318
0
    return NULL;
319
37.7k
}
320
321
static ctmbstr GetNameFromVers(uint vers)
322
97.3k
{
323
97.3k
    uint i;
324
325
1.84M
    for (i = 0; W3C_Doctypes[i].name; ++i)
326
1.84M
        if (W3C_Doctypes[i].vers == vers)
327
97.2k
            return W3C_Doctypes[i].name;
328
329
125
    return NULL;
330
97.3k
}
331
332
static uint GetVersFromFPI(ctmbstr fpi)
333
3.17k
{
334
3.17k
    uint i;
335
336
62.0k
    for (i = 0; W3C_Doctypes[i].name; ++i)
337
59.0k
        if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
338
194
            return W3C_Doctypes[i].vers;
339
340
2.97k
    return 0;
341
3.17k
}
342
343
#ifdef ENABLE_DEBUG_LOG
344
#  ifndef EndBuf
345
#    define EndBuf(a)   ( a + strlen(a) )
346
#  endif
347
348
/* Issue #377 - Output diminishing version bits */
349
typedef struct tagV2S {
350
    uint bit;
351
    ctmbstr val;
352
}V2S, *PV2S;
353
354
static V2S v2s[] = {
355
    { HT20, "HT20" },
356
    { HT32, "HT32" },
357
    { H40S, "H40S" },
358
    { H40T, "H40T" },
359
    { H40F, "H40F" },
360
    { H41S, "H41S" },
361
    { H41T, "H41T" },
362
    { H41F, "H41F" },
363
    { X10S, "X10S" },
364
    { X10T, "X10T" },
365
    { X10F, "X10F" },
366
    { XH11, "XH11" },
367
    { XB10, "XB10" }, /* 4096u */
368
    /* { VERS_SUN, "VSUN" }, */
369
    /* { VERS_NETSCAPE, "VNET" }, */
370
    /* { VERS_MICROSOFT, "VMIC" }, 32768u */
371
    { VERS_XML, "VXML" }, /* 65536u */
372
        /* HTML5 */
373
    { HT50, "HT50" }, /* 131072u */
374
    { XH50, "XH50" }, /* 262144u */
375
    { 0,     0  }
376
};
377
378
/* Process the above table, adding a bit name,
379
   or '----' when not present   */
380
static char *add_vers_string( tmbstr buf, uint vers )
381
{
382
    PV2S pv2s = v2s;
383
    int len = (int)strlen(buf);
384
    while (pv2s->val) {
385
        if (vers & pv2s->bit) {
386
            if (len) {
387
                strcat(buf,"|");
388
                len++;
389
            }
390
            strcat(buf,pv2s->val);
391
            len += (int)strlen(pv2s->val);
392
            vers &= ~(pv2s->bit);
393
            if (!vers)
394
                break;
395
        } else {
396
            if (len) {
397
                strcat(buf,"|");
398
                len++;
399
            }
400
            strcat(buf,"----");
401
            len += 4;
402
403
        }
404
        pv2s++;
405
    }
406
    if (vers) { /* Should not have any here! */
407
        if (len)
408
            strcat(buf,"|");
409
        sprintf(EndBuf(buf),"%u",vers);
410
    }
411
    return buf;
412
413
}
414
415
/* Issue #377 - Show first Before: list, and then on any change
416
   Note the VERS_PROPRIETARY are exclude since they always remain */
417
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
418
{
419
    static char vcur[256];
420
    static Bool dnfirst = no;
421
    uint curr = doc->lexer->versions; /* get current */
422
    doc->lexer->versions &= (vers | VERS_PROPRIETARY);
423
    if (curr != doc->lexer->versions) { /* only if different */
424
        if (!dnfirst) {
425
            dnfirst = yes;
426
            vcur[0] = 0;
427
            curr &= ~(VERS_PROPRIETARY);
428
            add_vers_string( vcur, curr );
429
            SPRTF("Before: %s\n", vcur);
430
        }
431
        vcur[0] = 0;
432
        curr = doc->lexer->versions;
433
        curr &= ~(VERS_PROPRIETARY);
434
        add_vers_string( vcur, curr );
435
        SPRTF("After : %s\n", vcur);
436
    }
437
}
438
#else /* !#if defined(ENABLE_DEBUG_LOG) */
439
/* everything is allowed in proprietary version of HTML */
440
/* this is handled here rather than in the tag/attr dicts */
441
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
442
3.15M
{
443
3.15M
    doc->lexer->versions &= (vers | VERS_PROPRIETARY);
444
3.15M
}
445
#endif /* #if defined(ENABLE_DEBUG_LOG) y/n */
446
447
Bool TY_(IsWhite)(uint c)
448
1.09G
{
449
1.09G
    uint map = MAP(c);
450
451
1.09G
    return (map & white)!=0;
452
1.09G
}
453
454
Bool TY_(IsNewline)(uint c)
455
0
{
456
0
    uint map = MAP(c);
457
0
    return (map & newline)!=0;
458
0
}
459
460
Bool TY_(IsDigit)(uint c)
461
157k
{
462
157k
    uint map;
463
464
157k
    map = MAP(c);
465
466
157k
    return (map & digit)!=0;
467
157k
}
468
469
static Bool IsDigitHex(uint c)
470
898k
{
471
898k
    uint map;
472
473
898k
    map = MAP(c);
474
475
898k
    return (map & digithex)!=0;
476
898k
}
477
478
Bool TY_(IsLetter)(uint c)
479
8.98M
{
480
8.98M
    uint map;
481
482
8.98M
    map = MAP(c);
483
484
8.98M
    return (map & letter)!=0;
485
8.98M
}
486
487
Bool TY_(IsHTMLSpace)(uint c)
488
2.45M
{
489
2.45M
    return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d;
490
2.45M
}
491
492
Bool TY_(IsNamechar)(uint c)
493
21.1M
{
494
21.1M
    uint map = MAP(c);
495
21.1M
    return (map & namechar)!=0;
496
21.1M
}
497
498
Bool TY_(IsXMLLetter)(uint c)
499
6.16M
{
500
6.16M
    return ((c >= 0x41 && c <= 0x5a) ||
501
5.72M
        (c >= 0x61 && c <= 0x7a) ||
502
4.97M
        (c >= 0xc0 && c <= 0xd6) ||
503
4.96M
        (c >= 0xd8 && c <= 0xf6) ||
504
4.96M
        (c >= 0xf8 && c <= 0xff) ||
505
4.96M
        (c >= 0x100 && c <= 0x131) ||
506
4.96M
        (c >= 0x134 && c <= 0x13e) ||
507
4.95M
        (c >= 0x141 && c <= 0x148) ||
508
4.95M
        (c >= 0x14a && c <= 0x17e) ||
509
4.95M
        (c >= 0x180 && c <= 0x1c3) ||
510
4.95M
        (c >= 0x1cd && c <= 0x1f0) ||
511
4.95M
        (c >= 0x1f4 && c <= 0x1f5) ||
512
4.95M
        (c >= 0x1fa && c <= 0x217) ||
513
4.95M
        (c >= 0x250 && c <= 0x2a8) ||
514
4.95M
        (c >= 0x2bb && c <= 0x2c1) ||
515
4.94M
        c == 0x386 ||
516
4.94M
        (c >= 0x388 && c <= 0x38a) ||
517
4.94M
        c == 0x38c ||
518
4.94M
        (c >= 0x38e && c <= 0x3a1) ||
519
4.94M
        (c >= 0x3a3 && c <= 0x3ce) ||
520
4.94M
        (c >= 0x3d0 && c <= 0x3d6) ||
521
4.94M
        c == 0x3da ||
522
4.93M
        c == 0x3dc ||
523
4.93M
        c == 0x3de ||
524
4.93M
        c == 0x3e0 ||
525
4.93M
        (c >= 0x3e2 && c <= 0x3f3) ||
526
4.93M
        (c >= 0x401 && c <= 0x40c) ||
527
4.93M
        (c >= 0x40e && c <= 0x44f) ||
528
4.93M
        (c >= 0x451 && c <= 0x45c) ||
529
4.93M
        (c >= 0x45e && c <= 0x481) ||
530
4.92M
        (c >= 0x490 && c <= 0x4c4) ||
531
4.92M
        (c >= 0x4c7 && c <= 0x4c8) ||
532
4.92M
        (c >= 0x4cb && c <= 0x4cc) ||
533
4.92M
        (c >= 0x4d0 && c <= 0x4eb) ||
534
4.92M
        (c >= 0x4ee && c <= 0x4f5) ||
535
4.92M
        (c >= 0x4f8 && c <= 0x4f9) ||
536
4.92M
        (c >= 0x531 && c <= 0x556) ||
537
4.91M
        c == 0x559 ||
538
4.91M
        (c >= 0x561 && c <= 0x586) ||
539
4.91M
        (c >= 0x5d0 && c <= 0x5ea) ||
540
4.91M
        (c >= 0x5f0 && c <= 0x5f2) ||
541
4.91M
        (c >= 0x621 && c <= 0x63a) ||
542
4.91M
        (c >= 0x641 && c <= 0x64a) ||
543
4.91M
        (c >= 0x671 && c <= 0x6b7) ||
544
4.90M
        (c >= 0x6ba && c <= 0x6be) ||
545
4.90M
        (c >= 0x6c0 && c <= 0x6ce) ||
546
4.90M
        (c >= 0x6d0 && c <= 0x6d3) ||
547
4.90M
        c == 0x6d5 ||
548
4.90M
        (c >= 0x6e5 && c <= 0x6e6) ||
549
4.90M
        (c >= 0x905 && c <= 0x939) ||
550
4.90M
        c == 0x93d ||
551
4.90M
        (c >= 0x958 && c <= 0x961) ||
552
4.89M
        (c >= 0x985 && c <= 0x98c) ||
553
4.89M
        (c >= 0x98f && c <= 0x990) ||
554
4.89M
        (c >= 0x993 && c <= 0x9a8) ||
555
4.89M
        (c >= 0x9aa && c <= 0x9b0) ||
556
4.89M
        c == 0x9b2 ||
557
4.89M
        (c >= 0x9b6 && c <= 0x9b9) ||
558
4.89M
        (c >= 0x9dc && c <= 0x9dd) ||
559
4.88M
        (c >= 0x9df && c <= 0x9e1) ||
560
4.88M
        (c >= 0x9f0 && c <= 0x9f1) ||
561
4.88M
        (c >= 0xa05 && c <= 0xa0a) ||
562
4.88M
        (c >= 0xa0f && c <= 0xa10) ||
563
4.88M
        (c >= 0xa13 && c <= 0xa28) ||
564
4.88M
        (c >= 0xa2a && c <= 0xa30) ||
565
4.87M
        (c >= 0xa32 && c <= 0xa33) ||
566
4.87M
        (c >= 0xa35 && c <= 0xa36) ||
567
4.87M
        (c >= 0xa38 && c <= 0xa39) ||
568
4.87M
        (c >= 0xa59 && c <= 0xa5c) ||
569
4.86M
        c == 0xa5e ||
570
4.86M
        (c >= 0xa72 && c <= 0xa74) ||
571
4.86M
        (c >= 0xa85 && c <= 0xa8b) ||
572
4.86M
        c == 0xa8d ||
573
4.86M
        (c >= 0xa8f && c <= 0xa91) ||
574
4.86M
        (c >= 0xa93 && c <= 0xaa8) ||
575
4.85M
        (c >= 0xaaa && c <= 0xab0) ||
576
4.85M
        (c >= 0xab2 && c <= 0xab3) ||
577
4.85M
        (c >= 0xab5 && c <= 0xab9) ||
578
4.85M
        c == 0xabd ||
579
4.85M
        c == 0xae0 ||
580
4.85M
        (c >= 0xb05 && c <= 0xb0c) ||
581
4.84M
        (c >= 0xb0f && c <= 0xb10) ||
582
4.84M
        (c >= 0xb13 && c <= 0xb28) ||
583
4.84M
        (c >= 0xb2a && c <= 0xb30) ||
584
4.84M
        (c >= 0xb32 && c <= 0xb33) ||
585
4.84M
        (c >= 0xb36 && c <= 0xb39) ||
586
4.83M
        c == 0xb3d ||
587
4.83M
        (c >= 0xb5c && c <= 0xb5d) ||
588
4.83M
        (c >= 0xb5f && c <= 0xb61) ||
589
4.83M
        (c >= 0xb85 && c <= 0xb8a) ||
590
4.83M
        (c >= 0xb8e && c <= 0xb90) ||
591
4.82M
        (c >= 0xb92 && c <= 0xb95) ||
592
4.82M
        (c >= 0xb99 && c <= 0xb9a) ||
593
4.82M
        c == 0xb9c ||
594
4.82M
        (c >= 0xb9e && c <= 0xb9f) ||
595
4.82M
        (c >= 0xba3 && c <= 0xba4) ||
596
4.82M
        (c >= 0xba8 && c <= 0xbaa) ||
597
4.82M
        (c >= 0xbae && c <= 0xbb5) ||
598
4.82M
        (c >= 0xbb7 && c <= 0xbb9) ||
599
4.82M
        (c >= 0xc05 && c <= 0xc0c) ||
600
4.81M
        (c >= 0xc0e && c <= 0xc10) ||
601
4.81M
        (c >= 0xc12 && c <= 0xc28) ||
602
4.81M
        (c >= 0xc2a && c <= 0xc33) ||
603
4.81M
        (c >= 0xc35 && c <= 0xc39) ||
604
4.81M
        (c >= 0xc60 && c <= 0xc61) ||
605
4.81M
        (c >= 0xc85 && c <= 0xc8c) ||
606
4.81M
        (c >= 0xc8e && c <= 0xc90) ||
607
4.81M
        (c >= 0xc92 && c <= 0xca8) ||
608
4.81M
        (c >= 0xcaa && c <= 0xcb3) ||
609
4.80M
        (c >= 0xcb5 && c <= 0xcb9) ||
610
4.80M
        c == 0xcde ||
611
4.80M
        (c >= 0xce0 && c <= 0xce1) ||
612
4.80M
        (c >= 0xd05 && c <= 0xd0c) ||
613
4.80M
        (c >= 0xd0e && c <= 0xd10) ||
614
4.79M
        (c >= 0xd12 && c <= 0xd28) ||
615
4.79M
        (c >= 0xd2a && c <= 0xd39) ||
616
4.79M
        (c >= 0xd60 && c <= 0xd61) ||
617
4.79M
        (c >= 0xe01 && c <= 0xe2e) ||
618
4.79M
        c == 0xe30 ||
619
4.79M
        (c >= 0xe32 && c <= 0xe33) ||
620
4.79M
        (c >= 0xe40 && c <= 0xe45) ||
621
4.78M
        (c >= 0xe81 && c <= 0xe82) ||
622
4.78M
        c == 0xe84 ||
623
4.78M
        (c >= 0xe87 && c <= 0xe88) ||
624
4.78M
        c == 0xe8a ||
625
4.78M
        c == 0xe8d ||
626
4.78M
        (c >= 0xe94 && c <= 0xe97) ||
627
4.78M
        (c >= 0xe99 && c <= 0xe9f) ||
628
4.77M
        (c >= 0xea1 && c <= 0xea3) ||
629
4.77M
        c == 0xea5 ||
630
4.77M
        c == 0xea7 ||
631
4.77M
        (c >= 0xeaa && c <= 0xeab) ||
632
4.77M
        (c >= 0xead && c <= 0xeae) ||
633
4.77M
        c == 0xeb0 ||
634
4.77M
        (c >= 0xeb2 && c <= 0xeb3) ||
635
4.77M
        c == 0xebd ||
636
4.77M
        (c >= 0xec0 && c <= 0xec4) ||
637
4.76M
        (c >= 0xf40 && c <= 0xf47) ||
638
4.76M
        (c >= 0xf49 && c <= 0xf69) ||
639
4.76M
        (c >= 0x10a0 && c <= 0x10c5) ||
640
4.76M
        (c >= 0x10d0 && c <= 0x10f6) ||
641
4.76M
        c == 0x1100 ||
642
4.76M
        (c >= 0x1102 && c <= 0x1103) ||
643
4.75M
        (c >= 0x1105 && c <= 0x1107) ||
644
4.75M
        c == 0x1109 ||
645
4.75M
        (c >= 0x110b && c <= 0x110c) ||
646
4.75M
        (c >= 0x110e && c <= 0x1112) ||
647
4.75M
        c == 0x113c ||
648
4.75M
        c == 0x113e ||
649
4.74M
        c == 0x1140 ||
650
4.74M
        c == 0x114c ||
651
4.74M
        c == 0x114e ||
652
4.74M
        c == 0x1150 ||
653
4.74M
        (c >= 0x1154 && c <= 0x1155) ||
654
4.74M
        c == 0x1159 ||
655
4.74M
        (c >= 0x115f && c <= 0x1161) ||
656
4.74M
        c == 0x1163 ||
657
4.74M
        c == 0x1165 ||
658
4.74M
        c == 0x1167 ||
659
4.73M
        c == 0x1169 ||
660
4.73M
        (c >= 0x116d && c <= 0x116e) ||
661
4.73M
        (c >= 0x1172 && c <= 0x1173) ||
662
4.73M
        c == 0x1175 ||
663
4.73M
        c == 0x119e ||
664
4.73M
        c == 0x11a8 ||
665
4.73M
        c == 0x11ab ||
666
4.73M
        (c >= 0x11ae && c <= 0x11af) ||
667
4.73M
        (c >= 0x11b7 && c <= 0x11b8) ||
668
4.73M
        c == 0x11ba ||
669
4.72M
        (c >= 0x11bc && c <= 0x11c2) ||
670
4.72M
        c == 0x11eb ||
671
4.72M
        c == 0x11f0 ||
672
4.72M
        c == 0x11f9 ||
673
4.72M
        (c >= 0x1e00 && c <= 0x1e9b) ||
674
4.72M
        (c >= 0x1ea0 && c <= 0x1ef9) ||
675
4.72M
        (c >= 0x1f00 && c <= 0x1f15) ||
676
4.72M
        (c >= 0x1f18 && c <= 0x1f1d) ||
677
4.72M
        (c >= 0x1f20 && c <= 0x1f45) ||
678
4.71M
        (c >= 0x1f48 && c <= 0x1f4d) ||
679
4.71M
        (c >= 0x1f50 && c <= 0x1f57) ||
680
4.71M
        c == 0x1f59 ||
681
4.71M
        c == 0x1f5b ||
682
4.71M
        c == 0x1f5d ||
683
4.71M
        (c >= 0x1f5f && c <= 0x1f7d) ||
684
4.71M
        (c >= 0x1f80 && c <= 0x1fb4) ||
685
4.71M
        (c >= 0x1fb6 && c <= 0x1fbc) ||
686
4.70M
        c == 0x1fbe ||
687
4.70M
        (c >= 0x1fc2 && c <= 0x1fc4) ||
688
4.70M
        (c >= 0x1fc6 && c <= 0x1fcc) ||
689
4.70M
        (c >= 0x1fd0 && c <= 0x1fd3) ||
690
4.70M
        (c >= 0x1fd6 && c <= 0x1fdb) ||
691
4.70M
        (c >= 0x1fe0 && c <= 0x1fec) ||
692
4.70M
        (c >= 0x1ff2 && c <= 0x1ff4) ||
693
4.70M
        (c >= 0x1ff6 && c <= 0x1ffc) ||
694
4.70M
        c == 0x2126 ||
695
4.69M
        (c >= 0x212a && c <= 0x212b) ||
696
4.69M
        c == 0x212e ||
697
4.69M
        (c >= 0x2180 && c <= 0x2182) ||
698
4.69M
        (c >= 0x3041 && c <= 0x3094) ||
699
4.69M
        (c >= 0x30a1 && c <= 0x30fa) ||
700
4.69M
        (c >= 0x3105 && c <= 0x312c) ||
701
4.69M
        (c >= 0xac00 && c <= 0xd7a3) ||
702
4.69M
        (c >= 0x4e00 && c <= 0x9fa5) ||
703
4.68M
        c == 0x3007 ||
704
4.68M
        (c >= 0x3021 && c <= 0x3029) ||
705
4.68M
        (c >= 0x4e00 && c <= 0x9fa5) ||
706
4.68M
        c == 0x3007 ||
707
4.68M
        (c >= 0x3021 && c <= 0x3029));
708
6.16M
}
709
710
Bool TY_(IsXMLNamechar)(uint c)
711
6.01M
{
712
6.01M
    return (TY_(IsXMLLetter)(c) ||
713
4.63M
        c == '.' || c == '_' ||
714
4.62M
        c == ':' || c == '-' ||
715
4.61M
        (c >= 0x300 && c <= 0x345) ||
716
4.61M
        (c >= 0x360 && c <= 0x361) ||
717
4.61M
        (c >= 0x483 && c <= 0x486) ||
718
4.61M
        (c >= 0x591 && c <= 0x5a1) ||
719
4.61M
        (c >= 0x5a3 && c <= 0x5b9) ||
720
4.60M
        (c >= 0x5bb && c <= 0x5bd) ||
721
4.60M
        c == 0x5bf ||
722
4.60M
        (c >= 0x5c1 && c <= 0x5c2) ||
723
4.60M
        c == 0x5c4 ||
724
4.60M
        (c >= 0x64b && c <= 0x652) ||
725
4.60M
        c == 0x670 ||
726
4.60M
        (c >= 0x6d6 && c <= 0x6dc) ||
727
4.59M
        (c >= 0x6dd && c <= 0x6df) ||
728
4.59M
        (c >= 0x6e0 && c <= 0x6e4) ||
729
4.59M
        (c >= 0x6e7 && c <= 0x6e8) ||
730
4.59M
        (c >= 0x6ea && c <= 0x6ed) ||
731
4.59M
        (c >= 0x901 && c <= 0x903) ||
732
4.59M
        c == 0x93c ||
733
4.59M
        (c >= 0x93e && c <= 0x94c) ||
734
4.58M
        c == 0x94d ||
735
4.58M
        (c >= 0x951 && c <= 0x954) ||
736
4.58M
        (c >= 0x962 && c <= 0x963) ||
737
4.58M
        (c >= 0x981 && c <= 0x983) ||
738
4.58M
        c == 0x9bc ||
739
4.58M
        c == 0x9be ||
740
4.58M
        c == 0x9bf ||
741
4.58M
        (c >= 0x9c0 && c <= 0x9c4) ||
742
4.57M
        (c >= 0x9c7 && c <= 0x9c8) ||
743
4.57M
        (c >= 0x9cb && c <= 0x9cd) ||
744
4.57M
        c == 0x9d7 ||
745
4.57M
        (c >= 0x9e2 && c <= 0x9e3) ||
746
4.57M
        c == 0xa02 ||
747
4.57M
        c == 0xa3c ||
748
4.57M
        c == 0xa3e ||
749
4.57M
        c == 0xa3f ||
750
4.57M
        (c >= 0xa40 && c <= 0xa42) ||
751
4.57M
        (c >= 0xa47 && c <= 0xa48) ||
752
4.56M
        (c >= 0xa4b && c <= 0xa4d) ||
753
4.56M
        (c >= 0xa70 && c <= 0xa71) ||
754
4.56M
        (c >= 0xa81 && c <= 0xa83) ||
755
4.56M
        c == 0xabc ||
756
4.56M
        (c >= 0xabe && c <= 0xac5) ||
757
4.55M
        (c >= 0xac7 && c <= 0xac9) ||
758
4.55M
        (c >= 0xacb && c <= 0xacd) ||
759
4.55M
        (c >= 0xb01 && c <= 0xb03) ||
760
4.55M
        c == 0xb3c ||
761
4.55M
        (c >= 0xb3e && c <= 0xb43) ||
762
4.55M
        (c >= 0xb47 && c <= 0xb48) ||
763
4.54M
        (c >= 0xb4b && c <= 0xb4d) ||
764
4.54M
        (c >= 0xb56 && c <= 0xb57) ||
765
4.54M
        (c >= 0xb82 && c <= 0xb83) ||
766
4.54M
        (c >= 0xbbe && c <= 0xbc2) ||
767
4.54M
        (c >= 0xbc6 && c <= 0xbc8) ||
768
4.54M
        (c >= 0xbca && c <= 0xbcd) ||
769
4.54M
        c == 0xbd7 ||
770
4.53M
        (c >= 0xc01 && c <= 0xc03) ||
771
4.53M
        (c >= 0xc3e && c <= 0xc44) ||
772
4.53M
        (c >= 0xc46 && c <= 0xc48) ||
773
4.53M
        (c >= 0xc4a && c <= 0xc4d) ||
774
4.53M
        (c >= 0xc55 && c <= 0xc56) ||
775
4.52M
        (c >= 0xc82 && c <= 0xc83) ||
776
4.52M
        (c >= 0xcbe && c <= 0xcc4) ||
777
4.52M
        (c >= 0xcc6 && c <= 0xcc8) ||
778
4.51M
        (c >= 0xcca && c <= 0xccd) ||
779
4.51M
        (c >= 0xcd5 && c <= 0xcd6) ||
780
4.51M
        (c >= 0xd02 && c <= 0xd03) ||
781
4.51M
        (c >= 0xd3e && c <= 0xd43) ||
782
4.51M
        (c >= 0xd46 && c <= 0xd48) ||
783
4.50M
        (c >= 0xd4a && c <= 0xd4d) ||
784
4.50M
        c == 0xd57 ||
785
4.50M
        c == 0xe31 ||
786
4.50M
        (c >= 0xe34 && c <= 0xe3a) ||
787
4.50M
        (c >= 0xe47 && c <= 0xe4e) ||
788
4.49M
        c == 0xeb1 ||
789
4.49M
        (c >= 0xeb4 && c <= 0xeb9) ||
790
4.49M
        (c >= 0xebb && c <= 0xebc) ||
791
4.49M
        (c >= 0xec8 && c <= 0xecd) ||
792
4.48M
        (c >= 0xf18 && c <= 0xf19) ||
793
4.48M
        c == 0xf35 ||
794
4.48M
        c == 0xf37 ||
795
4.48M
        c == 0xf39 ||
796
4.48M
        c == 0xf3e ||
797
4.48M
        c == 0xf3f ||
798
4.48M
        (c >= 0xf71 && c <= 0xf84) ||
799
4.47M
        (c >= 0xf86 && c <= 0xf8b) ||
800
4.47M
        (c >= 0xf90 && c <= 0xf95) ||
801
4.47M
        c == 0xf97 ||
802
4.47M
        (c >= 0xf99 && c <= 0xfad) ||
803
4.47M
        (c >= 0xfb1 && c <= 0xfb7) ||
804
4.46M
        c == 0xfb9 ||
805
4.46M
        (c >= 0x20d0 && c <= 0x20dc) ||
806
4.46M
        c == 0x20e1 ||
807
4.46M
        (c >= 0x302a && c <= 0x302f) ||
808
4.46M
        c == 0x3099 ||
809
4.46M
        c == 0x309a ||
810
4.46M
        (c >= 0x30 && c <= 0x39) ||
811
1.13M
        (c >= 0x660 && c <= 0x669) ||
812
1.13M
        (c >= 0x6f0 && c <= 0x6f9) ||
813
1.12M
        (c >= 0x966 && c <= 0x96f) ||
814
1.12M
        (c >= 0x9e6 && c <= 0x9ef) ||
815
1.12M
        (c >= 0xa66 && c <= 0xa6f) ||
816
1.12M
        (c >= 0xae6 && c <= 0xaef) ||
817
1.12M
        (c >= 0xb66 && c <= 0xb6f) ||
818
1.12M
        (c >= 0xbe7 && c <= 0xbef) ||
819
1.12M
        (c >= 0xc66 && c <= 0xc6f) ||
820
1.12M
        (c >= 0xce6 && c <= 0xcef) ||
821
1.12M
        (c >= 0xd66 && c <= 0xd6f) ||
822
1.11M
        (c >= 0xe50 && c <= 0xe59) ||
823
1.11M
        (c >= 0xed0 && c <= 0xed9) ||
824
1.11M
        (c >= 0xf20 && c <= 0xf29) ||
825
1.11M
        c == 0xb7 ||
826
1.11M
        c == 0x2d0 ||
827
1.11M
        c == 0x2d1 ||
828
1.10M
        c == 0x387 ||
829
1.10M
        c == 0x640 ||
830
1.10M
        c == 0xe46 ||
831
1.10M
        c == 0xec6 ||
832
1.10M
        c == 0x3005 ||
833
1.10M
        (c >= 0x3031 && c <= 0x3035) ||
834
1.10M
        (c >= 0x309d && c <= 0x309e) ||
835
1.09M
        (c >= 0x30fc && c <= 0x30fe));
836
6.01M
}
837
838
Bool TY_(IsUpper)(uint c)
839
16.2M
{
840
16.2M
    uint map = MAP(c);
841
842
16.2M
    return (map & uppercase)!=0;
843
16.2M
}
844
845
uint TY_(ToLower)(uint c)
846
250M
{
847
250M
    uint map = MAP(c);
848
849
250M
    if (map & uppercase)
850
15.0M
        c += 'a' - 'A';
851
852
250M
    return c;
853
250M
}
854
855
uint TY_(ToUpper)(uint c)
856
555k
{
857
555k
    uint map = MAP(c);
858
859
555k
    if (map & lowercase)
860
52.0k
        c += (uint) ('A' - 'a' );
861
862
555k
    return c;
863
555k
}
864
865
/*
866
 return last character in string
867
 this is useful when trailing quotemark
868
 is missing on an attribute
869
*/
870
static tmbchar LastChar( tmbstr str )
871
277k
{
872
277k
    if ( str && *str )
873
276k
    {
874
276k
        int n = TY_(tmbstrlen)(str);
875
276k
        return str[n-1];
876
276k
    }
877
332
    return 0;
878
277k
}
879
880
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
881
39.3k
{
882
39.3k
    Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
883
884
39.3k
    if ( lexer != NULL )
885
39.3k
    {
886
39.3k
        TidyClearMemory( lexer, sizeof(Lexer) );
887
888
39.3k
        lexer->allocator = doc->allocator;
889
39.3k
        lexer->lines = 1;
890
39.3k
        lexer->columns = 1;
891
39.3k
        lexer->state = LEX_CONTENT;
892
893
39.3k
        lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
894
39.3k
        lexer->doctype = VERS_UNKNOWN;
895
39.3k
        lexer->root = &doc->root;
896
39.3k
    }
897
39.3k
    return lexer;
898
39.3k
}
899
900
static Bool EndOfInput( TidyDocImpl* doc )
901
1.56M
{
902
1.56M
    assert( doc->docIn != NULL );
903
1.56M
    return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
904
1.56M
}
905
906
void TY_(FreeLexer)( TidyDocImpl* doc )
907
78.6k
{
908
78.6k
    Lexer *lexer = doc->lexer;
909
78.6k
    if ( lexer )
910
39.3k
    {
911
39.3k
        TY_(FreeStyles)( doc );
912
913
        /* See GetToken() */
914
39.3k
        if ( lexer->pushed || lexer->itoken )
915
0
        {
916
0
            if (lexer->pushed)
917
0
                TY_(FreeNode)( doc, lexer->itoken );
918
0
            TY_(FreeNode)( doc, lexer->token );
919
0
        }
920
921
112k
        while ( lexer->istacksize > 0 )
922
73.2k
            TY_(PopInline)( doc, NULL );
923
924
39.3k
        TidyDocFree( doc, lexer->istack );
925
39.3k
        TidyDocFree( doc, lexer->lexbuf );
926
39.3k
        TidyDocFree( doc, lexer );
927
39.3k
        doc->lexer = NULL;
928
39.3k
    }
929
78.6k
}
930
931
/* Lexer uses bigger memory chunks than pprint as
932
** it must hold the entire input document. not just
933
** the last line or three.
934
*/
935
static void AddByte( Lexer *lexer, tmbchar ch )
936
1.07G
{
937
1.07G
    if ( lexer->lexsize + 2 >= lexer->lexlength )
938
44.8k
    {
939
44.8k
        tmbstr buf = NULL;
940
44.8k
        uint allocAmt = lexer->lexlength;
941
44.8k
        uint prev = allocAmt; /* Is. #761 */
942
89.7k
        while ( lexer->lexsize + 2 >= allocAmt )
943
44.8k
        {
944
44.8k
            if ( allocAmt == 0 )
945
39.1k
                allocAmt = 8192;
946
5.70k
            else
947
5.70k
                allocAmt *= 2;
948
44.8k
            if (allocAmt < prev) /* Is. #761 - watch for wrap - and */
949
0
                TidyPanic(lexer->allocator, "\nPanic: out of internal memory!\nDocument input too big!\n");
950
44.8k
        }
951
44.8k
        buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt );
952
44.8k
        if ( buf )
953
44.8k
        {
954
44.8k
          TidyClearMemory( buf + lexer->lexlength, 
955
44.8k
                           allocAmt - lexer->lexlength );
956
44.8k
          lexer->lexbuf = buf;
957
44.8k
          lexer->lexlength = allocAmt;
958
44.8k
        }
959
44.8k
    }
960
961
1.07G
    lexer->lexbuf[ lexer->lexsize++ ] = ch;
962
1.07G
    lexer->lexbuf[ lexer->lexsize ]   = '\0';  /* debug */
963
1.07G
}
964
965
static void ChangeChar( Lexer *lexer, tmbchar c )
966
2.86M
{
967
2.86M
    if ( lexer->lexsize > 0 )
968
2.86M
    {
969
2.86M
        lexer->lexbuf[ lexer->lexsize-1 ] = c;
970
2.86M
    }
971
2.86M
}
972
973
/* store character c as UTF-8 encoded byte stream */
974
void TY_(AddCharToLexer)( Lexer *lexer, uint c )
975
1.06G
{
976
1.06G
    int i, err, count = 0;
977
1.06G
    tmbchar buf[10] = {0};
978
    
979
1.06G
    err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
980
1.06G
    if (err)
981
7.75k
    {
982
        /* replacement character 0xFFFD encoded as UTF-8 */
983
7.75k
        buf[0] = (byte) 0xEF;
984
7.75k
        buf[1] = (byte) 0xBF;
985
7.75k
        buf[2] = (byte) 0xBD;
986
7.75k
        count = 3;
987
7.75k
    }
988
    
989
2.13G
    for ( i = 0; i < count; ++i )
990
1.07G
        AddByte( lexer, buf[i] );
991
1.06G
}
992
993
static void AddStringToLexer( Lexer *lexer, ctmbstr str )
994
0
{
995
0
    uint c;
996
997
    /*  Many (all?) compilers will sign-extend signed chars (the default) when
998
    **  converting them to unsigned integer values.  We must cast our char to
999
    **  unsigned char before assigning it to prevent this from happening.
1000
    */
1001
0
    while( 0 != (c = (unsigned char) *str++ ))
1002
0
        TY_(AddCharToLexer)( lexer, c );
1003
0
}
1004
1005
1006
static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
1007
20.0M
{
1008
20.0M
    lexer->lines = doc->docIn->curline;
1009
20.0M
    lexer->columns = doc->docIn->curcol;
1010
20.0M
}
1011
1012
/*
1013
    Issue #483
1014
    Have detected the first of a surrogate pair...
1015
    Try to find, decode the second...
1016
    Already have '&' start...
1017
*/
1018
1019
typedef enum {
1020
    SP_ok,
1021
    SP_failed,
1022
    SP_error
1023
}SPStatus;
1024
1025
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
1026
13.1k
{
1027
13.1k
    Lexer* lexer = doc->lexer;
1028
13.1k
    uint bufSize = 32;
1029
13.1k
    uint c, ch = 0, offset = 0;
1030
13.1k
    tmbstr buf = 0;
1031
13.1k
    SPStatus status = SP_error;  /* assume failed */
1032
13.1k
    int type = 0;   /* assume numeric */
1033
13.1k
    uint fch = *pch;
1034
13.1k
    int i;  /* has to be signed due to for i >= 0 */
1035
13.1k
    if (!lexer)
1036
0
        return status;
1037
13.1k
    buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
1038
13.1k
    if (!buf)
1039
0
        return status;
1040
376k
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
1041
376k
    {
1042
376k
        if (c == ';')
1043
2.22k
        {
1044
2.22k
            break;  /* reached end of entity */
1045
2.22k
        }
1046
374k
        if ((offset + 2) > bufSize)
1047
1.06k
        {
1048
1.06k
            bufSize *= 2;
1049
1.06k
            buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
1050
1.06k
            if (!buf)
1051
0
            {
1052
0
                break;
1053
0
            }
1054
1.06k
        }
1055
374k
        buf[offset++] = c;  /* add char to buffer */
1056
374k
        if (offset == 1)
1057
12.7k
        {
1058
12.7k
            if (c != '#')   /* is a numeric entity */
1059
753
                break;
1060
12.7k
        }
1061
361k
        else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
1062
5.76k
        {
1063
5.76k
            type = 1;   /* set hex digits */
1064
5.76k
        }
1065
355k
        else
1066
355k
        {
1067
355k
            if (type)   /* if hex digits */
1068
319k
            {
1069
319k
                if (!IsDigitHex(c))
1070
3.84k
                    break;
1071
319k
            }
1072
36.3k
            else    /* if numeric */
1073
36.3k
            {
1074
36.3k
                if (!TY_(IsDigit)(c))
1075
6.20k
                    break;
1076
36.3k
            }
1077
355k
        }
1078
374k
    }
1079
1080
13.1k
    if (c == ';')
1081
2.22k
    {
1082
2.22k
        int scanned;
1083
1084
2.22k
        buf[offset] = 0;
1085
2.22k
        if (type)
1086
1.82k
            scanned = sscanf(buf + 2, "%x", &ch);
1087
400
        else
1088
400
            scanned = sscanf(buf + 1, "%d", &ch);
1089
1090
2.22k
        if (scanned == 1 && TY_(IsHighSurrogate)(ch))
1091
1.33k
        {
1092
1.33k
            ch = TY_(CombineSurrogatePair)(ch, fch);
1093
1.33k
            if (TY_(IsValidCombinedChar)(ch))
1094
899
            {
1095
899
                *pch = ch;  /* return combined pair value */
1096
899
                status = SP_ok; /* full success - pair used */
1097
899
            }
1098
431
            else
1099
431
            {
1100
431
                status = SP_failed; /* is one of the 32 out-of-range pairs */
1101
431
                *pch = 0xFFFD;  /* return substitute character */
1102
431
                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_PAIR, fch, ch); /* SP WARNING: -  */
1103
431
            }
1104
1.33k
        }
1105
2.22k
    }
1106
1107
13.1k
    if (status == SP_error)
1108
11.8k
    {
1109
        /* Error condition - can only put back all the chars */
1110
11.8k
        if (c == ';') /* if last, not added to buffer */
1111
896
            TY_(UngetChar)(c, doc->docIn);
1112
11.8k
        if (buf && offset)
1113
11.4k
        {
1114
            /* correct the order for unget - last first */
1115
377k
            for (i = offset - 1; i >= 0; i--)
1116
366k
            {
1117
366k
                c = buf[i];
1118
366k
                TY_(UngetChar)(c, doc->docIn);
1119
366k
            }
1120
11.4k
        }
1121
11.8k
    }
1122
1123
13.1k
    if (buf)
1124
13.1k
        TidyFree(lexer->allocator, buf);
1125
1126
13.1k
    return status;
1127
13.1k
}
1128
1129
/*
1130
  No longer attempts to insert missing ';' for unknown
1131
 entities unless one was present already, since this
1132
  gives unexpected results.
1133
1134
  For example:   <a href="something.htm?foo&bar&fred">
1135
  was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
1136
  rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
1137
1138
  My thanks for Maurice Buxton for spotting this.
1139
1140
  Also Randy Waki pointed out the following case for the
1141
  04 Aug 00 version (bug #433012):
1142
  
1143
  For example:   <a href="something.htm?id=1&lang=en">
1144
  was tidied to: <a href="something.htm?id=1&lang;=en">
1145
  rather than:   <a href="something.htm?id=1&amp;lang=en">
1146
  
1147
  where "lang" is a known entity (#9001), but browsers would
1148
  misinterpret "&lang;" because it had a value > 256.
1149
  
1150
  So the case of an apparently known entity with a value > 256 and
1151
  missing a semicolon is handled specially.
1152
  
1153
  "ParseEntity" is also a bit of a misnomer - it handles entities and
1154
  numeric character references. Invalid NCR's are now reported.
1155
*/
1156
static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
1157
179k
{
1158
179k
    typedef enum
1159
179k
    {
1160
179k
        ENT_default,
1161
179k
        ENT_numdec,
1162
179k
        ENT_numhex
1163
179k
    } ENTState;
1164
    
1165
179k
    typedef Bool (*ENTfn)(uint);
1166
179k
    const ENTfn entFn[] = {
1167
179k
        TY_(IsNamechar),
1168
179k
        TY_(IsDigit),
1169
179k
        IsDigitHex
1170
179k
    };
1171
179k
    uint start;
1172
179k
    ENTState entState = ENT_default;
1173
179k
    uint charRead = 0;
1174
179k
    Bool semicolon = no, found = no;
1175
179k
    Bool isXml = cfgBool( doc, TidyXmlTags );
1176
179k
    Bool preserveEntities = cfgBool( doc, TidyPreserveEntities );
1177
179k
    uint c, ch, startcol, entver = 0;
1178
179k
    Lexer* lexer = doc->lexer;
1179
1180
179k
    start = lexer->lexsize - 1;  /* to start at "&" */
1181
179k
    startcol = doc->docIn->curcol - 1;
1182
1183
7.22M
    while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
1184
7.22M
    {
1185
7.22M
        if ( c == ';' )
1186
18.8k
        {
1187
18.8k
            semicolon = yes;
1188
18.8k
            break;
1189
18.8k
        }
1190
7.20M
        ++charRead;
1191
1192
7.20M
        if (charRead == 1 && c == '#')
1193
97.6k
        {
1194
97.6k
            if ( !cfgBool(doc, TidyNCR) ||
1195
96.6k
                 cfg(doc, TidyInCharEncoding) == BIG5 ||
1196
96.6k
                 cfg(doc, TidyInCharEncoding) == SHIFTJIS )
1197
1.02k
            {
1198
1.02k
                TY_(UngetChar)('#', doc->docIn);
1199
1.02k
                return;
1200
1.02k
            }
1201
1202
96.6k
            TY_(AddCharToLexer)( lexer, c );
1203
96.6k
            entState = ENT_numdec;
1204
96.6k
            continue;
1205
97.6k
        }
1206
7.10M
        else if (charRead == 2 && entState == ENT_numdec
1207
96.4k
                 && (c == 'x' || (!isXml && c == 'X')) )
1208
80.8k
        {
1209
80.8k
            TY_(AddCharToLexer)( lexer, c );
1210
80.8k
            entState = ENT_numhex;
1211
80.8k
            continue;
1212
80.8k
        }
1213
1214
7.02M
        if ( entFn[entState](c) )
1215
6.86M
        {
1216
6.86M
            TY_(AddCharToLexer)( lexer, c );
1217
6.86M
            continue;
1218
6.86M
        }
1219
1220
        /* otherwise put it back */
1221
158k
        TY_(UngetChar)( c, doc->docIn );
1222
158k
        break;
1223
7.02M
    }
1224
1225
    /* make sure entity is NULL terminated */
1226
178k
    lexer->lexbuf[lexer->lexsize] = '\0';
1227
1228
    /* Should contrain version to XML/XHTML if &apos; 
1229
    ** is encountered.  But this is not possible with
1230
    ** Tidy's content model bit mask.
1231
    */
1232
178k
    if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0
1233
476
         && !cfgBool(doc, TidyXmlOut)
1234
0
         && !lexer->isvoyager
1235
0
         && !cfgBool(doc, TidyXhtmlOut)
1236
0
         && !(TY_(HTMLVersion)(doc) == HT50) ) /* Issue #239 - no warning if in HTML5++ mode */
1237
0
        TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
1238
1239
178k
    if (( mode == OtherNamespace ) && ( c == ';' ))
1240
465
    {
1241
        /* #130 MathML attr and entity fix! */
1242
465
        found = yes;
1243
465
        ch = 255;
1244
465
        entver = XH50|HT50;
1245
465
        preserveEntities = yes;
1246
465
    }
1247
178k
    else
1248
178k
    {
1249
        /* Lookup entity code and version
1250
        */
1251
178k
        found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
1252
178k
    }
1253
1254
    /* Issue #483 - Deal with 'surrogate pairs' */
1255
    /* TODO: Maybe warning/error, like found a leading surrogate
1256
       but no following surrogate! Maybe should avoid outputting
1257
       invalid utf-8 for this entity - maybe substitute?  */
1258
178k
    if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
1259
14.4k
    {
1260
14.4k
        uint c1;
1261
14.4k
        if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
1262
13.1k
        {
1263
13.1k
            SPStatus status;
1264
            /* Have a following entity, 
1265
               so there is a chance of having a valid surrogate pair */
1266
13.1k
            c1 = ch;    /* keep first value, in case of error */
1267
13.1k
            status = GetSurrogatePair(doc, isXml, &ch);
1268
13.1k
            if (status == SP_error)
1269
11.8k
            {
1270
11.8k
                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, c1, 0); /* SP WARNING: - using substitute character */
1271
11.8k
                TY_(UngetChar)('&', doc->docIn);  /* otherwise put it back */
1272
11.8k
            }
1273
13.1k
        }
1274
1.24k
        else
1275
1.24k
        {
1276
            /* put this non-entity lead char back */
1277
1.24k
            TY_(UngetChar)(c1, doc->docIn);
1278
            /* Have leading surrogate pair, with no tail */
1279
1.24k
            TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, ch, 0); /* SP WARNING: - using substitute character */
1280
1.24k
            ch = 0xFFFD;
1281
1.24k
        }
1282
14.4k
    } 
1283
164k
    else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
1284
1.94k
    {
1285
        /* Have trailing surrogate pair, with no lead */
1286
1.94k
        TY_(ReportSurrogateError)(doc, BAD_SURROGATE_LEAD, ch, 0); /* SP WARNING: - using substitute character */
1287
1.94k
        ch = 0xFFFD;
1288
1.94k
    }
1289
1290
    /* deal with unrecognized or invalid entities */
1291
    /* #433012 - fix by Randy Waki 17 Feb 01 */
1292
    /* report invalid NCR's - Terry Teague 01 Sep 01 */
1293
178k
    if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
1294
117k
    {
1295
        /* set error position just before offending character */
1296
117k
        SetLexerLocus( doc, lexer );
1297
117k
        lexer->columns = startcol;
1298
1299
117k
        if (lexer->lexsize > start + 1)
1300
88.6k
        {
1301
88.6k
            if (ch >= 128 && ch <= 159)
1302
2.07k
            {
1303
                /* invalid numeric character reference */
1304
                
1305
2.07k
                uint c1 = 0;
1306
2.07k
                int replaceMode = DISCARDED_CHAR;
1307
            
1308
                /* Always assume Win1252 in this circumstance. */
1309
2.07k
                c1 = TY_(DecodeWin1252)( ch );
1310
1311
2.07k
                if ( c1 )
1312
1.22k
                    replaceMode = REPLACED_CHAR;
1313
                
1314
2.07k
                if ( c != ';' )  /* issue warning if not terminated by ';' */
1315
1.18k
                    TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR,
1316
1.18k
                                            lexer->lexbuf+start, c );
1317
 
1318
2.07k
                TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
1319
                
1320
2.07k
                if ( c1 )
1321
1.22k
                {
1322
                    /* make the replacement */
1323
1.22k
                    lexer->lexsize = start;
1324
1.22k
                    TY_(AddCharToLexer)( lexer, c1 );
1325
1.22k
                    semicolon = no;
1326
1.22k
                }
1327
846
                else
1328
846
                {
1329
                    /* discard */
1330
846
                    lexer->lexsize = start;
1331
846
                    semicolon = no;
1332
846
               }
1333
               
1334
2.07k
            }
1335
86.6k
            else
1336
86.6k
                TY_(ReportEntityError)( doc, UNKNOWN_ENTITY,
1337
86.6k
                                        lexer->lexbuf+start, ch );
1338
1339
88.6k
            if (semicolon)
1340
1.52k
                TY_(AddCharToLexer)( lexer, ';' );
1341
88.6k
        }
1342
28.5k
        else
1343
28.5k
        {
1344
            /*\ 
1345
             *  Issue #207 - A naked & is allowed in HTML5, as an unambiguous ampersand!
1346
            \*/
1347
28.5k
            if (TY_(HTMLVersion)(doc) != HT50) 
1348
27.5k
            {
1349
27.5k
                TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND,
1350
27.5k
                                    lexer->lexbuf+start, ch );
1351
27.5k
            }
1352
28.5k
        }
1353
117k
    }
1354
61.4k
    else
1355
61.4k
    {
1356
61.4k
        if ( c != ';' )    /* issue warning if not terminated by ';' */
1357
46.5k
        {
1358
            /* set error position just before offending character */
1359
46.5k
            SetLexerLocus( doc, lexer );
1360
46.5k
            lexer->columns = startcol;
1361
46.5k
            TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
1362
46.5k
        }
1363
1364
61.4k
        if (preserveEntities)
1365
465
            TY_(AddCharToLexer)( lexer, ';' );
1366
61.0k
        else
1367
61.0k
        {
1368
61.0k
            lexer->lexsize = start;
1369
61.0k
            if ( ch == 160 && (mode == Preformatted) )
1370
1.00k
                ch = ' ';
1371
61.0k
            TY_(AddCharToLexer)( lexer, ch );
1372
1373
61.0k
            if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
1374
0
                AddStringToLexer( lexer, "amp;" );
1375
61.0k
        }
1376
1377
        /* Detect extended vs. basic entities */
1378
61.4k
        TY_(ConstrainVersion)( doc, entver );
1379
61.4k
    }
1380
178k
}
1381
1382
static tmbchar ParseTagName( TidyDocImpl* doc )
1383
2.58M
{
1384
2.58M
    Lexer *lexer = doc->lexer;
1385
2.58M
    uint c = lexer->lexbuf[ lexer->txtstart ];
1386
2.58M
    Bool xml = cfgBool(doc, TidyXmlTags);
1387
1388
    /* fold case of first character in buffer */
1389
2.58M
    if (!xml && TY_(IsUpper)(c))
1390
1.17M
        lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c);
1391
1392
13.1M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
1393
13.1M
    {
1394
13.1M
        if ((!xml && !TY_(IsNamechar)(c)) ||
1395
10.5M
            (xml && !TY_(IsXMLNamechar)(c)))
1396
2.57M
            break;
1397
1398
        /* fold case of subsequent characters */
1399
10.5M
        if (!xml && TY_(IsUpper)(c))
1400
2.02M
             c = TY_(ToLower)(c);
1401
1402
10.5M
        TY_(AddCharToLexer)(lexer, c);
1403
10.5M
    }
1404
1405
2.58M
    lexer->txtend = lexer->lexsize;
1406
2.58M
    return (tmbchar) c;
1407
2.58M
}
1408
1409
/*
1410
  Used for elements and text nodes
1411
  element name is NULL for text nodes
1412
  start and end are offsets into lexbuf
1413
  which contains the textual content of
1414
  all elements in the parse tree.
1415
1416
  parent and content allow traversal
1417
  of the parse tree in any direction.
1418
  attributes are represented as a linked
1419
  list of AttVal nodes which hold the
1420
  strings for attribute/value pairs.
1421
*/
1422
1423
1424
Node *TY_(NewNode)(TidyAllocator* allocator, Lexer *lexer)
1425
8.68M
{
1426
8.68M
    Node* node = (Node*) TidyAlloc( allocator, sizeof(Node) );
1427
8.68M
    TidyClearMemory( node, sizeof(Node) );
1428
8.68M
    if ( lexer )
1429
8.64M
    {
1430
8.64M
        node->line = lexer->lines;
1431
8.64M
        node->column = lexer->columns;
1432
8.64M
    }
1433
8.68M
    node->type = TextNode;
1434
#if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION)
1435
    SPRTF("Allocated node %p\n", node );
1436
#endif
1437
8.68M
    return node;
1438
8.68M
}
1439
1440
/* used to clone heading nodes when split by an <HR> */
1441
Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element )
1442
42.3k
{
1443
42.3k
    Lexer* lexer = doc->lexer;
1444
42.3k
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1445
1446
42.3k
    node->start = lexer->lexsize;
1447
42.3k
    node->end   = lexer->lexsize;
1448
1449
42.3k
    if ( element )
1450
42.3k
    {
1451
42.3k
        node->parent     = element->parent;
1452
42.3k
        node->type       = element->type;
1453
42.3k
        node->closed     = element->closed;
1454
42.3k
        node->implicit   = element->implicit;
1455
42.3k
        node->tag        = element->tag;
1456
42.3k
        node->element    = TY_(tmbstrdup)( doc->allocator, element->element );
1457
42.3k
        node->attributes = TY_(DupAttrs)( doc, element->attributes );
1458
42.3k
    }
1459
42.3k
    return node;
1460
42.3k
}
1461
1462
/* free node's attributes */
1463
void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node )
1464
8.51M
{
1465
9.46M
    while ( node->attributes )
1466
950k
    {
1467
950k
        AttVal *av = node->attributes;
1468
1469
950k
        if ( av->attribute )
1470
833k
        {
1471
833k
            if ( (attrIsID(av) || attrIsNAME(av)) &&
1472
193k
                 TY_(IsAnchorElement)(doc, node) )
1473
70.6k
            {
1474
70.6k
                TY_(RemoveAnchorByNode)( doc, av->value, node );
1475
70.6k
            }
1476
833k
        }
1477
1478
950k
        node->attributes = av->next;
1479
950k
        TY_(FreeAttribute)( doc, av );
1480
950k
    }
1481
8.51M
}
1482
1483
/* doesn't repair attribute list linkage */
1484
void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av )
1485
1.26M
{
1486
1.26M
    TY_(FreeNode)( doc, av->asp );
1487
1.26M
    TY_(FreeNode)( doc, av->php );
1488
1.26M
    TidyDocFree( doc, av->attribute );
1489
1.26M
    TidyDocFree( doc, av->value );
1490
1.26M
    TidyDocFree( doc, av );
1491
1.26M
}
1492
1493
/* detach attribute from node
1494
*/
1495
void TY_(DetachAttribute)( Node *node, AttVal *attr )
1496
16.0k
{
1497
16.0k
    AttVal *av, *prev = NULL;
1498
1499
1.01M
    for ( av = node->attributes; av; av = av->next )
1500
1.01M
    {
1501
1.01M
        if ( av == attr )
1502
16.0k
        {
1503
16.0k
            if ( prev )
1504
8.61k
                prev->next = attr->next;
1505
7.45k
            else
1506
7.45k
                node->attributes = attr->next;
1507
16.0k
            break;
1508
16.0k
        }
1509
1.00M
        prev = av;
1510
1.00M
    }
1511
16.0k
}
1512
1513
/* detach attribute from node then free it
1514
*/
1515
void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr )
1516
16.0k
{
1517
16.0k
    TY_(DetachAttribute)( node, attr );
1518
16.0k
    TY_(FreeAttribute)( doc, attr );
1519
16.0k
}
1520
1521
/*
1522
  Free document nodes by iterating through peers and recursing
1523
  through children. Set next to NULL before calling TY_(FreeNode)()
1524
  to avoid freeing peer nodes. Doesn't patch up prev/next links.
1525
 */
1526
void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
1527
13.4M
{
1528
#if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION)
1529
    /* avoid showing free of root node! */
1530
    if (node) {
1531
        if (RootNode != node->type) {
1532
            SPRTF("Free node %p\n", node);
1533
        }
1534
        else {
1535
            SPRTF("Root node %p\n", node);
1536
        }
1537
    }
1538
#endif
1539
1540
21.9M
    while ( node )
1541
8.51M
    {
1542
8.51M
        Node* next = node->next;
1543
1544
8.51M
        TY_(FreeAttrs)( doc, node );
1545
8.51M
        TY_(FreeNode)( doc, node->content );
1546
8.51M
        TidyDocFree( doc, node->element );
1547
8.51M
        if (RootNode != node->type)
1548
8.51M
            TidyDocFree( doc, node );
1549
78.6k
        else
1550
78.6k
            node->content = NULL;
1551
1552
8.51M
        node = next;
1553
8.51M
    }
1554
13.4M
}
1555
1556
Node* TY_(TextToken)( Lexer *lexer )
1557
784k
{
1558
784k
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1559
784k
    node->start = lexer->txtstart;
1560
784k
    node->end = lexer->txtend;
1561
784k
    return node;
1562
784k
}
1563
1564
/* used for creating preformatted text from Word2000 */
1565
Node *TY_(NewLineNode)( Lexer *lexer )
1566
0
{
1567
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1568
0
    node->start = lexer->lexsize;
1569
0
    TY_(AddCharToLexer)( lexer, (uint)'\n' );
1570
0
    node->end = lexer->lexsize;
1571
0
    return node;
1572
0
}
1573
1574
/* used for adding a &nbsp; for Word2000 */
1575
Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt )
1576
0
{
1577
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1578
0
    node->start = lexer->lexsize;
1579
0
    AddStringToLexer( lexer, txt );
1580
0
    node->end = lexer->lexsize;
1581
0
    return node;
1582
0
}
1583
1584
static Node* TagToken( TidyDocImpl* doc, NodeType type )
1585
2.58M
{
1586
2.58M
    Lexer* lexer = doc->lexer;
1587
2.58M
    Node* node = TY_(NewNode)( lexer->allocator, lexer );
1588
2.58M
    node->type = type;
1589
2.58M
    node->element = TY_(tmbstrndup)( doc->allocator,
1590
2.58M
                                     lexer->lexbuf + lexer->txtstart,
1591
2.58M
                                     lexer->txtend - lexer->txtstart );
1592
2.58M
    node->start = lexer->txtstart;
1593
2.58M
    node->end = lexer->txtstart;
1594
1595
2.58M
    if ( type == StartTag || type == StartEndTag || type == EndTag )
1596
2.58M
        TY_(FindTag)(doc, node);
1597
1598
2.58M
    return node;
1599
2.58M
}
1600
1601
static Node* NewToken(TidyDocImpl* doc, NodeType type)
1602
1.48M
{
1603
1.48M
    Lexer* lexer = doc->lexer;
1604
1.48M
    Node* node = TY_(NewNode)(lexer->allocator, lexer);
1605
1.48M
    node->type = type;
1606
1.48M
    node->start = lexer->txtstart;
1607
1.48M
    node->end = lexer->txtend;
1608
1.48M
    return node;
1609
1.48M
}
1610
1611
674k
#define CommentToken(doc) NewToken(doc, CommentTag)
1612
#define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1613
162k
#define PIToken(doc)      NewToken(doc, ProcInsTag)
1614
96.2k
#define AspToken(doc)     NewToken(doc, AspTag)
1615
202k
#define JsteToken(doc)    NewToken(doc, JsteTag)
1616
8.24k
#define PhpToken(doc)     NewToken(doc, PhpTag)
1617
20.6k
#define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1618
315k
#define SectionToken(doc) NewToken(doc, SectionTag)
1619
1.26k
#define CDATAToken(doc)   NewToken(doc, CDATATag)
1620
1621
void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str )
1622
33
{
1623
33
    byte c;
1624
386
    while(0 != (c = *str++) ) {
1625
        /*\
1626
         *  Issue #286
1627
         *  Previously this used TY_(AddCharToLexer)( lexer, c );
1628
         *  which uses err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
1629
         *  But this is transferring already 'translated' data from an
1630
         *  internal location to the lexer, so should use AddByte()
1631
        \*/
1632
353
        AddByte( lexer, c );
1633
353
    }
1634
33
}
1635
1636
/*
1637
void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1638
{
1639
    byte c;
1640
    int ix;
1641
1642
    for ( ix=0; ix < len && (c = *str++); ++ix )
1643
        TY_(AddCharToLexer)(lexer, c);
1644
}
1645
*/
1646
1647
/* find doctype element */
1648
Node *TY_(FindDocType)( TidyDocImpl* doc )
1649
123k
{
1650
123k
    Node* node;
1651
123k
    for ( node = (doc ? doc->root.content : NULL);
1652
424k
          node && node->type != DocTypeTag; 
1653
300k
          node = node->next )
1654
300k
        /**/;
1655
123k
    return node;
1656
123k
}
1657
1658
/* find parent container element */
1659
Node* TY_(FindContainer)( Node* node )
1660
0
{
1661
0
    for ( node = (node ? node->parent : NULL);
1662
0
          node && TY_(nodeHasCM)(node, CM_INLINE);
1663
0
          node = node->parent )
1664
0
        /**/;
1665
1666
0
    return node;
1667
0
}
1668
1669
1670
/* find html element */
1671
Node *TY_(FindHTML)( TidyDocImpl* doc )
1672
364k
{
1673
364k
    Node *node;
1674
364k
    for ( node = (doc ? doc->root.content : NULL);
1675
1.28M
          node && !nodeIsHTML(node); 
1676
916k
          node = node->next )
1677
916k
        /**/;
1678
1679
364k
    return node;
1680
364k
}
1681
1682
/* find XML Declaration */
1683
Node *TY_(FindXmlDecl)(TidyDocImpl* doc)
1684
1.96k
{
1685
1.96k
    Node *node;
1686
1.96k
    for ( node = (doc ? doc->root.content : NULL);
1687
3.90k
          node && !(node->type == XmlDecl);
1688
1.96k
          node = node->next )
1689
1.94k
        /**/;
1690
1691
1.96k
    return node;
1692
1.96k
}
1693
1694
1695
Node *TY_(FindHEAD)( TidyDocImpl* doc )
1696
250k
{
1697
250k
    Node *node = TY_(FindHTML)( doc );
1698
1699
250k
    if ( node )
1700
250k
    {
1701
250k
        for ( node = node->content;
1702
255k
              node && !nodeIsHEAD(node); 
1703
250k
              node = node->next )
1704
4.84k
            /**/;
1705
250k
    }
1706
1707
250k
    return node;
1708
250k
}
1709
1710
Node *TY_(FindTITLE)(TidyDocImpl* doc)
1711
39.1k
{
1712
39.1k
    Node *node = TY_(FindHEAD)(doc);
1713
1714
39.1k
    if (node)
1715
39.1k
        for (node = node->content;
1716
58.1k
             node && !nodeIsTITLE(node);
1717
39.1k
             node = node->next) {}
1718
1719
39.1k
    return node;
1720
39.1k
}
1721
1722
Node *TY_(FindBody)( TidyDocImpl* doc )
1723
168k
{
1724
168k
    Node *node = ( doc ? doc->root.content : NULL );
1725
1726
415k
    while ( node && !nodeIsHTML(node) )
1727
247k
        node = node->next;
1728
1729
168k
    if (node == NULL)
1730
0
        return NULL;
1731
1732
168k
    node = node->content;
1733
462k
    while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1734
294k
        node = node->next;
1735
1736
168k
    if ( node && nodeIsFRAMESET(node) )
1737
19.8k
    {
1738
19.8k
        node = node->content;
1739
25.4k
        while ( node && !nodeIsNOFRAMES(node) )
1740
5.64k
            node = node->next;
1741
1742
19.8k
        if ( node )
1743
17.6k
        {
1744
17.6k
            node = node->content;
1745
276k
            while ( node && !nodeIsBODY(node) )
1746
258k
                node = node->next;
1747
17.6k
        }
1748
19.8k
    }
1749
1750
168k
    return node;
1751
168k
}
1752
1753
/* add meta element for Tidy */
1754
Bool TY_(AddGenerator)( TidyDocImpl* doc )
1755
39.0k
{
1756
39.0k
    AttVal *attval;
1757
39.0k
    Node *node;
1758
39.0k
    Node *head = TY_(FindHEAD)( doc );
1759
39.0k
    tmbchar buf[256];
1760
    
1761
39.0k
    if (head)
1762
39.0k
    {
1763
39.0k
#ifdef PLATFORM_NAME
1764
39.0k
        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 for "PLATFORM_NAME" version %s",
1765
39.0k
                         tidyLibraryVersion());
1766
#else
1767
        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 version %s", tidyLibraryVersion());
1768
#endif
1769
1770
105k
        for ( node = head->content; node; node = node->next )
1771
66.1k
        {
1772
66.1k
            if ( nodeIsMETA(node) )
1773
3.25k
            {
1774
3.25k
                attval = TY_(AttrGetById)(node, TidyAttr_NAME);
1775
1776
3.25k
                if (AttrValueIs(attval, "generator"))
1777
640
                {
1778
640
                    attval = TY_(AttrGetById)(node, TidyAttr_CONTENT);
1779
1780
640
                    if (AttrHasValue(attval) &&
1781
210
                        TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0)
1782
0
                    {
1783
                        /* update the existing content to reflect the */
1784
                        /* actual version of Tidy currently being used */
1785
                        
1786
0
                        TidyDocFree(doc, attval->value);
1787
0
                        attval->value = TY_(tmbstrdup)(doc->allocator, buf);
1788
0
                        return no;
1789
0
                    }
1790
640
                }
1791
3.25k
            }
1792
66.1k
        }
1793
1794
39.0k
        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
1795
39.0k
        {
1796
39.0k
            node = TY_(InferredTag)(doc, TidyTag_META);
1797
39.0k
            TY_(AddAttribute)( doc, node, "name", "generator" );
1798
39.0k
            TY_(AddAttribute)( doc, node, "content", buf );
1799
39.0k
            TY_(InsertNodeAtStart)( head, node );
1800
39.0k
            return yes;
1801
39.0k
        }
1802
39.0k
    }
1803
1804
0
    return no;
1805
39.0k
}
1806
1807
/*\ examine <!DOCTYPE ...> to identify version 
1808
 *  Issue #167 and #169
1809
 *   If HTML5
1810
 *        <!DOCTYPE html>
1811
 *       <!DOCTYPE html SYSTEM "about:legacy-compat">
1812
 *   else others
1813
\*/
1814
static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )
1815
28.5k
{
1816
28.5k
    AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC");
1817
28.5k
    uint vers;
1818
1819
28.5k
    if (!fpi || !fpi->value) 
1820
25.3k
    {
1821
        /*\
1822
         * Is. #815 - change to case-insensitive test
1823
         * See REC: https://www.w3.org/TR/html5/syntax.html#the-doctype
1824
        \*/
1825
25.3k
        if (doctype->element && (TY_(tmbstrcasecmp)(doctype->element,"html") == 0))
1826
458
        {
1827
458
            return VERS_HTML5;  /* TODO: do we need to check MORE? */
1828
458
        }
1829
        /* TODO: Consider warning, error message */
1830
24.8k
        return VERS_UNKNOWN;
1831
25.3k
    }
1832
3.17k
    vers = GetVersFromFPI(fpi->value);
1833
1834
3.17k
    if (VERS_XHTML & vers)
1835
167
    {
1836
167
        TY_(SetOptionBool)(doc, TidyXmlOut, yes);
1837
167
        TY_(SetOptionBool)(doc, TidyXhtmlOut, yes);
1838
167
        doc->lexer->isvoyager = yes;
1839
167
    }
1840
1841
    /* todo: add a warning if case does not match? */
1842
3.17k
    TidyDocFree(doc, fpi->value);
1843
3.17k
    fpi->value = TY_(tmbstrdup)(doc->allocator, GetFPIFromVers(vers));
1844
1845
3.17k
    return vers;
1846
28.5k
}
1847
1848
/* return guessed version */
1849
uint TY_(ApparentVersion)( TidyDocImpl* doc )
1850
78.2k
{
1851
78.2k
    if ((doc->lexer->doctype == XH11 ||
1852
78.1k
         doc->lexer->doctype == XB10) &&
1853
128
        (doc->lexer->versions & doc->lexer->doctype))
1854
102
        return doc->lexer->doctype;
1855
78.1k
    else
1856
78.1k
        return TY_(HTMLVersion)(doc);
1857
78.2k
}
1858
1859
ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) )
1860
97.3k
{
1861
97.3k
    ctmbstr name = GetNameFromVers(vers);
1862
97.3k
    return name;
1863
97.3k
}
1864
1865
uint TY_(HTMLVersionNumberFromCode)( uint vers )
1866
0
{
1867
0
    uint i;
1868
1869
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
1870
0
        if (W3C_Doctypes[i].vers == vers)
1871
0
            return W3C_Doctypes[i].vers_out;
1872
1873
0
    return VERS_UNKNOWN;
1874
0
}
1875
1876
Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc )
1877
39.1k
{
1878
39.1k
    Bool isXhtml = doc->lexer->isvoyager;
1879
39.1k
    Node* doctype;
1880
    
1881
    /* Do not warn in XHTML mode */
1882
39.1k
    if ( isXhtml )
1883
1.62k
        return no;
1884
1885
    /* Do not warn if emitted doctype is proprietary */
1886
37.5k
    if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL )
1887
24
        return no;
1888
1889
    /* Do not warn if no SI is possible */
1890
37.5k
    if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL )
1891
37.5k
        return no;
1892
1893
22
    if ( (doctype = TY_(FindDocType)( doc )) != NULL
1894
22
         && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL )
1895
2
        return yes;
1896
1897
20
    return no;
1898
22
}
1899
1900
1901
/* Put DOCTYPE declaration between the
1902
** <?xml version "1.0" ... ?> declaration, if any,
1903
** and the <html> tag.  Should also work for any comments, 
1904
** etc. that may precede the <html> tag.
1905
*/
1906
1907
static Node* NewDocTypeNode( TidyDocImpl* doc )
1908
34.4k
{
1909
34.4k
    Node* doctype = NULL;
1910
34.4k
    Node* html = TY_(FindHTML)( doc );
1911
1912
34.4k
    if ( !html )
1913
0
        return NULL;
1914
1915
34.4k
    doctype = TY_(NewNode)( doc->allocator, NULL );
1916
34.4k
    doctype->type = DocTypeTag;
1917
34.4k
    TY_(InsertNodeBeforeElement)(html, doctype);
1918
34.4k
    return doctype;
1919
34.4k
}
1920
1921
Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
1922
39.0k
{
1923
39.0k
    Lexer *lexer = doc->lexer;
1924
39.0k
    Node *doctype = TY_(FindDocType)( doc );
1925
39.0k
    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
1926
39.0k
    ctmbstr pub = "PUBLIC";
1927
39.0k
    ctmbstr sys = "SYSTEM";
1928
1929
39.0k
    lexer->versionEmitted = TY_(ApparentVersion)( doc );
1930
1931
39.0k
    if (dtmode == TidyDoctypeOmit)
1932
0
    {
1933
0
        if (doctype)
1934
0
            TY_(DiscardElement)(doc, doctype);
1935
0
        return yes;
1936
0
    }
1937
1938
39.0k
    if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))
1939
0
        return no;
1940
1941
39.0k
    if (!doctype)
1942
34.3k
    {
1943
34.3k
        doctype = NewDocTypeNode(doc);
1944
34.3k
        doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
1945
34.3k
    }
1946
4.70k
    else
1947
4.70k
    {
1948
4.70k
        doctype->element = TY_(tmbstrtolower)(doctype->element);
1949
4.70k
    }
1950
1951
39.0k
    switch(dtmode)
1952
39.0k
    {
1953
0
    case TidyDoctypeHtml5:
1954
        /* HTML5 */
1955
0
        TY_(RepairAttrValue)(doc, doctype, pub, NULL);
1956
0
        TY_(RepairAttrValue)(doc, doctype, sys, NULL);
1957
0
        lexer->versionEmitted = XH50;
1958
0
        break;
1959
0
    case TidyDoctypeStrict:
1960
        /* XHTML 1.0 Strict */
1961
0
        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1962
0
        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1963
0
        lexer->versionEmitted = X10S;
1964
0
        break;
1965
0
    case TidyDoctypeLoose:
1966
        /* XHTML 1.0 Transitional */
1967
0
        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1968
0
        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1969
0
        lexer->versionEmitted = X10T;
1970
0
        break;
1971
0
    case TidyDoctypeUser:
1972
        /* user defined document type declaration */
1973
0
        TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype));
1974
0
        TY_(RepairAttrValue)(doc, doctype, sys, "");
1975
0
        break;
1976
39.0k
    case TidyDoctypeAuto:
1977
39.0k
        if (lexer->doctype == VERS_UNKNOWN || lexer->doctype == VERS_HTML5) {
1978
38.9k
          lexer->versionEmitted = XH50;
1979
38.9k
          return yes;
1980
38.9k
        }
1981
192
        else if (lexer->versions & XH11 && lexer->doctype == XH11)
1982
51
        {
1983
51
            if (!TY_(GetAttrByName)(doctype, sys))
1984
50
                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1985
51
            lexer->versionEmitted = XH11;
1986
51
            return yes;
1987
51
        }
1988
141
        else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))
1989
12
        {
1990
12
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11));
1991
12
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1992
12
            lexer->versionEmitted = XH11;
1993
12
        }
1994
129
        else if (lexer->versions & XB10 && lexer->doctype == XB10)
1995
0
        {
1996
0
            if (!TY_(GetAttrByName)(doctype, sys))
1997
0
                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10));
1998
0
            lexer->versionEmitted = XB10;
1999
0
            return yes;
2000
0
        }
2001
129
        else if (lexer->versions & VERS_HTML40_STRICT)
2002
15
        {
2003
15
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
2004
15
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
2005
15
            lexer->versionEmitted = X10S;
2006
15
        }
2007
114
        else if (lexer->versions & VERS_FRAMESET)
2008
4
        {
2009
4
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F));
2010
4
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F));
2011
4
            lexer->versionEmitted = X10F;
2012
4
        }
2013
110
        else if (lexer->versions & VERS_LOOSE)
2014
85
        {
2015
85
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
2016
85
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
2017
85
            lexer->versionEmitted = X10T;
2018
85
        }
2019
25
        else if (lexer->versions & VERS_HTML5)
2020
1
        {
2021
            /*\
2022
             *  Issue #273 - If still a html5/xhtml5 bit
2023
             *  existing, that is the 'ConstrainVersion' has
2024
             *  not eliminated all HTML5, then nothing to do here.
2025
             *  Certainly do **not** delete the DocType node!
2026
             *  see: http://www.w3.org/QA/Tips/Doctype
2027
            \*/
2028
1
        }
2029
24
        else
2030
24
        {
2031
24
            if (doctype)
2032
24
                TY_(DiscardElement)(doc, doctype);
2033
24
            return no;
2034
24
        }
2035
117
        break;
2036
117
    case TidyDoctypeOmit:
2037
0
        assert(0);
2038
0
        break;
2039
39.0k
    }
2040
2041
117
    return no;
2042
39.0k
}
2043
2044
/* fixup doctype if missing */
2045
Bool TY_(FixDocType)( TidyDocImpl* doc )
2046
81
{
2047
81
    Lexer* lexer = doc->lexer;
2048
81
    Node* doctype = TY_(FindDocType)( doc );
2049
81
    uint dtmode = cfg( doc, TidyDoctypeMode );
2050
81
    uint guessed = VERS_UNKNOWN;
2051
81
    Bool hadSI = no;
2052
2053
    /* Issue #167 - found doctype, and doctype is default VERS_HTML5, set VERS_HTML5 and return yes */
2054
81
    if (doctype && (dtmode == TidyDoctypeAuto) &&
2055
3
        (lexer->doctype == VERS_HTML5) )
2056
1
    {
2057
        /* The version emitted cannot be a composite value! */
2058
1
        lexer->versionEmitted = HT50;
2059
1
        return yes;
2060
1
    }
2061
80
    if (dtmode == TidyDoctypeAuto &&
2062
80
        lexer->versions & lexer->doctype &&
2063
0
        !(VERS_XHTML & lexer->doctype && !lexer->isvoyager)
2064
0
        && TY_(FindDocType)(doc))
2065
0
    {
2066
0
        lexer->versionEmitted = lexer->doctype;
2067
0
        return yes;
2068
0
    }
2069
2070
80
    if (dtmode == TidyDoctypeOmit)
2071
0
    {
2072
0
        if (doctype)
2073
0
            TY_(DiscardElement)( doc, doctype );
2074
0
        lexer->versionEmitted = TY_(ApparentVersion)( doc );
2075
0
        return yes;
2076
0
    }
2077
2078
80
    if (cfgBool(doc, TidyXmlOut))
2079
19
        return yes;
2080
2081
61
    if (doctype)
2082
2
        hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL;
2083
2084
61
    if ((dtmode == TidyDoctypeStrict ||
2085
61
         dtmode == TidyDoctypeLoose) && doctype)
2086
0
    {
2087
0
        TY_(DiscardElement)(doc, doctype);
2088
0
        doctype = NULL;
2089
0
    }
2090
2091
61
    switch (dtmode)
2092
61
    {
2093
0
    case TidyDoctypeHtml5:
2094
0
        guessed = HT50;
2095
0
        break;
2096
0
    case TidyDoctypeStrict:
2097
0
        guessed = H41S;
2098
0
        break;
2099
0
    case TidyDoctypeLoose:
2100
0
        guessed = H41T;
2101
0
        break;
2102
61
    case TidyDoctypeAuto:
2103
61
        guessed = TY_(HTMLVersion)(doc);
2104
61
        break;
2105
61
    }
2106
2107
61
    lexer->versionEmitted = guessed;
2108
61
    if (guessed == VERS_UNKNOWN)
2109
0
        return no;
2110
2111
61
    if (doctype)
2112
2
    {
2113
2
        doctype->element = TY_(tmbstrtolower)(doctype->element);
2114
2
    }
2115
59
    else
2116
59
    {
2117
59
        doctype = NewDocTypeNode(doc);
2118
59
        doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
2119
59
    }
2120
2121
61
    TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));
2122
2123
61
    if (hadSI)
2124
0
        TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed));
2125
2126
61
    return yes;
2127
61
}
2128
2129
/* ensure XML document starts with <?xml version="1.0"?> */
2130
/* add encoding attribute if not using ASCII or UTF-8 output */
2131
Bool TY_(FixXmlDecl)( TidyDocImpl* doc )
2132
0
{
2133
0
    Node* xml;
2134
0
    AttVal *version, *encoding;
2135
0
    Lexer*lexer = doc->lexer;
2136
0
    Node* root = &doc->root;
2137
2138
0
    if ( root->content && root->content->type == XmlDecl )
2139
0
    {
2140
0
        xml = root->content;
2141
0
    }
2142
0
    else
2143
0
    {
2144
0
        xml = TY_(NewNode)(lexer->allocator, lexer);
2145
0
        xml->type = XmlDecl;
2146
0
        if ( root->content )
2147
0
            TY_(InsertNodeBeforeElement)(root->content, xml);
2148
0
        else
2149
0
            root->content = xml;
2150
0
    }
2151
2152
0
    version = TY_(GetAttrByName)(xml, "version");
2153
0
    encoding = TY_(GetAttrByName)(xml, "encoding");
2154
2155
    /*
2156
      We need to insert a check if declared encoding 
2157
      and output encoding mismatch and fix the XML
2158
      declaration accordingly!!!
2159
    */
2160
2161
0
    if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )
2162
0
    {
2163
0
        ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2164
0
        if ( enc )
2165
0
            TY_(AddAttribute)( doc, xml, "encoding", enc );
2166
0
    }
2167
2168
0
    if ( version == NULL )
2169
0
        TY_(AddAttribute)( doc, xml, "version", "1.0" );
2170
0
    return yes;
2171
0
}
2172
2173
Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id)
2174
633k
{
2175
633k
    Lexer *lexer = doc->lexer;
2176
633k
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
2177
633k
    const Dict* dict = TY_(LookupTagDef)(id);
2178
2179
633k
    assert( dict != NULL );
2180
2181
633k
    node->type = StartTag;
2182
633k
    node->implicit = yes;
2183
633k
    node->element = TY_(tmbstrdup)(doc->allocator, dict->name);
2184
633k
    node->tag = dict;
2185
633k
    node->start = lexer->txtstart;
2186
633k
    node->end = lexer->txtend;
2187
2188
633k
    return node;
2189
633k
}
2190
2191
static Bool ExpectsContent(Node *node)
2192
2.23M
{
2193
2.23M
    if (node->type != StartTag)
2194
54.7k
        return no;
2195
2196
    /* unknown element? */
2197
2.17M
    if (node->tag == NULL)
2198
295k
        return yes;
2199
2200
1.88M
    if (node->tag->model & CM_EMPTY)
2201
85.3k
        return no;
2202
2203
1.79M
    return yes;
2204
1.88M
}
2205
2206
/*
2207
  create a text node for the contents of
2208
  a CDATA element like style or script
2209
  which ends with </foo> for some foo.
2210
*/
2211
2212
typedef enum
2213
{
2214
    CDATA_INTERMEDIATE,
2215
    CDATA_STARTTAG,
2216
    CDATA_ENDTAG
2217
} CDATAState;
2218
2219
static Node *GetCDATA( TidyDocImpl* doc, Node *container )
2220
7.54k
{
2221
7.54k
    Lexer* lexer = doc->lexer;
2222
7.54k
    uint start = 0;
2223
7.54k
    int nested = 0;
2224
7.54k
    CDATAState state = CDATA_INTERMEDIATE;
2225
7.54k
    uint i;
2226
7.54k
    Bool isEmpty = yes;
2227
7.54k
    Bool matches = no;
2228
7.54k
    uint c;
2229
7.54k
    Bool hasSrc = (TY_(AttrGetById)(container, TidyAttr_SRC) != NULL) ? yes : no;
2230
    /*\ Issue #65 (1642186) and #280 - is script or style, and the option on
2231
     *  If yes, then avoid incrementing nested...
2232
    \*/
2233
7.54k
    Bool nonested = ((nodeIsSCRIPT(container) || (nodeIsSTYLE(container))) && 
2234
6.81k
        cfgBool(doc, TidySkipNested)) ? yes : no;
2235
2236
7.54k
    SetLexerLocus( doc, lexer );
2237
7.54k
    lexer->waswhite = no;
2238
7.54k
    lexer->txtstart = lexer->txtend = lexer->lexsize;
2239
2240
    /* seen start tag, look for matching end tag */
2241
51.3M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2242
51.3M
    {
2243
51.3M
        TY_(AddCharToLexer)(lexer, c);
2244
51.3M
        lexer->txtend = lexer->lexsize;
2245
2246
51.3M
        if (state == CDATA_INTERMEDIATE)
2247
51.0M
        {
2248
51.0M
            if (c != '<')
2249
50.9M
            {
2250
50.9M
                if (isEmpty && !TY_(IsWhite)(c))
2251
3.15k
                    isEmpty = no;
2252
50.9M
                continue;
2253
50.9M
            }
2254
2255
38.1k
            c = TY_(ReadChar)(doc->docIn);
2256
2257
38.1k
            if (TY_(IsLetter)(c))
2258
15.0k
            {
2259
                /* <head><script src=foo><meta name=foo content=bar>*/
2260
15.0k
                if (hasSrc && isEmpty && nodeIsSCRIPT(container))
2261
1.18k
                {
2262
                    /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
2263
1.18k
                    lexer->lexsize = lexer->txtstart;
2264
1.18k
                    TY_(UngetChar)(c, doc->docIn);
2265
1.18k
                    TY_(UngetChar)('<', doc->docIn);
2266
1.18k
                    return NULL;
2267
1.18k
                }
2268
13.9k
                TY_(AddCharToLexer)(lexer, c);
2269
13.9k
                start = lexer->lexsize - 1;
2270
13.9k
                state = CDATA_STARTTAG;
2271
13.9k
            }
2272
23.0k
            else if (c == '/')
2273
14.8k
            {
2274
14.8k
                TY_(AddCharToLexer)(lexer, c);
2275
2276
14.8k
                c = TY_(ReadChar)(doc->docIn);
2277
                
2278
14.8k
                if (!TY_(IsLetter)(c))
2279
1.25k
                {
2280
1.25k
                    TY_(UngetChar)(c, doc->docIn);
2281
1.25k
                    continue;
2282
1.25k
                }
2283
13.5k
                TY_(UngetChar)(c, doc->docIn);
2284
2285
13.5k
                start = lexer->lexsize;
2286
13.5k
                state = CDATA_ENDTAG;
2287
13.5k
            }
2288
8.25k
            else if (c == '\\')
2289
2.41k
            {
2290
                /* recognize document.write("<script><\/script>") */
2291
2.41k
                TY_(AddCharToLexer)(lexer, c);
2292
2293
2.41k
                c = TY_(ReadChar)(doc->docIn);
2294
2295
2.41k
                if (c != '/')
2296
570
                {
2297
570
                    TY_(UngetChar)(c, doc->docIn);
2298
570
                    continue;
2299
570
                }
2300
2301
1.84k
                TY_(AddCharToLexer)(lexer, c);
2302
2303
1.84k
                if (nonested) {
2304
                    /*\ 
2305
                     *  Issue #65 - for version 5.1.14.EXP2
2306
                     *  If the nonested option is ON then the <script> 
2307
                     *  tag did not bump nested, so no need to treat this as 
2308
                     *  an end tag just to decrease nested, just continue!
2309
                    \*/
2310
401
                    continue;
2311
401
                }
2312
2313
1.44k
                c = TY_(ReadChar)(doc->docIn);
2314
                
2315
1.44k
                if (!TY_(IsLetter)(c))
2316
1.00k
                {
2317
1.00k
                    TY_(UngetChar)(c, doc->docIn);
2318
1.00k
                    continue;
2319
1.00k
                }
2320
437
                TY_(UngetChar)(c, doc->docIn);
2321
2322
437
                start = lexer->lexsize;
2323
437
                state = CDATA_ENDTAG;
2324
437
            }
2325
5.84k
            else
2326
5.84k
            {
2327
5.84k
                TY_(UngetChar)(c, doc->docIn);
2328
5.84k
            }
2329
38.1k
        }
2330
        /* '<' + Letter found */
2331
374k
        else if (state == CDATA_STARTTAG)
2332
45.5k
        {
2333
45.5k
            if (TY_(IsLetter)(c))
2334
31.7k
                continue;
2335
2336
13.8k
            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2337
13.8k
                                          TY_(tmbstrlen)(container->element)) == 0;
2338
13.8k
            if (matches && !nonested)
2339
1.16k
                nested++;
2340
2341
13.8k
            state = CDATA_INTERMEDIATE;
2342
13.8k
        }
2343
        /* '<' + '/' + Letter found */
2344
329k
        else if (state == CDATA_ENDTAG)
2345
329k
        {
2346
329k
            if (TY_(IsLetter)(c))
2347
315k
                continue;
2348
2349
14.0k
            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2350
14.0k
                                          TY_(tmbstrlen)(container->element)) == 0;
2351
2352
14.0k
            if (isEmpty && !matches)
2353
2.20k
            {
2354
                /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
2355
2356
96.1k
                for (i = lexer->lexsize - 1; i >= start; --i)
2357
93.9k
                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2358
2.20k
                TY_(UngetChar)('/', doc->docIn);
2359
2.20k
                TY_(UngetChar)('<', doc->docIn);
2360
2.20k
                break;
2361
2.20k
            }
2362
2363
11.7k
            if (matches && nested-- <= 0)
2364
3.20k
            {
2365
71.8k
                for (i = lexer->lexsize - 1; i >= start; --i)
2366
68.6k
                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2367
3.20k
                TY_(UngetChar)('/', doc->docIn);
2368
3.20k
                TY_(UngetChar)('<', doc->docIn);
2369
3.20k
                lexer->lexsize -= (lexer->lexsize - start) + 2;
2370
3.20k
                break;
2371
3.20k
            }
2372
8.58k
            else if (lexer->lexbuf[start - 2] != '\\')
2373
8.32k
            {
2374
                /* if the end tag is not already escaped using backslash */
2375
8.32k
                SetLexerLocus( doc, lexer );
2376
8.32k
                lexer->columns -= 3;
2377
2378
                /*\ if javascript insert backslash before / 
2379
                 *  Issue #348 - Add option, escape-scripts, to skip
2380
                \*/
2381
8.32k
                if ((TY_(IsJavaScript)(container)) && cfgBool(doc, TidyEscapeScripts) &&
2382
5.84k
                    !TY_(IsHTML5Mode)(doc) )    /* Is #700 - This only applies to legacy html4 mode */
2383
3.90k
                {
2384
                    /* Issue #281 - only warn if adding the escape! */
2385
3.90k
                    TY_(Report)(doc, NULL, NULL, BAD_CDATA_CONTENT);
2386
2387
161k
                    for (i = lexer->lexsize; i > start-1; --i)
2388
158k
                        lexer->lexbuf[i] = lexer->lexbuf[i-1];
2389
2390
3.90k
                    lexer->lexbuf[start-1] = '\\';
2391
3.90k
                    lexer->lexsize++;
2392
3.90k
                }
2393
8.32k
            }
2394
8.58k
            state = CDATA_INTERMEDIATE;
2395
8.58k
        }
2396
51.3M
    }
2397
6.36k
    if (isEmpty)
2398
3.20k
        lexer->lexsize = lexer->txtstart = lexer->txtend;
2399
3.15k
    else
2400
3.15k
        lexer->txtend = lexer->lexsize;
2401
2402
6.36k
    if (c == EndOfStream)
2403
949
        TY_(Report)(doc, container, NULL, MISSING_ENDTAG_FOR );
2404
2405
6.36k
    return TY_(TextToken)(lexer);
2406
7.54k
}
2407
2408
void TY_(UngetToken)( TidyDocImpl* doc )
2409
2.23M
{
2410
2.23M
    doc->lexer->pushed = yes;
2411
2.23M
}
2412
2413
#if defined(ENABLE_DEBUG_LOG)
2414
#  define CondReturnTextNode(doc, skip) \
2415
            if (lexer->txtend > lexer->txtstart) { \
2416
                Node *_node = TY_(TextToken)(lexer); \
2417
                lexer->token = _node; \
2418
                GTDBG(doc,"text_node",_node); \
2419
                return _node; \
2420
            }
2421
2422
#else
2423
#  define CondReturnTextNode(doc, skip) \
2424
3.93M
            if (lexer->txtend > lexer->txtstart) \
2425
3.93M
            { \
2426
716k
                lexer->token = TY_(TextToken)(lexer); \
2427
716k
                return lexer->token; \
2428
716k
            }
2429
#endif
2430
2431
/*
2432
  modes for GetToken()
2433
2434
  MixedContent   -- for elements which don't accept PCDATA
2435
  Preformatted   -- white space preserved as is
2436
  IgnoreMarkup   -- for CDATA elements such as script, style
2437
*/
2438
static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode );
2439
2440
Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
2441
12.9M
{
2442
12.9M
    Node *node;
2443
12.9M
    Lexer* lexer = doc->lexer;
2444
2445
12.9M
    if (lexer->pushed || lexer->itoken)
2446
2.25M
    {
2447
        /* Deal with previously returned duplicate inline token */
2448
2.25M
        if (lexer->itoken)
2449
11.2k
        {
2450
            /* itoken rejected */
2451
11.2k
            if (lexer->pushed)
2452
244
            {
2453
244
                lexer->pushed = no;
2454
244
                node = lexer->itoken;
2455
244
                GTDBG(doc,"lex-itoken", node);
2456
244
                return node;
2457
244
            }
2458
            /* itoken has been accepted */
2459
11.0k
            lexer->itoken = NULL;
2460
11.0k
        }
2461
            
2462
        /* duplicate inlines in preference to pushed text nodes when appropriate */
2463
2.25M
        lexer->pushed = no;
2464
2.25M
        if (lexer->token->type != TextNode
2465
2.23M
            || !(lexer->insert || lexer->inode)) {
2466
2.23M
            node = lexer->token;
2467
2.23M
            GTDBG(doc,"lex-token", node);
2468
2.23M
            return node;
2469
2.23M
        }
2470
11.0k
        lexer->itoken = TY_(InsertedToken)( doc );
2471
11.0k
        node = lexer->itoken;
2472
11.0k
        GTDBG(doc,"lex-inserted", node);
2473
11.0k
        return node;
2474
2.25M
    }
2475
2476
12.9M
    assert( !(lexer->pushed || lexer->itoken) );
2477
2478
    /* at start of block elements, unclosed inline
2479
       elements are inserted into the token stream 
2480
       Issue #341 - Can NOT insert a token if NO istacksize  
2481
     */
2482
10.6M
    if ((lexer->insert || lexer->inode) && lexer->istacksize)
2483
2.95M
    {
2484
        /*\ Issue #92: could fix by the following, but instead chose not to stack these 2
2485
         *  if ( !(lexer->insert && (nodeIsINS(lexer->insert) || nodeIsDEL(lexer->insert))) ) {
2486
        \*/
2487
2.95M
        lexer->token = TY_(InsertedToken)( doc );
2488
2.95M
        node = lexer->token;
2489
2.95M
        GTDBG(doc,"lex-inserted2", node);
2490
2.95M
        return node;
2491
2.95M
    }
2492
2493
7.73M
    if (mode == CdataContent)
2494
7.54k
    {
2495
7.54k
        assert( lexer->parent != NULL );
2496
7.54k
        node = GetCDATA(doc, lexer->parent);
2497
7.54k
        GTDBG(doc,"lex-cdata", node);
2498
7.54k
        return node;
2499
7.54k
    }
2500
2501
7.72M
    return GetTokenFromStream( doc, mode );
2502
7.73M
}
2503
2504
#if defined(ENABLE_DEBUG_LOG)
2505
static void check_me(char *name)
2506
{
2507
    SPRTF("Have node %s\n", name);
2508
}
2509
#endif
2510
2511
static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
2512
7.72M
{
2513
7.72M
    Lexer* lexer = doc->lexer;
2514
7.72M
    uint c, lexdump, badcomment = 0;
2515
7.72M
    Bool isempty = no;
2516
7.72M
    AttVal *attributes = NULL;
2517
7.72M
    Node *node;
2518
7.72M
    Bool fixComments;
2519
    
2520
7.72M
    switch ( cfgAutoBool(doc, TidyFixComments) )
2521
7.72M
    {
2522
0
        case TidyYesState:
2523
0
            fixComments = yes;
2524
0
            break;
2525
2526
0
        case TidyNoState:
2527
0
            fixComments = no;
2528
0
            break;
2529
2530
7.72M
        default:
2531
7.72M
            fixComments = (TY_(HTMLVersion)(doc) & HT50) == 0;
2532
7.72M
            break;
2533
7.72M
    }
2534
2535
    /* Lexer->token must be set on return. Nullify it for safety. */
2536
7.72M
    lexer->token = NULL;
2537
2538
7.72M
    SetLexerLocus( doc, lexer );
2539
7.72M
    lexer->waswhite = no;
2540
2541
7.72M
    lexer->txtstart = lexer->txtend = lexer->lexsize;
2542
2543
719M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2544
716M
    {
2545
716M
        if (lexer->insertspace)
2546
3.23k
        {
2547
3.23k
            TY_(AddCharToLexer)(lexer, ' ');
2548
3.23k
            lexer->waswhite = yes;
2549
3.23k
            lexer->insertspace = no;
2550
3.23k
        }
2551
2552
716M
        if (c == 160 && (mode == Preformatted))
2553
857
            c = ' ';
2554
2555
716M
        TY_(AddCharToLexer)(lexer, c);
2556
2557
716M
        switch (lexer->state)
2558
716M
        {
2559
498M
            case LEX_CONTENT:  /* element content */
2560
2561
                /*
2562
                 Discard white space if appropriate. Its cheaper
2563
                 to do this here rather than in parser methods
2564
                 for elements that don't have mixed content.
2565
                */
2566
498M
                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) 
2567
4.21M
                      && lexer->lexsize == lexer->txtstart + 1)
2568
3.73M
                {
2569
3.73M
                    --(lexer->lexsize);
2570
3.73M
                    lexer->waswhite = no;
2571
3.73M
                    SetLexerLocus( doc, lexer );
2572
3.73M
                    continue;
2573
3.73M
                }
2574
2575
494M
                if (c == '<')
2576
9.53M
                {
2577
9.53M
                    lexer->state = LEX_GT;
2578
9.53M
                    continue;
2579
9.53M
                }
2580
2581
485M
                if (TY_(IsWhite)(c))
2582
439M
                {
2583
                    /* was previous character white? */
2584
439M
                    if (lexer->waswhite)
2585
437M
                    {
2586
437M
                        if (mode != Preformatted && mode != IgnoreMarkup)
2587
8.35M
                        {
2588
8.35M
                            --(lexer->lexsize);
2589
8.35M
                            SetLexerLocus( doc, lexer );
2590
8.35M
                        }
2591
437M
                    }
2592
1.90M
                    else /* prev character wasn't white */
2593
1.90M
                    {
2594
1.90M
                        lexer->waswhite = yes;
2595
2596
1.90M
                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
2597
446k
                            ChangeChar(lexer, ' ');
2598
1.90M
                    }
2599
2600
439M
                    continue;
2601
439M
                }
2602
46.2M
                else if (c == '&' && mode != IgnoreMarkup)
2603
107k
                    ParseEntity( doc, mode );
2604
2605
                /* this is needed to avoid trimming trailing whitespace */
2606
46.2M
                if (mode == IgnoreWhitespace)
2607
100k
                    mode = MixedContent;
2608
2609
46.2M
                lexer->waswhite = no;
2610
46.2M
                continue;
2611
2612
9.53M
            case LEX_GT:  /* < */
2613
2614
                /* check for endtag */
2615
9.53M
                if (c == '/')
2616
250k
                {
2617
250k
                    if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2618
104
                    {
2619
104
                        TY_(UngetChar)(c, doc->docIn);
2620
104
                        continue;
2621
104
                    }
2622
2623
250k
                    TY_(AddCharToLexer)(lexer, c);
2624
2625
250k
                    if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c)))
2626
202k
                    {
2627
202k
                        lexer->lexsize -= 3;
2628
202k
                        lexer->txtend = lexer->lexsize;
2629
202k
                        TY_(UngetChar)(c, doc->docIn);
2630
202k
                        lexer->state = LEX_ENDTAG;
2631
202k
                        lexer->lexbuf[lexer->lexsize] = '\0';  /* debug */
2632
202k
                        doc->docIn->curcol -= 2;
2633
2634
                        /* if some text before the </ return it now */
2635
202k
                        if (lexer->txtend > lexer->txtstart)
2636
22.2k
                        {
2637
                            /* trim space character before end tag */
2638
22.2k
                            if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
2639
1.07k
                            {
2640
1.07k
                                lexer->lexsize -= 1;
2641
1.07k
                                lexer->txtend = lexer->lexsize;
2642
1.07k
                            }
2643
22.2k
                            lexer->token = TY_(TextToken)(lexer);
2644
22.2k
                            node = lexer->token;
2645
22.2k
                            GTDBG(doc,"text", node);
2646
22.2k
                            return node;
2647
22.2k
                        }
2648
2649
180k
                        continue;       /* no text so keep going */
2650
202k
                    }
2651
2652
                    /* otherwise treat as CDATA */
2653
47.8k
                    lexer->waswhite = no;
2654
47.8k
                    lexer->state = LEX_CONTENT;
2655
47.8k
                    continue;
2656
250k
                }
2657
2658
9.28M
                if (mode == IgnoreMarkup)
2659
0
                {
2660
                    /* otherwise treat as CDATA */
2661
0
                    lexer->waswhite = no;
2662
0
                    lexer->state = LEX_CONTENT;
2663
0
                    continue;
2664
0
                }
2665
2666
                /*
2667
                   look out for comments, doctype or marked sections
2668
                   this isn't quite right, but its getting there ...
2669
                */
2670
9.28M
                if (c == '!')
2671
1.16M
                {
2672
1.16M
                    c = TY_(ReadChar)(doc->docIn);
2673
2674
1.16M
                    if (c == '-')
2675
676k
                    {
2676
676k
                        c = TY_(ReadChar)(doc->docIn);
2677
2678
676k
                        if (c == '-')
2679
674k
                        {
2680
674k
                            lexer->state = LEX_COMMENT;  /* comment */
2681
674k
                            lexer->lexsize -= 2;
2682
674k
                            lexer->txtend = lexer->lexsize;
2683
2684
674k
                            CondReturnTextNode(doc, 4)
2685
2686
662k
                            lexer->txtstart = lexer->lexsize;
2687
662k
                            continue;
2688
674k
                        }
2689
2690
                        /*
2691
                           TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING );
2692
                           Warning now done later - see issue #487
2693
                         */
2694
676k
                    }
2695
483k
                    else if (c == 'd' || c == 'D')
2696
158k
                    {
2697
                        /* todo: check for complete "<!DOCTYPE" not just <!D */
2698
2699
158k
                        uint skip = 0;
2700
2701
158k
                        lexer->state = LEX_DOCTYPE; /* doctype */
2702
158k
                        lexer->lexsize -= 2;
2703
158k
                        lexer->txtend = lexer->lexsize;
2704
158k
                        mode = IgnoreWhitespace;
2705
2706
                        /* skip until white space or '>' */
2707
2708
158k
                        for (;;)
2709
312k
                        {
2710
312k
                            c = TY_(ReadChar)(doc->docIn);
2711
312k
                            ++skip;
2712
2713
312k
                            if (c == EndOfStream || c == '>')
2714
82.5k
                            {
2715
82.5k
                                TY_(UngetChar)(c, doc->docIn);
2716
82.5k
                                break;
2717
82.5k
                            }
2718
2719
2720
229k
                            if (!TY_(IsWhite)(c))
2721
154k
                                continue;
2722
2723
                            /* and skip to end of whitespace */
2724
2725
75.6k
                            for (;;)
2726
814k
                            {
2727
814k
                                c = TY_(ReadChar)(doc->docIn);
2728
814k
                                ++skip;
2729
2730
814k
                                if (c == EndOfStream || c == '>')
2731
2.33k
                                {
2732
2.33k
                                    TY_(UngetChar)(c, doc->docIn);
2733
2.33k
                                    break;
2734
2.33k
                                }
2735
2736
2737
811k
                                if (TY_(IsWhite)(c))
2738
738k
                                    continue;
2739
2740
73.2k
                                TY_(UngetChar)(c, doc->docIn);
2741
73.2k
                                break;
2742
811k
                            }
2743
2744
75.6k
                            break;
2745
229k
                        }
2746
2747
158k
                        CondReturnTextNode(doc, (skip + 3))
2748
2749
134k
                        lexer->txtstart = lexer->lexsize;
2750
134k
                        continue;
2751
158k
                    }
2752
325k
                    else if (c == '[')
2753
316k
                    {
2754
                        /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
2755
316k
                        lexer->lexsize -= 2;
2756
316k
                        lexer->state = LEX_SECTION;
2757
316k
                        lexer->txtend = lexer->lexsize;
2758
2759
316k
                        CondReturnTextNode(doc, 2)
2760
2761
16.5k
                        lexer->txtstart = lexer->lexsize;
2762
16.5k
                        continue;
2763
316k
                    }
2764
2765
2766
                    /*
2767
                       We only print this message if there's a missing
2768
                       starting hyphen; this comment will be dropped.
2769
                     */
2770
10.9k
                    TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING ); /* Is. #487 */
2771
2772
                    /* else swallow characters up to and including next '>' */
2773
429k
                    while ((c = TY_(ReadChar)(doc->docIn)) != '>')
2774
418k
                    {
2775
418k
                        if (c == EndOfStream)
2776
273
                        {
2777
273
                            TY_(UngetChar)(c, doc->docIn);
2778
273
                            break;
2779
273
                        }
2780
418k
                    }
2781
2782
10.9k
                    lexer->lexsize -= 2;
2783
10.9k
                    lexer->lexbuf[lexer->lexsize] = '\0';
2784
10.9k
                    lexer->state = LEX_CONTENT;
2785
10.9k
                    continue;
2786
1.16M
                }
2787
2788
                /*
2789
                   processing instructions
2790
                */
2791
2792
8.12M
                if (c == '?')
2793
183k
                {
2794
183k
                    lexer->lexsize -= 2;
2795
183k
                    lexer->state = LEX_PROCINSTR;
2796
183k
                    lexer->txtend = lexer->lexsize;
2797
2798
183k
                    CondReturnTextNode(doc, 2)
2799
2800
62.1k
                    lexer->txtstart = lexer->lexsize;
2801
62.1k
                    continue;
2802
183k
                }
2803
2804
                /* Microsoft ASP's e.g. <% ... server-code ... %> */
2805
7.94M
                if (c == '%')
2806
22.8k
                {
2807
22.8k
                    lexer->lexsize -= 2;
2808
22.8k
                    lexer->state = LEX_ASP;
2809
22.8k
                    lexer->txtend = lexer->lexsize;
2810
2811
22.8k
                    CondReturnTextNode(doc, 2)
2812
2813
21.2k
                    lexer->txtstart = lexer->lexsize;
2814
21.2k
                    continue;
2815
22.8k
                }
2816
2817
                /* Netscapes JSTE e.g. <# ... server-code ... #> */
2818
7.91M
                if (c == '#')
2819
202k
                {
2820
202k
                    lexer->lexsize -= 2;
2821
202k
                    lexer->state = LEX_JSTE;
2822
202k
                    lexer->txtend = lexer->lexsize;
2823
2824
202k
                    CondReturnTextNode(doc, 2)
2825
2826
200k
                    lexer->txtstart = lexer->lexsize;
2827
200k
                    continue;
2828
202k
                }
2829
2830
                /* check for start tag */
2831
7.71M
                if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c)))
2832
2.37M
                {
2833
2.37M
                    TY_(UngetChar)(c, doc->docIn);     /* push back letter */
2834
2.37M
                    TY_(UngetChar)('<', doc->docIn);
2835
2.37M
                    lexer->lexsize -= 2;      /* discard "<" + letter */
2836
2.37M
                    lexer->txtend = lexer->lexsize;
2837
2.37M
                    lexer->state = LEX_STARTTAG;         /* ready to read tag name */
2838
2839
2.37M
                    CondReturnTextNode(doc, 2)
2840
2841
                    /* lexer->txtstart = lexer->lexsize; missing here? */
2842
2.11M
                    continue;       /* no text so keep going */
2843
2.37M
                }
2844
2845
                /* otherwise treat as CDATA */
2846
                /* fix for bug 762102 (486) */
2847
                /* Issue #384 - Fix skipping parsing character, particularly '<<' */
2848
5.34M
                TY_(UngetChar)(c, doc->docIn);
2849
5.34M
                lexer->lexsize -= 1;
2850
5.34M
                lexer->state = LEX_CONTENT;
2851
5.34M
                lexer->waswhite = no;
2852
5.34M
                continue;
2853
2854
207k
            case LEX_ENDTAG:  /* </letter */
2855
207k
                lexer->txtstart = lexer->lexsize - 1;
2856
207k
                doc->docIn->curcol += 2;
2857
207k
                c = ParseTagName( doc );
2858
207k
                lexer->token = TagToken( doc, EndTag );  /* create endtag token */
2859
207k
                lexer->lexsize = lexer->txtend = lexer->txtstart;
2860
2861
                /* skip to '>' */
2862
710k
                while ( c != '>' && c != EndOfStream )
2863
502k
                {
2864
502k
                    c = TY_(ReadChar)(doc->docIn);
2865
502k
                }
2866
2867
207k
                if (c == EndOfStream)
2868
5.11k
                {
2869
5.11k
                    TY_(FreeNode)( doc, lexer->token );
2870
5.11k
                    continue;
2871
5.11k
                }
2872
2873
202k
                lexer->state = LEX_CONTENT;
2874
202k
                lexer->waswhite = no;
2875
202k
                node = lexer->token;
2876
202k
                GTDBG(doc,"endtag", node);
2877
202k
                return node;  /* the endtag token */
2878
2879
2.37M
            case LEX_STARTTAG: /* first letter of tagname */
2880
2.37M
                c = TY_(ReadChar)(doc->docIn);
2881
2.37M
                ChangeChar(lexer, (tmbchar)c);
2882
2.37M
                lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
2883
2.37M
                c = ParseTagName( doc );
2884
2.37M
                isempty = no;
2885
2.37M
                attributes = NULL;
2886
2.37M
                lexer->token = TagToken( doc, StartTag ); /* [i_a]2 'isempty' is always false, thanks to code 2 lines above */
2887
2888
                /* parse attributes, consuming closing ">" */
2889
2.37M
                if (c != '>')
2890
914k
                {
2891
914k
                    if (c == '/')
2892
29.2k
                        TY_(UngetChar)(c, doc->docIn);
2893
2894
914k
                    attributes = ParseAttrs( doc, &isempty );
2895
914k
                }
2896
2897
2.37M
                if (isempty)
2898
55.3k
                    lexer->token->type = StartEndTag;
2899
2900
2.37M
                lexer->token->attributes = attributes;
2901
2.37M
                lexer->lexsize = lexer->txtend = lexer->txtstart;
2902
2903
                /* swallow newline following start tag */
2904
                /* special check needed for CRLF sequence */
2905
                /* this doesn't apply to empty elements */
2906
                /* nor to preformatted content that needs escaping */
2907
                /*\
2908
                 * Issue #230: Need to KEEP this user newline character in certain 
2909
                 * circumstances, certainly for <pre>, <script>, <style>...
2910
                 * Any others?
2911
                 * Issue #238: maybe **ONLY** for <pre>
2912
                \*/
2913
2.37M
                if ( nodeIsPRE(lexer->token) )
2914
65.4k
                {
2915
65.4k
                    mode = Preformatted;
2916
65.4k
                }
2917
2918
2.37M
                if ((mode != Preformatted && ExpectsContent(lexer->token))
2919
2.37M
                    || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
2920
2.11M
                {
2921
2.11M
                    c = TY_(ReadChar)(doc->docIn);
2922
2923
2.11M
                    if ((c == '\n') && (mode != IgnoreWhitespace)) /* Issue #329 - Can NOT afford to lose this newline */
2924
46.4k
                        TY_(UngetChar)(c, doc->docIn);  /* Issue #329 - make sure the newline is maintained for now */
2925
2.06M
                    else if (c != '\n' && c != '\f')
2926
2.06M
                        TY_(UngetChar)(c, doc->docIn);
2927
2928
2.11M
                    lexer->waswhite = yes;  /* to swallow leading whitespace */
2929
2.11M
                }
2930
260k
                else
2931
260k
                    lexer->waswhite = no;
2932
2933
2.37M
                lexer->state = LEX_CONTENT;
2934
2.37M
                if (lexer->token->tag == NULL) 
2935
314k
                {
2936
314k
                    if (mode != OtherNamespace) /* [i_a]2 only issue warning if NOT 'OtherNamespace', and tag null */
2937
283k
                    {
2938
                        /* Special case for HTML5 unknown tags: if it looks 
2939
                           like an autonomous custom tag, then emit a variation
2940
                           of the standard message. We don't want to do this
2941
                           for older HTML, because it's not truly supported
2942
                           by the standard, although Tidy will allow it. */
2943
283k
                        if ( (doc->lexer->doctype & VERS_HTML5) > 0 && TY_(elementIsAutonomousCustomFormat)( lexer->token->element ) )
2944
505
                            TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT_LOOKS_CUSTOM );
2945
283k
                        else
2946
283k
                            TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT );
2947
283k
                    }
2948
314k
                }
2949
2.05M
                else if ( !cfgBool(doc, TidyXmlTags) )
2950
2.02M
                {
2951
2.02M
                    TY_(ConstrainVersion)( doc, lexer->token->tag->versions );
2952
2.02M
                    TY_(RepairDuplicateAttributes)( doc, lexer->token, no );
2953
2.02M
                } else 
2954
32.0k
                    TY_(RepairDuplicateAttributes)( doc, lexer->token, yes );
2955
2.37M
                node = lexer->token;
2956
2.37M
                GTDBG(doc,"starttag", node);
2957
2.37M
                return node;  /* return start tag */
2958
2959
6.14M
            case LEX_COMMENT:  /* seen <!-- so look for --> */
2960
2961
6.14M
                if (c != '-')
2962
4.97M
                    continue;
2963
2964
1.16M
                c = TY_(ReadChar)(doc->docIn);
2965
2966
                /* Fix hyphens at beginning of tag */
2967
1.16M
                if ( c != '-' && fixComments && lexer->lexsize - lexer->txtstart == 1 )
2968
4.43k
                {
2969
4.43k
                    lexer->lexbuf[lexer->lexsize - 1] = '=';
2970
4.43k
                }
2971
2972
1.16M
                TY_(AddCharToLexer)(lexer, c);
2973
2974
1.16M
                if (c != '-')
2975
347k
                    continue;
2976
2977
2.50M
            end_comment:
2978
2.50M
                c = TY_(ReadChar)(doc->docIn);
2979
2980
2.50M
                if (c == '>')
2981
673k
                {
2982
673k
                    if (badcomment)
2983
20.3k
                    {
2984
                        /*
2985
                           We've got bad comments that we either fixed or
2986
                           ignored; provide proper user feedback based on
2987
                           doctype and whether or not we fixed them.
2988
                         */
2989
20.3k
                        if ( (TY_(HTMLVersion)(doc) & HT50) )
2990
0
                        {
2991
0
                            if ( fixComments )
2992
0
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT );
2993
                            /* Otherwise for HTML5, it's safe to ignore. */
2994
0
                        }
2995
20.3k
                        else
2996
20.3k
                        {
2997
20.3k
                            if ( fixComments )
2998
20.3k
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT );
2999
0
                            else
3000
0
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_WARN );
3001
20.3k
                        }
3002
20.3k
                    }
3003
3004
                    /* do not store closing -- in lexbuf */
3005
673k
                    lexer->lexsize -= 2;
3006
673k
                    lexer->txtend = lexer->lexsize;
3007
673k
                    lexer->lexbuf[lexer->lexsize] = '\0';
3008
673k
                    lexer->state = LEX_CONTENT;
3009
673k
                    lexer->waswhite = no;
3010
673k
                    lexer->token = CommentToken(doc);
3011
3012
                    /* now look for a line break */
3013
3014
673k
                    c = TY_(ReadChar)(doc->docIn);
3015
3016
673k
                    if (c == '\n')
3017
972
                        lexer->token->linebreak = yes;
3018
673k
                    else
3019
673k
                        TY_(UngetChar)(c, doc->docIn);
3020
3021
673k
                    node = lexer->token;
3022
673k
                    GTDBG(doc,"comment", node);
3023
673k
                    return node;
3024
673k
                }
3025
3026
                /* note position of first such error in the comment */
3027
1.82M
                if (!badcomment)
3028
20.5k
                {
3029
20.5k
                    SetLexerLocus( doc, lexer );
3030
20.5k
                    lexer->columns -= 3;
3031
20.5k
                }
3032
3033
1.82M
                badcomment++;
3034
3035
                /* fix hyphens in the middle */
3036
1.82M
                if ( fixComments )
3037
1.82M
                    lexer->lexbuf[lexer->lexsize - 2] = '=';
3038
3039
                /* if '-' then look for '>' to end the comment */
3040
1.82M
                if (c == '-')
3041
1.68M
                {
3042
1.68M
                    TY_(AddCharToLexer)(lexer, c);
3043
1.68M
                    goto end_comment;
3044
1.68M
                }
3045
3046
                /* fix hyphens end, and continue to look for --> */
3047
146k
                if ( fixComments )
3048
146k
                    lexer->lexbuf[lexer->lexsize - 1] = '=';
3049
3050
                /* http://tidy.sf.net/bug/1266647 */
3051
146k
                TY_(AddCharToLexer)(lexer, c);
3052
3053
146k
                continue; 
3054
3055
157k
            case LEX_DOCTYPE:  /* seen <!d so look for '>' munging whitespace */
3056
3057
                /* use ParseDocTypeDecl() to tokenize doctype declaration */
3058
157k
                TY_(UngetChar)(c, doc->docIn);
3059
157k
                lexer->lexsize -= 1;
3060
157k
                lexer->token = ParseDocTypeDecl(doc);
3061
3062
157k
                lexer->txtend = lexer->lexsize;
3063
157k
                lexer->lexbuf[lexer->lexsize] = '\0';
3064
157k
                lexer->state = LEX_CONTENT;
3065
157k
                lexer->waswhite = no;
3066
3067
                /* make a note of the version named by the 1st doctype */
3068
157k
                if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
3069
28.5k
                {
3070
28.5k
                    lexer->doctype = FindGivenVersion(doc, lexer->token);
3071
28.5k
                    if (lexer->doctype != VERS_HTML5)
3072
28.0k
                    {
3073
                        /*\
3074
                         *  Back to legacy HTML4 mode for -
3075
                         *  Issue #167 & #169 - TidyTag_A
3076
                         *  Issue #196        - TidyTag_CAPTION
3077
                         *  others?
3078
                        \*/ 
3079
28.0k
                        TY_(AdjustTags)(doc); /* Dynamically modify the tags table  */
3080
28.0k
                    }
3081
28.5k
                }
3082
157k
                node = lexer->token;
3083
157k
                GTDBG(doc,"doctype", node);
3084
157k
                return node;
3085
3086
92.0M
            case LEX_PROCINSTR:  /* seen <? so look for '>' */
3087
                /* check for PHP preprocessor instructions <?php ... ?> */
3088
3089
92.0M
                if  (lexer->lexsize - lexer->txtstart == 3)
3090
126k
                {
3091
126k
                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
3092
574
                    {
3093
574
                        lexer->state = LEX_PHP;
3094
574
                        continue;
3095
574
                    }
3096
126k
                }
3097
3098
92.0M
                if  (lexer->lexsize - lexer->txtstart == 4)
3099
121k
                {
3100
121k
                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&
3101
23.0k
                        TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3]))
3102
20.8k
                    {
3103
20.8k
                        lexer->state = LEX_XMLDECL;
3104
20.8k
                        attributes = NULL;
3105
20.8k
                        continue;
3106
20.8k
                    }
3107
121k
                }
3108
3109
92.0M
                if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */
3110
46.3M
                {
3111
46.3M
                    if (c != '?')
3112
46.3M
                        continue;
3113
3114
                    /* now look for '>' */
3115
4.74k
                    c = TY_(ReadChar)(doc->docIn);
3116
3117
4.74k
                    if (c == EndOfStream)
3118
5
                    {
3119
5
                        TY_(Report)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );
3120
5
                        TY_(UngetChar)(c, doc->docIn);
3121
5
                        continue;
3122
5
                    }
3123
3124
4.74k
                    TY_(AddCharToLexer)(lexer, c);
3125
4.74k
                }
3126
3127
3128
45.6M
                if (c != '>')
3129
45.5M
                    continue;
3130
3131
162k
                lexer->lexsize -= 1;
3132
3133
162k
                if (lexer->lexsize)
3134
152k
                {
3135
152k
                    uint i;
3136
152k
                    Bool closed;
3137
3138
1.27M
                    for (i = 0; i < lexer->lexsize - lexer->txtstart &&
3139
1.12M
                        !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i)
3140
1.11M
                        /**/;
3141
3142
152k
                    closed = lexer->lexbuf[lexer->lexsize - 1] == '?';
3143
3144
152k
                    if (closed)
3145
98.9k
                        lexer->lexsize -= 1;
3146
3147
152k
                    lexer->txtstart += i;
3148
152k
                    lexer->txtend = lexer->lexsize;
3149
152k
                    lexer->lexbuf[lexer->lexsize] = '\0';
3150
3151
152k
                    lexer->token = PIToken(doc);
3152
152k
                    lexer->token->closed = closed;
3153
152k
                    lexer->token->element = TY_(tmbstrndup)(doc->allocator,
3154
152k
                                                            lexer->lexbuf +
3155
152k
                                                            lexer->txtstart - i, i);
3156
152k
                }
3157
9.18k
                else
3158
9.18k
                {
3159
9.18k
                    lexer->txtend = lexer->lexsize;
3160
9.18k
                    lexer->lexbuf[lexer->lexsize] = '\0';
3161
9.18k
                    lexer->token = PIToken(doc);
3162
9.18k
                }
3163
3164
162k
                lexer->state = LEX_CONTENT;
3165
162k
                lexer->waswhite = no;
3166
162k
                node = lexer->token;
3167
162k
                GTDBG(doc,"procinstr", node);
3168
162k
                return node;
3169
3170
6.71M
            case LEX_ASP:  /* seen <% so look for "%>" */
3171
6.71M
                if (c != '%')
3172
6.68M
                    continue;
3173
3174
                /* now look for '>' */
3175
24.4k
                c = TY_(ReadChar)(doc->docIn);
3176
3177
3178
24.4k
                if (c != '>')
3179
1.77k
                {
3180
1.77k
                    TY_(UngetChar)(c, doc->docIn);
3181
1.77k
                    continue;
3182
1.77k
                }
3183
3184
22.6k
                lexer->lexsize -= 1;
3185
22.6k
                lexer->txtend = lexer->lexsize;
3186
22.6k
                lexer->lexbuf[lexer->lexsize] = '\0';
3187
22.6k
                lexer->state = LEX_CONTENT;
3188
22.6k
                lexer->waswhite = no;
3189
22.6k
                lexer->token = AspToken(doc);
3190
22.6k
                node = lexer->token;
3191
22.6k
                GTDBG(doc,"ASP", node);
3192
22.6k
                return node;  /* the endtag token */
3193
3194
3195
3196
778k
            case LEX_JSTE:  /* seen <# so look for "#>" */
3197
778k
                if (c != '#')
3198
514k
                    continue;
3199
3200
                /* now look for '>' */
3201
264k
                c = TY_(ReadChar)(doc->docIn);
3202
3203
3204
264k
                if (c != '>')
3205
61.4k
                {
3206
61.4k
                    TY_(UngetChar)(c, doc->docIn);
3207
61.4k
                    continue;
3208
61.4k
                }
3209
3210
202k
                lexer->lexsize -= 1;
3211
202k
                lexer->txtend = lexer->lexsize;
3212
202k
                lexer->lexbuf[lexer->lexsize] = '\0';
3213
202k
                lexer->state = LEX_CONTENT;
3214
202k
                lexer->waswhite = no;
3215
202k
                lexer->token = JsteToken(doc);
3216
202k
                node = lexer->token;
3217
202k
                GTDBG(doc,"JSTE", node);
3218
202k
                return node;  /* the JSTE token */
3219
3220
3221
716k
            case LEX_PHP: /* seen "<?php" so look for "?>" */
3222
716k
                if (c != '?')
3223
714k
                    continue;
3224
3225
                /* now look for '>' */
3226
1.37k
                c = TY_(ReadChar)(doc->docIn);
3227
3228
1.37k
                if (c != '>')
3229
867
                {
3230
867
                    TY_(UngetChar)(c, doc->docIn);
3231
867
                    continue;
3232
867
                }
3233
3234
505
                lexer->lexsize -= 1;
3235
505
                lexer->txtend = lexer->lexsize;
3236
505
                lexer->lexbuf[lexer->lexsize] = '\0';
3237
505
                lexer->state = LEX_CONTENT;
3238
505
                lexer->waswhite = no;
3239
505
                lexer->token = PhpToken(doc);
3240
505
                node = lexer->token;
3241
505
                GTDBG(doc,"PHP", node);
3242
505
                return node;  /* the PHP token */
3243
3244
52.6k
            case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
3245
3246
52.6k
                if (TY_(IsWhite)(c) && c != '?')
3247
18.6k
                    continue;
3248
3249
                /* get pseudo-attribute */
3250
33.9k
                if (c != '?')
3251
33.0k
                {
3252
33.0k
                    tmbstr name;
3253
33.0k
                    Node *asp, *php;
3254
33.0k
                    AttVal *av = NULL;
3255
33.0k
                    int pdelim = 0;
3256
33.0k
                    isempty = no;
3257
3258
33.0k
                    TY_(UngetChar)(c, doc->docIn);
3259
3260
33.0k
                    name = ParseAttribute( doc, &isempty, &asp, &php );
3261
3262
33.0k
                    if (!name)
3263
17.9k
                    {
3264
                        /* check if attributes are created by ASP markup */
3265
17.9k
                        if (asp)
3266
11.0k
                        {
3267
11.0k
                            av = TY_(NewAttribute)(doc);
3268
11.0k
                            av->asp = asp;
3269
11.0k
                            AddAttrToList( &attributes, av ); 
3270
11.0k
                        }
3271
3272
                        /* check if attributes are created by PHP markup */
3273
17.9k
                        if (php)
3274
3.54k
                        {
3275
3.54k
                            av = TY_(NewAttribute)(doc);
3276
3.54k
                            av->php = php;
3277
3.54k
                            AddAttrToList( &attributes, av ); 
3278
3.54k
                        }
3279
                      
3280
                        /* fix for http://tidy.sf.net/bug/788031 */
3281
17.9k
                        lexer->lexsize -= 1;
3282
17.9k
                        lexer->txtend = lexer->txtstart;
3283
17.9k
                        lexer->lexbuf[lexer->txtend] = '\0';
3284
17.9k
                        lexer->state = LEX_CONTENT;
3285
17.9k
                        lexer->waswhite = no;
3286
17.9k
                        lexer->token = XmlDeclToken(doc);
3287
17.9k
                        lexer->token->attributes = attributes;
3288
17.9k
                        node = lexer->token;
3289
17.9k
                        GTDBG(doc,"xml", node);
3290
17.9k
                        return node;  /* the xml token */
3291
17.9k
                    }
3292
3293
15.0k
                    av = TY_(NewAttribute)(doc);
3294
15.0k
                    av->attribute = name;
3295
15.0k
                    av->value = ParseValue( doc, name, yes, &isempty, &pdelim );
3296
15.0k
                    av->delim = pdelim;
3297
15.0k
                    av->dict = TY_(FindAttribute)( doc, av );
3298
3299
15.0k
                    AddAttrToList( &attributes, av );
3300
                    /* continue; */
3301
15.0k
                }
3302
3303
                /* now look for '>' */
3304
15.9k
                c = TY_(ReadChar)(doc->docIn);
3305
3306
15.9k
                if (c != '>')
3307
13.2k
                {
3308
13.2k
                    TY_(UngetChar)(c, doc->docIn);
3309
13.2k
                    continue;
3310
13.2k
                }
3311
2.69k
                lexer->lexsize -= 1;
3312
2.69k
                lexer->txtend = lexer->txtstart;
3313
2.69k
                lexer->lexbuf[lexer->txtend] = '\0';
3314
2.69k
                lexer->state = LEX_CONTENT;
3315
2.69k
                lexer->waswhite = no;
3316
2.69k
                lexer->token = XmlDeclToken(doc);
3317
2.69k
                lexer->token->attributes = attributes;
3318
2.69k
                node = lexer->token;
3319
2.69k
                GTDBG(doc,"XML", node);
3320
2.69k
                return node;  /* the XML token */
3321
3322
98.6M
            case LEX_SECTION: /* seen "<![" so look for "]>" */
3323
98.6M
                if (c == '[')
3324
293k
                {
3325
293k
                    if (lexer->lexsize == (lexer->txtstart + 6) &&
3326
13.4k
                        TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
3327
1.35k
                    {
3328
1.35k
                        lexer->state = LEX_CDATA;
3329
1.35k
                        lexer->lexsize -= 6;
3330
1.35k
                        continue;
3331
1.35k
                    }
3332
293k
                }
3333
3334
98.6M
                if (c == '>')
3335
307k
                {
3336
                    /* Is. #462 - reached '>' before ']' */
3337
307k
                    TY_(UngetChar)(c, doc->docIn);
3338
98.3M
                } else if (c != ']')
3339
98.3M
                    continue;
3340
3341
                /* now look for '>' */
3342
329k
                c = TY_(ReadChar)(doc->docIn);
3343
3344
329k
                lexdump = 1;
3345
329k
                if (c != '>')
3346
15.3k
                {
3347
                    /* Issue #153 - can also be ]'-->' */
3348
15.3k
                    if (c == '-') 
3349
2.14k
                    {
3350
2.14k
                        c = TY_(ReadChar)(doc->docIn);
3351
2.14k
                        if (c == '-')
3352
1.49k
                        {
3353
1.49k
                            c = TY_(ReadChar)(doc->docIn);
3354
1.49k
                            if (c != '>')
3355
637
                            {
3356
637
                                TY_(UngetChar)(c, doc->docIn);
3357
637
                                TY_(UngetChar)('-', doc->docIn);
3358
637
                                TY_(UngetChar)('-', doc->docIn);
3359
637
                                continue;
3360
637
                            }
3361
                            /* this failed!
3362
                               TY_(AddCharToLexer)(lexer, '-'); TY_(AddCharToLexer)(lexer, '-'); lexdump = 0; 
3363
                               got output <![endif]--]> - needs further fix in pprint section output
3364
                             */
3365
1.49k
                        }
3366
654
                        else
3367
654
                        {
3368
654
                            TY_(UngetChar)(c, doc->docIn);
3369
654
                            TY_(UngetChar)('-', doc->docIn);
3370
654
                            continue;
3371
654
                        }
3372
2.14k
                    } 
3373
13.1k
                    else 
3374
13.1k
                    {
3375
13.1k
                        TY_(UngetChar)(c, doc->docIn);
3376
13.1k
                        continue;
3377
13.1k
                    }
3378
15.3k
                }
3379
 
3380
315k
                lexer->lexsize -= lexdump;
3381
315k
                lexer->txtend = lexer->lexsize;
3382
315k
                lexer->lexbuf[lexer->lexsize] = '\0';
3383
315k
                lexer->state = LEX_CONTENT;
3384
315k
                lexer->waswhite = no;
3385
315k
                lexer->token = SectionToken(doc);
3386
315k
                node = lexer->token;
3387
315k
                GTDBG(doc,"SECTION", node);
3388
315k
                return node;  /* the SECTION token */
3389
3390
399k
            case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
3391
399k
                if (c != ']')
3392
397k
                    continue;
3393
3394
                /* now look for ']' */
3395
2.35k
                c = TY_(ReadChar)(doc->docIn);
3396
3397
2.35k
                if (c != ']')
3398
597
                {
3399
597
                    TY_(UngetChar)(c, doc->docIn);
3400
597
                    continue;
3401
597
                }
3402
3403
                /* now look for '>' */
3404
1.75k
                c = TY_(ReadChar)(doc->docIn);
3405
3406
1.75k
                if (c != '>')
3407
492
                {
3408
492
                    TY_(UngetChar)(c, doc->docIn);
3409
492
                    TY_(UngetChar)(']', doc->docIn);
3410
492
                    continue;
3411
492
                }
3412
3413
1.26k
                lexer->lexsize -= 1;
3414
1.26k
                lexer->txtend = lexer->lexsize;
3415
1.26k
                lexer->lexbuf[lexer->lexsize] = '\0';
3416
1.26k
                lexer->state = LEX_CONTENT;
3417
1.26k
                lexer->waswhite = no;
3418
1.26k
                lexer->token = CDATAToken(doc);
3419
1.26k
                node = lexer->token;
3420
1.26k
                GTDBG(doc,"CDATA", node);
3421
1.26k
                return node;  /* the CDATA token */
3422
716M
        }
3423
716M
    }
3424
3425
2.85M
    if (lexer->state == LEX_CONTENT)  /* text string */
3426
2.43M
    {
3427
2.43M
        lexer->txtend = lexer->lexsize;
3428
3429
2.43M
        if (lexer->txtend > lexer->txtstart)
3430
7.68k
        {
3431
7.68k
            TY_(UngetChar)(c, doc->docIn);
3432
3433
7.68k
            if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
3434
542
            {
3435
542
                lexer->lexsize -= 1;
3436
542
                lexer->txtend = lexer->lexsize;
3437
542
            }
3438
7.68k
            lexer->token = TY_(TextToken)(lexer);
3439
7.68k
            node = lexer->token;
3440
7.68k
            GTDBG(doc,"textstring", node);
3441
7.68k
            return node;  /* the textstring token */
3442
7.68k
        }
3443
2.43M
    }
3444
412k
    else if (lexer->state == LEX_COMMENT) /* comment */
3445
301
    {
3446
301
        if (c == EndOfStream)
3447
301
        {
3448
            /* We print this if we reached end of the stream mid-comment. */
3449
301
            TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_EOS );
3450
301
        }
3451
3452
301
        lexer->txtend = lexer->lexsize;
3453
301
        lexer->lexbuf[lexer->lexsize] = '\0';
3454
301
        lexer->state = LEX_CONTENT;
3455
301
        lexer->waswhite = no;
3456
301
        lexer->token = CommentToken(doc);
3457
301
        node = lexer->token;
3458
301
        GTDBG(doc,"COMMENT", node);
3459
301
        return node;  /* the COMMENT token */
3460
301
    }
3461
3462
    /* check attributes before return NULL */
3463
2.84M
    if (attributes)
3464
122
        TY_(FreeAttribute)( doc, attributes );
3465
3466
2.84M
    DEBUG_LOG(SPRTF("Returning NULL...\n"));
3467
2.84M
    return NULL;
3468
2.85M
}
3469
3470
static void MapStr( ctmbstr str, uint code )
3471
275k
{
3472
3.53M
    while ( *str )
3473
3.26M
    {
3474
3.26M
        uint i = (byte) *str++;
3475
3.26M
        lexmap[i] |= code;
3476
3.26M
    }
3477
275k
}
3478
3479
void TY_(InitMap)(void)
3480
39.3k
{
3481
39.3k
    MapStr("\r\n\f", newline|white);
3482
39.3k
    MapStr(" \t", white);
3483
39.3k
    MapStr("-.:_", namechar);
3484
39.3k
    MapStr("0123456789", digit|digithex|namechar);
3485
39.3k
    MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
3486
39.3k
    MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
3487
39.3k
    MapStr("abcdefABCDEF", digithex);
3488
39.3k
}
3489
3490
/*
3491
 parser for ASP within start tags
3492
3493
 Some people use ASP for to customize attributes
3494
 Tidy isn't really well suited to dealing with ASP
3495
 This is a workaround for attributes, but won't
3496
 deal with the case where the ASP is used to tailor
3497
 the attribute value. Here is an example of a work
3498
 around for using ASP in attribute values:
3499
3500
  href='<%=rsSchool.Fields("ID").Value%>'
3501
3502
 where the ASP that generates the attribute value
3503
 is masked from Tidy by the quotemarks.
3504
3505
*/
3506
3507
static Node *ParseAsp( TidyDocImpl* doc )
3508
74.2k
{
3509
74.2k
    Lexer* lexer = doc->lexer;
3510
74.2k
    uint c;
3511
74.2k
    Node *asp = NULL;
3512
3513
74.2k
    lexer->txtstart = lexer->lexsize;
3514
3515
74.2k
    for (;;)
3516
86.6M
    {
3517
86.6M
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3518
267
            break;
3519
3520
86.6M
        TY_(AddCharToLexer)(lexer, c);
3521
3522
3523
86.6M
        if (c != '%')
3524
86.5M
            continue;
3525
3526
87.4k
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3527
11
            break;
3528
3529
87.4k
        TY_(AddCharToLexer)(lexer, c);
3530
3531
87.4k
        if (c == '>')
3532
73.9k
        {
3533
73.9k
            lexer->lexsize -= 2;
3534
73.9k
            break;
3535
73.9k
        }
3536
87.4k
    }
3537
3538
74.2k
    lexer->txtend = lexer->lexsize;
3539
74.2k
    if (lexer->txtend > lexer->txtstart)
3540
73.6k
        asp = AspToken(doc);
3541
3542
74.2k
    lexer->txtstart = lexer->txtend;
3543
74.2k
    return asp;
3544
74.2k
}   
3545
 
3546
3547
/*
3548
 PHP is like ASP but is based upon XML
3549
 processing instructions, e.g. <?php ... ?>
3550
*/
3551
static Node *ParsePhp( TidyDocImpl* doc )
3552
8.17k
{
3553
8.17k
    Lexer* lexer = doc->lexer;
3554
8.17k
    uint c;
3555
8.17k
    Node *php = NULL;
3556
3557
8.17k
    lexer->txtstart = lexer->lexsize;
3558
3559
8.17k
    for (;;)
3560
80.2M
    {
3561
80.2M
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3562
270
            break;
3563
3564
80.2M
        TY_(AddCharToLexer)(lexer, c);
3565
3566
3567
80.2M
        if (c != '?')
3568
80.2M
            continue;
3569
3570
12.3k
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3571
26
            break;
3572
3573
12.3k
        TY_(AddCharToLexer)(lexer, c);
3574
3575
12.3k
        if (c == '>')
3576
7.88k
        {
3577
7.88k
            lexer->lexsize -= 2;
3578
7.88k
            break;
3579
7.88k
        }
3580
12.3k
    }
3581
3582
8.17k
    lexer->txtend = lexer->lexsize;
3583
8.17k
    if (lexer->txtend > lexer->txtstart)
3584
7.73k
        php = PhpToken(doc);
3585
3586
8.17k
    lexer->txtstart = lexer->txtend;
3587
8.17k
    return php;
3588
8.17k
}   
3589
3590
/* consumes the '>' terminating start tags */
3591
/* @TODO: float the errors back to the calling method */
3592
static tmbstr  ParseAttribute( TidyDocImpl* doc, Bool *isempty,
3593
                              Node **asp, Node **php )
3594
1.58M
{
3595
1.58M
    Lexer* lexer = doc->lexer;
3596
1.58M
    int start, len = 0;
3597
1.58M
    tmbstr attr = NULL;
3598
1.58M
    uint c, lastc;
3599
3600
1.58M
    *asp = NULL;  /* clear asp pointer */
3601
1.58M
    *php = NULL;  /* clear php pointer */
3602
3603
 /* skip white space before the attribute */
3604
3605
1.58M
    for (;;)
3606
2.64M
    {
3607
2.64M
        c = TY_(ReadChar)( doc->docIn );
3608
3609
3610
2.64M
        if (c == '/')
3611
86.6k
        {
3612
86.6k
            c = TY_(ReadChar)( doc->docIn );
3613
3614
86.6k
            if (c == '>')
3615
53.0k
            {
3616
53.0k
                *isempty = yes;
3617
53.0k
                return NULL;
3618
53.0k
            }
3619
3620
33.6k
            TY_(UngetChar)(c, doc->docIn);
3621
33.6k
            c = '/';
3622
33.6k
            break;
3623
86.6k
        }
3624
3625
2.55M
        if (c == '>')
3626
237k
            return NULL;
3627
3628
2.31M
        if (c =='<')
3629
690k
        {
3630
690k
            c = TY_(ReadChar)(doc->docIn);
3631
3632
690k
            if (c == '%')
3633
74.2k
            {
3634
74.2k
                *asp = ParseAsp( doc );
3635
74.2k
                return NULL;
3636
74.2k
            }
3637
616k
            else if (c == '?')
3638
8.17k
            {
3639
8.17k
                *php = ParsePhp( doc );
3640
8.17k
                return NULL;
3641
8.17k
            }
3642
3643
608k
            TY_(UngetChar)(c, doc->docIn);
3644
608k
            TY_(UngetChar)('<', doc->docIn);
3645
608k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3646
608k
            return NULL;
3647
690k
        }
3648
3649
1.62M
        if (c == '=')
3650
1.21k
        {
3651
1.21k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );
3652
1.21k
            continue;
3653
1.21k
        }
3654
3655
1.62M
        if (c == '"' || c == '\'')
3656
12.7k
        {
3657
12.7k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3658
12.7k
            continue;
3659
12.7k
        }
3660
3661
1.61M
        if (c == EndOfStream)
3662
701
        {
3663
701
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3664
701
            TY_(UngetChar)(c, doc->docIn);
3665
701
            return NULL;
3666
701
        }
3667
3668
3669
1.61M
        if (!TY_(IsWhite)(c))
3670
571k
           break;
3671
1.61M
    }
3672
3673
604k
    start = lexer->lexsize;
3674
604k
    lastc = c;
3675
3676
604k
    for (;;)
3677
8.21M
    {
3678
     /* but push back '=' for parseValue() */
3679
8.21M
        if (c == '=' || c == '>')
3680
277k
        {
3681
277k
            TY_(UngetChar)(c, doc->docIn);
3682
277k
            break;
3683
277k
        }
3684
3685
7.93M
        if (c == '<' || c == EndOfStream)
3686
210k
        {
3687
210k
            TY_(UngetChar)(c, doc->docIn);
3688
210k
            break;
3689
210k
        }
3690
3691
7.72M
        if (lastc == '-' && (c == '"' || c == '\''))
3692
1.98k
        {
3693
1.98k
            lexer->lexsize--;
3694
1.98k
            --len;
3695
1.98k
            TY_(UngetChar)(c, doc->docIn);
3696
1.98k
            break;
3697
1.98k
        }
3698
3699
7.72M
        if (TY_(IsWhite)(c))
3700
111k
            break;
3701
3702
7.61M
        if (c == '/') /* Issue #395 - potential self closing tag */
3703
48.8k
        {
3704
48.8k
            c = TY_(ReadChar)(doc->docIn);  /* read next */
3705
48.8k
            if (c == '>')
3706
3.59k
            {
3707
                /* got a self closing tag - put is back and continue... */
3708
3.59k
                TY_(UngetChar)(c, doc->docIn);
3709
3.59k
                break;
3710
3.59k
            }
3711
45.2k
            else
3712
45.2k
            {
3713
                /* Not '/>' - put it back */
3714
45.2k
                TY_(UngetChar)(c, doc->docIn);
3715
45.2k
                c = '/';  /* restore original char */
3716
45.2k
            }
3717
48.8k
        }
3718
3719
        /* what should be done about non-namechar characters? */
3720
        /* currently these are incorporated into the attr name */
3721
3722
7.61M
        if ( cfg(doc, TidyUpperCaseAttrs) != TidyUppercasePreserve )
3723
7.61M
        {
3724
7.61M
            if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) )
3725
243k
                c = TY_(ToLower)(c);
3726
7.61M
        }
3727
3728
7.61M
        TY_(AddCharToLexer)( lexer, c );
3729
7.61M
        lastc = c;
3730
7.61M
        c = TY_(ReadChar)(doc->docIn);
3731
7.61M
    }
3732
3733
    /* handle attribute names with multibyte chars */
3734
604k
    len = lexer->lexsize - start;
3735
604k
    attr = (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3736
604k
                                      lexer->lexbuf+start, len) : NULL);
3737
604k
    lexer->lexsize = start;
3738
604k
    return attr;
3739
1.58M
}
3740
3741
/*
3742
 invoked when < is seen in place of attribute value
3743
 but terminates on whitespace if not ASP, PHP or Tango
3744
 this routine recognizes ' and " quoted strings
3745
*/
3746
static int ParseServerInstruction( TidyDocImpl* doc )
3747
8.09k
{
3748
8.09k
    Lexer* lexer = doc->lexer;
3749
8.09k
    uint c;
3750
8.09k
    int delim = '"';
3751
8.09k
    Bool isrule = no;
3752
3753
8.09k
    c = TY_(ReadChar)(doc->docIn);
3754
8.09k
    TY_(AddCharToLexer)(lexer, c);
3755
3756
    /* check for ASP, PHP or Tango */
3757
8.09k
    if (c == '%' || c == '?' || c == '@')
3758
2.00k
        isrule = yes;
3759
3760
8.09k
    for (;;)
3761
10.2M
    {
3762
10.2M
        c = TY_(ReadChar)(doc->docIn);
3763
3764
10.2M
        if (c == EndOfStream)
3765
425
            break;
3766
3767
10.2M
        if (c == '>')
3768
2.99k
        {
3769
2.99k
            if (isrule)
3770
1.74k
                TY_(AddCharToLexer)(lexer, c);
3771
1.24k
            else
3772
1.24k
                TY_(UngetChar)(c, doc->docIn);
3773
3774
2.99k
            break;
3775
2.99k
        }
3776
3777
        /* if not recognized as ASP, PHP or Tango */
3778
        /* then also finish value on whitespace */
3779
10.2M
        if (!isrule)
3780
5.08M
        {
3781
5.08M
            if (TY_(IsWhite)(c))
3782
3.15k
                break;
3783
5.08M
        }
3784
3785
10.2M
        TY_(AddCharToLexer)(lexer, c);
3786
3787
10.2M
        if (c == '"')
3788
3.57k
        {
3789
3.57k
            do
3790
134k
            {
3791
134k
                c = TY_(ReadChar)(doc->docIn);
3792
134k
                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3793
42
                {
3794
42
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3795
42
                    TY_(UngetChar)(c, doc->docIn);
3796
42
                    return 0;
3797
42
                }
3798
134k
                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3799
437
                {
3800
437
                    TY_(UngetChar)(c, doc->docIn);
3801
437
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3802
437
                    return 0;
3803
437
                }
3804
134k
                TY_(AddCharToLexer)(lexer, c);
3805
134k
            }
3806
134k
            while (c != '"');
3807
3.09k
            delim = '\'';
3808
3.09k
            continue;
3809
3.57k
        }
3810
3811
10.2M
        if (c == '\'')
3812
724k
        {
3813
724k
            do
3814
4.58M
            {
3815
4.58M
                c = TY_(ReadChar)(doc->docIn);
3816
4.58M
                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3817
56
                {
3818
56
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3819
56
                    TY_(UngetChar)(c, doc->docIn);
3820
56
                    return 0;
3821
56
                }
3822
4.58M
                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3823
985
                {
3824
985
                    TY_(UngetChar)(c, doc->docIn);
3825
985
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3826
985
                    return 0;
3827
985
                }
3828
4.58M
                TY_(AddCharToLexer)(lexer, c);
3829
4.58M
            }
3830
4.58M
            while (c != '\'');
3831
724k
        }
3832
10.2M
    }
3833
3834
6.57k
    return delim;
3835
8.09k
}
3836
3837
/* values start with "=" or " = " etc. */
3838
/* doesn't consume the ">" at end of start tag */
3839
3840
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,
3841
                          Bool foldCase, Bool *isempty, int *pdelim)
3842
603k
{
3843
603k
    Lexer* lexer = doc->lexer;
3844
603k
    int len = 0, start;
3845
603k
    Bool seen_gt = no;
3846
603k
    Bool munge = yes;
3847
603k
    uint c, lastc, delim, quotewarning;
3848
603k
    tmbstr value;
3849
3850
603k
    delim = (tmbchar) 0;
3851
603k
    *pdelim = '"';
3852
3853
    /*
3854
     Henry Zrepa reports that some folk are using the
3855
     embed element with script attributes where newlines
3856
     are significant and must be preserved
3857
    */
3858
603k
    if ( cfgBool(doc, TidyLiteralAttribs) )
3859
28.3k
        munge = no;
3860
3861
 /* skip white space before the '=' */
3862
3863
603k
    for (;;)
3864
3.91M
    {
3865
3.91M
        c = TY_(ReadChar)(doc->docIn);
3866
3867
3.91M
        if (c == EndOfStream)
3868
2.67k
        {
3869
2.67k
            TY_(UngetChar)(c, doc->docIn);
3870
2.67k
            break;
3871
2.67k
        }
3872
3873
3.90M
        if (!TY_(IsWhite)(c))
3874
600k
           break;
3875
3.90M
    }
3876
3877
/*
3878
  c should be '=' if there is a value
3879
  other legal possibilities are white
3880
  space, '/' and '>'
3881
*/
3882
3883
603k
    if (c != '=' && c != '"' && c != '\'')
3884
436k
    {
3885
436k
        TY_(UngetChar)(c, doc->docIn);
3886
436k
        return NULL;
3887
436k
    }
3888
3889
 /* skip white space after '=' */
3890
3891
167k
    for (;;)
3892
178k
    {
3893
178k
        c = TY_(ReadChar)(doc->docIn);
3894
3895
178k
        if (c == EndOfStream)
3896
128
        {
3897
128
            TY_(UngetChar)(c, doc->docIn);
3898
128
            break;
3899
128
        }
3900
3901
178k
        if (!TY_(IsWhite)(c))
3902
167k
           break;
3903
178k
    }
3904
3905
 /* check for quote marks */
3906
3907
167k
    if (c == '"' || c == '\'')
3908
31.6k
        delim = c;
3909
135k
    else if (c == '<')
3910
8.09k
    {
3911
8.09k
        start = lexer->lexsize;
3912
8.09k
        TY_(AddCharToLexer)(lexer, c);
3913
8.09k
        *pdelim = ParseServerInstruction( doc );
3914
8.09k
        len = lexer->lexsize - start;
3915
8.09k
        lexer->lexsize = start;
3916
8.09k
        return (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3917
8.09k
                                          lexer->lexbuf+start, len) : NULL);
3918
8.09k
    }
3919
127k
    else
3920
127k
        TY_(UngetChar)(c, doc->docIn);
3921
3922
 /*
3923
   and read the value string
3924
   check for quote mark if needed
3925
 */
3926
3927
159k
    quotewarning = 0;
3928
159k
    start = lexer->lexsize;
3929
159k
    c = '\0';
3930
3931
159k
    for (;;)
3932
26.7M
    {
3933
26.7M
        lastc = c;  /* track last character */
3934
26.7M
        c = TY_(ReadChar)(doc->docIn);
3935
3936
26.7M
        if (c == EndOfStream)
3937
2.72k
        {
3938
2.72k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3939
2.72k
            TY_(UngetChar)(c, doc->docIn);
3940
2.72k
            break;
3941
2.72k
        }
3942
3943
26.7M
        if (delim == (tmbchar)0)
3944
1.00M
        {
3945
1.00M
            if (c == '>')
3946
6.85k
            {
3947
6.85k
                TY_(UngetChar)(c, doc->docIn);
3948
6.85k
                break;
3949
6.85k
            }
3950
3951
998k
            if (c == '"' || c == '\'')
3952
9.62k
            {
3953
9.62k
                uint q = c;
3954
3955
                /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
3956
                /* this doesn't handle <a title=foo"/> which browsers treat as  */
3957
                /* 'foo"/' nor  <a title=foo" /> which browser treat as 'foo"'  */
3958
                
3959
9.62k
                c = TY_(ReadChar)(doc->docIn);
3960
9.62k
                if (c == '>')
3961
576
                {
3962
576
                    TY_(AddCharToLexer)(lexer, q);
3963
576
                    TY_(UngetChar)(c, doc->docIn);
3964
576
                    break;
3965
576
                }
3966
9.04k
                else
3967
9.04k
                {
3968
9.04k
                    TY_(UngetChar)(c, doc->docIn);
3969
9.04k
                    c = q;
3970
9.04k
                }
3971
9.62k
            }
3972
3973
998k
            if (c == '<')
3974
84.0k
            {
3975
84.0k
                TY_(UngetChar)(c, doc->docIn);
3976
84.0k
                c = '>';
3977
84.0k
                TY_(UngetChar)(c, doc->docIn);
3978
84.0k
                TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3979
84.0k
                break;
3980
84.0k
            }
3981
3982
            /*
3983
             For cases like <br clear=all/> need to avoid treating /> as
3984
             part of the attribute value, however care is needed to avoid
3985
             so treating <a href=http://www.acme.com/> in this way, which
3986
             would map the <a> tag to <a href="http://www.acme.com"/>
3987
            */
3988
914k
            if (c == '/')
3989
5.58k
            {
3990
                /* peek ahead in case of /> */
3991
5.58k
                c = TY_(ReadChar)(doc->docIn);
3992
3993
5.58k
                if ( c == '>' && !TY_(IsUrl)(doc, name) )
3994
2.27k
                {
3995
2.27k
                    *isempty = yes;
3996
2.27k
                    TY_(UngetChar)(c, doc->docIn);
3997
2.27k
                    break;
3998
2.27k
                }
3999
4000
                /* unget peeked character */
4001
3.30k
                TY_(UngetChar)(c, doc->docIn);
4002
3.30k
                c = '/';
4003
3.30k
            }
4004
914k
        }
4005
25.7M
        else  /* delim is '\'' or '"' */
4006
25.7M
        {
4007
25.7M
            if (c == delim)
4008
31.1k
                break;
4009
4010
25.7M
            if (c == '\n' || c == '<' || c == '>')
4011
138k
                ++quotewarning;
4012
4013
25.7M
            if (c == '>')
4014
44.4k
                seen_gt = yes;
4015
25.7M
        }
4016
4017
26.6M
        if (c == '&')
4018
72.7k
        {
4019
72.7k
            TY_(AddCharToLexer)(lexer, c);
4020
72.7k
            ParseEntity( doc, IgnoreWhitespace );
4021
72.7k
            if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)
4022
40.6k
                ChangeChar(lexer, ' ');
4023
72.7k
            continue;
4024
72.7k
        }
4025
4026
        /*
4027
         kludge for JavaScript attribute values
4028
         with line continuations in string literals
4029
        */
4030
26.5M
        if (c == '\\')
4031
5.52k
        {
4032
5.52k
            c = TY_(ReadChar)(doc->docIn);
4033
4034
5.52k
            if (c != '\n')
4035
4.62k
            {
4036
4.62k
                TY_(UngetChar)(c, doc->docIn);
4037
4.62k
                c = '\\';
4038
4.62k
            }
4039
5.52k
        }
4040
4041
26.5M
        if (TY_(IsWhite)(c))
4042
17.8M
        {
4043
17.8M
            if ( delim == 0 )
4044
31.7k
                break;
4045
4046
17.8M
            if (munge)
4047
236k
            {
4048
                /* discard line breaks in quoted URLs */ 
4049
                /* #438650 - fix by Randy Waki */
4050
236k
                if ( c == '\n' && TY_(IsUrl)(doc, name) )
4051
2.29k
                {
4052
                    /* warn that we discard this newline */
4053
2.29k
                    TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI);
4054
2.29k
                    continue;
4055
2.29k
                }
4056
                
4057
234k
                c = ' ';
4058
4059
234k
                if (lastc == ' ')
4060
199k
                {
4061
199k
                    if (TY_(IsUrl)(doc, name) )
4062
152k
                        TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI);
4063
199k
                    continue;
4064
199k
                }
4065
234k
            }
4066
17.8M
        }
4067
8.65M
        else if (foldCase && TY_(IsUpper)(c))
4068
654
            c = TY_(ToLower)(c);
4069
4070
26.3M
        TY_(AddCharToLexer)(lexer, c);
4071
26.3M
    }
4072
4073
159k
    if (quotewarning > 10 && seen_gt && munge)
4074
2.74k
    {
4075
        /*
4076
           there is almost certainly a missing trailing quote mark
4077
           as we have see too many newlines, < or > characters.
4078
4079
           an exception is made for Javascript attributes and the
4080
           javascript URL scheme which may legitimately include < and >,
4081
           and for attributes starting with "<xml " as generated by
4082
           Microsoft Office.
4083
        */
4084
2.74k
        if ( !TY_(IsScript)(doc, name) &&
4085
2.48k
             !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) &&
4086
2.11k
             !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0)
4087
2.74k
           )
4088
1.68k
            TY_(Report)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE ); 
4089
2.74k
    }
4090
4091
159k
    len = lexer->lexsize - start;
4092
159k
    lexer->lexsize = start;
4093
4094
4095
159k
    if (len > 0 || delim)
4096
157k
    {
4097
        /* ignore leading and trailing white space for all but title, alt, value */
4098
        /* and prompts attributes unless --literal-attributes is set to yes      */
4099
        /* #994841 - Whitespace is removed from value attributes                 */
4100
4101
        /* Issue #217 - Also only if/while (len > 0) - MUST NEVER GO NEGATIVE! */
4102
157k
        if ((len > 0) && munge &&
4103
150k
            TY_(tmbstrcasecmp)(name, "alt") &&
4104
149k
            TY_(tmbstrcasecmp)(name, "title") &&
4105
149k
            TY_(tmbstrcasecmp)(name, "value") &&
4106
148k
            TY_(tmbstrcasecmp)(name, "prompt"))
4107
148k
        {
4108
150k
            while (TY_(IsWhite)(lexer->lexbuf[start+len-1]) && (len > 0))
4109
2.34k
                --len;
4110
4111
            /* Issue #497 - Fix leading space trimming */
4112
150k
            while (TY_(IsWhite)(lexer->lexbuf[start]) && (len > 0))
4113
1.69k
            {
4114
1.69k
                ++start;
4115
1.69k
                --len;
4116
1.69k
            }
4117
148k
        }
4118
4119
157k
        value = TY_(tmbstrndup)(doc->allocator, lexer->lexbuf + start, len);
4120
157k
    }
4121
2.30k
    else
4122
2.30k
        value = NULL;
4123
4124
    /* note delimiter if given */
4125
159k
    *pdelim = delim;
4126
4127
159k
    return value;
4128
167k
}
4129
4130
/* attr must be non-NULL */
4131
static Bool IsValidAttrName( ctmbstr attr )
4132
588k
{
4133
588k
    uint i, c = attr[0];
4134
4135
    /* first character should be a letter */
4136
588k
    if (!TY_(IsLetter)(c))
4137
222k
        return no;
4138
4139
    /* remaining characters should be namechars */
4140
2.09M
    for( i = 1; i < TY_(tmbstrlen)(attr); i++)
4141
1.78M
    {
4142
1.78M
        c = attr[i];
4143
4144
1.78M
        if (TY_(IsNamechar)(c))
4145
1.73M
            continue;
4146
4147
56.2k
        return no;
4148
1.78M
    }
4149
4150
310k
    return yes;
4151
366k
}
4152
4153
/* create a new attribute */
4154
AttVal *TY_(NewAttribute)( TidyDocImpl* doc )
4155
1.27M
{
4156
1.27M
    AttVal *av = (AttVal*) TidyDocAlloc( doc, sizeof(AttVal) );
4157
1.27M
    TidyClearMemory( av, sizeof(AttVal) );
4158
1.27M
    return av;
4159
1.27M
}
4160
4161
/* create a new attribute with given name and value */
4162
AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
4163
                             int delim )
4164
38.5k
{
4165
38.5k
    AttVal *av = TY_(NewAttribute)(doc);
4166
38.5k
    av->attribute = TY_(tmbstrdup)(doc->allocator, name);
4167
38.5k
    av->value = TY_(tmbstrdup)(doc->allocator, value);
4168
38.5k
    av->delim = delim;
4169
38.5k
    av->dict = TY_(FindAttribute)( doc, av );
4170
38.5k
    return av;
4171
38.5k
}
4172
4173
static void AddAttrToList( AttVal** list, AttVal* av )
4174
620k
{
4175
620k
  if ( *list == NULL )
4176
414k
    *list = av;
4177
205k
  else
4178
205k
  {
4179
205k
    AttVal* here = *list;
4180
55.1M
    while ( here->next )
4181
54.9M
      here = here->next;
4182
205k
    here->next = av;
4183
205k
  }
4184
620k
}
4185
4186
void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av )
4187
212k
{
4188
212k
    AddAttrToList(&node->attributes, av);
4189
212k
}
4190
4191
void TY_(InsertAttributeAtStart)( Node *node, AttVal *av )
4192
38.5k
{
4193
38.5k
    av->next = node->attributes;
4194
38.5k
    node->attributes = av;
4195
38.5k
}
4196
4197
/* swallows closing '>' */
4198
4199
static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )
4200
914k
{
4201
914k
    Lexer* lexer = doc->lexer;
4202
914k
    AttVal *av, *list;
4203
914k
    tmbstr value;
4204
914k
    int delim;
4205
914k
    Node *asp, *php;
4206
4207
914k
    list = NULL;
4208
4209
1.56M
    while ( !EndOfInput(doc) )
4210
1.55M
    {
4211
1.55M
        tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );
4212
4213
1.55M
        if (attribute == NULL)
4214
964k
        {
4215
            /* check if attributes are created by ASP markup */
4216
964k
            if (asp)
4217
62.6k
            {
4218
62.6k
                av = TY_(NewAttribute)(doc);
4219
62.6k
                av->asp = asp;
4220
62.6k
                AddAttrToList( &list, av ); 
4221
62.6k
                continue;
4222
62.6k
            }
4223
4224
            /* check if attributes are created by PHP markup */
4225
902k
            if (php)
4226
4.18k
            {
4227
4.18k
                av = TY_(NewAttribute)(doc);
4228
4.18k
                av->php = php;
4229
4.18k
                AddAttrToList( &list, av ); 
4230
4.18k
                continue;
4231
4.18k
            }
4232
4233
898k
            break;
4234
902k
        }
4235
4236
588k
        value = ParseValue( doc, attribute, no, isempty, &delim );
4237
4238
588k
        if (attribute && (IsValidAttrName(attribute) ||
4239
278k
            (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))
4240
311k
        {
4241
311k
            av = TY_(NewAttribute)(doc);
4242
311k
            av->delim = delim ? delim : '"';
4243
311k
            av->attribute = attribute;
4244
311k
            av->value = value;
4245
311k
            av->dict = TY_(FindAttribute)( doc, av );
4246
311k
            AddAttrToList( &list, av );
4247
311k
            if ( !delim && value )
4248
119k
                TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK_OPEN);
4249
311k
        }
4250
277k
        else
4251
277k
        {
4252
277k
            av = TY_(NewAttribute)(doc);
4253
277k
            av->attribute = attribute;
4254
277k
            av->value = value;
4255
4256
277k
            if (LastChar(attribute) == '"')
4257
3.00k
                TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK);
4258
274k
            else if (value == NULL)
4259
260k
                TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE);
4260
13.2k
            else
4261
13.2k
                TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE);
4262
4263
277k
            TY_(FreeAttribute)( doc, av );
4264
277k
        }
4265
588k
    }
4266
4267
914k
    return list;
4268
914k
}
4269
4270
/*
4271
  Returns document type declarations like
4272
4273
  <!DOCTYPE foo PUBLIC "fpi" "sysid">
4274
  <!DOCTYPE bar SYSTEM "sysid">
4275
  <!DOCTYPE baz [ <!ENTITY ouml "&#246"> ]>
4276
4277
  as
4278
4279
  <foo PUBLIC="fpi" SYSTEM="sysid" />
4280
  <bar SYSTEM="sysid" />
4281
  <baz> &lt;!ENTITY ouml &quot;&amp;#246&quot;&gt; </baz>
4282
*/
4283
static Node *ParseDocTypeDecl(TidyDocImpl* doc)
4284
157k
{
4285
157k
    Lexer *lexer = doc->lexer;
4286
157k
    int start = lexer->lexsize;
4287
157k
    ParseDocTypeDeclState state = DT_DOCTYPENAME;
4288
157k
    uint c;
4289
157k
    uint delim = 0;
4290
157k
    Bool hasfpi = yes;
4291
4292
157k
    Node* node = TY_(NewNode)(lexer->allocator, lexer);
4293
157k
    node->type = DocTypeTag;
4294
157k
    node->start = lexer->txtstart;
4295
157k
    node->end = lexer->txtend;
4296
4297
157k
    lexer->waswhite = no;
4298
4299
    /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */
4300
4301
57.6M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
4302
57.6M
    {
4303
        /* convert newlines to spaces */
4304
57.6M
        if (state != DT_INTSUBSET)
4305
3.35M
            c = c == '\n' ? ' ' : c;
4306
4307
        /* convert white-space sequences to single space character */
4308
57.6M
        if (TY_(IsWhite)(c) && state != DT_INTSUBSET)
4309
410k
        {
4310
410k
            if (!lexer->waswhite)
4311
53.5k
            {
4312
53.5k
                TY_(AddCharToLexer)(lexer, c);
4313
53.5k
                lexer->waswhite = yes;
4314
53.5k
            }
4315
356k
            else
4316
356k
            {
4317
                /* discard space */
4318
356k
                continue;
4319
356k
            }
4320
410k
        }
4321
57.1M
        else
4322
57.1M
        {
4323
57.1M
            TY_(AddCharToLexer)(lexer, c);
4324
57.1M
            lexer->waswhite = no;
4325
57.1M
        }
4326
4327
57.2M
        switch(state)
4328
57.2M
        {
4329
254k
        case DT_INTERMEDIATE:
4330
            /* determine what's next */
4331
254k
            if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S')
4332
3.28k
            {
4333
3.28k
                start = lexer->lexsize - 1;
4334
3.28k
                state = DT_PUBLICSYSTEM;
4335
3.28k
                continue;
4336
3.28k
            }
4337
251k
            else if (c == '[')
4338
31.3k
            {
4339
31.3k
                start = lexer->lexsize;
4340
31.3k
                state = DT_INTSUBSET;
4341
31.3k
                continue;
4342
31.3k
            }
4343
220k
            else if (c == '\'' || c == '"')
4344
24.2k
            {
4345
24.2k
                start = lexer->lexsize;
4346
24.2k
                delim = c;
4347
24.2k
                state = DT_QUOTEDSTRING;
4348
24.2k
                continue;
4349
24.2k
            }
4350
196k
            else if (c == '>')
4351
157k
            {
4352
157k
                AttVal* si;
4353
4354
157k
                node->end = --(lexer->lexsize);
4355
4356
157k
                si = TY_(GetAttrByName)(node, "SYSTEM");
4357
157k
                if (si)
4358
7.38k
                    TY_(CheckUrl)(doc, node, si);
4359
4360
157k
                if (!node->element || !IsValidXMLElemName(node->element))
4361
94.3k
                {
4362
94.3k
                    TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE);
4363
94.3k
                    TY_(FreeNode)(doc, node);
4364
94.3k
                    return NULL;
4365
94.3k
                }
4366
63.0k
                return node;
4367
157k
            }
4368
38.7k
            else
4369
38.7k
            {
4370
                /* error */
4371
38.7k
            }
4372
38.7k
            break;
4373
1.36M
        case DT_DOCTYPENAME:
4374
            /* read document type name */
4375
1.36M
            if (TY_(IsWhite)(c) || c == '>' || c == '[')
4376
157k
            {
4377
157k
                node->element = TY_(tmbstrndup)(doc->allocator,
4378
157k
                                                lexer->lexbuf + start,
4379
157k
                                                lexer->lexsize - start - 1);
4380
157k
                if (c == '>' || c == '[')
4381
143k
                {
4382
143k
                    --(lexer->lexsize);
4383
143k
                    TY_(UngetChar)(c, doc->docIn);
4384
143k
                }
4385
4386
157k
                state = DT_INTERMEDIATE;
4387
157k
                continue;
4388
157k
            }
4389
1.20M
            break;
4390
1.20M
        case DT_PUBLICSYSTEM:
4391
            /* read PUBLIC/SYSTEM */
4392
21.0k
            if (TY_(IsWhite)(c) || c == '>')
4393
3.21k
            {
4394
3.21k
                char *attname = TY_(tmbstrndup)(doc->allocator,
4395
3.21k
                                                lexer->lexbuf + start,
4396
3.21k
                                                lexer->lexsize - start - 1);
4397
3.21k
                hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0);
4398
4399
3.21k
                TidyDocFree(doc, attname);
4400
4401
                /* todo: report an error if SYSTEM/PUBLIC not uppercase */
4402
4403
3.21k
                if (c == '>')
4404
1.64k
                {
4405
1.64k
                    --(lexer->lexsize);
4406
1.64k
                    TY_(UngetChar)(c, doc->docIn);
4407
1.64k
                }
4408
4409
3.21k
                state = DT_INTERMEDIATE;
4410
3.21k
                continue;
4411
3.21k
            }
4412
17.8k
            break;
4413
1.36M
        case DT_QUOTEDSTRING:
4414
            /* read quoted string */
4415
1.36M
            if (c == delim)
4416
24.1k
            {
4417
24.1k
                char *value = TY_(tmbstrndup)(doc->allocator,
4418
24.1k
                                              lexer->lexbuf + start,
4419
24.1k
                                              lexer->lexsize - start - 1);
4420
24.1k
                AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);
4421
24.1k
                TidyDocFree(doc, value);
4422
24.1k
                att->delim = delim;
4423
24.1k
                hasfpi = no;
4424
24.1k
                state = DT_INTERMEDIATE;
4425
24.1k
                delim = 0;
4426
24.1k
                continue;
4427
24.1k
            }
4428
1.34M
            break;
4429
54.2M
        case DT_INTSUBSET:
4430
            /* read internal subset */
4431
54.2M
            if (c == ']')
4432
31.2k
            {
4433
31.2k
                Node* subset;
4434
31.2k
                lexer->txtstart = start;
4435
31.2k
                lexer->txtend = lexer->lexsize - 1;
4436
31.2k
                subset = TY_(TextToken)(lexer);
4437
31.2k
                TY_(InsertNodeAtEnd)(node, subset);
4438
31.2k
                state = DT_INTERMEDIATE;
4439
31.2k
            }
4440
54.2M
            break;
4441
57.2M
        }
4442
57.2M
    }
4443
4444
    /* document type declaration not finished */
4445
521
    TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE);
4446
521
    TY_(FreeNode)(doc, node);
4447
521
    return NULL;
4448
157k
}
4449
4450
4451
/****************************************************************************//*
4452
 ** MARK: - Node Stack
4453
 ***************************************************************************/
4454
4455
4456
/**
4457
 * Create a new stack with a given starting capacity. If memory allocation
4458
 * fails, then the allocator will panic the program automatically.
4459
 */
4460
Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
4461
273k
{
4462
273k
    Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
4463
273k
    stack->top = -1;
4464
273k
    stack->capacity = capacity;
4465
273k
    stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
4466
273k
    stack->allocator = doc->allocator;
4467
273k
    return stack;
4468
273k
}
4469
 
4470
4471
/**
4472
 *  Increase the stack size. This will be called automatically when the
4473
 *  current stack is full. If memory allocation fails, then the allocator
4474
 *  will panic the program automatically.
4475
 */
4476
void TY_(growStack)(Stack *stack)
4477
1.05k
{
4478
1.05k
    uint new_capacity = stack->capacity * 2;
4479
    
4480
1.05k
    Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity * sizeof(Node**));
4481
    
4482
1.05k
    memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
4483
1.05k
    TidyFree(stack->allocator, stack->firstNode);
4484
4485
1.05k
    stack->firstNode = firstNode;
4486
1.05k
    stack->capacity = new_capacity;
4487
1.05k
}
4488
4489
4490
/**
4491
 * Stack is full when top is equal to the last index.
4492
 */
4493
Bool TY_(stackFull)(Stack *stack)
4494
24.1M
{
4495
24.1M
    return stack->top == stack->capacity - 1;
4496
24.1M
}
4497
4498
4499
/**
4500
 * Stack is empty when top is equal to -1
4501
 */
4502
Bool TY_(stackEmpty)(Stack *stack)
4503
1.74M
{
4504
1.74M
    return stack->top == -1;
4505
1.74M
}
4506
 
4507
4508
/**
4509
 * Push an item to the stack.
4510
 */
4511
void TY_(push)(Stack *stack, Node *node)
4512
24.1M
{
4513
24.1M
    if (TY_(stackFull)(stack))
4514
1.05k
        TY_(growStack)(stack);
4515
    
4516
24.1M
    if (node)
4517
1.47M
        stack->firstNode[++stack->top] = node;
4518
24.1M
}
4519
4520
4521
/**
4522
 * Pop an item from the stack.
4523
 */
4524
Node* TY_(pop)(Stack *stack)
4525
1.74M
{
4526
1.74M
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
4527
1.74M
}
4528
4529
4530
/**
4531
 * Peek at the stack.
4532
 */
4533
FUNC_UNUSED Node* TY_(peek)(Stack *stack)
4534
0
{
4535
0
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
4536
0
}
4537
4538
/**
4539
 *  Frees the stack when done.
4540
 */
4541
void TY_(freeStack)(Stack *stack)
4542
273k
{
4543
273k
    TidyFree( stack->allocator, stack->firstNode );
4544
273k
    stack->top = -1;
4545
273k
    stack->capacity = 0;
4546
273k
    stack->firstNode = NULL;
4547
    stack->allocator = NULL;
4548
273k
}