Coverage Report

Created: 2025-08-29 06:26

/src/tidy-html5/src/lexer.c
Line
Count
Source (jump to first uncovered line)
1
/* lexer.c -- Lexer for html parser
2
  
3
  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
*/
7
8
/*
9
  Given a file stream fp it returns a sequence of tokens.
10
11
     GetToken(fp) gets the next token
12
     UngetToken(fp) provides one level undo
13
14
  The tags include an attribute list:
15
16
    - linked list of attribute/value nodes
17
    - each node has 2 NULL-terminated strings.
18
    - entities are replaced in attribute values
19
20
  white space is compacted if not in preformatted mode
21
  If not in preformatted mode then leading white space
22
  is discarded and subsequent white space sequences
23
  compacted to single space characters.
24
25
  If XmlTags is no then Tag names are folded to upper
26
  case and attribute names to lower case.
27
28
 Not yet done:
29
    -   Doctype subset and marked sections
30
*/
31
32
#include "tidy-int.h"
33
#include "lexer.h"
34
#include "parser.h"
35
#include "entities.h"
36
#include "streamio.h"
37
#include "message.h"
38
#include "tmbstr.h"
39
#include "clean.h"
40
#include "utf8.h"
41
#include "streamio.h"
42
#include "sprtf.h"
43
44
#if defined(ENABLE_DEBUG_LOG)
45
/* #define DEBUG_ALLOCATION   special EXTRA allocation debug information - VERY NOISY */
46
static void check_me(char *name);
47
static Bool show_attrs = yes;
48
#define MX_TXT 8
49
static char buffer[(MX_TXT*4)+8]; /* NOTE extra for '...'\0 tail */
50
static tmbstr get_text_string(Lexer* lexer, Node *node)
51
{
52
    uint len = node->end - node->start;
53
    tmbstr cp = lexer->lexbuf + node->start;
54
    tmbstr end = lexer->lexbuf + node->end;
55
    unsigned char c;
56
    uint i = 0;
57
    Bool insp = no;
58
    if (len <= ((MX_TXT * 2) + 3)) {
59
        buffer[0] = 0;
60
        while (cp < end) {
61
            c = *cp;
62
            cp++;
63
            if (c == '\n') {
64
                buffer[i++] = '\\';
65
                buffer[i++] = 'n';
66
            } else if (c == '\t') {
67
                buffer[i++] = '\\';
68
                buffer[i++] = 't';
69
            } else if ( c == ' ' ) {
70
                if (!insp)
71
                    buffer[i++] = c;
72
                insp = yes;
73
            } else {
74
                buffer[i++] = c;
75
                insp = no;
76
            }
77
        }
78
    } else {
79
        char *end1 = cp + MX_TXT;
80
        char *bgn = cp + (len - MX_TXT);
81
        buffer[0] = 0;
82
        if (bgn < end1)
83
            bgn = end1;
84
        while (cp < end1) {
85
            c = *cp;
86
            cp++;
87
            if (c == '\n') {
88
                buffer[i++] = '\\';
89
                buffer[i++] = 'n';
90
            } else if (c == '\t') {
91
                buffer[i++] = '\\';
92
                buffer[i++] = 't';
93
            } else if ( c == ' ' ) {
94
                if (!insp)
95
                    buffer[i++] = c;
96
                insp = yes;
97
            } else {
98
                buffer[i++] = c;
99
                insp = no;
100
            }
101
            if (i >= MX_TXT)
102
                break;
103
        }
104
        c = '.';
105
        if ((i < len)&&(cp < bgn)) {
106
            buffer[i++] = c;
107
            cp++;
108
            if ((i < len)&&(cp < bgn)) {
109
                buffer[i++] = c;
110
                cp++;
111
                if ((i < len)&&(cp < bgn)) {
112
                    buffer[i++] = c;
113
                    cp++;
114
                }
115
            }
116
        }
117
        cp = bgn;
118
        insp = no;
119
        while (cp < end) {
120
            c = *cp;
121
            cp++;
122
            if (c == '\n') {
123
                buffer[i++] = '\\';
124
                buffer[i++] = 'n';
125
            } else if (c == '\t') {
126
                buffer[i++] = '\\';
127
                buffer[i++] = 't';
128
            } else if ( c == ' ' ) {
129
                if (!insp)
130
                    buffer[i++] = c;
131
                insp = yes;
132
            } else {
133
                buffer[i++] = c;
134
                insp = no;
135
            }
136
        }
137
    }
138
    buffer[i] = 0;
139
    return buffer;
140
}
141
static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node )
142
{
143
    Lexer* lexer = doc->lexer;
144
    Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? yes : no;
145
    int line = ( doc->lexer ? doc->lexer->lines : 0 );
146
    int col  = ( doc->lexer ? doc->lexer->columns : 0 );
147
    tmbstr src = lex ? "lexer" : "stream";
148
    SPRTF("R=%d C=%d: ", line, col );
149
    /* DEBUG: Be able to set a TRAP on a SPECIFIC row,col */
150
    if ((line == 3) && (col == 1)) {
151
        check_me("Show_Node"); /* just a debug trap */
152
    }
153
    if (lexer && lexer->token && 
154
        ((lexer->token->type == TextNode)||(node && (node->type == TextNode)))) {
155
        if (show_attrs) {
156
            uint len = node ? node->end - node->start : 0;
157
            tmbstr cp = node ? get_text_string( lexer, node ) : "NULL";
158
            SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len, src );
159
        } else {
160
            SPRTF("Returning %s TextNode %p... %s\n", msg, node, src );
161
        }
162
    } else {
163
        tmbstr name = node ? node->element ? node->element : "blank" : "NULL";
164
        if (show_attrs) {
165
            AttVal* av;
166
            SPRTF("Returning %s node <%s", msg, name);
167
            if (node) {
168
                for (av = node->attributes; av; av = av->next) {
169
                    name = av->attribute;
170
                    if (name) {
171
                        SPRTF(" %s",name);
172
                        if (av->value) {
173
                            SPRTF("=\"%s\"", av->value);
174
                        }
175
                    }
176
                }
177
            }
178
            SPRTF("> %s\n", src);
179
        } else {
180
            SPRTF("Returning %s node %p <%s>... %s\n", msg, node,
181
                name, src );
182
        }
183
    }
184
}
185
#define GTDBG(a,b,c) Show_Node(a,b,c)
186
#else /* ENABLE_DEBUG_LOG */
187
#define GTDBG(a,b,c)
188
#endif /* defined(ENABLE_DEBUG_LOG) */
189
190
/* Forward references
191
*/
192
/* swallows closing '>' */
193
static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
194
195
static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, 
196
                             Node **asp, Node **php );
197
198
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
199
                         Bool *isempty, int *pdelim );
200
201
static Node *ParseDocTypeDecl(TidyDocImpl* doc);
202
203
static void AddAttrToList( AttVal** list, AttVal* av );
204
205
/* used to classify characters for lexical purposes */
206
404M
#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
207
static uint lexmap[128];
208
209
0
#define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
210
16.0k
#define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
211
212
static struct _doctypes
213
{
214
    uint score;
215
    uint vers;
216
    uint vers_out;
217
    Bool xhtml;
218
    ctmbstr name;
219
    ctmbstr fpi;
220
    ctmbstr si;
221
} const W3C_Doctypes[] =
222
{
223
  {  2, HT20, 200, no,  "HTML 2.0",               "-//IETF//DTD HTML 2.0//EN",              NULL,                                                       },
224
  {  2, HT20, 200, no,  "HTML 2.0",               "-//IETF//DTD HTML//EN",                  NULL,                                                       },
225
  {  2, HT20, 200, no,  "HTML 2.0",               "-//W3C//DTD HTML 2.0//EN",               NULL,                                                       },
226
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2//EN",               NULL,                                                       },
227
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2 Final//EN",         NULL,                                                       },
228
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2 Draft//EN",         NULL,                                                       },
229
  {  6, H40S, 400, no,  "HTML 4.0 Strict",        "-//W3C//DTD HTML 4.0//EN",               "http://www.w3.org/TR/REC-html40/strict.dtd"                },
230
  {  8, H40T, 400, no,  "HTML 4.0 Transitional",  "-//W3C//DTD HTML 4.0 Transitional//EN",  "http://www.w3.org/TR/REC-html40/loose.dtd"                 },
231
  {  7, H40F, 400, no,  "HTML 4.0 Frameset",      "-//W3C//DTD HTML 4.0 Frameset//EN",      "http://www.w3.org/TR/REC-html40/frameset.dtd"              },
232
  {  3, H41S, 401, no,  "HTML 4.01 Strict",       "-//W3C//DTD HTML 4.01//EN",              "http://www.w3.org/TR/html4/strict.dtd"                     },
233
  {  5, H41T, 401, no,  "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd"                      },
234
  {  4, H41F, 401, no,  "HTML 4.01 Frameset",     "-//W3C//DTD HTML 4.01 Frameset//EN",     "http://www.w3.org/TR/html4/frameset.dtd"                   },
235
  {  9, X10S, 100, yes, "XHTML 1.0 Strict",       "-//W3C//DTD XHTML 1.0 Strict//EN",       "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"         },
236
  { 11, X10T, 100, yes, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"   },
237
  { 10, X10F, 100, yes, "XHTML 1.0 Frameset",     "-//W3C//DTD XHTML 1.0 Frameset//EN",     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"       },
238
  { 12, XH11, 110, yes, "XHTML 1.1",              "-//W3C//DTD XHTML 1.1//EN",              "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"              },
239
  { 13, XB10, 100, yes, "XHTML Basic 1.0",        "-//W3C//DTD XHTML Basic 1.0//EN",        "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"        },
240
241
  { 20, HT50, 500, no,  "HTML5",                  NULL,                                     NULL                                                        },
242
  { 21, XH50, 500, yes, "XHTML5",                 NULL,                                     NULL                                                        },
243
244
  /* final entry */
245
  {  0,    0, 0,  no,  NULL,                     NULL,                                     NULL                                                        }
246
};
247
248
/* 
249
 * Issue #643 - Since VERS_FROM40 was extended to include VERS_HTML5
250
 * to be used in the expanded entity table some 155 times,
251
 * need a special macro here to denote just HTML 4 plus XHTML,
252
 * which is actually the former define of VERS_FROM40
253
 */
254
3.15M
#define VERS_HMTL40PX        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
255
256
int TY_(HTMLVersion)(TidyDocImpl* doc)
257
3.02M
{
258
3.02M
    uint i;
259
3.02M
    uint j = 0;
260
3.02M
    uint score = 0;
261
3.02M
    uint vers = doc->lexer->versions;
262
3.02M
    uint dtver = doc->lexer->doctype;
263
3.02M
    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
264
3.02M
    Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
265
3.02M
                 !cfgBool(doc, TidyHtmlOut);
266
3.02M
    Bool html4 = ((dtmode == TidyDoctypeStrict) || (dtmode == TidyDoctypeLoose) ||
267
3.02M
                  (VERS_HMTL40PX & dtver) ? yes : no);
268
3.02M
    Bool html5 = (!html4 && ((dtmode == TidyDoctypeAuto) ||
269
3.00M
                  (dtmode == TidyDoctypeHtml5)) ? yes : no);
270
271
3.02M
    if (xhtml && dtver == VERS_UNKNOWN) return XH50;
272
27.6k
    if (dtver == VERS_UNKNOWN) return HT50;
273
    /* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */
274
27.6k
    if (!xhtml && (dtver == VERS_HTML5)) return HT50;
275
    /* Issue #377 - If xhtml and (doctype == html5) and constrained vers contains XH50 return that,
276
       and really if tidy defaults to 'html5', then maybe 'auto' should also apply! */
277
27.6k
    if (xhtml && html5 && ((vers & VERS_HTML5) == XH50)) return XH50;
278
279
537k
    for (i = 0; W3C_Doctypes[i].name; ++i)
280
510k
    {
281
510k
        if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
282
510k
            (html4 && !(VERS_HMTL40PX & W3C_Doctypes[i].vers)))
283
370k
            continue;
284
285
139k
        if (vers & W3C_Doctypes[i].vers &&
286
139k
            (W3C_Doctypes[i].score < score || !score))
287
19.9k
        {
288
19.9k
            score = W3C_Doctypes[i].score;
289
19.9k
            j = i;
290
19.9k
        }
291
139k
    }
292
293
26.8k
    if (score)
294
18.4k
        return W3C_Doctypes[j].vers;
295
296
8.37k
    return VERS_UNKNOWN;
297
26.8k
}
298
299
static ctmbstr GetFPIFromVers(uint vers)
300
2.20k
{
301
2.20k
    uint i;
302
303
43.5k
    for (i = 0; W3C_Doctypes[i].name; ++i)
304
41.4k
        if (W3C_Doctypes[i].vers == vers)
305
80
            return W3C_Doctypes[i].fpi;
306
307
2.12k
    return NULL;
308
2.20k
}
309
310
static ctmbstr GetSIFromVers(uint vers)
311
0
{
312
0
    uint i;
313
314
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
315
0
        if (W3C_Doctypes[i].vers == vers)
316
0
            return W3C_Doctypes[i].si;
317
318
0
    return NULL;
319
0
}
320
321
static ctmbstr GetNameFromVers(uint vers)
322
0
{
323
0
    uint i;
324
325
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
326
0
        if (W3C_Doctypes[i].vers == vers)
327
0
            return W3C_Doctypes[i].name;
328
329
0
    return NULL;
330
0
}
331
332
static uint GetVersFromFPI(ctmbstr fpi)
333
2.20k
{
334
2.20k
    uint i;
335
336
43.5k
    for (i = 0; W3C_Doctypes[i].name; ++i)
337
41.4k
        if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
338
80
            return W3C_Doctypes[i].vers;
339
340
2.12k
    return 0;
341
2.20k
}
342
343
#ifdef ENABLE_DEBUG_LOG
344
#  ifndef EndBuf
345
#    define EndBuf(a)   ( a + strlen(a) )
346
#  endif
347
348
/* Issue #377 - Output diminishing version bits */
349
typedef struct tagV2S {
350
    uint bit;
351
    ctmbstr val;
352
}V2S, *PV2S;
353
354
static V2S v2s[] = {
355
    { HT20, "HT20" },
356
    { HT32, "HT32" },
357
    { H40S, "H40S" },
358
    { H40T, "H40T" },
359
    { H40F, "H40F" },
360
    { H41S, "H41S" },
361
    { H41T, "H41T" },
362
    { H41F, "H41F" },
363
    { X10S, "X10S" },
364
    { X10T, "X10T" },
365
    { X10F, "X10F" },
366
    { XH11, "XH11" },
367
    { XB10, "XB10" }, /* 4096u */
368
    /* { VERS_SUN, "VSUN" }, */
369
    /* { VERS_NETSCAPE, "VNET" }, */
370
    /* { VERS_MICROSOFT, "VMIC" }, 32768u */
371
    { VERS_XML, "VXML" }, /* 65536u */
372
        /* HTML5 */
373
    { HT50, "HT50" }, /* 131072u */
374
    { XH50, "XH50" }, /* 262144u */
375
    { 0,     0  }
376
};
377
378
/* Process the above table, adding a bit name,
379
   or '----' when not present   */
380
static char *add_vers_string( tmbstr buf, uint vers )
381
{
382
    PV2S pv2s = v2s;
383
    int len = (int)strlen(buf);
384
    while (pv2s->val) {
385
        if (vers & pv2s->bit) {
386
            if (len) {
387
                strcat(buf,"|");
388
                len++;
389
            }
390
            strcat(buf,pv2s->val);
391
            len += (int)strlen(pv2s->val);
392
            vers &= ~(pv2s->bit);
393
            if (!vers)
394
                break;
395
        } else {
396
            if (len) {
397
                strcat(buf,"|");
398
                len++;
399
            }
400
            strcat(buf,"----");
401
            len += 4;
402
403
        }
404
        pv2s++;
405
    }
406
    if (vers) { /* Should not have any here! */
407
        if (len)
408
            strcat(buf,"|");
409
        sprintf(EndBuf(buf),"%u",vers);
410
    }
411
    return buf;
412
413
}
414
415
/* Issue #377 - Show first Before: list, and then on any change
416
   Note the VERS_PROPRIETARY are exclude since they always remain */
417
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
418
{
419
    static char vcur[256];
420
    static Bool dnfirst = no;
421
    uint curr = doc->lexer->versions; /* get current */
422
    doc->lexer->versions &= (vers | VERS_PROPRIETARY);
423
    if (curr != doc->lexer->versions) { /* only if different */
424
        if (!dnfirst) {
425
            dnfirst = yes;
426
            vcur[0] = 0;
427
            curr &= ~(VERS_PROPRIETARY);
428
            add_vers_string( vcur, curr );
429
            SPRTF("Before: %s\n", vcur);
430
        }
431
        vcur[0] = 0;
432
        curr = doc->lexer->versions;
433
        curr &= ~(VERS_PROPRIETARY);
434
        add_vers_string( vcur, curr );
435
        SPRTF("After : %s\n", vcur);
436
    }
437
}
438
#else /* !#if defined(ENABLE_DEBUG_LOG) */
439
/* everything is allowed in proprietary version of HTML */
440
/* this is handled here rather than in the tag/attr dicts */
441
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
442
1.14M
{
443
1.14M
    doc->lexer->versions &= (vers | VERS_PROPRIETARY);
444
1.14M
}
445
#endif /* #if defined(ENABLE_DEBUG_LOG) y/n */
446
447
Bool TY_(IsWhite)(uint c)
448
208M
{
449
208M
    uint map = MAP(c);
450
451
208M
    return (map & white)!=0;
452
208M
}
453
454
Bool TY_(IsNewline)(uint c)
455
0
{
456
0
    uint map = MAP(c);
457
0
    return (map & newline)!=0;
458
0
}
459
460
Bool TY_(IsDigit)(uint c)
461
53.1k
{
462
53.1k
    uint map;
463
464
53.1k
    map = MAP(c);
465
466
53.1k
    return (map & digit)!=0;
467
53.1k
}
468
469
static Bool IsDigitHex(uint c)
470
2.52M
{
471
2.52M
    uint map;
472
473
2.52M
    map = MAP(c);
474
475
2.52M
    return (map & digithex)!=0;
476
2.52M
}
477
478
Bool TY_(IsLetter)(uint c)
479
1.81M
{
480
1.81M
    uint map;
481
482
1.81M
    map = MAP(c);
483
484
1.81M
    return (map & letter)!=0;
485
1.81M
}
486
487
Bool TY_(IsHTMLSpace)(uint c)
488
1.12M
{
489
1.12M
    return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d;
490
1.12M
}
491
492
Bool TY_(IsNamechar)(uint c)
493
10.7M
{
494
10.7M
    uint map = MAP(c);
495
10.7M
    return (map & namechar)!=0;
496
10.7M
}
497
498
Bool TY_(IsXMLLetter)(uint c)
499
387k
{
500
387k
    return ((c >= 0x41 && c <= 0x5a) ||
501
387k
        (c >= 0x61 && c <= 0x7a) ||
502
387k
        (c >= 0xc0 && c <= 0xd6) ||
503
387k
        (c >= 0xd8 && c <= 0xf6) ||
504
387k
        (c >= 0xf8 && c <= 0xff) ||
505
387k
        (c >= 0x100 && c <= 0x131) ||
506
387k
        (c >= 0x134 && c <= 0x13e) ||
507
387k
        (c >= 0x141 && c <= 0x148) ||
508
387k
        (c >= 0x14a && c <= 0x17e) ||
509
387k
        (c >= 0x180 && c <= 0x1c3) ||
510
387k
        (c >= 0x1cd && c <= 0x1f0) ||
511
387k
        (c >= 0x1f4 && c <= 0x1f5) ||
512
387k
        (c >= 0x1fa && c <= 0x217) ||
513
387k
        (c >= 0x250 && c <= 0x2a8) ||
514
387k
        (c >= 0x2bb && c <= 0x2c1) ||
515
387k
        c == 0x386 ||
516
387k
        (c >= 0x388 && c <= 0x38a) ||
517
387k
        c == 0x38c ||
518
387k
        (c >= 0x38e && c <= 0x3a1) ||
519
387k
        (c >= 0x3a3 && c <= 0x3ce) ||
520
387k
        (c >= 0x3d0 && c <= 0x3d6) ||
521
387k
        c == 0x3da ||
522
387k
        c == 0x3dc ||
523
387k
        c == 0x3de ||
524
387k
        c == 0x3e0 ||
525
387k
        (c >= 0x3e2 && c <= 0x3f3) ||
526
387k
        (c >= 0x401 && c <= 0x40c) ||
527
387k
        (c >= 0x40e && c <= 0x44f) ||
528
387k
        (c >= 0x451 && c <= 0x45c) ||
529
387k
        (c >= 0x45e && c <= 0x481) ||
530
387k
        (c >= 0x490 && c <= 0x4c4) ||
531
387k
        (c >= 0x4c7 && c <= 0x4c8) ||
532
387k
        (c >= 0x4cb && c <= 0x4cc) ||
533
387k
        (c >= 0x4d0 && c <= 0x4eb) ||
534
387k
        (c >= 0x4ee && c <= 0x4f5) ||
535
387k
        (c >= 0x4f8 && c <= 0x4f9) ||
536
387k
        (c >= 0x531 && c <= 0x556) ||
537
387k
        c == 0x559 ||
538
387k
        (c >= 0x561 && c <= 0x586) ||
539
387k
        (c >= 0x5d0 && c <= 0x5ea) ||
540
387k
        (c >= 0x5f0 && c <= 0x5f2) ||
541
387k
        (c >= 0x621 && c <= 0x63a) ||
542
387k
        (c >= 0x641 && c <= 0x64a) ||
543
387k
        (c >= 0x671 && c <= 0x6b7) ||
544
387k
        (c >= 0x6ba && c <= 0x6be) ||
545
387k
        (c >= 0x6c0 && c <= 0x6ce) ||
546
387k
        (c >= 0x6d0 && c <= 0x6d3) ||
547
387k
        c == 0x6d5 ||
548
387k
        (c >= 0x6e5 && c <= 0x6e6) ||
549
387k
        (c >= 0x905 && c <= 0x939) ||
550
387k
        c == 0x93d ||
551
387k
        (c >= 0x958 && c <= 0x961) ||
552
387k
        (c >= 0x985 && c <= 0x98c) ||
553
387k
        (c >= 0x98f && c <= 0x990) ||
554
387k
        (c >= 0x993 && c <= 0x9a8) ||
555
387k
        (c >= 0x9aa && c <= 0x9b0) ||
556
387k
        c == 0x9b2 ||
557
387k
        (c >= 0x9b6 && c <= 0x9b9) ||
558
387k
        (c >= 0x9dc && c <= 0x9dd) ||
559
387k
        (c >= 0x9df && c <= 0x9e1) ||
560
387k
        (c >= 0x9f0 && c <= 0x9f1) ||
561
387k
        (c >= 0xa05 && c <= 0xa0a) ||
562
387k
        (c >= 0xa0f && c <= 0xa10) ||
563
387k
        (c >= 0xa13 && c <= 0xa28) ||
564
387k
        (c >= 0xa2a && c <= 0xa30) ||
565
387k
        (c >= 0xa32 && c <= 0xa33) ||
566
387k
        (c >= 0xa35 && c <= 0xa36) ||
567
387k
        (c >= 0xa38 && c <= 0xa39) ||
568
387k
        (c >= 0xa59 && c <= 0xa5c) ||
569
387k
        c == 0xa5e ||
570
387k
        (c >= 0xa72 && c <= 0xa74) ||
571
387k
        (c >= 0xa85 && c <= 0xa8b) ||
572
387k
        c == 0xa8d ||
573
387k
        (c >= 0xa8f && c <= 0xa91) ||
574
387k
        (c >= 0xa93 && c <= 0xaa8) ||
575
387k
        (c >= 0xaaa && c <= 0xab0) ||
576
387k
        (c >= 0xab2 && c <= 0xab3) ||
577
387k
        (c >= 0xab5 && c <= 0xab9) ||
578
387k
        c == 0xabd ||
579
387k
        c == 0xae0 ||
580
387k
        (c >= 0xb05 && c <= 0xb0c) ||
581
387k
        (c >= 0xb0f && c <= 0xb10) ||
582
387k
        (c >= 0xb13 && c <= 0xb28) ||
583
387k
        (c >= 0xb2a && c <= 0xb30) ||
584
387k
        (c >= 0xb32 && c <= 0xb33) ||
585
387k
        (c >= 0xb36 && c <= 0xb39) ||
586
387k
        c == 0xb3d ||
587
387k
        (c >= 0xb5c && c <= 0xb5d) ||
588
387k
        (c >= 0xb5f && c <= 0xb61) ||
589
387k
        (c >= 0xb85 && c <= 0xb8a) ||
590
387k
        (c >= 0xb8e && c <= 0xb90) ||
591
387k
        (c >= 0xb92 && c <= 0xb95) ||
592
387k
        (c >= 0xb99 && c <= 0xb9a) ||
593
387k
        c == 0xb9c ||
594
387k
        (c >= 0xb9e && c <= 0xb9f) ||
595
387k
        (c >= 0xba3 && c <= 0xba4) ||
596
387k
        (c >= 0xba8 && c <= 0xbaa) ||
597
387k
        (c >= 0xbae && c <= 0xbb5) ||
598
387k
        (c >= 0xbb7 && c <= 0xbb9) ||
599
387k
        (c >= 0xc05 && c <= 0xc0c) ||
600
387k
        (c >= 0xc0e && c <= 0xc10) ||
601
387k
        (c >= 0xc12 && c <= 0xc28) ||
602
387k
        (c >= 0xc2a && c <= 0xc33) ||
603
387k
        (c >= 0xc35 && c <= 0xc39) ||
604
387k
        (c >= 0xc60 && c <= 0xc61) ||
605
387k
        (c >= 0xc85 && c <= 0xc8c) ||
606
387k
        (c >= 0xc8e && c <= 0xc90) ||
607
387k
        (c >= 0xc92 && c <= 0xca8) ||
608
387k
        (c >= 0xcaa && c <= 0xcb3) ||
609
387k
        (c >= 0xcb5 && c <= 0xcb9) ||
610
387k
        c == 0xcde ||
611
387k
        (c >= 0xce0 && c <= 0xce1) ||
612
387k
        (c >= 0xd05 && c <= 0xd0c) ||
613
387k
        (c >= 0xd0e && c <= 0xd10) ||
614
387k
        (c >= 0xd12 && c <= 0xd28) ||
615
387k
        (c >= 0xd2a && c <= 0xd39) ||
616
387k
        (c >= 0xd60 && c <= 0xd61) ||
617
387k
        (c >= 0xe01 && c <= 0xe2e) ||
618
387k
        c == 0xe30 ||
619
387k
        (c >= 0xe32 && c <= 0xe33) ||
620
387k
        (c >= 0xe40 && c <= 0xe45) ||
621
387k
        (c >= 0xe81 && c <= 0xe82) ||
622
387k
        c == 0xe84 ||
623
387k
        (c >= 0xe87 && c <= 0xe88) ||
624
387k
        c == 0xe8a ||
625
387k
        c == 0xe8d ||
626
387k
        (c >= 0xe94 && c <= 0xe97) ||
627
387k
        (c >= 0xe99 && c <= 0xe9f) ||
628
387k
        (c >= 0xea1 && c <= 0xea3) ||
629
387k
        c == 0xea5 ||
630
387k
        c == 0xea7 ||
631
387k
        (c >= 0xeaa && c <= 0xeab) ||
632
387k
        (c >= 0xead && c <= 0xeae) ||
633
387k
        c == 0xeb0 ||
634
387k
        (c >= 0xeb2 && c <= 0xeb3) ||
635
387k
        c == 0xebd ||
636
387k
        (c >= 0xec0 && c <= 0xec4) ||
637
387k
        (c >= 0xf40 && c <= 0xf47) ||
638
387k
        (c >= 0xf49 && c <= 0xf69) ||
639
387k
        (c >= 0x10a0 && c <= 0x10c5) ||
640
387k
        (c >= 0x10d0 && c <= 0x10f6) ||
641
387k
        c == 0x1100 ||
642
387k
        (c >= 0x1102 && c <= 0x1103) ||
643
387k
        (c >= 0x1105 && c <= 0x1107) ||
644
387k
        c == 0x1109 ||
645
387k
        (c >= 0x110b && c <= 0x110c) ||
646
387k
        (c >= 0x110e && c <= 0x1112) ||
647
387k
        c == 0x113c ||
648
387k
        c == 0x113e ||
649
387k
        c == 0x1140 ||
650
387k
        c == 0x114c ||
651
387k
        c == 0x114e ||
652
387k
        c == 0x1150 ||
653
387k
        (c >= 0x1154 && c <= 0x1155) ||
654
387k
        c == 0x1159 ||
655
387k
        (c >= 0x115f && c <= 0x1161) ||
656
387k
        c == 0x1163 ||
657
387k
        c == 0x1165 ||
658
387k
        c == 0x1167 ||
659
387k
        c == 0x1169 ||
660
387k
        (c >= 0x116d && c <= 0x116e) ||
661
387k
        (c >= 0x1172 && c <= 0x1173) ||
662
387k
        c == 0x1175 ||
663
387k
        c == 0x119e ||
664
387k
        c == 0x11a8 ||
665
387k
        c == 0x11ab ||
666
387k
        (c >= 0x11ae && c <= 0x11af) ||
667
387k
        (c >= 0x11b7 && c <= 0x11b8) ||
668
387k
        c == 0x11ba ||
669
387k
        (c >= 0x11bc && c <= 0x11c2) ||
670
387k
        c == 0x11eb ||
671
387k
        c == 0x11f0 ||
672
387k
        c == 0x11f9 ||
673
387k
        (c >= 0x1e00 && c <= 0x1e9b) ||
674
387k
        (c >= 0x1ea0 && c <= 0x1ef9) ||
675
387k
        (c >= 0x1f00 && c <= 0x1f15) ||
676
387k
        (c >= 0x1f18 && c <= 0x1f1d) ||
677
387k
        (c >= 0x1f20 && c <= 0x1f45) ||
678
387k
        (c >= 0x1f48 && c <= 0x1f4d) ||
679
387k
        (c >= 0x1f50 && c <= 0x1f57) ||
680
387k
        c == 0x1f59 ||
681
387k
        c == 0x1f5b ||
682
387k
        c == 0x1f5d ||
683
387k
        (c >= 0x1f5f && c <= 0x1f7d) ||
684
387k
        (c >= 0x1f80 && c <= 0x1fb4) ||
685
387k
        (c >= 0x1fb6 && c <= 0x1fbc) ||
686
387k
        c == 0x1fbe ||
687
387k
        (c >= 0x1fc2 && c <= 0x1fc4) ||
688
387k
        (c >= 0x1fc6 && c <= 0x1fcc) ||
689
387k
        (c >= 0x1fd0 && c <= 0x1fd3) ||
690
387k
        (c >= 0x1fd6 && c <= 0x1fdb) ||
691
387k
        (c >= 0x1fe0 && c <= 0x1fec) ||
692
387k
        (c >= 0x1ff2 && c <= 0x1ff4) ||
693
387k
        (c >= 0x1ff6 && c <= 0x1ffc) ||
694
387k
        c == 0x2126 ||
695
387k
        (c >= 0x212a && c <= 0x212b) ||
696
387k
        c == 0x212e ||
697
387k
        (c >= 0x2180 && c <= 0x2182) ||
698
387k
        (c >= 0x3041 && c <= 0x3094) ||
699
387k
        (c >= 0x30a1 && c <= 0x30fa) ||
700
387k
        (c >= 0x3105 && c <= 0x312c) ||
701
387k
        (c >= 0xac00 && c <= 0xd7a3) ||
702
387k
        (c >= 0x4e00 && c <= 0x9fa5) ||
703
387k
        c == 0x3007 ||
704
387k
        (c >= 0x3021 && c <= 0x3029) ||
705
387k
        (c >= 0x4e00 && c <= 0x9fa5) ||
706
387k
        c == 0x3007 ||
707
387k
        (c >= 0x3021 && c <= 0x3029));
708
387k
}
709
710
Bool TY_(IsXMLNamechar)(uint c)
711
331k
{
712
331k
    return (TY_(IsXMLLetter)(c) ||
713
331k
        c == '.' || c == '_' ||
714
331k
        c == ':' || c == '-' ||
715
331k
        (c >= 0x300 && c <= 0x345) ||
716
331k
        (c >= 0x360 && c <= 0x361) ||
717
331k
        (c >= 0x483 && c <= 0x486) ||
718
331k
        (c >= 0x591 && c <= 0x5a1) ||
719
331k
        (c >= 0x5a3 && c <= 0x5b9) ||
720
331k
        (c >= 0x5bb && c <= 0x5bd) ||
721
331k
        c == 0x5bf ||
722
331k
        (c >= 0x5c1 && c <= 0x5c2) ||
723
331k
        c == 0x5c4 ||
724
331k
        (c >= 0x64b && c <= 0x652) ||
725
331k
        c == 0x670 ||
726
331k
        (c >= 0x6d6 && c <= 0x6dc) ||
727
331k
        (c >= 0x6dd && c <= 0x6df) ||
728
331k
        (c >= 0x6e0 && c <= 0x6e4) ||
729
331k
        (c >= 0x6e7 && c <= 0x6e8) ||
730
331k
        (c >= 0x6ea && c <= 0x6ed) ||
731
331k
        (c >= 0x901 && c <= 0x903) ||
732
331k
        c == 0x93c ||
733
331k
        (c >= 0x93e && c <= 0x94c) ||
734
331k
        c == 0x94d ||
735
331k
        (c >= 0x951 && c <= 0x954) ||
736
331k
        (c >= 0x962 && c <= 0x963) ||
737
331k
        (c >= 0x981 && c <= 0x983) ||
738
331k
        c == 0x9bc ||
739
331k
        c == 0x9be ||
740
331k
        c == 0x9bf ||
741
331k
        (c >= 0x9c0 && c <= 0x9c4) ||
742
331k
        (c >= 0x9c7 && c <= 0x9c8) ||
743
331k
        (c >= 0x9cb && c <= 0x9cd) ||
744
331k
        c == 0x9d7 ||
745
331k
        (c >= 0x9e2 && c <= 0x9e3) ||
746
331k
        c == 0xa02 ||
747
331k
        c == 0xa3c ||
748
331k
        c == 0xa3e ||
749
331k
        c == 0xa3f ||
750
331k
        (c >= 0xa40 && c <= 0xa42) ||
751
331k
        (c >= 0xa47 && c <= 0xa48) ||
752
331k
        (c >= 0xa4b && c <= 0xa4d) ||
753
331k
        (c >= 0xa70 && c <= 0xa71) ||
754
331k
        (c >= 0xa81 && c <= 0xa83) ||
755
331k
        c == 0xabc ||
756
331k
        (c >= 0xabe && c <= 0xac5) ||
757
331k
        (c >= 0xac7 && c <= 0xac9) ||
758
331k
        (c >= 0xacb && c <= 0xacd) ||
759
331k
        (c >= 0xb01 && c <= 0xb03) ||
760
331k
        c == 0xb3c ||
761
331k
        (c >= 0xb3e && c <= 0xb43) ||
762
331k
        (c >= 0xb47 && c <= 0xb48) ||
763
331k
        (c >= 0xb4b && c <= 0xb4d) ||
764
331k
        (c >= 0xb56 && c <= 0xb57) ||
765
331k
        (c >= 0xb82 && c <= 0xb83) ||
766
331k
        (c >= 0xbbe && c <= 0xbc2) ||
767
331k
        (c >= 0xbc6 && c <= 0xbc8) ||
768
331k
        (c >= 0xbca && c <= 0xbcd) ||
769
331k
        c == 0xbd7 ||
770
331k
        (c >= 0xc01 && c <= 0xc03) ||
771
331k
        (c >= 0xc3e && c <= 0xc44) ||
772
331k
        (c >= 0xc46 && c <= 0xc48) ||
773
331k
        (c >= 0xc4a && c <= 0xc4d) ||
774
331k
        (c >= 0xc55 && c <= 0xc56) ||
775
331k
        (c >= 0xc82 && c <= 0xc83) ||
776
331k
        (c >= 0xcbe && c <= 0xcc4) ||
777
331k
        (c >= 0xcc6 && c <= 0xcc8) ||
778
331k
        (c >= 0xcca && c <= 0xccd) ||
779
331k
        (c >= 0xcd5 && c <= 0xcd6) ||
780
331k
        (c >= 0xd02 && c <= 0xd03) ||
781
331k
        (c >= 0xd3e && c <= 0xd43) ||
782
331k
        (c >= 0xd46 && c <= 0xd48) ||
783
331k
        (c >= 0xd4a && c <= 0xd4d) ||
784
331k
        c == 0xd57 ||
785
331k
        c == 0xe31 ||
786
331k
        (c >= 0xe34 && c <= 0xe3a) ||
787
331k
        (c >= 0xe47 && c <= 0xe4e) ||
788
331k
        c == 0xeb1 ||
789
331k
        (c >= 0xeb4 && c <= 0xeb9) ||
790
331k
        (c >= 0xebb && c <= 0xebc) ||
791
331k
        (c >= 0xec8 && c <= 0xecd) ||
792
331k
        (c >= 0xf18 && c <= 0xf19) ||
793
331k
        c == 0xf35 ||
794
331k
        c == 0xf37 ||
795
331k
        c == 0xf39 ||
796
331k
        c == 0xf3e ||
797
331k
        c == 0xf3f ||
798
331k
        (c >= 0xf71 && c <= 0xf84) ||
799
331k
        (c >= 0xf86 && c <= 0xf8b) ||
800
331k
        (c >= 0xf90 && c <= 0xf95) ||
801
331k
        c == 0xf97 ||
802
331k
        (c >= 0xf99 && c <= 0xfad) ||
803
331k
        (c >= 0xfb1 && c <= 0xfb7) ||
804
331k
        c == 0xfb9 ||
805
331k
        (c >= 0x20d0 && c <= 0x20dc) ||
806
331k
        c == 0x20e1 ||
807
331k
        (c >= 0x302a && c <= 0x302f) ||
808
331k
        c == 0x3099 ||
809
331k
        c == 0x309a ||
810
331k
        (c >= 0x30 && c <= 0x39) ||
811
331k
        (c >= 0x660 && c <= 0x669) ||
812
331k
        (c >= 0x6f0 && c <= 0x6f9) ||
813
331k
        (c >= 0x966 && c <= 0x96f) ||
814
331k
        (c >= 0x9e6 && c <= 0x9ef) ||
815
331k
        (c >= 0xa66 && c <= 0xa6f) ||
816
331k
        (c >= 0xae6 && c <= 0xaef) ||
817
331k
        (c >= 0xb66 && c <= 0xb6f) ||
818
331k
        (c >= 0xbe7 && c <= 0xbef) ||
819
331k
        (c >= 0xc66 && c <= 0xc6f) ||
820
331k
        (c >= 0xce6 && c <= 0xcef) ||
821
331k
        (c >= 0xd66 && c <= 0xd6f) ||
822
331k
        (c >= 0xe50 && c <= 0xe59) ||
823
331k
        (c >= 0xed0 && c <= 0xed9) ||
824
331k
        (c >= 0xf20 && c <= 0xf29) ||
825
331k
        c == 0xb7 ||
826
331k
        c == 0x2d0 ||
827
331k
        c == 0x2d1 ||
828
331k
        c == 0x387 ||
829
331k
        c == 0x640 ||
830
331k
        c == 0xe46 ||
831
331k
        c == 0xec6 ||
832
331k
        c == 0x3005 ||
833
331k
        (c >= 0x3031 && c <= 0x3035) ||
834
331k
        (c >= 0x309d && c <= 0x309e) ||
835
331k
        (c >= 0x30fc && c <= 0x30fe));
836
331k
}
837
838
Bool TY_(IsUpper)(uint c)
839
11.1M
{
840
11.1M
    uint map = MAP(c);
841
842
11.1M
    return (map & uppercase)!=0;
843
11.1M
}
844
845
uint TY_(ToLower)(uint c)
846
169M
{
847
169M
    uint map = MAP(c);
848
849
169M
    if (map & uppercase)
850
8.94M
        c += 'a' - 'A';
851
852
169M
    return c;
853
169M
}
854
855
uint TY_(ToUpper)(uint c)
856
186k
{
857
186k
    uint map = MAP(c);
858
859
186k
    if (map & lowercase)
860
11.6k
        c += (uint) ('A' - 'a' );
861
862
186k
    return c;
863
186k
}
864
865
/*
866
 return last character in string
867
 this is useful when trailing quotemark
868
 is missing on an attribute
869
*/
870
static tmbchar LastChar( tmbstr str )
871
153k
{
872
153k
    if ( str && *str )
873
153k
    {
874
153k
        int n = TY_(tmbstrlen)(str);
875
153k
        return str[n-1];
876
153k
    }
877
686
    return 0;
878
153k
}
879
880
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
881
17.0k
{
882
17.0k
    Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
883
884
17.0k
    if ( lexer != NULL )
885
17.0k
    {
886
17.0k
        TidyClearMemory( lexer, sizeof(Lexer) );
887
888
17.0k
        lexer->allocator = doc->allocator;
889
17.0k
        lexer->lines = 1;
890
17.0k
        lexer->columns = 1;
891
17.0k
        lexer->state = LEX_CONTENT;
892
893
17.0k
        lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
894
17.0k
        lexer->doctype = VERS_UNKNOWN;
895
17.0k
        lexer->root = &doc->root;
896
17.0k
    }
897
17.0k
    return lexer;
898
17.0k
}
899
900
static Bool EndOfInput( TidyDocImpl* doc )
901
683k
{
902
683k
    assert( doc->docIn != NULL );
903
683k
    return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
904
683k
}
905
906
void TY_(FreeLexer)( TidyDocImpl* doc )
907
34.1k
{
908
34.1k
    Lexer *lexer = doc->lexer;
909
34.1k
    if ( lexer )
910
17.0k
    {
911
17.0k
        TY_(FreeStyles)( doc );
912
913
        /* See GetToken() */
914
17.0k
        if ( lexer->pushed || lexer->itoken )
915
0
        {
916
0
            if (lexer->pushed)
917
0
                TY_(FreeNode)( doc, lexer->itoken );
918
0
            TY_(FreeNode)( doc, lexer->token );
919
0
        }
920
921
49.6k
        while ( lexer->istacksize > 0 )
922
32.6k
            TY_(PopInline)( doc, NULL );
923
924
17.0k
        TidyDocFree( doc, lexer->istack );
925
17.0k
        TidyDocFree( doc, lexer->lexbuf );
926
17.0k
        TidyDocFree( doc, lexer );
927
17.0k
        doc->lexer = NULL;
928
17.0k
    }
929
34.1k
}
930
931
/* Lexer uses bigger memory chunks than pprint as
932
** it must hold the entire input document. not just
933
** the last line or three.
934
*/
935
static void AddByte( Lexer *lexer, tmbchar ch )
936
278M
{
937
278M
    if ( lexer->lexsize + 2 >= lexer->lexlength )
938
18.4k
    {
939
18.4k
        tmbstr buf = NULL;
940
18.4k
        uint allocAmt = lexer->lexlength;
941
18.4k
        uint prev = allocAmt; /* Is. #761 */
942
36.8k
        while ( lexer->lexsize + 2 >= allocAmt )
943
18.4k
        {
944
18.4k
            if ( allocAmt == 0 )
945
17.0k
                allocAmt = 8192;
946
1.40k
            else
947
1.40k
                allocAmt *= 2;
948
18.4k
            if (allocAmt < prev) /* Is. #761 - watch for wrap - and */
949
0
                TidyPanic(lexer->allocator, "\nPanic: out of internal memory!\nDocument input too big!\n");
950
18.4k
        }
951
18.4k
        buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt );
952
18.4k
        if ( buf )
953
18.4k
        {
954
18.4k
          TidyClearMemory( buf + lexer->lexlength, 
955
18.4k
                           allocAmt - lexer->lexlength );
956
18.4k
          lexer->lexbuf = buf;
957
18.4k
          lexer->lexlength = allocAmt;
958
18.4k
        }
959
18.4k
    }
960
961
278M
    lexer->lexbuf[ lexer->lexsize++ ] = ch;
962
278M
    lexer->lexbuf[ lexer->lexsize ]   = '\0';  /* debug */
963
278M
}
964
965
static void ChangeChar( Lexer *lexer, tmbchar c )
966
1.07M
{
967
1.07M
    if ( lexer->lexsize > 0 )
968
1.07M
    {
969
1.07M
        lexer->lexbuf[ lexer->lexsize-1 ] = c;
970
1.07M
    }
971
1.07M
}
972
973
/* store character c as UTF-8 encoded byte stream */
974
void TY_(AddCharToLexer)( Lexer *lexer, uint c )
975
274M
{
976
274M
    int i, err, count = 0;
977
274M
    tmbchar buf[10] = {0};
978
    
979
274M
    err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
980
274M
    if (err)
981
5.50k
    {
982
        /* replacement character 0xFFFD encoded as UTF-8 */
983
5.50k
        buf[0] = (byte) 0xEF;
984
5.50k
        buf[1] = (byte) 0xBF;
985
5.50k
        buf[2] = (byte) 0xBD;
986
5.50k
        count = 3;
987
5.50k
    }
988
    
989
552M
    for ( i = 0; i < count; ++i )
990
278M
        AddByte( lexer, buf[i] );
991
274M
}
992
993
static void AddStringToLexer( Lexer *lexer, ctmbstr str )
994
0
{
995
0
    uint c;
996
997
    /*  Many (all?) compilers will sign-extend signed chars (the default) when
998
    **  converting them to unsigned integer values.  We must cast our char to
999
    **  unsigned char before assigning it to prevent this from happening.
1000
    */
1001
0
    while( 0 != (c = (unsigned char) *str++ ))
1002
0
        TY_(AddCharToLexer)( lexer, c );
1003
0
}
1004
1005
1006
static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
1007
4.86M
{
1008
4.86M
    lexer->lines = doc->docIn->curline;
1009
4.86M
    lexer->columns = doc->docIn->curcol;
1010
4.86M
}
1011
1012
/*
1013
    Issue #483
1014
    Have detected the first of a surrogate pair...
1015
    Try to find, decode the second...
1016
    Already have '&' start...
1017
*/
1018
1019
typedef enum {
1020
    SP_ok,
1021
    SP_failed,
1022
    SP_error
1023
}SPStatus;
1024
1025
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
1026
3.75k
{
1027
3.75k
    Lexer* lexer = doc->lexer;
1028
3.75k
    uint bufSize = 32;
1029
3.75k
    uint c, ch = 0, offset = 0;
1030
3.75k
    tmbstr buf = 0;
1031
3.75k
    SPStatus status = SP_error;  /* assume failed */
1032
3.75k
    int type = 0;   /* assume numeric */
1033
3.75k
    uint fch = *pch;
1034
3.75k
    int i;  /* has to be signed due to for i >= 0 */
1035
3.75k
    if (!lexer)
1036
0
        return status;
1037
3.75k
    buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
1038
3.75k
    if (!buf)
1039
0
        return status;
1040
64.6k
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
1041
64.5k
    {
1042
64.5k
        if (c == ';')
1043
979
        {
1044
979
            break;  /* reached end of entity */
1045
979
        }
1046
63.5k
        if ((offset + 2) > bufSize)
1047
510
        {
1048
510
            bufSize *= 2;
1049
510
            buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
1050
510
            if (!buf)
1051
0
            {
1052
0
                break;
1053
0
            }
1054
510
        }
1055
63.5k
        buf[offset++] = c;  /* add char to buffer */
1056
63.5k
        if (offset == 1)
1057
3.45k
        {
1058
3.45k
            if (c != '#')   /* is a numeric entity */
1059
511
                break;
1060
3.45k
        }
1061
60.1k
        else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
1062
2.23k
        {
1063
2.23k
            type = 1;   /* set hex digits */
1064
2.23k
        }
1065
57.8k
        else
1066
57.8k
        {
1067
57.8k
            if (type)   /* if hex digits */
1068
53.1k
            {
1069
53.1k
                if (!IsDigitHex(c))
1070
1.49k
                    break;
1071
53.1k
            }
1072
4.68k
            else    /* if numeric */
1073
4.68k
            {
1074
4.68k
                if (!TY_(IsDigit)(c))
1075
692
                    break;
1076
4.68k
            }
1077
57.8k
        }
1078
63.5k
    }
1079
1080
3.75k
    if (c == ';')
1081
979
    {
1082
979
        int scanned;
1083
1084
979
        buf[offset] = 0;
1085
979
        if (type)
1086
687
            scanned = sscanf(buf + 2, "%x", &ch);
1087
292
        else
1088
292
            scanned = sscanf(buf + 1, "%d", &ch);
1089
1090
979
        if (scanned == 1 && TY_(IsHighSurrogate)(ch))
1091
442
        {
1092
442
            ch = TY_(CombineSurrogatePair)(ch, fch);
1093
442
            if (TY_(IsValidCombinedChar)(ch))
1094
234
            {
1095
234
                *pch = ch;  /* return combined pair value */
1096
234
                status = SP_ok; /* full success - pair used */
1097
234
            }
1098
208
            else
1099
208
            {
1100
208
                status = SP_failed; /* is one of the 32 out-of-range pairs */
1101
208
                *pch = 0xFFFD;  /* return substitute character */
1102
208
                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_PAIR, fch, ch); /* SP WARNING: -  */
1103
208
            }
1104
442
        }
1105
979
    }
1106
1107
3.75k
    if (status == SP_error)
1108
3.31k
    {
1109
        /* Error condition - can only put back all the chars */
1110
3.31k
        if (c == ';') /* if last, not added to buffer */
1111
537
            TY_(UngetChar)(c, doc->docIn);
1112
3.31k
        if (buf && offset)
1113
3.01k
        {
1114
            /* correct the order for unget - last first */
1115
63.9k
            for (i = offset - 1; i >= 0; i--)
1116
60.9k
            {
1117
60.9k
                c = buf[i];
1118
60.9k
                TY_(UngetChar)(c, doc->docIn);
1119
60.9k
            }
1120
3.01k
        }
1121
3.31k
    }
1122
1123
3.75k
    if (buf)
1124
3.75k
        TidyFree(lexer->allocator, buf);
1125
1126
3.75k
    return status;
1127
3.75k
}
1128
1129
/*
1130
  No longer attempts to insert missing ';' for unknown
1131
 entities unless one was present already, since this
1132
  gives unexpected results.
1133
1134
  For example:   <a href="something.htm?foo&bar&fred">
1135
  was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
1136
  rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
1137
1138
  My thanks for Maurice Buxton for spotting this.
1139
1140
  Also Randy Waki pointed out the following case for the
1141
  04 Aug 00 version (bug #433012):
1142
  
1143
  For example:   <a href="something.htm?id=1&lang=en">
1144
  was tidied to: <a href="something.htm?id=1&lang;=en">
1145
  rather than:   <a href="something.htm?id=1&amp;lang=en">
1146
  
1147
  where "lang" is a known entity (#9001), but browsers would
1148
  misinterpret "&lang;" because it had a value > 256.
1149
  
1150
  So the case of an apparently known entity with a value > 256 and
1151
  missing a semicolon is handled specially.
1152
  
1153
  "ParseEntity" is also a bit of a misnomer - it handles entities and
1154
  numeric character references. Invalid NCR's are now reported.
1155
*/
1156
static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
1157
87.5k
{
1158
87.5k
    typedef enum
1159
87.5k
    {
1160
87.5k
        ENT_default,
1161
87.5k
        ENT_numdec,
1162
87.5k
        ENT_numhex
1163
87.5k
    } ENTState;
1164
    
1165
87.5k
    typedef Bool (*ENTfn)(uint);
1166
87.5k
    const ENTfn entFn[] = {
1167
87.5k
        TY_(IsNamechar),
1168
87.5k
        TY_(IsDigit),
1169
87.5k
        IsDigitHex
1170
87.5k
    };
1171
87.5k
    uint start;
1172
87.5k
    ENTState entState = ENT_default;
1173
87.5k
    uint charRead = 0;
1174
87.5k
    Bool semicolon = no, found = no;
1175
87.5k
    Bool isXml = cfgBool( doc, TidyXmlTags );
1176
87.5k
    Bool preserveEntities = cfgBool( doc, TidyPreserveEntities );
1177
87.5k
    uint c, ch, startcol, entver = 0;
1178
87.5k
    Lexer* lexer = doc->lexer;
1179
1180
87.5k
    start = lexer->lexsize - 1;  /* to start at "&" */
1181
87.5k
    startcol = doc->docIn->curcol - 1;
1182
1183
2.84M
    while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
1184
2.84M
    {
1185
2.84M
        if ( c == ';' )
1186
56.4k
        {
1187
56.4k
            semicolon = yes;
1188
56.4k
            break;
1189
56.4k
        }
1190
2.78M
        ++charRead;
1191
1192
2.78M
        if (charRead == 1 && c == '#')
1193
17.3k
        {
1194
17.3k
            if ( !cfgBool(doc, TidyNCR) ||
1195
17.3k
                 cfg(doc, TidyInCharEncoding) == BIG5 ||
1196
17.3k
                 cfg(doc, TidyInCharEncoding) == SHIFTJIS )
1197
0
            {
1198
0
                TY_(UngetChar)('#', doc->docIn);
1199
0
                return;
1200
0
            }
1201
1202
17.3k
            TY_(AddCharToLexer)( lexer, c );
1203
17.3k
            entState = ENT_numdec;
1204
17.3k
            continue;
1205
17.3k
        }
1206
2.76M
        else if (charRead == 2 && entState == ENT_numdec
1207
2.76M
                 && (c == 'x' || (!isXml && c == 'X')) )
1208
12.4k
        {
1209
12.4k
            TY_(AddCharToLexer)( lexer, c );
1210
12.4k
            entState = ENT_numhex;
1211
12.4k
            continue;
1212
12.4k
        }
1213
1214
2.75M
        if ( entFn[entState](c) )
1215
2.72M
        {
1216
2.72M
            TY_(AddCharToLexer)( lexer, c );
1217
2.72M
            continue;
1218
2.72M
        }
1219
1220
        /* otherwise put it back */
1221
30.5k
        TY_(UngetChar)( c, doc->docIn );
1222
30.5k
        break;
1223
2.75M
    }
1224
1225
    /* make sure entity is NULL terminated */
1226
87.5k
    lexer->lexbuf[lexer->lexsize] = '\0';
1227
1228
    /* Should contrain version to XML/XHTML if &apos; 
1229
    ** is encountered.  But this is not possible with
1230
    ** Tidy's content model bit mask.
1231
    */
1232
87.5k
    if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0
1233
87.5k
         && !cfgBool(doc, TidyXmlOut)
1234
87.5k
         && !lexer->isvoyager
1235
87.5k
         && !cfgBool(doc, TidyXhtmlOut)
1236
87.5k
         && !(TY_(HTMLVersion)(doc) == HT50) ) /* Issue #239 - no warning if in HTML5++ mode */
1237
0
        TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
1238
1239
87.5k
    if (( mode == OtherNamespace ) && ( c == ';' ))
1240
282
    {
1241
        /* #130 MathML attr and entity fix! */
1242
282
        found = yes;
1243
282
        ch = 255;
1244
282
        entver = XH50|HT50;
1245
282
        preserveEntities = yes;
1246
282
    }
1247
87.2k
    else
1248
87.2k
    {
1249
        /* Lookup entity code and version
1250
        */
1251
87.2k
        found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
1252
87.2k
    }
1253
1254
    /* Issue #483 - Deal with 'surrogate pairs' */
1255
    /* TODO: Maybe warning/error, like found a leading surrogate
1256
       but no following surrogate! Maybe should avoid outputting
1257
       invalid utf-8 for this entity - maybe substitute?  */
1258
87.5k
    if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
1259
4.26k
    {
1260
4.26k
        uint c1;
1261
4.26k
        if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
1262
3.75k
        {
1263
3.75k
            SPStatus status;
1264
            /* Have a following entity, 
1265
               so there is a chance of having a valid surrogate pair */
1266
3.75k
            c1 = ch;    /* keep first value, in case of error */
1267
3.75k
            status = GetSurrogatePair(doc, isXml, &ch);
1268
3.75k
            if (status == SP_error)
1269
3.31k
            {
1270
3.31k
                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, c1, 0); /* SP WARNING: - using substitute character */
1271
3.31k
                TY_(UngetChar)('&', doc->docIn);  /* otherwise put it back */
1272
3.31k
            }
1273
3.75k
        }
1274
506
        else
1275
506
        {
1276
            /* put this non-entity lead char back */
1277
506
            TY_(UngetChar)(c1, doc->docIn);
1278
            /* Have leading surrogate pair, with no tail */
1279
506
            TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, ch, 0); /* SP WARNING: - using substitute character */
1280
506
            ch = 0xFFFD;
1281
506
        }
1282
4.26k
    } 
1283
83.2k
    else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
1284
292
    {
1285
        /* Have trailing surrogate pair, with no lead */
1286
292
        TY_(ReportSurrogateError)(doc, BAD_SURROGATE_LEAD, ch, 0); /* SP WARNING: - using substitute character */
1287
292
        ch = 0xFFFD;
1288
292
    }
1289
1290
    /* deal with unrecognized or invalid entities */
1291
    /* #433012 - fix by Randy Waki 17 Feb 01 */
1292
    /* report invalid NCR's - Terry Teague 01 Sep 01 */
1293
87.5k
    if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
1294
78.0k
    {
1295
        /* set error position just before offending character */
1296
78.0k
        SetLexerLocus( doc, lexer );
1297
78.0k
        lexer->columns = startcol;
1298
1299
78.0k
        if (lexer->lexsize > start + 1)
1300
66.6k
        {
1301
66.6k
            if (ch >= 128 && ch <= 159)
1302
1.31k
            {
1303
                /* invalid numeric character reference */
1304
                
1305
1.31k
                uint c1 = 0;
1306
1.31k
                int replaceMode = DISCARDED_CHAR;
1307
            
1308
                /* Always assume Win1252 in this circumstance. */
1309
1.31k
                c1 = TY_(DecodeWin1252)( ch );
1310
1311
1.31k
                if ( c1 )
1312
528
                    replaceMode = REPLACED_CHAR;
1313
                
1314
1.31k
                if ( c != ';' )  /* issue warning if not terminated by ';' */
1315
1.08k
                    TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR,
1316
1.08k
                                            lexer->lexbuf+start, c );
1317
 
1318
1.31k
                TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
1319
                
1320
1.31k
                if ( c1 )
1321
528
                {
1322
                    /* make the replacement */
1323
528
                    lexer->lexsize = start;
1324
528
                    TY_(AddCharToLexer)( lexer, c1 );
1325
528
                    semicolon = no;
1326
528
                }
1327
785
                else
1328
785
                {
1329
                    /* discard */
1330
785
                    lexer->lexsize = start;
1331
785
                    semicolon = no;
1332
785
               }
1333
               
1334
1.31k
            }
1335
65.3k
            else
1336
65.3k
                TY_(ReportEntityError)( doc, UNKNOWN_ENTITY,
1337
65.3k
                                        lexer->lexbuf+start, ch );
1338
1339
66.6k
            if (semicolon)
1340
52.2k
                TY_(AddCharToLexer)( lexer, ';' );
1341
66.6k
        }
1342
11.4k
        else
1343
11.4k
        {
1344
            /*\ 
1345
             *  Issue #207 - A naked & is allowed in HTML5, as an unambiguous ampersand!
1346
            \*/
1347
11.4k
            if (TY_(HTMLVersion)(doc) != HT50) 
1348
11.4k
            {
1349
11.4k
                TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND,
1350
11.4k
                                    lexer->lexbuf+start, ch );
1351
11.4k
            }
1352
11.4k
        }
1353
78.0k
    }
1354
9.47k
    else
1355
9.47k
    {
1356
9.47k
        if ( c != ';' )    /* issue warning if not terminated by ';' */
1357
6.09k
        {
1358
            /* set error position just before offending character */
1359
6.09k
            SetLexerLocus( doc, lexer );
1360
6.09k
            lexer->columns = startcol;
1361
6.09k
            TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
1362
6.09k
        }
1363
1364
9.47k
        if (preserveEntities)
1365
282
            TY_(AddCharToLexer)( lexer, ';' );
1366
9.19k
        else
1367
9.19k
        {
1368
9.19k
            lexer->lexsize = start;
1369
9.19k
            if ( ch == 160 && (mode == Preformatted) )
1370
239
                ch = ' ';
1371
9.19k
            TY_(AddCharToLexer)( lexer, ch );
1372
1373
9.19k
            if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
1374
0
                AddStringToLexer( lexer, "amp;" );
1375
9.19k
        }
1376
1377
        /* Detect extended vs. basic entities */
1378
9.47k
        TY_(ConstrainVersion)( doc, entver );
1379
9.47k
    }
1380
87.5k
}
1381
1382
static tmbchar ParseTagName( TidyDocImpl* doc )
1383
1.11M
{
1384
1.11M
    Lexer *lexer = doc->lexer;
1385
1.11M
    uint c = lexer->lexbuf[ lexer->txtstart ];
1386
1.11M
    Bool xml = cfgBool(doc, TidyXmlTags);
1387
1388
    /* fold case of first character in buffer */
1389
1.11M
    if (!xml && TY_(IsUpper)(c))
1390
562k
        lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c);
1391
1392
10.0M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
1393
10.0M
    {
1394
10.0M
        if ((!xml && !TY_(IsNamechar)(c)) ||
1395
10.0M
            (xml && !TY_(IsXMLNamechar)(c)))
1396
1.10M
            break;
1397
1398
        /* fold case of subsequent characters */
1399
8.95M
        if (!xml && TY_(IsUpper)(c))
1400
5.45M
             c = TY_(ToLower)(c);
1401
1402
8.95M
        TY_(AddCharToLexer)(lexer, c);
1403
8.95M
    }
1404
1405
1.11M
    lexer->txtend = lexer->lexsize;
1406
1.11M
    return (tmbchar) c;
1407
1.11M
}
1408
1409
/*
1410
  Used for elements and text nodes
1411
  element name is NULL for text nodes
1412
  start and end are offsets into lexbuf
1413
  which contains the textual content of
1414
  all elements in the parse tree.
1415
1416
  parent and content allow traversal
1417
  of the parse tree in any direction.
1418
  attributes are represented as a linked
1419
  list of AttVal nodes which hold the
1420
  strings for attribute/value pairs.
1421
*/
1422
1423
1424
Node *TY_(NewNode)(TidyAllocator* allocator, Lexer *lexer)
1425
3.33M
{
1426
3.33M
    Node* node = (Node*) TidyAlloc( allocator, sizeof(Node) );
1427
3.33M
    TidyClearMemory( node, sizeof(Node) );
1428
3.33M
    if ( lexer )
1429
3.33M
    {
1430
3.33M
        node->line = lexer->lines;
1431
3.33M
        node->column = lexer->columns;
1432
3.33M
    }
1433
3.33M
    node->type = TextNode;
1434
#if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION)
1435
    SPRTF("Allocated node %p\n", node );
1436
#endif
1437
3.33M
    return node;
1438
3.33M
}
1439
1440
/* used to clone heading nodes when split by an <HR> */
1441
Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element )
1442
3.50k
{
1443
3.50k
    Lexer* lexer = doc->lexer;
1444
3.50k
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1445
1446
3.50k
    node->start = lexer->lexsize;
1447
3.50k
    node->end   = lexer->lexsize;
1448
1449
3.50k
    if ( element )
1450
3.50k
    {
1451
3.50k
        node->parent     = element->parent;
1452
3.50k
        node->type       = element->type;
1453
3.50k
        node->closed     = element->closed;
1454
3.50k
        node->implicit   = element->implicit;
1455
3.50k
        node->tag        = element->tag;
1456
3.50k
        node->element    = TY_(tmbstrdup)( doc->allocator, element->element );
1457
3.50k
        node->attributes = TY_(DupAttrs)( doc, element->attributes );
1458
3.50k
    }
1459
3.50k
    return node;
1460
3.50k
}
1461
1462
/* free node's attributes */
1463
void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node )
1464
3.21M
{
1465
3.52M
    while ( node->attributes )
1466
319k
    {
1467
319k
        AttVal *av = node->attributes;
1468
1469
319k
        if ( av->attribute )
1470
313k
        {
1471
313k
            if ( (attrIsID(av) || attrIsNAME(av)) &&
1472
313k
                 TY_(IsAnchorElement)(doc, node) )
1473
7.91k
            {
1474
7.91k
                TY_(RemoveAnchorByNode)( doc, av->value, node );
1475
7.91k
            }
1476
313k
        }
1477
1478
319k
        node->attributes = av->next;
1479
319k
        TY_(FreeAttribute)( doc, av );
1480
319k
    }
1481
3.21M
}
1482
1483
/* doesn't repair attribute list linkage */
1484
void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av )
1485
491k
{
1486
491k
    TY_(FreeNode)( doc, av->asp );
1487
491k
    TY_(FreeNode)( doc, av->php );
1488
491k
    TidyDocFree( doc, av->attribute );
1489
491k
    TidyDocFree( doc, av->value );
1490
491k
    TidyDocFree( doc, av );
1491
491k
}
1492
1493
/* detach attribute from node
1494
*/
1495
void TY_(DetachAttribute)( Node *node, AttVal *attr )
1496
8.86k
{
1497
8.86k
    AttVal *av, *prev = NULL;
1498
1499
18.2k
    for ( av = node->attributes; av; av = av->next )
1500
18.2k
    {
1501
18.2k
        if ( av == attr )
1502
8.86k
        {
1503
8.86k
            if ( prev )
1504
4.56k
                prev->next = attr->next;
1505
4.29k
            else
1506
4.29k
                node->attributes = attr->next;
1507
8.86k
            break;
1508
8.86k
        }
1509
9.43k
        prev = av;
1510
9.43k
    }
1511
8.86k
}
1512
1513
/* detach attribute from node then free it
1514
*/
1515
void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr )
1516
8.86k
{
1517
8.86k
    TY_(DetachAttribute)( node, attr );
1518
8.86k
    TY_(FreeAttribute)( doc, attr );
1519
8.86k
}
1520
1521
/*
1522
  Free document nodes by iterating through peers and recursing
1523
  through children. Set next to NULL before calling TY_(FreeNode)()
1524
  to avoid freeing peer nodes. Doesn't patch up prev/next links.
1525
 */
1526
void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
1527
5.38M
{
1528
#if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION)
1529
    /* avoid showing free of root node! */
1530
    if (node) {
1531
        if (RootNode != node->type) {
1532
            SPRTF("Free node %p\n", node);
1533
        }
1534
        else {
1535
            SPRTF("Root node %p\n", node);
1536
        }
1537
    }
1538
#endif
1539
1540
8.59M
    while ( node )
1541
3.20M
    {
1542
3.20M
        Node* next = node->next;
1543
1544
3.20M
        TY_(FreeAttrs)( doc, node );
1545
3.20M
        TY_(FreeNode)( doc, node->content );
1546
3.20M
        TidyDocFree( doc, node->element );
1547
3.20M
        if (RootNode != node->type)
1548
3.20M
            TidyDocFree( doc, node );
1549
34.1k
        else
1550
34.1k
            node->content = NULL;
1551
1552
3.20M
        node = next;
1553
3.20M
    }
1554
5.38M
}
1555
1556
Node* TY_(TextToken)( Lexer *lexer )
1557
187k
{
1558
187k
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1559
187k
    node->start = lexer->txtstart;
1560
187k
    node->end = lexer->txtend;
1561
187k
    return node;
1562
187k
}
1563
1564
/* used for creating preformatted text from Word2000 */
1565
Node *TY_(NewLineNode)( Lexer *lexer )
1566
0
{
1567
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1568
0
    node->start = lexer->lexsize;
1569
0
    TY_(AddCharToLexer)( lexer, (uint)'\n' );
1570
0
    node->end = lexer->lexsize;
1571
0
    return node;
1572
0
}
1573
1574
/* used for adding a &nbsp; for Word2000 */
1575
Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt )
1576
0
{
1577
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1578
0
    node->start = lexer->lexsize;
1579
0
    AddStringToLexer( lexer, txt );
1580
0
    node->end = lexer->lexsize;
1581
0
    return node;
1582
0
}
1583
1584
static Node* TagToken( TidyDocImpl* doc, NodeType type )
1585
1.11M
{
1586
1.11M
    Lexer* lexer = doc->lexer;
1587
1.11M
    Node* node = TY_(NewNode)( lexer->allocator, lexer );
1588
1.11M
    node->type = type;
1589
1.11M
    node->element = TY_(tmbstrndup)( doc->allocator,
1590
1.11M
                                     lexer->lexbuf + lexer->txtstart,
1591
1.11M
                                     lexer->txtend - lexer->txtstart );
1592
1.11M
    node->start = lexer->txtstart;
1593
1.11M
    node->end = lexer->txtstart;
1594
1595
1.11M
    if ( type == StartTag || type == StartEndTag || type == EndTag )
1596
1.11M
        TY_(FindTag)(doc, node);
1597
1598
1.11M
    return node;
1599
1.11M
}
1600
1601
static Node* NewToken(TidyDocImpl* doc, NodeType type)
1602
36.8k
{
1603
36.8k
    Lexer* lexer = doc->lexer;
1604
36.8k
    Node* node = TY_(NewNode)(lexer->allocator, lexer);
1605
36.8k
    node->type = type;
1606
36.8k
    node->start = lexer->txtstart;
1607
36.8k
    node->end = lexer->txtend;
1608
36.8k
    return node;
1609
36.8k
}
1610
1611
4.67k
#define CommentToken(doc) NewToken(doc, CommentTag)
1612
#define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1613
15.9k
#define PIToken(doc)      NewToken(doc, ProcInsTag)
1614
2.96k
#define AspToken(doc)     NewToken(doc, AspTag)
1615
837
#define JsteToken(doc)    NewToken(doc, JsteTag)
1616
3.95k
#define PhpToken(doc)     NewToken(doc, PhpTag)
1617
4.88k
#define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1618
2.56k
#define SectionToken(doc) NewToken(doc, SectionTag)
1619
1.07k
#define CDATAToken(doc)   NewToken(doc, CDATATag)
1620
1621
void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str )
1622
0
{
1623
0
    byte c;
1624
0
    while(0 != (c = *str++) ) {
1625
        /*\
1626
         *  Issue #286
1627
         *  Previously this used TY_(AddCharToLexer)( lexer, c );
1628
         *  which uses err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
1629
         *  But this is transferring already 'translated' data from an
1630
         *  internal location to the lexer, so should use AddByte()
1631
        \*/
1632
0
        AddByte( lexer, c );
1633
0
    }
1634
0
}
1635
1636
/*
1637
void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1638
{
1639
    byte c;
1640
    int ix;
1641
1642
    for ( ix=0; ix < len && (c = *str++); ++ix )
1643
        TY_(AddCharToLexer)(lexer, c);
1644
}
1645
*/
1646
1647
/* find doctype element */
1648
Node *TY_(FindDocType)( TidyDocImpl* doc )
1649
18.0k
{
1650
18.0k
    Node* node;
1651
18.0k
    for ( node = (doc ? doc->root.content : NULL);
1652
20.0k
          node && node->type != DocTypeTag; 
1653
18.0k
          node = node->next )
1654
2.02k
        /**/;
1655
18.0k
    return node;
1656
18.0k
}
1657
1658
/* find parent container element */
1659
Node* TY_(FindContainer)( Node* node )
1660
0
{
1661
0
    for ( node = (node ? node->parent : NULL);
1662
0
          node && TY_(nodeHasCM)(node, CM_INLINE);
1663
0
          node = node->parent )
1664
0
        /**/;
1665
1666
0
    return node;
1667
0
}
1668
1669
1670
/* find html element */
1671
Node *TY_(FindHTML)( TidyDocImpl* doc )
1672
58.3k
{
1673
58.3k
    Node *node;
1674
58.3k
    for ( node = (doc ? doc->root.content : NULL);
1675
109k
          node && !nodeIsHTML(node); 
1676
58.3k
          node = node->next )
1677
51.4k
        /**/;
1678
1679
58.3k
    return node;
1680
58.3k
}
1681
1682
/* find XML Declaration */
1683
Node *TY_(FindXmlDecl)(TidyDocImpl* doc)
1684
1.14k
{
1685
1.14k
    Node *node;
1686
1.14k
    for ( node = (doc ? doc->root.content : NULL);
1687
1.41k
          node && !(node->type == XmlDecl);
1688
1.14k
          node = node->next )
1689
269
        /**/;
1690
1691
1.14k
    return node;
1692
1.14k
}
1693
1694
1695
Node *TY_(FindHEAD)( TidyDocImpl* doc )
1696
40.7k
{
1697
40.7k
    Node *node = TY_(FindHTML)( doc );
1698
1699
40.7k
    if ( node )
1700
40.7k
    {
1701
40.7k
        for ( node = node->content;
1702
41.2k
              node && !nodeIsHEAD(node); 
1703
40.7k
              node = node->next )
1704
538
            /**/;
1705
40.7k
    }
1706
1707
40.7k
    return node;
1708
40.7k
}
1709
1710
Node *TY_(FindTITLE)(TidyDocImpl* doc)
1711
17.0k
{
1712
17.0k
    Node *node = TY_(FindHEAD)(doc);
1713
1714
17.0k
    if (node)
1715
17.0k
        for (node = node->content;
1716
49.3k
             node && !nodeIsTITLE(node);
1717
32.3k
             node = node->next) {}
1718
1719
17.0k
    return node;
1720
17.0k
}
1721
1722
Node *TY_(FindBody)( TidyDocImpl* doc )
1723
16.2k
{
1724
16.2k
    Node *node = ( doc ? doc->root.content : NULL );
1725
1726
20.5k
    while ( node && !nodeIsHTML(node) )
1727
4.32k
        node = node->next;
1728
1729
16.2k
    if (node == NULL)
1730
0
        return NULL;
1731
1732
16.2k
    node = node->content;
1733
38.7k
    while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1734
22.5k
        node = node->next;
1735
1736
16.2k
    if ( node && nodeIsFRAMESET(node) )
1737
7.42k
    {
1738
7.42k
        node = node->content;
1739
9.98k
        while ( node && !nodeIsNOFRAMES(node) )
1740
2.56k
            node = node->next;
1741
1742
7.42k
        if ( node )
1743
6.53k
        {
1744
6.53k
            node = node->content;
1745
7.03k
            while ( node && !nodeIsBODY(node) )
1746
494
                node = node->next;
1747
6.53k
        }
1748
7.42k
    }
1749
1750
16.2k
    return node;
1751
16.2k
}
1752
1753
/* add meta element for Tidy */
1754
Bool TY_(AddGenerator)( TidyDocImpl* doc )
1755
0
{
1756
0
    AttVal *attval;
1757
0
    Node *node;
1758
0
    Node *head = TY_(FindHEAD)( doc );
1759
0
    tmbchar buf[256];
1760
    
1761
0
    if (head)
1762
0
    {
1763
0
#ifdef PLATFORM_NAME
1764
0
        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 for "PLATFORM_NAME" version %s",
1765
0
                         tidyLibraryVersion());
1766
#else
1767
        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 version %s", tidyLibraryVersion());
1768
#endif
1769
1770
0
        for ( node = head->content; node; node = node->next )
1771
0
        {
1772
0
            if ( nodeIsMETA(node) )
1773
0
            {
1774
0
                attval = TY_(AttrGetById)(node, TidyAttr_NAME);
1775
1776
0
                if (AttrValueIs(attval, "generator"))
1777
0
                {
1778
0
                    attval = TY_(AttrGetById)(node, TidyAttr_CONTENT);
1779
1780
0
                    if (AttrHasValue(attval) &&
1781
0
                        TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0)
1782
0
                    {
1783
                        /* update the existing content to reflect the */
1784
                        /* actual version of Tidy currently being used */
1785
                        
1786
0
                        TidyDocFree(doc, attval->value);
1787
0
                        attval->value = TY_(tmbstrdup)(doc->allocator, buf);
1788
0
                        return no;
1789
0
                    }
1790
0
                }
1791
0
            }
1792
0
        }
1793
1794
0
        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
1795
0
        {
1796
0
            node = TY_(InferredTag)(doc, TidyTag_META);
1797
0
            TY_(AddAttribute)( doc, node, "name", "generator" );
1798
0
            TY_(AddAttribute)( doc, node, "content", buf );
1799
0
            TY_(InsertNodeAtStart)( head, node );
1800
0
            return yes;
1801
0
        }
1802
0
    }
1803
1804
0
    return no;
1805
0
}
1806
1807
/*\ examine <!DOCTYPE ...> to identify version 
1808
 *  Issue #167 and #169
1809
 *   If HTML5
1810
 *        <!DOCTYPE html>
1811
 *       <!DOCTYPE html SYSTEM "about:legacy-compat">
1812
 *   else others
1813
\*/
1814
static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )
1815
10.9k
{
1816
10.9k
    AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC");
1817
10.9k
    uint vers;
1818
1819
10.9k
    if (!fpi || !fpi->value) 
1820
8.74k
    {
1821
        /*\
1822
         * Is. #815 - change to case-insensitive test
1823
         * See REC: https://www.w3.org/TR/html5/syntax.html#the-doctype
1824
        \*/
1825
8.74k
        if (doctype->element && (TY_(tmbstrcasecmp)(doctype->element,"html") == 0))
1826
105
        {
1827
105
            return VERS_HTML5;  /* TODO: do we need to check MORE? */
1828
105
        }
1829
        /* TODO: Consider warning, error message */
1830
8.63k
        return VERS_UNKNOWN;
1831
8.74k
    }
1832
2.20k
    vers = GetVersFromFPI(fpi->value);
1833
1834
2.20k
    if (VERS_XHTML & vers)
1835
80
    {
1836
80
        TY_(SetOptionBool)(doc, TidyXmlOut, yes);
1837
80
        TY_(SetOptionBool)(doc, TidyXhtmlOut, yes);
1838
80
        doc->lexer->isvoyager = yes;
1839
80
    }
1840
1841
    /* todo: add a warning if case does not match? */
1842
2.20k
    TidyDocFree(doc, fpi->value);
1843
2.20k
    fpi->value = TY_(tmbstrdup)(doc->allocator, GetFPIFromVers(vers));
1844
1845
2.20k
    return vers;
1846
10.9k
}
1847
1848
/* return guessed version */
1849
uint TY_(ApparentVersion)( TidyDocImpl* doc )
1850
0
{
1851
0
    if ((doc->lexer->doctype == XH11 ||
1852
0
         doc->lexer->doctype == XB10) &&
1853
0
        (doc->lexer->versions & doc->lexer->doctype))
1854
0
        return doc->lexer->doctype;
1855
0
    else
1856
0
        return TY_(HTMLVersion)(doc);
1857
0
}
1858
1859
ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) )
1860
0
{
1861
0
    ctmbstr name = GetNameFromVers(vers);
1862
0
    return name;
1863
0
}
1864
1865
uint TY_(HTMLVersionNumberFromCode)( uint vers )
1866
0
{
1867
0
    uint i;
1868
1869
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
1870
0
        if (W3C_Doctypes[i].vers == vers)
1871
0
            return W3C_Doctypes[i].vers_out;
1872
1873
0
    return VERS_UNKNOWN;
1874
0
}
1875
1876
Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc )
1877
0
{
1878
0
    Bool isXhtml = doc->lexer->isvoyager;
1879
0
    Node* doctype;
1880
    
1881
    /* Do not warn in XHTML mode */
1882
0
    if ( isXhtml )
1883
0
        return no;
1884
1885
    /* Do not warn if emitted doctype is proprietary */
1886
0
    if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL )
1887
0
        return no;
1888
1889
    /* Do not warn if no SI is possible */
1890
0
    if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL )
1891
0
        return no;
1892
1893
0
    if ( (doctype = TY_(FindDocType)( doc )) != NULL
1894
0
         && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL )
1895
0
        return yes;
1896
1897
0
    return no;
1898
0
}
1899
1900
1901
/* Put DOCTYPE declaration between the
1902
** <?xml version "1.0" ... ?> declaration, if any,
1903
** and the <html> tag.  Should also work for any comments, 
1904
** etc. that may precede the <html> tag.
1905
*/
1906
1907
static Node* NewDocTypeNode( TidyDocImpl* doc )
1908
0
{
1909
0
    Node* doctype = NULL;
1910
0
    Node* html = TY_(FindHTML)( doc );
1911
1912
0
    if ( !html )
1913
0
        return NULL;
1914
1915
0
    doctype = TY_(NewNode)( doc->allocator, NULL );
1916
0
    doctype->type = DocTypeTag;
1917
0
    TY_(InsertNodeBeforeElement)(html, doctype);
1918
0
    return doctype;
1919
0
}
1920
1921
Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
1922
0
{
1923
0
    Lexer *lexer = doc->lexer;
1924
0
    Node *doctype = TY_(FindDocType)( doc );
1925
0
    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
1926
0
    ctmbstr pub = "PUBLIC";
1927
0
    ctmbstr sys = "SYSTEM";
1928
1929
0
    lexer->versionEmitted = TY_(ApparentVersion)( doc );
1930
1931
0
    if (dtmode == TidyDoctypeOmit)
1932
0
    {
1933
0
        if (doctype)
1934
0
            TY_(DiscardElement)(doc, doctype);
1935
0
        return yes;
1936
0
    }
1937
1938
0
    if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))
1939
0
        return no;
1940
1941
0
    if (!doctype)
1942
0
    {
1943
0
        doctype = NewDocTypeNode(doc);
1944
0
        doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
1945
0
    }
1946
0
    else
1947
0
    {
1948
0
        doctype->element = TY_(tmbstrtolower)(doctype->element);
1949
0
    }
1950
1951
0
    switch(dtmode)
1952
0
    {
1953
0
    case TidyDoctypeHtml5:
1954
        /* HTML5 */
1955
0
        TY_(RepairAttrValue)(doc, doctype, pub, NULL);
1956
0
        TY_(RepairAttrValue)(doc, doctype, sys, NULL);
1957
0
        lexer->versionEmitted = XH50;
1958
0
        break;
1959
0
    case TidyDoctypeStrict:
1960
        /* XHTML 1.0 Strict */
1961
0
        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1962
0
        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1963
0
        lexer->versionEmitted = X10S;
1964
0
        break;
1965
0
    case TidyDoctypeLoose:
1966
        /* XHTML 1.0 Transitional */
1967
0
        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1968
0
        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1969
0
        lexer->versionEmitted = X10T;
1970
0
        break;
1971
0
    case TidyDoctypeUser:
1972
        /* user defined document type declaration */
1973
0
        TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype));
1974
0
        TY_(RepairAttrValue)(doc, doctype, sys, "");
1975
0
        break;
1976
0
    case TidyDoctypeAuto:
1977
0
        if (lexer->doctype == VERS_UNKNOWN || lexer->doctype == VERS_HTML5) {
1978
0
          lexer->versionEmitted = XH50;
1979
0
          return yes;
1980
0
        }
1981
0
        else if (lexer->versions & XH11 && lexer->doctype == XH11)
1982
0
        {
1983
0
            if (!TY_(GetAttrByName)(doctype, sys))
1984
0
                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1985
0
            lexer->versionEmitted = XH11;
1986
0
            return yes;
1987
0
        }
1988
0
        else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))
1989
0
        {
1990
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11));
1991
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1992
0
            lexer->versionEmitted = XH11;
1993
0
        }
1994
0
        else if (lexer->versions & XB10 && lexer->doctype == XB10)
1995
0
        {
1996
0
            if (!TY_(GetAttrByName)(doctype, sys))
1997
0
                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10));
1998
0
            lexer->versionEmitted = XB10;
1999
0
            return yes;
2000
0
        }
2001
0
        else if (lexer->versions & VERS_HTML40_STRICT)
2002
0
        {
2003
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
2004
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
2005
0
            lexer->versionEmitted = X10S;
2006
0
        }
2007
0
        else if (lexer->versions & VERS_FRAMESET)
2008
0
        {
2009
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F));
2010
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F));
2011
0
            lexer->versionEmitted = X10F;
2012
0
        }
2013
0
        else if (lexer->versions & VERS_LOOSE)
2014
0
        {
2015
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
2016
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
2017
0
            lexer->versionEmitted = X10T;
2018
0
        }
2019
0
        else if (lexer->versions & VERS_HTML5)
2020
0
        {
2021
            /*\
2022
             *  Issue #273 - If still a html5/xhtml5 bit
2023
             *  existing, that is the 'ConstrainVersion' has
2024
             *  not eliminated all HTML5, then nothing to do here.
2025
             *  Certainly do **not** delete the DocType node!
2026
             *  see: http://www.w3.org/QA/Tips/Doctype
2027
            \*/
2028
0
        }
2029
0
        else
2030
0
        {
2031
0
            if (doctype)
2032
0
                TY_(DiscardElement)(doc, doctype);
2033
0
            return no;
2034
0
        }
2035
0
        break;
2036
0
    case TidyDoctypeOmit:
2037
0
        assert(0);
2038
0
        break;
2039
0
    }
2040
2041
0
    return no;
2042
0
}
2043
2044
/* fixup doctype if missing */
2045
Bool TY_(FixDocType)( TidyDocImpl* doc )
2046
0
{
2047
0
    Lexer* lexer = doc->lexer;
2048
0
    Node* doctype = TY_(FindDocType)( doc );
2049
0
    uint dtmode = cfg( doc, TidyDoctypeMode );
2050
0
    uint guessed = VERS_UNKNOWN;
2051
0
    Bool hadSI = no;
2052
2053
    /* Issue #167 - found doctype, and doctype is default VERS_HTML5, set VERS_HTML5 and return yes */
2054
0
    if (doctype && (dtmode == TidyDoctypeAuto) &&
2055
0
        (lexer->doctype == VERS_HTML5) )
2056
0
    {
2057
        /* The version emitted cannot be a composite value! */
2058
0
        lexer->versionEmitted = HT50;
2059
0
        return yes;
2060
0
    }
2061
0
    if (dtmode == TidyDoctypeAuto &&
2062
0
        lexer->versions & lexer->doctype &&
2063
0
        !(VERS_XHTML & lexer->doctype && !lexer->isvoyager)
2064
0
        && TY_(FindDocType)(doc))
2065
0
    {
2066
0
        lexer->versionEmitted = lexer->doctype;
2067
0
        return yes;
2068
0
    }
2069
2070
0
    if (dtmode == TidyDoctypeOmit)
2071
0
    {
2072
0
        if (doctype)
2073
0
            TY_(DiscardElement)( doc, doctype );
2074
0
        lexer->versionEmitted = TY_(ApparentVersion)( doc );
2075
0
        return yes;
2076
0
    }
2077
2078
0
    if (cfgBool(doc, TidyXmlOut))
2079
0
        return yes;
2080
2081
0
    if (doctype)
2082
0
        hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL;
2083
2084
0
    if ((dtmode == TidyDoctypeStrict ||
2085
0
         dtmode == TidyDoctypeLoose) && doctype)
2086
0
    {
2087
0
        TY_(DiscardElement)(doc, doctype);
2088
0
        doctype = NULL;
2089
0
    }
2090
2091
0
    switch (dtmode)
2092
0
    {
2093
0
    case TidyDoctypeHtml5:
2094
0
        guessed = HT50;
2095
0
        break;
2096
0
    case TidyDoctypeStrict:
2097
0
        guessed = H41S;
2098
0
        break;
2099
0
    case TidyDoctypeLoose:
2100
0
        guessed = H41T;
2101
0
        break;
2102
0
    case TidyDoctypeAuto:
2103
0
        guessed = TY_(HTMLVersion)(doc);
2104
0
        break;
2105
0
    }
2106
2107
0
    lexer->versionEmitted = guessed;
2108
0
    if (guessed == VERS_UNKNOWN)
2109
0
        return no;
2110
2111
0
    if (doctype)
2112
0
    {
2113
0
        doctype->element = TY_(tmbstrtolower)(doctype->element);
2114
0
    }
2115
0
    else
2116
0
    {
2117
0
        doctype = NewDocTypeNode(doc);
2118
0
        doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
2119
0
    }
2120
2121
0
    TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));
2122
2123
0
    if (hadSI)
2124
0
        TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed));
2125
2126
0
    return yes;
2127
0
}
2128
2129
/* ensure XML document starts with <?xml version="1.0"?> */
2130
/* add encoding attribute if not using ASCII or UTF-8 output */
2131
Bool TY_(FixXmlDecl)( TidyDocImpl* doc )
2132
0
{
2133
0
    Node* xml;
2134
0
    AttVal *version, *encoding;
2135
0
    Lexer*lexer = doc->lexer;
2136
0
    Node* root = &doc->root;
2137
2138
0
    if ( root->content && root->content->type == XmlDecl )
2139
0
    {
2140
0
        xml = root->content;
2141
0
    }
2142
0
    else
2143
0
    {
2144
0
        xml = TY_(NewNode)(lexer->allocator, lexer);
2145
0
        xml->type = XmlDecl;
2146
0
        if ( root->content )
2147
0
            TY_(InsertNodeBeforeElement)(root->content, xml);
2148
0
        else
2149
0
            root->content = xml;
2150
0
    }
2151
2152
0
    version = TY_(GetAttrByName)(xml, "version");
2153
0
    encoding = TY_(GetAttrByName)(xml, "encoding");
2154
2155
    /*
2156
      We need to insert a check if declared encoding 
2157
      and output encoding mismatch and fix the XML
2158
      declaration accordingly!!!
2159
    */
2160
2161
0
    if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )
2162
0
    {
2163
0
        ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2164
0
        if ( enc )
2165
0
            TY_(AddAttribute)( doc, xml, "encoding", enc );
2166
0
    }
2167
2168
0
    if ( version == NULL )
2169
0
        TY_(AddAttribute)( doc, xml, "version", "1.0" );
2170
0
    return yes;
2171
0
}
2172
2173
Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id)
2174
238k
{
2175
238k
    Lexer *lexer = doc->lexer;
2176
238k
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
2177
238k
    const Dict* dict = TY_(LookupTagDef)(id);
2178
2179
238k
    assert( dict != NULL );
2180
2181
238k
    node->type = StartTag;
2182
238k
    node->implicit = yes;
2183
238k
    node->element = TY_(tmbstrdup)(doc->allocator, dict->name);
2184
238k
    node->tag = dict;
2185
238k
    node->start = lexer->txtstart;
2186
238k
    node->end = lexer->txtend;
2187
2188
238k
    return node;
2189
238k
}
2190
2191
static Bool ExpectsContent(Node *node)
2192
931k
{
2193
931k
    if (node->type != StartTag)
2194
13.7k
        return no;
2195
2196
    /* unknown element? */
2197
917k
    if (node->tag == NULL)
2198
162k
        return yes;
2199
2200
754k
    if (node->tag->model & CM_EMPTY)
2201
67.6k
        return no;
2202
2203
687k
    return yes;
2204
754k
}
2205
2206
/*
2207
  create a text node for the contents of
2208
  a CDATA element like style or script
2209
  which ends with </foo> for some foo.
2210
*/
2211
2212
typedef enum
2213
{
2214
    CDATA_INTERMEDIATE,
2215
    CDATA_STARTTAG,
2216
    CDATA_ENDTAG
2217
} CDATAState;
2218
2219
static Node *GetCDATA( TidyDocImpl* doc, Node *container )
2220
3.41k
{
2221
3.41k
    Lexer* lexer = doc->lexer;
2222
3.41k
    uint start = 0;
2223
3.41k
    int nested = 0;
2224
3.41k
    CDATAState state = CDATA_INTERMEDIATE;
2225
3.41k
    uint i;
2226
3.41k
    Bool isEmpty = yes;
2227
3.41k
    Bool matches = no;
2228
3.41k
    uint c;
2229
3.41k
    Bool hasSrc = (TY_(AttrGetById)(container, TidyAttr_SRC) != NULL) ? yes : no;
2230
    /*\ Issue #65 (1642186) and #280 - is script or style, and the option on
2231
     *  If yes, then avoid incrementing nested...
2232
    \*/
2233
3.41k
    Bool nonested = ((nodeIsSCRIPT(container) || (nodeIsSTYLE(container))) && 
2234
3.41k
        cfgBool(doc, TidySkipNested)) ? yes : no;
2235
2236
3.41k
    SetLexerLocus( doc, lexer );
2237
3.41k
    lexer->waswhite = no;
2238
3.41k
    lexer->txtstart = lexer->txtend = lexer->lexsize;
2239
2240
    /* seen start tag, look for matching end tag */
2241
13.2M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2242
13.2M
    {
2243
13.2M
        TY_(AddCharToLexer)(lexer, c);
2244
13.2M
        lexer->txtend = lexer->lexsize;
2245
2246
13.2M
        if (state == CDATA_INTERMEDIATE)
2247
13.0M
        {
2248
13.0M
            if (c != '<')
2249
13.0M
            {
2250
13.0M
                if (isEmpty && !TY_(IsWhite)(c))
2251
990
                    isEmpty = no;
2252
13.0M
                continue;
2253
13.0M
            }
2254
2255
22.6k
            c = TY_(ReadChar)(doc->docIn);
2256
2257
22.6k
            if (TY_(IsLetter)(c))
2258
10.7k
            {
2259
                /* <head><script src=foo><meta name=foo content=bar>*/
2260
10.7k
                if (hasSrc && isEmpty && nodeIsSCRIPT(container))
2261
1.00k
                {
2262
                    /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
2263
1.00k
                    lexer->lexsize = lexer->txtstart;
2264
1.00k
                    TY_(UngetChar)(c, doc->docIn);
2265
1.00k
                    TY_(UngetChar)('<', doc->docIn);
2266
1.00k
                    return NULL;
2267
1.00k
                }
2268
9.75k
                TY_(AddCharToLexer)(lexer, c);
2269
9.75k
                start = lexer->lexsize - 1;
2270
9.75k
                state = CDATA_STARTTAG;
2271
9.75k
            }
2272
11.9k
            else if (c == '/')
2273
7.19k
            {
2274
7.19k
                TY_(AddCharToLexer)(lexer, c);
2275
2276
7.19k
                c = TY_(ReadChar)(doc->docIn);
2277
                
2278
7.19k
                if (!TY_(IsLetter)(c))
2279
1.26k
                {
2280
1.26k
                    TY_(UngetChar)(c, doc->docIn);
2281
1.26k
                    continue;
2282
1.26k
                }
2283
5.93k
                TY_(UngetChar)(c, doc->docIn);
2284
2285
5.93k
                start = lexer->lexsize;
2286
5.93k
                state = CDATA_ENDTAG;
2287
5.93k
            }
2288
4.72k
            else if (c == '\\')
2289
1.72k
            {
2290
                /* recognize document.write("<script><\/script>") */
2291
1.72k
                TY_(AddCharToLexer)(lexer, c);
2292
2293
1.72k
                c = TY_(ReadChar)(doc->docIn);
2294
2295
1.72k
                if (c != '/')
2296
375
                {
2297
375
                    TY_(UngetChar)(c, doc->docIn);
2298
375
                    continue;
2299
375
                }
2300
2301
1.34k
                TY_(AddCharToLexer)(lexer, c);
2302
2303
1.34k
                if (nonested) {
2304
                    /*\ 
2305
                     *  Issue #65 - for version 5.1.14.EXP2
2306
                     *  If the nonested option is ON then the <script> 
2307
                     *  tag did not bump nested, so no need to treat this as 
2308
                     *  an end tag just to decrease nested, just continue!
2309
                    \*/
2310
204
                    continue;
2311
204
                }
2312
2313
1.14k
                c = TY_(ReadChar)(doc->docIn);
2314
                
2315
1.14k
                if (!TY_(IsLetter)(c))
2316
516
                {
2317
516
                    TY_(UngetChar)(c, doc->docIn);
2318
516
                    continue;
2319
516
                }
2320
628
                TY_(UngetChar)(c, doc->docIn);
2321
2322
628
                start = lexer->lexsize;
2323
628
                state = CDATA_ENDTAG;
2324
628
            }
2325
3.00k
            else
2326
3.00k
            {
2327
3.00k
                TY_(UngetChar)(c, doc->docIn);
2328
3.00k
            }
2329
22.6k
        }
2330
        /* '<' + Letter found */
2331
259k
        else if (state == CDATA_STARTTAG)
2332
26.8k
        {
2333
26.8k
            if (TY_(IsLetter)(c))
2334
17.1k
                continue;
2335
2336
9.70k
            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2337
9.70k
                                          TY_(tmbstrlen)(container->element)) == 0;
2338
9.70k
            if (matches && !nonested)
2339
1.24k
                nested++;
2340
2341
9.70k
            state = CDATA_INTERMEDIATE;
2342
9.70k
        }
2343
        /* '<' + '/' + Letter found */
2344
232k
        else if (state == CDATA_ENDTAG)
2345
232k
        {
2346
232k
            if (TY_(IsLetter)(c))
2347
225k
                continue;
2348
2349
6.55k
            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2350
6.55k
                                          TY_(tmbstrlen)(container->element)) == 0;
2351
2352
6.55k
            if (isEmpty && !matches)
2353
945
            {
2354
                /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
2355
2356
61.2k
                for (i = lexer->lexsize - 1; i >= start; --i)
2357
60.2k
                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2358
945
                TY_(UngetChar)('/', doc->docIn);
2359
945
                TY_(UngetChar)('<', doc->docIn);
2360
945
                break;
2361
945
            }
2362
2363
5.60k
            if (matches && nested-- <= 0)
2364
1.05k
            {
2365
49.1k
                for (i = lexer->lexsize - 1; i >= start; --i)
2366
48.0k
                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2367
1.05k
                TY_(UngetChar)('/', doc->docIn);
2368
1.05k
                TY_(UngetChar)('<', doc->docIn);
2369
1.05k
                lexer->lexsize -= (lexer->lexsize - start) + 2;
2370
1.05k
                break;
2371
1.05k
            }
2372
4.55k
            else if (lexer->lexbuf[start - 2] != '\\')
2373
3.92k
            {
2374
                /* if the end tag is not already escaped using backslash */
2375
3.92k
                SetLexerLocus( doc, lexer );
2376
3.92k
                lexer->columns -= 3;
2377
2378
                /*\ if javascript insert backslash before / 
2379
                 *  Issue #348 - Add option, escape-scripts, to skip
2380
                \*/
2381
3.92k
                if ((TY_(IsJavaScript)(container)) && cfgBool(doc, TidyEscapeScripts) &&
2382
3.92k
                    !TY_(IsHTML5Mode)(doc) )    /* Is #700 - This only applies to legacy html4 mode */
2383
413
                {
2384
                    /* Issue #281 - only warn if adding the escape! */
2385
413
                    TY_(Report)(doc, NULL, NULL, BAD_CDATA_CONTENT);
2386
2387
111k
                    for (i = lexer->lexsize; i > start-1; --i)
2388
111k
                        lexer->lexbuf[i] = lexer->lexbuf[i-1];
2389
2390
413
                    lexer->lexbuf[start-1] = '\\';
2391
413
                    lexer->lexsize++;
2392
413
                }
2393
3.92k
            }
2394
4.55k
            state = CDATA_INTERMEDIATE;
2395
4.55k
        }
2396
13.2M
    }
2397
2.41k
    if (isEmpty)
2398
1.42k
        lexer->lexsize = lexer->txtstart = lexer->txtend;
2399
990
    else
2400
990
        lexer->txtend = lexer->lexsize;
2401
2402
2.41k
    if (c == EndOfStream)
2403
416
        TY_(Report)(doc, container, NULL, MISSING_ENDTAG_FOR );
2404
2405
2.41k
    return TY_(TextToken)(lexer);
2406
3.41k
}
2407
2408
void TY_(UngetToken)( TidyDocImpl* doc )
2409
990k
{
2410
990k
    doc->lexer->pushed = yes;
2411
990k
}
2412
2413
#if defined(ENABLE_DEBUG_LOG)
2414
#  define CondReturnTextNode(doc, skip) \
2415
            if (lexer->txtend > lexer->txtstart) { \
2416
                Node *_node = TY_(TextToken)(lexer); \
2417
                lexer->token = _node; \
2418
                GTDBG(doc,"text_node",_node); \
2419
                return _node; \
2420
            }
2421
2422
#else
2423
#  define CondReturnTextNode(doc, skip) \
2424
1.09M
            if (lexer->txtend > lexer->txtstart) \
2425
1.09M
            { \
2426
167k
                lexer->token = TY_(TextToken)(lexer); \
2427
167k
                return lexer->token; \
2428
167k
            }
2429
#endif
2430
2431
/*
2432
  modes for GetToken()
2433
2434
  MixedContent   -- for elements which don't accept PCDATA
2435
  Preformatted   -- white space preserved as is
2436
  IgnoreMarkup   -- for CDATA elements such as script, style
2437
*/
2438
static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode );
2439
2440
Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
2441
5.55M
{
2442
5.55M
    Node *node;
2443
5.55M
    Lexer* lexer = doc->lexer;
2444
2445
5.55M
    if (lexer->pushed || lexer->itoken)
2446
1.00M
    {
2447
        /* Deal with previously returned duplicate inline token */
2448
1.00M
        if (lexer->itoken)
2449
14.4k
        {
2450
            /* itoken rejected */
2451
14.4k
            if (lexer->pushed)
2452
6
            {
2453
6
                lexer->pushed = no;
2454
6
                node = lexer->itoken;
2455
6
                GTDBG(doc,"lex-itoken", node);
2456
6
                return node;
2457
6
            }
2458
            /* itoken has been accepted */
2459
14.4k
            lexer->itoken = NULL;
2460
14.4k
        }
2461
            
2462
        /* duplicate inlines in preference to pushed text nodes when appropriate */
2463
1.00M
        lexer->pushed = no;
2464
1.00M
        if (lexer->token->type != TextNode
2465
1.00M
            || !(lexer->insert || lexer->inode)) {
2466
990k
            node = lexer->token;
2467
990k
            GTDBG(doc,"lex-token", node);
2468
990k
            return node;
2469
990k
        }
2470
14.4k
        lexer->itoken = TY_(InsertedToken)( doc );
2471
14.4k
        node = lexer->itoken;
2472
14.4k
        GTDBG(doc,"lex-inserted", node);
2473
14.4k
        return node;
2474
1.00M
    }
2475
2476
4.55M
    assert( !(lexer->pushed || lexer->itoken) );
2477
2478
    /* at start of block elements, unclosed inline
2479
       elements are inserted into the token stream 
2480
       Issue #341 - Can NOT insert a token if NO istacksize  
2481
     */
2482
4.55M
    if ((lexer->insert || lexer->inode) && lexer->istacksize)
2483
1.66M
    {
2484
        /*\ Issue #92: could fix by the following, but instead chose not to stack these 2
2485
         *  if ( !(lexer->insert && (nodeIsINS(lexer->insert) || nodeIsDEL(lexer->insert))) ) {
2486
        \*/
2487
1.66M
        lexer->token = TY_(InsertedToken)( doc );
2488
1.66M
        node = lexer->token;
2489
1.66M
        GTDBG(doc,"lex-inserted2", node);
2490
1.66M
        return node;
2491
1.66M
    }
2492
2493
2.88M
    if (mode == CdataContent)
2494
3.41k
    {
2495
3.41k
        assert( lexer->parent != NULL );
2496
3.41k
        node = GetCDATA(doc, lexer->parent);
2497
3.41k
        GTDBG(doc,"lex-cdata", node);
2498
3.41k
        return node;
2499
3.41k
    }
2500
2501
2.88M
    return GetTokenFromStream( doc, mode );
2502
2.88M
}
2503
2504
#if defined(ENABLE_DEBUG_LOG)
2505
static void check_me(char *name)
2506
{
2507
    SPRTF("Have node %s\n", name);
2508
}
2509
#endif
2510
2511
static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
2512
2.88M
{
2513
2.88M
    Lexer* lexer = doc->lexer;
2514
2.88M
    uint c, lexdump, badcomment = 0;
2515
2.88M
    Bool isempty = no;
2516
2.88M
    AttVal *attributes = NULL;
2517
2.88M
    Node *node;
2518
2.88M
    Bool fixComments;
2519
    
2520
2.88M
    switch ( cfgAutoBool(doc, TidyFixComments) )
2521
2.88M
    {
2522
0
        case TidyYesState:
2523
0
            fixComments = yes;
2524
0
            break;
2525
2526
0
        case TidyNoState:
2527
0
            fixComments = no;
2528
0
            break;
2529
2530
2.88M
        default:
2531
2.88M
            fixComments = (TY_(HTMLVersion)(doc) & HT50) == 0;
2532
2.88M
            break;
2533
2.88M
    }
2534
2535
    /* Lexer->token must be set on return. Nullify it for safety. */
2536
2.88M
    lexer->token = NULL;
2537
2538
2.88M
    SetLexerLocus( doc, lexer );
2539
2.88M
    lexer->waswhite = no;
2540
2541
2.88M
    lexer->txtstart = lexer->txtend = lexer->lexsize;
2542
2543
243M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2544
241M
    {
2545
241M
        if (lexer->insertspace)
2546
2.39k
        {
2547
2.39k
            TY_(AddCharToLexer)(lexer, ' ');
2548
2.39k
            lexer->waswhite = yes;
2549
2.39k
            lexer->insertspace = no;
2550
2.39k
        }
2551
2552
241M
        if (c == 160 && (mode == Preformatted))
2553
307
            c = ' ';
2554
2555
241M
        TY_(AddCharToLexer)(lexer, c);
2556
2557
241M
        switch (lexer->state)
2558
241M
        {
2559
102M
            case LEX_CONTENT:  /* element content */
2560
2561
                /*
2562
                 Discard white space if appropriate. Its cheaper
2563
                 to do this here rather than in parser methods
2564
                 for elements that don't have mixed content.
2565
                */
2566
102M
                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) 
2567
102M
                      && lexer->lexsize == lexer->txtstart + 1)
2568
118k
                {
2569
118k
                    --(lexer->lexsize);
2570
118k
                    lexer->waswhite = no;
2571
118k
                    SetLexerLocus( doc, lexer );
2572
118k
                    continue;
2573
118k
                }
2574
2575
101M
                if (c == '<')
2576
1.32M
                {
2577
1.32M
                    lexer->state = LEX_GT;
2578
1.32M
                    continue;
2579
1.32M
                }
2580
2581
100M
                if (TY_(IsWhite)(c))
2582
89.4M
                {
2583
                    /* was previous character white? */
2584
89.4M
                    if (lexer->waswhite)
2585
89.2M
                    {
2586
89.2M
                        if (mode != Preformatted && mode != IgnoreMarkup)
2587
1.76M
                        {
2588
1.76M
                            --(lexer->lexsize);
2589
1.76M
                            SetLexerLocus( doc, lexer );
2590
1.76M
                        }
2591
89.2M
                    }
2592
190k
                    else /* prev character wasn't white */
2593
190k
                    {
2594
190k
                        lexer->waswhite = yes;
2595
2596
190k
                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
2597
67.9k
                            ChangeChar(lexer, ' ');
2598
190k
                    }
2599
2600
89.4M
                    continue;
2601
89.4M
                }
2602
11.1M
                else if (c == '&' && mode != IgnoreMarkup)
2603
77.4k
                    ParseEntity( doc, mode );
2604
2605
                /* this is needed to avoid trimming trailing whitespace */
2606
11.1M
                if (mode == IgnoreWhitespace)
2607
47.4k
                    mode = MixedContent;
2608
2609
11.1M
                lexer->waswhite = no;
2610
11.1M
                continue;
2611
2612
1.32M
            case LEX_GT:  /* < */
2613
2614
                /* check for endtag */
2615
1.32M
                if (c == '/')
2616
116k
                {
2617
116k
                    if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2618
46
                    {
2619
46
                        TY_(UngetChar)(c, doc->docIn);
2620
46
                        continue;
2621
46
                    }
2622
2623
116k
                    TY_(AddCharToLexer)(lexer, c);
2624
2625
116k
                    if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c)))
2626
104k
                    {
2627
104k
                        lexer->lexsize -= 3;
2628
104k
                        lexer->txtend = lexer->lexsize;
2629
104k
                        TY_(UngetChar)(c, doc->docIn);
2630
104k
                        lexer->state = LEX_ENDTAG;
2631
104k
                        lexer->lexbuf[lexer->lexsize] = '\0';  /* debug */
2632
104k
                        doc->docIn->curcol -= 2;
2633
2634
                        /* if some text before the </ return it now */
2635
104k
                        if (lexer->txtend > lexer->txtstart)
2636
13.2k
                        {
2637
                            /* trim space character before end tag */
2638
13.2k
                            if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
2639
364
                            {
2640
364
                                lexer->lexsize -= 1;
2641
364
                                lexer->txtend = lexer->lexsize;
2642
364
                            }
2643
13.2k
                            lexer->token = TY_(TextToken)(lexer);
2644
13.2k
                            node = lexer->token;
2645
13.2k
                            GTDBG(doc,"text", node);
2646
13.2k
                            return node;
2647
13.2k
                        }
2648
2649
91.1k
                        continue;       /* no text so keep going */
2650
104k
                    }
2651
2652
                    /* otherwise treat as CDATA */
2653
11.6k
                    lexer->waswhite = no;
2654
11.6k
                    lexer->state = LEX_CONTENT;
2655
11.6k
                    continue;
2656
116k
                }
2657
2658
1.20M
                if (mode == IgnoreMarkup)
2659
0
                {
2660
                    /* otherwise treat as CDATA */
2661
0
                    lexer->waswhite = no;
2662
0
                    lexer->state = LEX_CONTENT;
2663
0
                    continue;
2664
0
                }
2665
2666
                /*
2667
                   look out for comments, doctype or marked sections
2668
                   this isn't quite right, but its getting there ...
2669
                */
2670
1.20M
                if (c == '!')
2671
67.4k
                {
2672
67.4k
                    c = TY_(ReadChar)(doc->docIn);
2673
2674
67.4k
                    if (c == '-')
2675
4.99k
                    {
2676
4.99k
                        c = TY_(ReadChar)(doc->docIn);
2677
2678
4.99k
                        if (c == '-')
2679
4.67k
                        {
2680
4.67k
                            lexer->state = LEX_COMMENT;  /* comment */
2681
4.67k
                            lexer->lexsize -= 2;
2682
4.67k
                            lexer->txtend = lexer->lexsize;
2683
2684
4.67k
                            CondReturnTextNode(doc, 4)
2685
2686
4.35k
                            lexer->txtstart = lexer->lexsize;
2687
4.35k
                            continue;
2688
4.67k
                        }
2689
2690
                        /*
2691
                           TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING );
2692
                           Warning now done later - see issue #487
2693
                         */
2694
4.99k
                    }
2695
62.4k
                    else if (c == 'd' || c == 'D')
2696
55.1k
                    {
2697
                        /* todo: check for complete "<!DOCTYPE" not just <!D */
2698
2699
55.1k
                        uint skip = 0;
2700
2701
55.1k
                        lexer->state = LEX_DOCTYPE; /* doctype */
2702
55.1k
                        lexer->lexsize -= 2;
2703
55.1k
                        lexer->txtend = lexer->lexsize;
2704
55.1k
                        mode = IgnoreWhitespace;
2705
2706
                        /* skip until white space or '>' */
2707
2708
55.1k
                        for (;;)
2709
93.1k
                        {
2710
93.1k
                            c = TY_(ReadChar)(doc->docIn);
2711
93.1k
                            ++skip;
2712
2713
93.1k
                            if (c == EndOfStream || c == '>')
2714
37.4k
                            {
2715
37.4k
                                TY_(UngetChar)(c, doc->docIn);
2716
37.4k
                                break;
2717
37.4k
                            }
2718
2719
2720
55.7k
                            if (!TY_(IsWhite)(c))
2721
38.0k
                                continue;
2722
2723
                            /* and skip to end of whitespace */
2724
2725
17.6k
                            for (;;)
2726
44.1k
                            {
2727
44.1k
                                c = TY_(ReadChar)(doc->docIn);
2728
44.1k
                                ++skip;
2729
2730
44.1k
                                if (c == EndOfStream || c == '>')
2731
1.41k
                                {
2732
1.41k
                                    TY_(UngetChar)(c, doc->docIn);
2733
1.41k
                                    break;
2734
1.41k
                                }
2735
2736
2737
42.6k
                                if (TY_(IsWhite)(c))
2738
26.4k
                                    continue;
2739
2740
16.2k
                                TY_(UngetChar)(c, doc->docIn);
2741
16.2k
                                break;
2742
42.6k
                            }
2743
2744
17.6k
                            break;
2745
55.7k
                        }
2746
2747
55.1k
                        CondReturnTextNode(doc, (skip + 3))
2748
2749
42.6k
                        lexer->txtstart = lexer->lexsize;
2750
42.6k
                        continue;
2751
55.1k
                    }
2752
7.37k
                    else if (c == '[')
2753
3.78k
                    {
2754
                        /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
2755
3.78k
                        lexer->lexsize -= 2;
2756
3.78k
                        lexer->state = LEX_SECTION;
2757
3.78k
                        lexer->txtend = lexer->lexsize;
2758
2759
3.78k
                        CondReturnTextNode(doc, 2)
2760
2761
2.75k
                        lexer->txtstart = lexer->lexsize;
2762
2.75k
                        continue;
2763
3.78k
                    }
2764
2765
2766
                    /*
2767
                       We only print this message if there's a missing
2768
                       starting hyphen; this comment will be dropped.
2769
                     */
2770
3.90k
                    TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING ); /* Is. #487 */
2771
2772
                    /* else swallow characters up to and including next '>' */
2773
104k
                    while ((c = TY_(ReadChar)(doc->docIn)) != '>')
2774
100k
                    {
2775
100k
                        if (c == EndOfStream)
2776
142
                        {
2777
142
                            TY_(UngetChar)(c, doc->docIn);
2778
142
                            break;
2779
142
                        }
2780
100k
                    }
2781
2782
3.90k
                    lexer->lexsize -= 2;
2783
3.90k
                    lexer->lexbuf[lexer->lexsize] = '\0';
2784
3.90k
                    lexer->state = LEX_CONTENT;
2785
3.90k
                    continue;
2786
67.4k
                }
2787
2788
                /*
2789
                   processing instructions
2790
                */
2791
2792
1.14M
                if (c == '?')
2793
21.2k
                {
2794
21.2k
                    lexer->lexsize -= 2;
2795
21.2k
                    lexer->state = LEX_PROCINSTR;
2796
21.2k
                    lexer->txtend = lexer->lexsize;
2797
2798
21.2k
                    CondReturnTextNode(doc, 2)
2799
2800
12.7k
                    lexer->txtstart = lexer->lexsize;
2801
12.7k
                    continue;
2802
21.2k
                }
2803
2804
                /* Microsoft ASP's e.g. <% ... server-code ... %> */
2805
1.12M
                if (c == '%')
2806
1.57k
                {
2807
1.57k
                    lexer->lexsize -= 2;
2808
1.57k
                    lexer->state = LEX_ASP;
2809
1.57k
                    lexer->txtend = lexer->lexsize;
2810
2811
1.57k
                    CondReturnTextNode(doc, 2)
2812
2813
619
                    lexer->txtstart = lexer->lexsize;
2814
619
                    continue;
2815
1.57k
                }
2816
2817
                /* Netscapes JSTE e.g. <# ... server-code ... #> */
2818
1.11M
                if (c == '#')
2819
898
                {
2820
898
                    lexer->lexsize -= 2;
2821
898
                    lexer->state = LEX_JSTE;
2822
898
                    lexer->txtend = lexer->lexsize;
2823
2824
898
                    CondReturnTextNode(doc, 2)
2825
2826
340
                    lexer->txtstart = lexer->lexsize;
2827
340
                    continue;
2828
898
                }
2829
2830
                /* check for start tag */
2831
1.11M
                if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c)))
2832
1.00M
                {
2833
1.00M
                    TY_(UngetChar)(c, doc->docIn);     /* push back letter */
2834
1.00M
                    TY_(UngetChar)('<', doc->docIn);
2835
1.00M
                    lexer->lexsize -= 2;      /* discard "<" + letter */
2836
1.00M
                    lexer->txtend = lexer->lexsize;
2837
1.00M
                    lexer->state = LEX_STARTTAG;         /* ready to read tag name */
2838
2839
1.00M
                    CondReturnTextNode(doc, 2)
2840
2841
                    /* lexer->txtstart = lexer->lexsize; missing here? */
2842
862k
                    continue;       /* no text so keep going */
2843
1.00M
                }
2844
2845
                /* otherwise treat as CDATA */
2846
                /* fix for bug 762102 (486) */
2847
                /* Issue #384 - Fix skipping parsing character, particularly '<<' */
2848
112k
                TY_(UngetChar)(c, doc->docIn);
2849
112k
                lexer->lexsize -= 1;
2850
112k
                lexer->state = LEX_CONTENT;
2851
112k
                lexer->waswhite = no;
2852
112k
                continue;
2853
2854
105k
            case LEX_ENDTAG:  /* </letter */
2855
105k
                lexer->txtstart = lexer->lexsize - 1;
2856
105k
                doc->docIn->curcol += 2;
2857
105k
                c = ParseTagName( doc );
2858
105k
                lexer->token = TagToken( doc, EndTag );  /* create endtag token */
2859
105k
                lexer->lexsize = lexer->txtend = lexer->txtstart;
2860
2861
                /* skip to '>' */
2862
180k
                while ( c != '>' && c != EndOfStream )
2863
74.9k
                {
2864
74.9k
                    c = TY_(ReadChar)(doc->docIn);
2865
74.9k
                }
2866
2867
105k
                if (c == EndOfStream)
2868
969
                {
2869
969
                    TY_(FreeNode)( doc, lexer->token );
2870
969
                    continue;
2871
969
                }
2872
2873
104k
                lexer->state = LEX_CONTENT;
2874
104k
                lexer->waswhite = no;
2875
104k
                node = lexer->token;
2876
104k
                GTDBG(doc,"endtag", node);
2877
104k
                return node;  /* the endtag token */
2878
2879
1.00M
            case LEX_STARTTAG: /* first letter of tagname */
2880
1.00M
                c = TY_(ReadChar)(doc->docIn);
2881
1.00M
                ChangeChar(lexer, (tmbchar)c);
2882
1.00M
                lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
2883
1.00M
                c = ParseTagName( doc );
2884
1.00M
                isempty = no;
2885
1.00M
                attributes = NULL;
2886
1.00M
                lexer->token = TagToken( doc, StartTag ); /* [i_a]2 'isempty' is always false, thanks to code 2 lines above */
2887
2888
                /* parse attributes, consuming closing ">" */
2889
1.00M
                if (c != '>')
2890
391k
                {
2891
391k
                    if (c == '/')
2892
54.9k
                        TY_(UngetChar)(c, doc->docIn);
2893
2894
391k
                    attributes = ParseAttrs( doc, &isempty );
2895
391k
                }
2896
2897
1.00M
                if (isempty)
2898
13.8k
                    lexer->token->type = StartEndTag;
2899
2900
1.00M
                lexer->token->attributes = attributes;
2901
1.00M
                lexer->lexsize = lexer->txtend = lexer->txtstart;
2902
2903
                /* swallow newline following start tag */
2904
                /* special check needed for CRLF sequence */
2905
                /* this doesn't apply to empty elements */
2906
                /* nor to preformatted content that needs escaping */
2907
                /*\
2908
                 * Issue #230: Need to KEEP this user newline character in certain 
2909
                 * circumstances, certainly for <pre>, <script>, <style>...
2910
                 * Any others?
2911
                 * Issue #238: maybe **ONLY** for <pre>
2912
                \*/
2913
1.00M
                if ( nodeIsPRE(lexer->token) )
2914
31.6k
                {
2915
31.6k
                    mode = Preformatted;
2916
31.6k
                }
2917
2918
1.00M
                if ((mode != Preformatted && ExpectsContent(lexer->token))
2919
1.00M
                    || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
2920
860k
                {
2921
860k
                    c = TY_(ReadChar)(doc->docIn);
2922
2923
860k
                    if ((c == '\n') && (mode != IgnoreWhitespace)) /* Issue #329 - Can NOT afford to lose this newline */
2924
29.9k
                        TY_(UngetChar)(c, doc->docIn);  /* Issue #329 - make sure the newline is maintained for now */
2925
830k
                    else if (c != '\n' && c != '\f')
2926
828k
                        TY_(UngetChar)(c, doc->docIn);
2927
2928
860k
                    lexer->waswhite = yes;  /* to swallow leading whitespace */
2929
860k
                }
2930
145k
                else
2931
145k
                    lexer->waswhite = no;
2932
2933
1.00M
                lexer->state = LEX_CONTENT;
2934
1.00M
                if (lexer->token->tag == NULL) 
2935
169k
                {
2936
169k
                    if (mode != OtherNamespace) /* [i_a]2 only issue warning if NOT 'OtherNamespace', and tag null */
2937
164k
                    {
2938
                        /* Special case for HTML5 unknown tags: if it looks 
2939
                           like an autonomous custom tag, then emit a variation
2940
                           of the standard message. We don't want to do this
2941
                           for older HTML, because it's not truly supported
2942
                           by the standard, although Tidy will allow it. */
2943
164k
                        if ( (doc->lexer->doctype & VERS_HTML5) > 0 && TY_(elementIsAutonomousCustomFormat)( lexer->token->element ) )
2944
393
                            TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT_LOOKS_CUSTOM );
2945
164k
                        else
2946
164k
                            TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT );
2947
164k
                    }
2948
169k
                }
2949
835k
                else if ( !cfgBool(doc, TidyXmlTags) )
2950
835k
                {
2951
835k
                    TY_(ConstrainVersion)( doc, lexer->token->tag->versions );
2952
835k
                    TY_(RepairDuplicateAttributes)( doc, lexer->token, no );
2953
835k
                } else 
2954
0
                    TY_(RepairDuplicateAttributes)( doc, lexer->token, yes );
2955
1.00M
                node = lexer->token;
2956
1.00M
                GTDBG(doc,"starttag", node);
2957
1.00M
                return node;  /* return start tag */
2958
2959
2.56M
            case LEX_COMMENT:  /* seen <!-- so look for --> */
2960
2961
2.56M
                if (c != '-')
2962
2.55M
                    continue;
2963
2964
10.4k
                c = TY_(ReadChar)(doc->docIn);
2965
2966
                /* Fix hyphens at beginning of tag */
2967
10.4k
                if ( c != '-' && fixComments && lexer->lexsize - lexer->txtstart == 1 )
2968
3.67k
                {
2969
3.67k
                    lexer->lexbuf[lexer->lexsize - 1] = '=';
2970
3.67k
                }
2971
2972
10.4k
                TY_(AddCharToLexer)(lexer, c);
2973
2974
10.4k
                if (c != '-')
2975
4.09k
                    continue;
2976
2977
8.05k
            end_comment:
2978
8.05k
                c = TY_(ReadChar)(doc->docIn);
2979
2980
8.05k
                if (c == '>')
2981
4.58k
                {
2982
4.58k
                    if (badcomment)
2983
1.19k
                    {
2984
                        /*
2985
                           We've got bad comments that we either fixed or
2986
                           ignored; provide proper user feedback based on
2987
                           doctype and whether or not we fixed them.
2988
                         */
2989
1.19k
                        if ( (TY_(HTMLVersion)(doc) & HT50) )
2990
0
                        {
2991
0
                            if ( fixComments )
2992
0
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT );
2993
                            /* Otherwise for HTML5, it's safe to ignore. */
2994
0
                        }
2995
1.19k
                        else
2996
1.19k
                        {
2997
1.19k
                            if ( fixComments )
2998
1.19k
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT );
2999
0
                            else
3000
0
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_WARN );
3001
1.19k
                        }
3002
1.19k
                    }
3003
3004
                    /* do not store closing -- in lexbuf */
3005
4.58k
                    lexer->lexsize -= 2;
3006
4.58k
                    lexer->txtend = lexer->lexsize;
3007
4.58k
                    lexer->lexbuf[lexer->lexsize] = '\0';
3008
4.58k
                    lexer->state = LEX_CONTENT;
3009
4.58k
                    lexer->waswhite = no;
3010
4.58k
                    lexer->token = CommentToken(doc);
3011
3012
                    /* now look for a line break */
3013
3014
4.58k
                    c = TY_(ReadChar)(doc->docIn);
3015
3016
4.58k
                    if (c == '\n')
3017
334
                        lexer->token->linebreak = yes;
3018
4.25k
                    else
3019
4.25k
                        TY_(UngetChar)(c, doc->docIn);
3020
3021
4.58k
                    node = lexer->token;
3022
4.58k
                    GTDBG(doc,"comment", node);
3023
4.58k
                    return node;
3024
4.58k
                }
3025
3026
                /* note position of first such error in the comment */
3027
3.46k
                if (!badcomment)
3028
1.22k
                {
3029
1.22k
                    SetLexerLocus( doc, lexer );
3030
1.22k
                    lexer->columns -= 3;
3031
1.22k
                }
3032
3033
3.46k
                badcomment++;
3034
3035
                /* fix hyphens in the middle */
3036
3.46k
                if ( fixComments )
3037
3.46k
                    lexer->lexbuf[lexer->lexsize - 2] = '=';
3038
3039
                /* if '-' then look for '>' to end the comment */
3040
3.46k
                if (c == '-')
3041
1.75k
                {
3042
1.75k
                    TY_(AddCharToLexer)(lexer, c);
3043
1.75k
                    goto end_comment;
3044
1.75k
                }
3045
3046
                /* fix hyphens end, and continue to look for --> */
3047
1.71k
                if ( fixComments )
3048
1.71k
                    lexer->lexbuf[lexer->lexsize - 1] = '=';
3049
3050
                /* http://tidy.sf.net/bug/1266647 */
3051
1.71k
                TY_(AddCharToLexer)(lexer, c);
3052
3053
1.71k
                continue; 
3054
3055
55.0k
            case LEX_DOCTYPE:  /* seen <!d so look for '>' munging whitespace */
3056
3057
                /* use ParseDocTypeDecl() to tokenize doctype declaration */
3058
55.0k
                TY_(UngetChar)(c, doc->docIn);
3059
55.0k
                lexer->lexsize -= 1;
3060
55.0k
                lexer->token = ParseDocTypeDecl(doc);
3061
3062
55.0k
                lexer->txtend = lexer->lexsize;
3063
55.0k
                lexer->lexbuf[lexer->lexsize] = '\0';
3064
55.0k
                lexer->state = LEX_CONTENT;
3065
55.0k
                lexer->waswhite = no;
3066
3067
                /* make a note of the version named by the 1st doctype */
3068
55.0k
                if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
3069
10.9k
                {
3070
10.9k
                    lexer->doctype = FindGivenVersion(doc, lexer->token);
3071
10.9k
                    if (lexer->doctype != VERS_HTML5)
3072
10.8k
                    {
3073
                        /*\
3074
                         *  Back to legacy HTML4 mode for -
3075
                         *  Issue #167 & #169 - TidyTag_A
3076
                         *  Issue #196        - TidyTag_CAPTION
3077
                         *  others?
3078
                        \*/ 
3079
10.8k
                        TY_(AdjustTags)(doc); /* Dynamically modify the tags table  */
3080
10.8k
                    }
3081
10.9k
                }
3082
55.0k
                node = lexer->token;
3083
55.0k
                GTDBG(doc,"doctype", node);
3084
55.0k
                return node;
3085
3086
43.3M
            case LEX_PROCINSTR:  /* seen <? so look for '>' */
3087
                /* check for PHP preprocessor instructions <?php ... ?> */
3088
3089
43.3M
                if  (lexer->lexsize - lexer->txtstart == 3)
3090
9.56k
                {
3091
9.56k
                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
3092
234
                    {
3093
234
                        lexer->state = LEX_PHP;
3094
234
                        continue;
3095
234
                    }
3096
9.56k
                }
3097
3098
43.3M
                if  (lexer->lexsize - lexer->txtstart == 4)
3099
9.02k
                {
3100
9.02k
                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&
3101
9.02k
                        TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3]))
3102
4.95k
                    {
3103
4.95k
                        lexer->state = LEX_XMLDECL;
3104
4.95k
                        attributes = NULL;
3105
4.95k
                        continue;
3106
4.95k
                    }
3107
9.02k
                }
3108
3109
43.3M
                if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */
3110
2.40M
                {
3111
2.40M
                    if (c != '?')
3112
2.40M
                        continue;
3113
3114
                    /* now look for '>' */
3115
632
                    c = TY_(ReadChar)(doc->docIn);
3116
3117
632
                    if (c == EndOfStream)
3118
3
                    {
3119
3
                        TY_(Report)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );
3120
3
                        TY_(UngetChar)(c, doc->docIn);
3121
3
                        continue;
3122
3
                    }
3123
3124
629
                    TY_(AddCharToLexer)(lexer, c);
3125
629
                }
3126
3127
3128
40.9M
                if (c != '>')
3129
40.8M
                    continue;
3130
3131
15.9k
                lexer->lexsize -= 1;
3132
3133
15.9k
                if (lexer->lexsize)
3134
14.2k
                {
3135
14.2k
                    uint i;
3136
14.2k
                    Bool closed;
3137
3138
65.1k
                    for (i = 0; i < lexer->lexsize - lexer->txtstart &&
3139
65.1k
                        !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i)
3140
50.8k
                        /**/;
3141
3142
14.2k
                    closed = lexer->lexbuf[lexer->lexsize - 1] == '?';
3143
3144
14.2k
                    if (closed)
3145
2.75k
                        lexer->lexsize -= 1;
3146
3147
14.2k
                    lexer->txtstart += i;
3148
14.2k
                    lexer->txtend = lexer->lexsize;
3149
14.2k
                    lexer->lexbuf[lexer->lexsize] = '\0';
3150
3151
14.2k
                    lexer->token = PIToken(doc);
3152
14.2k
                    lexer->token->closed = closed;
3153
14.2k
                    lexer->token->element = TY_(tmbstrndup)(doc->allocator,
3154
14.2k
                                                            lexer->lexbuf +
3155
14.2k
                                                            lexer->txtstart - i, i);
3156
14.2k
                }
3157
1.68k
                else
3158
1.68k
                {
3159
1.68k
                    lexer->txtend = lexer->lexsize;
3160
1.68k
                    lexer->lexbuf[lexer->lexsize] = '\0';
3161
1.68k
                    lexer->token = PIToken(doc);
3162
1.68k
                }
3163
3164
15.9k
                lexer->state = LEX_CONTENT;
3165
15.9k
                lexer->waswhite = no;
3166
15.9k
                node = lexer->token;
3167
15.9k
                GTDBG(doc,"procinstr", node);
3168
15.9k
                return node;
3169
3170
16.8k
            case LEX_ASP:  /* seen <% so look for "%>" */
3171
16.8k
                if (c != '%')
3172
10.1k
                    continue;
3173
3174
                /* now look for '>' */
3175
6.78k
                c = TY_(ReadChar)(doc->docIn);
3176
3177
3178
6.78k
                if (c != '>')
3179
5.29k
                {
3180
5.29k
                    TY_(UngetChar)(c, doc->docIn);
3181
5.29k
                    continue;
3182
5.29k
                }
3183
3184
1.49k
                lexer->lexsize -= 1;
3185
1.49k
                lexer->txtend = lexer->lexsize;
3186
1.49k
                lexer->lexbuf[lexer->lexsize] = '\0';
3187
1.49k
                lexer->state = LEX_CONTENT;
3188
1.49k
                lexer->waswhite = no;
3189
1.49k
                lexer->token = AspToken(doc);
3190
1.49k
                node = lexer->token;
3191
1.49k
                GTDBG(doc,"ASP", node);
3192
1.49k
                return node;  /* the endtag token */
3193
3194
3195
3196
976k
            case LEX_JSTE:  /* seen <# so look for "#>" */
3197
976k
                if (c != '#')
3198
974k
                    continue;
3199
3200
                /* now look for '>' */
3201
1.27k
                c = TY_(ReadChar)(doc->docIn);
3202
3203
3204
1.27k
                if (c != '>')
3205
438
                {
3206
438
                    TY_(UngetChar)(c, doc->docIn);
3207
438
                    continue;
3208
438
                }
3209
3210
837
                lexer->lexsize -= 1;
3211
837
                lexer->txtend = lexer->lexsize;
3212
837
                lexer->lexbuf[lexer->lexsize] = '\0';
3213
837
                lexer->state = LEX_CONTENT;
3214
837
                lexer->waswhite = no;
3215
837
                lexer->token = JsteToken(doc);
3216
837
                node = lexer->token;
3217
837
                GTDBG(doc,"JSTE", node);
3218
837
                return node;  /* the JSTE token */
3219
3220
3221
926
            case LEX_PHP: /* seen "<?php" so look for "?>" */
3222
926
                if (c != '?')
3223
374
                    continue;
3224
3225
                /* now look for '>' */
3226
552
                c = TY_(ReadChar)(doc->docIn);
3227
3228
552
                if (c != '>')
3229
347
                {
3230
347
                    TY_(UngetChar)(c, doc->docIn);
3231
347
                    continue;
3232
347
                }
3233
3234
205
                lexer->lexsize -= 1;
3235
205
                lexer->txtend = lexer->lexsize;
3236
205
                lexer->lexbuf[lexer->lexsize] = '\0';
3237
205
                lexer->state = LEX_CONTENT;
3238
205
                lexer->waswhite = no;
3239
205
                lexer->token = PhpToken(doc);
3240
205
                node = lexer->token;
3241
205
                GTDBG(doc,"PHP", node);
3242
205
                return node;  /* the PHP token */
3243
3244
13.2k
            case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
3245
3246
13.2k
                if (TY_(IsWhite)(c) && c != '?')
3247
4.63k
                    continue;
3248
3249
                /* get pseudo-attribute */
3250
8.63k
                if (c != '?')
3251
6.63k
                {
3252
6.63k
                    tmbstr name;
3253
6.63k
                    Node *asp, *php;
3254
6.63k
                    AttVal *av = NULL;
3255
6.63k
                    int pdelim = 0;
3256
6.63k
                    isempty = no;
3257
3258
6.63k
                    TY_(UngetChar)(c, doc->docIn);
3259
3260
6.63k
                    name = ParseAttribute( doc, &isempty, &asp, &php );
3261
3262
6.63k
                    if (!name)
3263
3.64k
                    {
3264
                        /* check if attributes are created by ASP markup */
3265
3.64k
                        if (asp)
3266
661
                        {
3267
661
                            av = TY_(NewAttribute)(doc);
3268
661
                            av->asp = asp;
3269
661
                            AddAttrToList( &attributes, av ); 
3270
661
                        }
3271
3272
                        /* check if attributes are created by PHP markup */
3273
3.64k
                        if (php)
3274
1.71k
                        {
3275
1.71k
                            av = TY_(NewAttribute)(doc);
3276
1.71k
                            av->php = php;
3277
1.71k
                            AddAttrToList( &attributes, av ); 
3278
1.71k
                        }
3279
                      
3280
                        /* fix for http://tidy.sf.net/bug/788031 */
3281
3.64k
                        lexer->lexsize -= 1;
3282
3.64k
                        lexer->txtend = lexer->txtstart;
3283
3.64k
                        lexer->lexbuf[lexer->txtend] = '\0';
3284
3.64k
                        lexer->state = LEX_CONTENT;
3285
3.64k
                        lexer->waswhite = no;
3286
3.64k
                        lexer->token = XmlDeclToken(doc);
3287
3.64k
                        lexer->token->attributes = attributes;
3288
3.64k
                        node = lexer->token;
3289
3.64k
                        GTDBG(doc,"xml", node);
3290
3.64k
                        return node;  /* the xml token */
3291
3.64k
                    }
3292
3293
2.98k
                    av = TY_(NewAttribute)(doc);
3294
2.98k
                    av->attribute = name;
3295
2.98k
                    av->value = ParseValue( doc, name, yes, &isempty, &pdelim );
3296
2.98k
                    av->delim = pdelim;
3297
2.98k
                    av->dict = TY_(FindAttribute)( doc, av );
3298
3299
2.98k
                    AddAttrToList( &attributes, av );
3300
                    /* continue; */
3301
2.98k
                }
3302
3303
                /* now look for '>' */
3304
4.98k
                c = TY_(ReadChar)(doc->docIn);
3305
3306
4.98k
                if (c != '>')
3307
3.74k
                {
3308
3.74k
                    TY_(UngetChar)(c, doc->docIn);
3309
3.74k
                    continue;
3310
3.74k
                }
3311
1.23k
                lexer->lexsize -= 1;
3312
1.23k
                lexer->txtend = lexer->txtstart;
3313
1.23k
                lexer->lexbuf[lexer->txtend] = '\0';
3314
1.23k
                lexer->state = LEX_CONTENT;
3315
1.23k
                lexer->waswhite = no;
3316
1.23k
                lexer->token = XmlDeclToken(doc);
3317
1.23k
                lexer->token->attributes = attributes;
3318
1.23k
                node = lexer->token;
3319
1.23k
                GTDBG(doc,"XML", node);
3320
1.23k
                return node;  /* the XML token */
3321
3322
90.2M
            case LEX_SECTION: /* seen "<![" so look for "]>" */
3323
90.2M
                if (c == '[')
3324
2.77k
                {
3325
2.77k
                    if (lexer->lexsize == (lexer->txtstart + 6) &&
3326
2.77k
                        TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
3327
1.10k
                    {
3328
1.10k
                        lexer->state = LEX_CDATA;
3329
1.10k
                        lexer->lexsize -= 6;
3330
1.10k
                        continue;
3331
1.10k
                    }
3332
2.77k
                }
3333
3334
90.2M
                if (c == '>')
3335
1.98k
                {
3336
                    /* Is. #462 - reached '>' before ']' */
3337
1.98k
                    TY_(UngetChar)(c, doc->docIn);
3338
90.2M
                } else if (c != ']')
3339
90.2M
                    continue;
3340
3341
                /* now look for '>' */
3342
8.48k
                c = TY_(ReadChar)(doc->docIn);
3343
3344
8.48k
                lexdump = 1;
3345
8.48k
                if (c != '>')
3346
6.24k
                {
3347
                    /* Issue #153 - can also be ]'-->' */
3348
6.24k
                    if (c == '-') 
3349
917
                    {
3350
917
                        c = TY_(ReadChar)(doc->docIn);
3351
917
                        if (c == '-')
3352
546
                        {
3353
546
                            c = TY_(ReadChar)(doc->docIn);
3354
546
                            if (c != '>')
3355
222
                            {
3356
222
                                TY_(UngetChar)(c, doc->docIn);
3357
222
                                TY_(UngetChar)('-', doc->docIn);
3358
222
                                TY_(UngetChar)('-', doc->docIn);
3359
222
                                continue;
3360
222
                            }
3361
                            /* this failed!
3362
                               TY_(AddCharToLexer)(lexer, '-'); TY_(AddCharToLexer)(lexer, '-'); lexdump = 0; 
3363
                               got output <![endif]--]> - needs further fix in pprint section output
3364
                             */
3365
546
                        }
3366
371
                        else
3367
371
                        {
3368
371
                            TY_(UngetChar)(c, doc->docIn);
3369
371
                            TY_(UngetChar)('-', doc->docIn);
3370
371
                            continue;
3371
371
                        }
3372
917
                    } 
3373
5.32k
                    else 
3374
5.32k
                    {
3375
5.32k
                        TY_(UngetChar)(c, doc->docIn);
3376
5.32k
                        continue;
3377
5.32k
                    }
3378
6.24k
                }
3379
 
3380
2.56k
                lexer->lexsize -= lexdump;
3381
2.56k
                lexer->txtend = lexer->lexsize;
3382
2.56k
                lexer->lexbuf[lexer->lexsize] = '\0';
3383
2.56k
                lexer->state = LEX_CONTENT;
3384
2.56k
                lexer->waswhite = no;
3385
2.56k
                lexer->token = SectionToken(doc);
3386
2.56k
                node = lexer->token;
3387
2.56k
                GTDBG(doc,"SECTION", node);
3388
2.56k
                return node;  /* the SECTION token */
3389
3390
35.1k
            case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
3391
35.1k
                if (c != ']')
3392
32.2k
                    continue;
3393
3394
                /* now look for ']' */
3395
2.91k
                c = TY_(ReadChar)(doc->docIn);
3396
3397
2.91k
                if (c != ']')
3398
1.56k
                {
3399
1.56k
                    TY_(UngetChar)(c, doc->docIn);
3400
1.56k
                    continue;
3401
1.56k
                }
3402
3403
                /* now look for '>' */
3404
1.35k
                c = TY_(ReadChar)(doc->docIn);
3405
3406
1.35k
                if (c != '>')
3407
278
                {
3408
278
                    TY_(UngetChar)(c, doc->docIn);
3409
278
                    TY_(UngetChar)(']', doc->docIn);
3410
278
                    continue;
3411
278
                }
3412
3413
1.07k
                lexer->lexsize -= 1;
3414
1.07k
                lexer->txtend = lexer->lexsize;
3415
1.07k
                lexer->lexbuf[lexer->lexsize] = '\0';
3416
1.07k
                lexer->state = LEX_CONTENT;
3417
1.07k
                lexer->waswhite = no;
3418
1.07k
                lexer->token = CDATAToken(doc);
3419
1.07k
                node = lexer->token;
3420
1.07k
                GTDBG(doc,"CDATA", node);
3421
1.07k
                return node;  /* the CDATA token */
3422
241M
        }
3423
241M
    }
3424
3425
1.50M
    if (lexer->state == LEX_CONTENT)  /* text string */
3426
1.43M
    {
3427
1.43M
        lexer->txtend = lexer->lexsize;
3428
3429
1.43M
        if (lexer->txtend > lexer->txtstart)
3430
3.22k
        {
3431
3.22k
            TY_(UngetChar)(c, doc->docIn);
3432
3433
3.22k
            if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
3434
238
            {
3435
238
                lexer->lexsize -= 1;
3436
238
                lexer->txtend = lexer->lexsize;
3437
238
            }
3438
3.22k
            lexer->token = TY_(TextToken)(lexer);
3439
3.22k
            node = lexer->token;
3440
3.22k
            GTDBG(doc,"textstring", node);
3441
3.22k
            return node;  /* the textstring token */
3442
3.22k
        }
3443
1.43M
    }
3444
71.4k
    else if (lexer->state == LEX_COMMENT) /* comment */
3445
90
    {
3446
90
        if (c == EndOfStream)
3447
90
        {
3448
            /* We print this if we reached end of the stream mid-comment. */
3449
90
            TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_EOS );
3450
90
        }
3451
3452
90
        lexer->txtend = lexer->lexsize;
3453
90
        lexer->lexbuf[lexer->lexsize] = '\0';
3454
90
        lexer->state = LEX_CONTENT;
3455
90
        lexer->waswhite = no;
3456
90
        lexer->token = CommentToken(doc);
3457
90
        node = lexer->token;
3458
90
        GTDBG(doc,"COMMENT", node);
3459
90
        return node;  /* the COMMENT token */
3460
90
    }
3461
3462
    /* check attributes before return NULL */
3463
1.50M
    if (attributes)
3464
61
        TY_(FreeAttribute)( doc, attributes );
3465
3466
1.50M
    DEBUG_LOG(SPRTF("Returning NULL...\n"));
3467
1.50M
    return NULL;
3468
1.50M
}
3469
3470
static void MapStr( ctmbstr str, uint code )
3471
119k
{
3472
1.53M
    while ( *str )
3473
1.41M
    {
3474
1.41M
        uint i = (byte) *str++;
3475
1.41M
        lexmap[i] |= code;
3476
1.41M
    }
3477
119k
}
3478
3479
void TY_(InitMap)(void)
3480
17.0k
{
3481
17.0k
    MapStr("\r\n\f", newline|white);
3482
17.0k
    MapStr(" \t", white);
3483
17.0k
    MapStr("-.:_", namechar);
3484
17.0k
    MapStr("0123456789", digit|digithex|namechar);
3485
17.0k
    MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
3486
17.0k
    MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
3487
17.0k
    MapStr("abcdefABCDEF", digithex);
3488
17.0k
}
3489
3490
/*
3491
 parser for ASP within start tags
3492
3493
 Some people use ASP for to customize attributes
3494
 Tidy isn't really well suited to dealing with ASP
3495
 This is a workaround for attributes, but won't
3496
 deal with the case where the ASP is used to tailor
3497
 the attribute value. Here is an example of a work
3498
 around for using ASP in attribute values:
3499
3500
  href='<%=rsSchool.Fields("ID").Value%>'
3501
3502
 where the ASP that generates the attribute value
3503
 is masked from Tidy by the quotemarks.
3504
3505
*/
3506
3507
static Node *ParseAsp( TidyDocImpl* doc )
3508
1.71k
{
3509
1.71k
    Lexer* lexer = doc->lexer;
3510
1.71k
    uint c;
3511
1.71k
    Node *asp = NULL;
3512
3513
1.71k
    lexer->txtstart = lexer->lexsize;
3514
3515
1.71k
    for (;;)
3516
1.13M
    {
3517
1.13M
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3518
87
            break;
3519
3520
1.13M
        TY_(AddCharToLexer)(lexer, c);
3521
3522
3523
1.13M
        if (c != '%')
3524
1.10M
            continue;
3525
3526
38.2k
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3527
7
            break;
3528
3529
38.2k
        TY_(AddCharToLexer)(lexer, c);
3530
3531
38.2k
        if (c == '>')
3532
1.62k
        {
3533
1.62k
            lexer->lexsize -= 2;
3534
1.62k
            break;
3535
1.62k
        }
3536
38.2k
    }
3537
3538
1.71k
    lexer->txtend = lexer->lexsize;
3539
1.71k
    if (lexer->txtend > lexer->txtstart)
3540
1.46k
        asp = AspToken(doc);
3541
3542
1.71k
    lexer->txtstart = lexer->txtend;
3543
1.71k
    return asp;
3544
1.71k
}   
3545
 
3546
3547
/*
3548
 PHP is like ASP but is based upon XML
3549
 processing instructions, e.g. <?php ... ?>
3550
*/
3551
static Node *ParsePhp( TidyDocImpl* doc )
3552
4.00k
{
3553
4.00k
    Lexer* lexer = doc->lexer;
3554
4.00k
    uint c;
3555
4.00k
    Node *php = NULL;
3556
3557
4.00k
    lexer->txtstart = lexer->lexsize;
3558
3559
4.00k
    for (;;)
3560
3.57M
    {
3561
3.57M
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3562
116
            break;
3563
3564
3.57M
        TY_(AddCharToLexer)(lexer, c);
3565
3566
3567
3.57M
        if (c != '?')
3568
3.56M
            continue;
3569
3570
6.41k
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3571
9
            break;
3572
3573
6.40k
        TY_(AddCharToLexer)(lexer, c);
3574
3575
6.40k
        if (c == '>')
3576
3.88k
        {
3577
3.88k
            lexer->lexsize -= 2;
3578
3.88k
            break;
3579
3.88k
        }
3580
6.40k
    }
3581
3582
4.00k
    lexer->txtend = lexer->lexsize;
3583
4.00k
    if (lexer->txtend > lexer->txtstart)
3584
3.75k
        php = PhpToken(doc);
3585
3586
4.00k
    lexer->txtstart = lexer->txtend;
3587
4.00k
    return php;
3588
4.00k
}   
3589
3590
/* consumes the '>' terminating start tags */
3591
/* @TODO: float the errors back to the calling method */
3592
static tmbstr  ParseAttribute( TidyDocImpl* doc, Bool *isempty,
3593
                              Node **asp, Node **php )
3594
682k
{
3595
682k
    Lexer* lexer = doc->lexer;
3596
682k
    int start, len = 0;
3597
682k
    tmbstr attr = NULL;
3598
682k
    uint c, lastc;
3599
3600
682k
    *asp = NULL;  /* clear asp pointer */
3601
682k
    *php = NULL;  /* clear php pointer */
3602
3603
 /* skip white space before the attribute */
3604
3605
682k
    for (;;)
3606
787k
    {
3607
787k
        c = TY_(ReadChar)( doc->docIn );
3608
3609
3610
787k
        if (c == '/')
3611
69.8k
        {
3612
69.8k
            c = TY_(ReadChar)( doc->docIn );
3613
3614
69.8k
            if (c == '>')
3615
12.7k
            {
3616
12.7k
                *isempty = yes;
3617
12.7k
                return NULL;
3618
12.7k
            }
3619
3620
57.0k
            TY_(UngetChar)(c, doc->docIn);
3621
57.0k
            c = '/';
3622
57.0k
            break;
3623
69.8k
        }
3624
3625
718k
        if (c == '>')
3626
78.9k
            return NULL;
3627
3628
639k
        if (c =='<')
3629
299k
        {
3630
299k
            c = TY_(ReadChar)(doc->docIn);
3631
3632
299k
            if (c == '%')
3633
1.71k
            {
3634
1.71k
                *asp = ParseAsp( doc );
3635
1.71k
                return NULL;
3636
1.71k
            }
3637
297k
            else if (c == '?')
3638
4.00k
            {
3639
4.00k
                *php = ParsePhp( doc );
3640
4.00k
                return NULL;
3641
4.00k
            }
3642
3643
293k
            TY_(UngetChar)(c, doc->docIn);
3644
293k
            TY_(UngetChar)('<', doc->docIn);
3645
293k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3646
293k
            return NULL;
3647
299k
        }
3648
3649
340k
        if (c == '=')
3650
3.44k
        {
3651
3.44k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );
3652
3.44k
            continue;
3653
3.44k
        }
3654
3655
336k
        if (c == '"' || c == '\'')
3656
4.94k
        {
3657
4.94k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3658
4.94k
            continue;
3659
4.94k
        }
3660
3661
331k
        if (c == EndOfStream)
3662
63
        {
3663
63
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3664
63
            TY_(UngetChar)(c, doc->docIn);
3665
63
            return NULL;
3666
63
        }
3667
3668
3669
331k
        if (!TY_(IsWhite)(c))
3670
234k
           break;
3671
331k
    }
3672
3673
291k
    start = lexer->lexsize;
3674
291k
    lastc = c;
3675
3676
291k
    for (;;)
3677
1.31M
    {
3678
     /* but push back '=' for parseValue() */
3679
1.31M
        if (c == '=' || c == '>')
3680
92.2k
        {
3681
92.2k
            TY_(UngetChar)(c, doc->docIn);
3682
92.2k
            break;
3683
92.2k
        }
3684
3685
1.21M
        if (c == '<' || c == EndOfStream)
3686
119k
        {
3687
119k
            TY_(UngetChar)(c, doc->docIn);
3688
119k
            break;
3689
119k
        }
3690
3691
1.09M
        if (lastc == '-' && (c == '"' || c == '\''))
3692
527
        {
3693
527
            lexer->lexsize--;
3694
527
            --len;
3695
527
            TY_(UngetChar)(c, doc->docIn);
3696
527
            break;
3697
527
        }
3698
3699
1.09M
        if (TY_(IsWhite)(c))
3700
78.3k
            break;
3701
3702
1.01M
        if (c == '/') /* Issue #395 - potential self closing tag */
3703
61.5k
        {
3704
61.5k
            c = TY_(ReadChar)(doc->docIn);  /* read next */
3705
61.5k
            if (c == '>')
3706
1.18k
            {
3707
                /* got a self closing tag - put is back and continue... */
3708
1.18k
                TY_(UngetChar)(c, doc->docIn);
3709
1.18k
                break;
3710
1.18k
            }
3711
60.3k
            else
3712
60.3k
            {
3713
                /* Not '/>' - put it back */
3714
60.3k
                TY_(UngetChar)(c, doc->docIn);
3715
60.3k
                c = '/';  /* restore original char */
3716
60.3k
            }
3717
61.5k
        }
3718
3719
        /* what should be done about non-namechar characters? */
3720
        /* currently these are incorporated into the attr name */
3721
3722
1.01M
        if ( cfg(doc, TidyUpperCaseAttrs) != TidyUppercasePreserve )
3723
1.01M
        {
3724
1.01M
            if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) )
3725
167k
                c = TY_(ToLower)(c);
3726
1.01M
        }
3727
3728
1.01M
        TY_(AddCharToLexer)( lexer, c );
3729
1.01M
        lastc = c;
3730
1.01M
        c = TY_(ReadChar)(doc->docIn);
3731
1.01M
    }
3732
3733
    /* handle attribute names with multibyte chars */
3734
291k
    len = lexer->lexsize - start;
3735
291k
    attr = (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3736
291k
                                      lexer->lexbuf+start, len) : NULL);
3737
291k
    lexer->lexsize = start;
3738
291k
    return attr;
3739
682k
}
3740
3741
/*
3742
 invoked when < is seen in place of attribute value
3743
 but terminates on whitespace if not ASP, PHP or Tango
3744
 this routine recognizes ' and " quoted strings
3745
*/
3746
static int ParseServerInstruction( TidyDocImpl* doc )
3747
2.70k
{
3748
2.70k
    Lexer* lexer = doc->lexer;
3749
2.70k
    uint c;
3750
2.70k
    int delim = '"';
3751
2.70k
    Bool isrule = no;
3752
3753
2.70k
    c = TY_(ReadChar)(doc->docIn);
3754
2.70k
    TY_(AddCharToLexer)(lexer, c);
3755
3756
    /* check for ASP, PHP or Tango */
3757
2.70k
    if (c == '%' || c == '?' || c == '@')
3758
761
        isrule = yes;
3759
3760
2.70k
    for (;;)
3761
21.8k
    {
3762
21.8k
        c = TY_(ReadChar)(doc->docIn);
3763
3764
21.8k
        if (c == EndOfStream)
3765
133
            break;
3766
3767
21.7k
        if (c == '>')
3768
1.53k
        {
3769
1.53k
            if (isrule)
3770
724
                TY_(AddCharToLexer)(lexer, c);
3771
808
            else
3772
808
                TY_(UngetChar)(c, doc->docIn);
3773
3774
1.53k
            break;
3775
1.53k
        }
3776
3777
        /* if not recognized as ASP, PHP or Tango */
3778
        /* then also finish value on whitespace */
3779
20.1k
        if (!isrule)
3780
7.28k
        {
3781
7.28k
            if (TY_(IsWhite)(c))
3782
686
                break;
3783
7.28k
        }
3784
3785
19.5k
        TY_(AddCharToLexer)(lexer, c);
3786
3787
19.5k
        if (c == '"')
3788
482
        {
3789
482
            do
3790
1.71k
            {
3791
1.71k
                c = TY_(ReadChar)(doc->docIn);
3792
1.71k
                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3793
27
                {
3794
27
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3795
27
                    TY_(UngetChar)(c, doc->docIn);
3796
27
                    return 0;
3797
27
                }
3798
1.69k
                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3799
220
                {
3800
220
                    TY_(UngetChar)(c, doc->docIn);
3801
220
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3802
220
                    return 0;
3803
220
                }
3804
1.47k
                TY_(AddCharToLexer)(lexer, c);
3805
1.47k
            }
3806
1.47k
            while (c != '"');
3807
235
            delim = '\'';
3808
235
            continue;
3809
482
        }
3810
3811
19.0k
        if (c == '\'')
3812
411
        {
3813
411
            do
3814
771k
            {
3815
771k
                c = TY_(ReadChar)(doc->docIn);
3816
771k
                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3817
26
                {
3818
26
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3819
26
                    TY_(UngetChar)(c, doc->docIn);
3820
26
                    return 0;
3821
26
                }
3822
771k
                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3823
85
                {
3824
85
                    TY_(UngetChar)(c, doc->docIn);
3825
85
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3826
85
                    return 0;
3827
85
                }
3828
771k
                TY_(AddCharToLexer)(lexer, c);
3829
771k
            }
3830
771k
            while (c != '\'');
3831
411
        }
3832
19.0k
    }
3833
3834
2.35k
    return delim;
3835
2.70k
}
3836
3837
/* values start with "=" or " = " etc. */
3838
/* doesn't consume the ">" at end of start tag */
3839
3840
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,
3841
                          Bool foldCase, Bool *isempty, int *pdelim)
3842
291k
{
3843
291k
    Lexer* lexer = doc->lexer;
3844
291k
    int len = 0, start;
3845
291k
    Bool seen_gt = no;
3846
291k
    Bool munge = yes;
3847
291k
    uint c, lastc, delim, quotewarning;
3848
291k
    tmbstr value;
3849
3850
291k
    delim = (tmbchar) 0;
3851
291k
    *pdelim = '"';
3852
3853
    /*
3854
     Henry Zrepa reports that some folk are using the
3855
     embed element with script attributes where newlines
3856
     are significant and must be preserved
3857
    */
3858
291k
    if ( cfgBool(doc, TidyLiteralAttribs) )
3859
0
        munge = no;
3860
3861
 /* skip white space before the '=' */
3862
3863
291k
    for (;;)
3864
820k
    {
3865
820k
        c = TY_(ReadChar)(doc->docIn);
3866
3867
820k
        if (c == EndOfStream)
3868
1.18k
        {
3869
1.18k
            TY_(UngetChar)(c, doc->docIn);
3870
1.18k
            break;
3871
1.18k
        }
3872
3873
819k
        if (!TY_(IsWhite)(c))
3874
290k
           break;
3875
819k
    }
3876
3877
/*
3878
  c should be '=' if there is a value
3879
  other legal possibilities are white
3880
  space, '/' and '>'
3881
*/
3882
3883
291k
    if (c != '=' && c != '"' && c != '\'')
3884
229k
    {
3885
229k
        TY_(UngetChar)(c, doc->docIn);
3886
229k
        return NULL;
3887
229k
    }
3888
3889
 /* skip white space after '=' */
3890
3891
61.7k
    for (;;)
3892
64.0k
    {
3893
64.0k
        c = TY_(ReadChar)(doc->docIn);
3894
3895
64.0k
        if (c == EndOfStream)
3896
60
        {
3897
60
            TY_(UngetChar)(c, doc->docIn);
3898
60
            break;
3899
60
        }
3900
3901
64.0k
        if (!TY_(IsWhite)(c))
3902
61.6k
           break;
3903
64.0k
    }
3904
3905
 /* check for quote marks */
3906
3907
61.7k
    if (c == '"' || c == '\'')
3908
19.6k
        delim = c;
3909
42.0k
    else if (c == '<')
3910
2.70k
    {
3911
2.70k
        start = lexer->lexsize;
3912
2.70k
        TY_(AddCharToLexer)(lexer, c);
3913
2.70k
        *pdelim = ParseServerInstruction( doc );
3914
2.70k
        len = lexer->lexsize - start;
3915
2.70k
        lexer->lexsize = start;
3916
2.70k
        return (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3917
2.70k
                                          lexer->lexbuf+start, len) : NULL);
3918
2.70k
    }
3919
39.3k
    else
3920
39.3k
        TY_(UngetChar)(c, doc->docIn);
3921
3922
 /*
3923
   and read the value string
3924
   check for quote mark if needed
3925
 */
3926
3927
59.0k
    quotewarning = 0;
3928
59.0k
    start = lexer->lexsize;
3929
59.0k
    c = '\0';
3930
3931
59.0k
    for (;;)
3932
2.08M
    {
3933
2.08M
        lastc = c;  /* track last character */
3934
2.08M
        c = TY_(ReadChar)(doc->docIn);
3935
3936
2.08M
        if (c == EndOfStream)
3937
1.10k
        {
3938
1.10k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3939
1.10k
            TY_(UngetChar)(c, doc->docIn);
3940
1.10k
            break;
3941
1.10k
        }
3942
3943
2.08M
        if (delim == (tmbchar)0)
3944
257k
        {
3945
257k
            if (c == '>')
3946
2.67k
            {
3947
2.67k
                TY_(UngetChar)(c, doc->docIn);
3948
2.67k
                break;
3949
2.67k
            }
3950
3951
255k
            if (c == '"' || c == '\'')
3952
4.19k
            {
3953
4.19k
                uint q = c;
3954
3955
                /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
3956
                /* this doesn't handle <a title=foo"/> which browsers treat as  */
3957
                /* 'foo"/' nor  <a title=foo" /> which browser treat as 'foo"'  */
3958
                
3959
4.19k
                c = TY_(ReadChar)(doc->docIn);
3960
4.19k
                if (c == '>')
3961
637
                {
3962
637
                    TY_(AddCharToLexer)(lexer, q);
3963
637
                    TY_(UngetChar)(c, doc->docIn);
3964
637
                    break;
3965
637
                }
3966
3.55k
                else
3967
3.55k
                {
3968
3.55k
                    TY_(UngetChar)(c, doc->docIn);
3969
3.55k
                    c = q;
3970
3.55k
                }
3971
4.19k
            }
3972
3973
254k
            if (c == '<')
3974
23.8k
            {
3975
23.8k
                TY_(UngetChar)(c, doc->docIn);
3976
23.8k
                c = '>';
3977
23.8k
                TY_(UngetChar)(c, doc->docIn);
3978
23.8k
                TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3979
23.8k
                break;
3980
23.8k
            }
3981
3982
            /*
3983
             For cases like <br clear=all/> need to avoid treating /> as
3984
             part of the attribute value, however care is needed to avoid
3985
             so treating <a href=http://www.acme.com/> in this way, which
3986
             would map the <a> tag to <a href="http://www.acme.com"/>
3987
            */
3988
230k
            if (c == '/')
3989
2.78k
            {
3990
                /* peek ahead in case of /> */
3991
2.78k
                c = TY_(ReadChar)(doc->docIn);
3992
3993
2.78k
                if ( c == '>' && !TY_(IsUrl)(doc, name) )
3994
1.07k
                {
3995
1.07k
                    *isempty = yes;
3996
1.07k
                    TY_(UngetChar)(c, doc->docIn);
3997
1.07k
                    break;
3998
1.07k
                }
3999
4000
                /* unget peeked character */
4001
1.70k
                TY_(UngetChar)(c, doc->docIn);
4002
1.70k
                c = '/';
4003
1.70k
            }
4004
230k
        }
4005
1.82M
        else  /* delim is '\'' or '"' */
4006
1.82M
        {
4007
1.82M
            if (c == delim)
4008
19.4k
                break;
4009
4010
1.80M
            if (c == '\n' || c == '<' || c == '>')
4011
849k
                ++quotewarning;
4012
4013
1.80M
            if (c == '>')
4014
23.7k
                seen_gt = yes;
4015
1.80M
        }
4016
4017
2.03M
        if (c == '&')
4018
10.0k
        {
4019
10.0k
            TY_(AddCharToLexer)(lexer, c);
4020
10.0k
            ParseEntity( doc, IgnoreWhitespace );
4021
10.0k
            if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)
4022
977
                ChangeChar(lexer, ' ');
4023
10.0k
            continue;
4024
10.0k
        }
4025
4026
        /*
4027
         kludge for JavaScript attribute values
4028
         with line continuations in string literals
4029
        */
4030
2.02M
        if (c == '\\')
4031
1.19k
        {
4032
1.19k
            c = TY_(ReadChar)(doc->docIn);
4033
4034
1.19k
            if (c != '\n')
4035
661
            {
4036
661
                TY_(UngetChar)(c, doc->docIn);
4037
661
                c = '\\';
4038
661
            }
4039
1.19k
        }
4040
4041
2.02M
        if (TY_(IsWhite)(c))
4042
1.59M
        {
4043
1.59M
            if ( delim == 0 )
4044
10.1k
                break;
4045
4046
1.58M
            if (munge)
4047
1.58M
            {
4048
                /* discard line breaks in quoted URLs */ 
4049
                /* #438650 - fix by Randy Waki */
4050
1.58M
                if ( c == '\n' && TY_(IsUrl)(doc, name) )
4051
765
                {
4052
                    /* warn that we discard this newline */
4053
765
                    TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI);
4054
765
                    continue;
4055
765
                }
4056
                
4057
1.58M
                c = ' ';
4058
4059
1.58M
                if (lastc == ' ')
4060
1.57M
                {
4061
1.57M
                    if (TY_(IsUrl)(doc, name) )
4062
1.28k
                        TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI);
4063
1.57M
                    continue;
4064
1.57M
                }
4065
1.58M
            }
4066
1.58M
        }
4067
433k
        else if (foldCase && TY_(IsUpper)(c))
4068
455
            c = TY_(ToLower)(c);
4069
4070
439k
        TY_(AddCharToLexer)(lexer, c);
4071
439k
    }
4072
4073
59.0k
    if (quotewarning > 10 && seen_gt && munge)
4074
3.32k
    {
4075
        /*
4076
           there is almost certainly a missing trailing quote mark
4077
           as we have see too many newlines, < or > characters.
4078
4079
           an exception is made for Javascript attributes and the
4080
           javascript URL scheme which may legitimately include < and >,
4081
           and for attributes starting with "<xml " as generated by
4082
           Microsoft Office.
4083
        */
4084
3.32k
        if ( !TY_(IsScript)(doc, name) &&
4085
3.32k
             !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) &&
4086
3.32k
             !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0)
4087
3.32k
           )
4088
884
            TY_(Report)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE ); 
4089
3.32k
    }
4090
4091
59.0k
    len = lexer->lexsize - start;
4092
59.0k
    lexer->lexsize = start;
4093
4094
4095
59.0k
    if (len > 0 || delim)
4096
57.2k
    {
4097
        /* ignore leading and trailing white space for all but title, alt, value */
4098
        /* and prompts attributes unless --literal-attributes is set to yes      */
4099
        /* #994841 - Whitespace is removed from value attributes                 */
4100
4101
        /* Issue #217 - Also only if/while (len > 0) - MUST NEVER GO NEGATIVE! */
4102
57.2k
        if ((len > 0) && munge &&
4103
57.2k
            TY_(tmbstrcasecmp)(name, "alt") &&
4104
57.2k
            TY_(tmbstrcasecmp)(name, "title") &&
4105
57.2k
            TY_(tmbstrcasecmp)(name, "value") &&
4106
57.2k
            TY_(tmbstrcasecmp)(name, "prompt"))
4107
55.6k
        {
4108
56.6k
            while (TY_(IsWhite)(lexer->lexbuf[start+len-1]) && (len > 0))
4109
958
                --len;
4110
4111
            /* Issue #497 - Fix leading space trimming */
4112
56.1k
            while (TY_(IsWhite)(lexer->lexbuf[start]) && (len > 0))
4113
460
            {
4114
460
                ++start;
4115
460
                --len;
4116
460
            }
4117
55.6k
        }
4118
4119
57.2k
        value = TY_(tmbstrndup)(doc->allocator, lexer->lexbuf + start, len);
4120
57.2k
    }
4121
1.83k
    else
4122
1.83k
        value = NULL;
4123
4124
    /* note delimiter if given */
4125
59.0k
    *pdelim = delim;
4126
4127
59.0k
    return value;
4128
61.7k
}
4129
4130
/* attr must be non-NULL */
4131
static Bool IsValidAttrName( ctmbstr attr )
4132
288k
{
4133
288k
    uint i, c = attr[0];
4134
4135
    /* first character should be a letter */
4136
288k
    if (!TY_(IsLetter)(c))
4137
134k
        return no;
4138
4139
    /* remaining characters should be namechars */
4140
514k
    for( i = 1; i < TY_(tmbstrlen)(attr); i++)
4141
380k
    {
4142
380k
        c = attr[i];
4143
4144
380k
        if (TY_(IsNamechar)(c))
4145
360k
            continue;
4146
4147
19.2k
        return no;
4148
380k
    }
4149
4150
134k
    return yes;
4151
153k
}
4152
4153
/* create a new attribute */
4154
AttVal *TY_(NewAttribute)( TidyDocImpl* doc )
4155
503k
{
4156
503k
    AttVal *av = (AttVal*) TidyDocAlloc( doc, sizeof(AttVal) );
4157
503k
    TidyClearMemory( av, sizeof(AttVal) );
4158
503k
    return av;
4159
503k
}
4160
4161
/* create a new attribute with given name and value */
4162
AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
4163
                             int delim )
4164
25.1k
{
4165
25.1k
    AttVal *av = TY_(NewAttribute)(doc);
4166
25.1k
    av->attribute = TY_(tmbstrdup)(doc->allocator, name);
4167
25.1k
    av->value = TY_(tmbstrdup)(doc->allocator, value);
4168
25.1k
    av->delim = delim;
4169
25.1k
    av->dict = TY_(FindAttribute)( doc, av );
4170
25.1k
    return av;
4171
25.1k
}
4172
4173
static void AddAttrToList( AttVal** list, AttVal* av )
4174
148k
{
4175
148k
  if ( *list == NULL )
4176
117k
    *list = av;
4177
30.8k
  else
4178
30.8k
  {
4179
30.8k
    AttVal* here = *list;
4180
467k
    while ( here->next )
4181
436k
      here = here->next;
4182
30.8k
    here->next = av;
4183
30.8k
  }
4184
148k
}
4185
4186
void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av )
4187
6.07k
{
4188
6.07k
    AddAttrToList(&node->attributes, av);
4189
6.07k
}
4190
4191
void TY_(InsertAttributeAtStart)( Node *node, AttVal *av )
4192
25.1k
{
4193
25.1k
    av->next = node->attributes;
4194
25.1k
    node->attributes = av;
4195
25.1k
}
4196
4197
/* swallows closing '>' */
4198
4199
static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )
4200
391k
{
4201
391k
    Lexer* lexer = doc->lexer;
4202
391k
    AttVal *av, *list;
4203
391k
    tmbstr value;
4204
391k
    int delim;
4205
391k
    Node *asp, *php;
4206
4207
391k
    list = NULL;
4208
4209
683k
    while ( !EndOfInput(doc) )
4210
675k
    {
4211
675k
        tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );
4212
4213
675k
        if (attribute == NULL)
4214
387k
        {
4215
            /* check if attributes are created by ASP markup */
4216
387k
            if (asp)
4217
806
            {
4218
806
                av = TY_(NewAttribute)(doc);
4219
806
                av->asp = asp;
4220
806
                AddAttrToList( &list, av ); 
4221
806
                continue;
4222
806
            }
4223
4224
            /* check if attributes are created by PHP markup */
4225
386k
            if (php)
4226
2.03k
            {
4227
2.03k
                av = TY_(NewAttribute)(doc);
4228
2.03k
                av->php = php;
4229
2.03k
                AddAttrToList( &list, av ); 
4230
2.03k
                continue;
4231
2.03k
            }
4232
4233
384k
            break;
4234
386k
        }
4235
4236
288k
        value = ParseValue( doc, attribute, no, isempty, &delim );
4237
4238
288k
        if (attribute && (IsValidAttrName(attribute) ||
4239
288k
            (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))
4240
134k
        {
4241
134k
            av = TY_(NewAttribute)(doc);
4242
134k
            av->delim = delim ? delim : '"';
4243
134k
            av->attribute = attribute;
4244
134k
            av->value = value;
4245
134k
            av->dict = TY_(FindAttribute)( doc, av );
4246
134k
            AddAttrToList( &list, av );
4247
134k
            if ( !delim && value )
4248
33.4k
                TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK_OPEN);
4249
134k
        }
4250
153k
        else
4251
153k
        {
4252
153k
            av = TY_(NewAttribute)(doc);
4253
153k
            av->attribute = attribute;
4254
153k
            av->value = value;
4255
4256
153k
            if (LastChar(attribute) == '"')
4257
3.15k
                TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK);
4258
150k
            else if (value == NULL)
4259
131k
                TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE);
4260
18.5k
            else
4261
18.5k
                TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE);
4262
4263
153k
            TY_(FreeAttribute)( doc, av );
4264
153k
        }
4265
288k
    }
4266
4267
391k
    return list;
4268
391k
}
4269
4270
/*
4271
  Returns document type declarations like
4272
4273
  <!DOCTYPE foo PUBLIC "fpi" "sysid">
4274
  <!DOCTYPE bar SYSTEM "sysid">
4275
  <!DOCTYPE baz [ <!ENTITY ouml "&#246"> ]>
4276
4277
  as
4278
4279
  <foo PUBLIC="fpi" SYSTEM="sysid" />
4280
  <bar SYSTEM="sysid" />
4281
  <baz> &lt;!ENTITY ouml &quot;&amp;#246&quot;&gt; </baz>
4282
*/
4283
static Node *ParseDocTypeDecl(TidyDocImpl* doc)
4284
55.0k
{
4285
55.0k
    Lexer *lexer = doc->lexer;
4286
55.0k
    int start = lexer->lexsize;
4287
55.0k
    ParseDocTypeDeclState state = DT_DOCTYPENAME;
4288
55.0k
    uint c;
4289
55.0k
    uint delim = 0;
4290
55.0k
    Bool hasfpi = yes;
4291
4292
55.0k
    Node* node = TY_(NewNode)(lexer->allocator, lexer);
4293
55.0k
    node->type = DocTypeTag;
4294
55.0k
    node->start = lexer->txtstart;
4295
55.0k
    node->end = lexer->txtend;
4296
4297
55.0k
    lexer->waswhite = no;
4298
4299
    /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */
4300
4301
633k
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
4302
633k
    {
4303
        /* convert newlines to spaces */
4304
633k
        if (state != DT_INTSUBSET)
4305
533k
            c = c == '\n' ? ' ' : c;
4306
4307
        /* convert white-space sequences to single space character */
4308
633k
        if (TY_(IsWhite)(c) && state != DT_INTSUBSET)
4309
297k
        {
4310
297k
            if (!lexer->waswhite)
4311
10.2k
            {
4312
10.2k
                TY_(AddCharToLexer)(lexer, c);
4313
10.2k
                lexer->waswhite = yes;
4314
10.2k
            }
4315
286k
            else
4316
286k
            {
4317
                /* discard space */
4318
286k
                continue;
4319
286k
            }
4320
297k
        }
4321
336k
        else
4322
336k
        {
4323
336k
            TY_(AddCharToLexer)(lexer, c);
4324
336k
            lexer->waswhite = no;
4325
336k
        }
4326
4327
346k
        switch(state)
4328
346k
        {
4329
90.4k
        case DT_INTERMEDIATE:
4330
            /* determine what's next */
4331
90.4k
            if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S')
4332
2.00k
            {
4333
2.00k
                start = lexer->lexsize - 1;
4334
2.00k
                state = DT_PUBLICSYSTEM;
4335
2.00k
                continue;
4336
2.00k
            }
4337
88.4k
            else if (c == '[')
4338
1.38k
            {
4339
1.38k
                start = lexer->lexsize;
4340
1.38k
                state = DT_INTSUBSET;
4341
1.38k
                continue;
4342
1.38k
            }
4343
87.0k
            else if (c == '\'' || c == '"')
4344
6.11k
            {
4345
6.11k
                start = lexer->lexsize;
4346
6.11k
                delim = c;
4347
6.11k
                state = DT_QUOTEDSTRING;
4348
6.11k
                continue;
4349
6.11k
            }
4350
80.9k
            else if (c == '>')
4351
54.7k
            {
4352
54.7k
                AttVal* si;
4353
4354
54.7k
                node->end = --(lexer->lexsize);
4355
4356
54.7k
                si = TY_(GetAttrByName)(node, "SYSTEM");
4357
54.7k
                if (si)
4358
844
                    TY_(CheckUrl)(doc, node, si);
4359
4360
54.7k
                if (!node->element || !IsValidXMLElemName(node->element))
4361
43.4k
                {
4362
43.4k
                    TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE);
4363
43.4k
                    TY_(FreeNode)(doc, node);
4364
43.4k
                    return NULL;
4365
43.4k
                }
4366
11.3k
                return node;
4367
54.7k
            }
4368
26.1k
            else
4369
26.1k
            {
4370
                /* error */
4371
26.1k
            }
4372
26.1k
            break;
4373
123k
        case DT_DOCTYPENAME:
4374
            /* read document type name */
4375
123k
            if (TY_(IsWhite)(c) || c == '>' || c == '[')
4376
54.9k
            {
4377
54.9k
                node->element = TY_(tmbstrndup)(doc->allocator,
4378
54.9k
                                                lexer->lexbuf + start,
4379
54.9k
                                                lexer->lexsize - start - 1);
4380
54.9k
                if (c == '>' || c == '[')
4381
50.2k
                {
4382
50.2k
                    --(lexer->lexsize);
4383
50.2k
                    TY_(UngetChar)(c, doc->docIn);
4384
50.2k
                }
4385
4386
54.9k
                state = DT_INTERMEDIATE;
4387
54.9k
                continue;
4388
54.9k
            }
4389
68.9k
            break;
4390
68.9k
        case DT_PUBLICSYSTEM:
4391
            /* read PUBLIC/SYSTEM */
4392
7.06k
            if (TY_(IsWhite)(c) || c == '>')
4393
1.98k
            {
4394
1.98k
                char *attname = TY_(tmbstrndup)(doc->allocator,
4395
1.98k
                                                lexer->lexbuf + start,
4396
1.98k
                                                lexer->lexsize - start - 1);
4397
1.98k
                hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0);
4398
4399
1.98k
                TidyDocFree(doc, attname);
4400
4401
                /* todo: report an error if SYSTEM/PUBLIC not uppercase */
4402
4403
1.98k
                if (c == '>')
4404
769
                {
4405
769
                    --(lexer->lexsize);
4406
769
                    TY_(UngetChar)(c, doc->docIn);
4407
769
                }
4408
4409
1.98k
                state = DT_INTERMEDIATE;
4410
1.98k
                continue;
4411
1.98k
            }
4412
5.07k
            break;
4413
25.4k
        case DT_QUOTEDSTRING:
4414
            /* read quoted string */
4415
25.4k
            if (c == delim)
4416
6.07k
            {
4417
6.07k
                char *value = TY_(tmbstrndup)(doc->allocator,
4418
6.07k
                                              lexer->lexbuf + start,
4419
6.07k
                                              lexer->lexsize - start - 1);
4420
6.07k
                AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);
4421
6.07k
                TidyDocFree(doc, value);
4422
6.07k
                att->delim = delim;
4423
6.07k
                hasfpi = no;
4424
6.07k
                state = DT_INTERMEDIATE;
4425
6.07k
                delim = 0;
4426
6.07k
                continue;
4427
6.07k
            }
4428
19.3k
            break;
4429
99.7k
        case DT_INTSUBSET:
4430
            /* read internal subset */
4431
99.7k
            if (c == ']')
4432
1.34k
            {
4433
1.34k
                Node* subset;
4434
1.34k
                lexer->txtstart = start;
4435
1.34k
                lexer->txtend = lexer->lexsize - 1;
4436
1.34k
                subset = TY_(TextToken)(lexer);
4437
1.34k
                TY_(InsertNodeAtEnd)(node, subset);
4438
1.34k
                state = DT_INTERMEDIATE;
4439
1.34k
            }
4440
99.7k
            break;
4441
346k
        }
4442
346k
    }
4443
4444
    /* document type declaration not finished */
4445
248
    TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE);
4446
248
    TY_(FreeNode)(doc, node);
4447
248
    return NULL;
4448
55.0k
}
4449
4450
4451
/****************************************************************************//*
4452
 ** MARK: - Node Stack
4453
 ***************************************************************************/
4454
4455
4456
/**
4457
 * Create a new stack with a given starting capacity. If memory allocation
4458
 * fails, then the allocator will panic the program automatically.
4459
 */
4460
Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
4461
17.0k
{
4462
17.0k
    Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
4463
17.0k
    stack->top = -1;
4464
17.0k
    stack->capacity = capacity;
4465
17.0k
    stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
4466
17.0k
    stack->allocator = doc->allocator;
4467
17.0k
    return stack;
4468
17.0k
}
4469
 
4470
4471
/**
4472
 *  Increase the stack size. This will be called automatically when the
4473
 *  current stack is full. If memory allocation fails, then the allocator
4474
 *  will panic the program automatically.
4475
 */
4476
void TY_(growStack)(Stack *stack)
4477
44
{
4478
44
    uint new_capacity = stack->capacity * 2;
4479
    
4480
44
    Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity * sizeof(Node**));
4481
    
4482
44
    memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
4483
44
    TidyFree(stack->allocator, stack->firstNode);
4484
4485
44
    stack->firstNode = firstNode;
4486
44
    stack->capacity = new_capacity;
4487
44
}
4488
4489
4490
/**
4491
 * Stack is full when top is equal to the last index.
4492
 */
4493
Bool TY_(stackFull)(Stack *stack)
4494
1.68M
{
4495
1.68M
    return stack->top == stack->capacity - 1;
4496
1.68M
}
4497
4498
4499
/**
4500
 * Stack is empty when top is equal to -1
4501
 */
4502
Bool TY_(stackEmpty)(Stack *stack)
4503
140k
{
4504
140k
    return stack->top == -1;
4505
140k
}
4506
 
4507
4508
/**
4509
 * Push an item to the stack.
4510
 */
4511
void TY_(push)(Stack *stack, Node *node)
4512
1.68M
{
4513
1.68M
    if (TY_(stackFull)(stack))
4514
44
        TY_(growStack)(stack);
4515
    
4516
1.68M
    if (node)
4517
123k
        stack->firstNode[++stack->top] = node;
4518
1.68M
}
4519
4520
4521
/**
4522
 * Pop an item from the stack.
4523
 */
4524
Node* TY_(pop)(Stack *stack)
4525
140k
{
4526
140k
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
4527
140k
}
4528
4529
4530
/**
4531
 * Peek at the stack.
4532
 */
4533
FUNC_UNUSED Node* TY_(peek)(Stack *stack)
4534
0
{
4535
0
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
4536
0
}
4537
4538
/**
4539
 *  Frees the stack when done.
4540
 */
4541
void TY_(freeStack)(Stack *stack)
4542
17.0k
{
4543
17.0k
    TidyFree( stack->allocator, stack->firstNode );
4544
17.0k
    stack->top = -1;
4545
17.0k
    stack->capacity = 0;
4546
17.0k
    stack->firstNode = NULL;
4547
17.0k
    stack->allocator = NULL;
4548
17.0k
}