Coverage Report

Created: 2025-10-10 06:53

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tidy-html5/src/lexer.c
Line
Count
Source
1
/* lexer.c -- Lexer for html parser
2
  
3
  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
*/
7
8
/*
9
  Given a file stream fp it returns a sequence of tokens.
10
11
     GetToken(fp) gets the next token
12
     UngetToken(fp) provides one level undo
13
14
  The tags include an attribute list:
15
16
    - linked list of attribute/value nodes
17
    - each node has 2 NULL-terminated strings.
18
    - entities are replaced in attribute values
19
20
  white space is compacted if not in preformatted mode
21
  If not in preformatted mode then leading white space
22
  is discarded and subsequent white space sequences
23
  compacted to single space characters.
24
25
  If XmlTags is no then Tag names are folded to upper
26
  case and attribute names to lower case.
27
28
 Not yet done:
29
    -   Doctype subset and marked sections
30
*/
31
32
#include "tidy-int.h"
33
#include "lexer.h"
34
#include "parser.h"
35
#include "entities.h"
36
#include "streamio.h"
37
#include "message.h"
38
#include "tmbstr.h"
39
#include "clean.h"
40
#include "utf8.h"
41
#include "streamio.h"
42
#include "sprtf.h"
43
44
#if defined(ENABLE_DEBUG_LOG)
45
/* #define DEBUG_ALLOCATION   special EXTRA allocation debug information - VERY NOISY */
46
static void check_me(char *name);
47
static Bool show_attrs = yes;
48
#define MX_TXT 8
49
static char buffer[(MX_TXT*4)+8]; /* NOTE extra for '...'\0 tail */
50
static tmbstr get_text_string(Lexer* lexer, Node *node)
51
{
52
    uint len = node->end - node->start;
53
    tmbstr cp = lexer->lexbuf + node->start;
54
    tmbstr end = lexer->lexbuf + node->end;
55
    unsigned char c;
56
    uint i = 0;
57
    Bool insp = no;
58
    if (len <= ((MX_TXT * 2) + 3)) {
59
        buffer[0] = 0;
60
        while (cp < end) {
61
            c = *cp;
62
            cp++;
63
            if (c == '\n') {
64
                buffer[i++] = '\\';
65
                buffer[i++] = 'n';
66
            } else if (c == '\t') {
67
                buffer[i++] = '\\';
68
                buffer[i++] = 't';
69
            } else if ( c == ' ' ) {
70
                if (!insp)
71
                    buffer[i++] = c;
72
                insp = yes;
73
            } else {
74
                buffer[i++] = c;
75
                insp = no;
76
            }
77
        }
78
    } else {
79
        char *end1 = cp + MX_TXT;
80
        char *bgn = cp + (len - MX_TXT);
81
        buffer[0] = 0;
82
        if (bgn < end1)
83
            bgn = end1;
84
        while (cp < end1) {
85
            c = *cp;
86
            cp++;
87
            if (c == '\n') {
88
                buffer[i++] = '\\';
89
                buffer[i++] = 'n';
90
            } else if (c == '\t') {
91
                buffer[i++] = '\\';
92
                buffer[i++] = 't';
93
            } else if ( c == ' ' ) {
94
                if (!insp)
95
                    buffer[i++] = c;
96
                insp = yes;
97
            } else {
98
                buffer[i++] = c;
99
                insp = no;
100
            }
101
            if (i >= MX_TXT)
102
                break;
103
        }
104
        c = '.';
105
        if ((i < len)&&(cp < bgn)) {
106
            buffer[i++] = c;
107
            cp++;
108
            if ((i < len)&&(cp < bgn)) {
109
                buffer[i++] = c;
110
                cp++;
111
                if ((i < len)&&(cp < bgn)) {
112
                    buffer[i++] = c;
113
                    cp++;
114
                }
115
            }
116
        }
117
        cp = bgn;
118
        insp = no;
119
        while (cp < end) {
120
            c = *cp;
121
            cp++;
122
            if (c == '\n') {
123
                buffer[i++] = '\\';
124
                buffer[i++] = 'n';
125
            } else if (c == '\t') {
126
                buffer[i++] = '\\';
127
                buffer[i++] = 't';
128
            } else if ( c == ' ' ) {
129
                if (!insp)
130
                    buffer[i++] = c;
131
                insp = yes;
132
            } else {
133
                buffer[i++] = c;
134
                insp = no;
135
            }
136
        }
137
    }
138
    buffer[i] = 0;
139
    return buffer;
140
}
141
static void Show_Node( TidyDocImpl* doc, const char *msg, Node *node )
142
{
143
    Lexer* lexer = doc->lexer;
144
    Bool lex = ((msg[0] == 'l')&&(msg[1] == 'e')) ? yes : no;
145
    int line = ( doc->lexer ? doc->lexer->lines : 0 );
146
    int col  = ( doc->lexer ? doc->lexer->columns : 0 );
147
    tmbstr src = lex ? "lexer" : "stream";
148
    SPRTF("R=%d C=%d: ", line, col );
149
    /* DEBUG: Be able to set a TRAP on a SPECIFIC row,col */
150
    if ((line == 3) && (col == 1)) {
151
        check_me("Show_Node"); /* just a debug trap */
152
    }
153
    if (lexer && lexer->token && 
154
        ((lexer->token->type == TextNode)||(node && (node->type == TextNode)))) {
155
        if (show_attrs) {
156
            uint len = node ? node->end - node->start : 0;
157
            tmbstr cp = node ? get_text_string( lexer, node ) : "NULL";
158
            SPRTF("Returning %s TextNode [%s]%u %s\n", msg, cp, len, src );
159
        } else {
160
            SPRTF("Returning %s TextNode %p... %s\n", msg, node, src );
161
        }
162
    } else {
163
        tmbstr name = node ? node->element ? node->element : "blank" : "NULL";
164
        if (show_attrs) {
165
            AttVal* av;
166
            SPRTF("Returning %s node <%s", msg, name);
167
            if (node) {
168
                for (av = node->attributes; av; av = av->next) {
169
                    name = av->attribute;
170
                    if (name) {
171
                        SPRTF(" %s",name);
172
                        if (av->value) {
173
                            SPRTF("=\"%s\"", av->value);
174
                        }
175
                    }
176
                }
177
            }
178
            SPRTF("> %s\n", src);
179
        } else {
180
            SPRTF("Returning %s node %p <%s>... %s\n", msg, node,
181
                name, src );
182
        }
183
    }
184
}
185
#define GTDBG(a,b,c) Show_Node(a,b,c)
186
#else /* ENABLE_DEBUG_LOG */
187
#define GTDBG(a,b,c)
188
#endif /* defined(ENABLE_DEBUG_LOG) */
189
190
/* Forward references
191
*/
192
/* swallows closing '>' */
193
static AttVal *ParseAttrs( TidyDocImpl* doc, Bool *isempty );
194
195
static tmbstr ParseAttribute( TidyDocImpl* doc, Bool* isempty, 
196
                             Node **asp, Node **php );
197
198
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name, Bool foldCase,
199
                         Bool *isempty, int *pdelim );
200
201
static Node *ParseDocTypeDecl(TidyDocImpl* doc);
202
203
static void AddAttrToList( AttVal** list, AttVal* av );
204
205
/* used to classify characters for lexical purposes */
206
225M
#define MAP(c) ((unsigned)c < 128 ? lexmap[(unsigned)c] : 0)
207
static uint lexmap[128];
208
209
20.5k
#define IsValidXMLAttrName(name) TY_(IsValidXMLID)(name)
210
36.5k
#define IsValidXMLElemName(name) TY_(IsValidXMLID)(name)
211
212
static struct _doctypes
213
{
214
    uint score;
215
    uint vers;
216
    uint vers_out;
217
    Bool xhtml;
218
    ctmbstr name;
219
    ctmbstr fpi;
220
    ctmbstr si;
221
} const W3C_Doctypes[] =
222
{
223
  {  2, HT20, 200, no,  "HTML 2.0",               "-//IETF//DTD HTML 2.0//EN",              NULL,                                                       },
224
  {  2, HT20, 200, no,  "HTML 2.0",               "-//IETF//DTD HTML//EN",                  NULL,                                                       },
225
  {  2, HT20, 200, no,  "HTML 2.0",               "-//W3C//DTD HTML 2.0//EN",               NULL,                                                       },
226
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2//EN",               NULL,                                                       },
227
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2 Final//EN",         NULL,                                                       },
228
  {  1, HT32, 320, no,  "HTML 3.2",               "-//W3C//DTD HTML 3.2 Draft//EN",         NULL,                                                       },
229
  {  6, H40S, 400, no,  "HTML 4.0 Strict",        "-//W3C//DTD HTML 4.0//EN",               "http://www.w3.org/TR/REC-html40/strict.dtd"                },
230
  {  8, H40T, 400, no,  "HTML 4.0 Transitional",  "-//W3C//DTD HTML 4.0 Transitional//EN",  "http://www.w3.org/TR/REC-html40/loose.dtd"                 },
231
  {  7, H40F, 400, no,  "HTML 4.0 Frameset",      "-//W3C//DTD HTML 4.0 Frameset//EN",      "http://www.w3.org/TR/REC-html40/frameset.dtd"              },
232
  {  3, H41S, 401, no,  "HTML 4.01 Strict",       "-//W3C//DTD HTML 4.01//EN",              "http://www.w3.org/TR/html4/strict.dtd"                     },
233
  {  5, H41T, 401, no,  "HTML 4.01 Transitional", "-//W3C//DTD HTML 4.01 Transitional//EN", "http://www.w3.org/TR/html4/loose.dtd"                      },
234
  {  4, H41F, 401, no,  "HTML 4.01 Frameset",     "-//W3C//DTD HTML 4.01 Frameset//EN",     "http://www.w3.org/TR/html4/frameset.dtd"                   },
235
  {  9, X10S, 100, yes, "XHTML 1.0 Strict",       "-//W3C//DTD XHTML 1.0 Strict//EN",       "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"         },
236
  { 11, X10T, 100, yes, "XHTML 1.0 Transitional", "-//W3C//DTD XHTML 1.0 Transitional//EN", "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"   },
237
  { 10, X10F, 100, yes, "XHTML 1.0 Frameset",     "-//W3C//DTD XHTML 1.0 Frameset//EN",     "http://www.w3.org/TR/xhtml1/DTD/xhtml1-frameset.dtd"       },
238
  { 12, XH11, 110, yes, "XHTML 1.1",              "-//W3C//DTD XHTML 1.1//EN",              "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"              },
239
  { 13, XB10, 100, yes, "XHTML Basic 1.0",        "-//W3C//DTD XHTML Basic 1.0//EN",        "http://www.w3.org/TR/xhtml-basic/xhtml-basic10.dtd"        },
240
241
  { 20, HT50, 500, no,  "HTML5",                  NULL,                                     NULL                                                        },
242
  { 21, XH50, 500, yes, "XHTML5",                 NULL,                                     NULL                                                        },
243
244
  /* final entry */
245
  {  0,    0, 0,  no,  NULL,                     NULL,                                     NULL                                                        }
246
};
247
248
/* 
249
 * Issue #643 - Since VERS_FROM40 was extended to include VERS_HTML5
250
 * to be used in the expanded entity table some 155 times,
251
 * need a special macro here to denote just HTML 4 plus XHTML,
252
 * which is actually the former define of VERS_FROM40
253
 */
254
118k
#define VERS_HMTL40PX        (VERS_HTML40|VERS_XHTML11|VERS_BASIC)
255
256
int TY_(HTMLVersion)(TidyDocImpl* doc)
257
118k
{
258
118k
    uint i;
259
118k
    uint j = 0;
260
118k
    uint score = 0;
261
118k
    uint vers = doc->lexer->versions;
262
118k
    uint dtver = doc->lexer->doctype;
263
118k
    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
264
118k
    Bool xhtml = (cfgBool(doc, TidyXmlOut) || doc->lexer->isvoyager) &&
265
118k
                 !cfgBool(doc, TidyHtmlOut);
266
118k
    Bool html4 = ((dtmode == TidyDoctypeStrict) || (dtmode == TidyDoctypeLoose) ||
267
118k
                  (VERS_HMTL40PX & dtver) ? yes : no);
268
118k
    Bool html5 = (!html4 && ((dtmode == TidyDoctypeAuto) ||
269
118k
                  (dtmode == TidyDoctypeHtml5)) ? yes : no);
270
271
118k
    if (xhtml && dtver == VERS_UNKNOWN) return XH50;
272
0
    if (dtver == VERS_UNKNOWN) return HT50;
273
    /* Issue #167 - if NOT XHTML, and doctype is default VERS_HTML5, then return HT50 */
274
0
    if (!xhtml && (dtver == VERS_HTML5)) return HT50;
275
    /* Issue #377 - If xhtml and (doctype == html5) and constrained vers contains XH50 return that,
276
       and really if tidy defaults to 'html5', then maybe 'auto' should also apply! */
277
0
    if (xhtml && html5 && ((vers & VERS_HTML5) == XH50)) return XH50;
278
279
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
280
0
    {
281
0
        if ((xhtml && !(VERS_XHTML & W3C_Doctypes[i].vers)) ||
282
0
            (html4 && !(VERS_HMTL40PX & W3C_Doctypes[i].vers)))
283
0
            continue;
284
285
0
        if (vers & W3C_Doctypes[i].vers &&
286
0
            (W3C_Doctypes[i].score < score || !score))
287
0
        {
288
0
            score = W3C_Doctypes[i].score;
289
0
            j = i;
290
0
        }
291
0
    }
292
293
0
    if (score)
294
0
        return W3C_Doctypes[j].vers;
295
296
0
    return VERS_UNKNOWN;
297
0
}
298
299
static ctmbstr GetFPIFromVers(uint vers)
300
0
{
301
0
    uint i;
302
303
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
304
0
        if (W3C_Doctypes[i].vers == vers)
305
0
            return W3C_Doctypes[i].fpi;
306
307
0
    return NULL;
308
0
}
309
310
static ctmbstr GetSIFromVers(uint vers)
311
0
{
312
0
    uint i;
313
314
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
315
0
        if (W3C_Doctypes[i].vers == vers)
316
0
            return W3C_Doctypes[i].si;
317
318
0
    return NULL;
319
0
}
320
321
static ctmbstr GetNameFromVers(uint vers)
322
0
{
323
0
    uint i;
324
325
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
326
0
        if (W3C_Doctypes[i].vers == vers)
327
0
            return W3C_Doctypes[i].name;
328
329
0
    return NULL;
330
0
}
331
332
static uint GetVersFromFPI(ctmbstr fpi)
333
0
{
334
0
    uint i;
335
336
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
337
0
        if (W3C_Doctypes[i].fpi != NULL && TY_(tmbstrcasecmp)(W3C_Doctypes[i].fpi, fpi) == 0)
338
0
            return W3C_Doctypes[i].vers;
339
340
0
    return 0;
341
0
}
342
343
#ifdef ENABLE_DEBUG_LOG
344
#  ifndef EndBuf
345
#    define EndBuf(a)   ( a + strlen(a) )
346
#  endif
347
348
/* Issue #377 - Output diminishing version bits */
349
typedef struct tagV2S {
350
    uint bit;
351
    ctmbstr val;
352
}V2S, *PV2S;
353
354
static V2S v2s[] = {
355
    { HT20, "HT20" },
356
    { HT32, "HT32" },
357
    { H40S, "H40S" },
358
    { H40T, "H40T" },
359
    { H40F, "H40F" },
360
    { H41S, "H41S" },
361
    { H41T, "H41T" },
362
    { H41F, "H41F" },
363
    { X10S, "X10S" },
364
    { X10T, "X10T" },
365
    { X10F, "X10F" },
366
    { XH11, "XH11" },
367
    { XB10, "XB10" }, /* 4096u */
368
    /* { VERS_SUN, "VSUN" }, */
369
    /* { VERS_NETSCAPE, "VNET" }, */
370
    /* { VERS_MICROSOFT, "VMIC" }, 32768u */
371
    { VERS_XML, "VXML" }, /* 65536u */
372
        /* HTML5 */
373
    { HT50, "HT50" }, /* 131072u */
374
    { XH50, "XH50" }, /* 262144u */
375
    { 0,     0  }
376
};
377
378
/* Process the above table, adding a bit name,
379
   or '----' when not present   */
380
static char *add_vers_string( tmbstr buf, uint vers )
381
{
382
    PV2S pv2s = v2s;
383
    int len = (int)strlen(buf);
384
    while (pv2s->val) {
385
        if (vers & pv2s->bit) {
386
            if (len) {
387
                strcat(buf,"|");
388
                len++;
389
            }
390
            strcat(buf,pv2s->val);
391
            len += (int)strlen(pv2s->val);
392
            vers &= ~(pv2s->bit);
393
            if (!vers)
394
                break;
395
        } else {
396
            if (len) {
397
                strcat(buf,"|");
398
                len++;
399
            }
400
            strcat(buf,"----");
401
            len += 4;
402
403
        }
404
        pv2s++;
405
    }
406
    if (vers) { /* Should not have any here! */
407
        if (len)
408
            strcat(buf,"|");
409
        sprintf(EndBuf(buf),"%u",vers);
410
    }
411
    return buf;
412
413
}
414
415
/* Issue #377 - Show first Before: list, and then on any change
416
   Note the VERS_PROPRIETARY are exclude since they always remain */
417
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
418
{
419
    static char vcur[256];
420
    static Bool dnfirst = no;
421
    uint curr = doc->lexer->versions; /* get current */
422
    doc->lexer->versions &= (vers | VERS_PROPRIETARY);
423
    if (curr != doc->lexer->versions) { /* only if different */
424
        if (!dnfirst) {
425
            dnfirst = yes;
426
            vcur[0] = 0;
427
            curr &= ~(VERS_PROPRIETARY);
428
            add_vers_string( vcur, curr );
429
            SPRTF("Before: %s\n", vcur);
430
        }
431
        vcur[0] = 0;
432
        curr = doc->lexer->versions;
433
        curr &= ~(VERS_PROPRIETARY);
434
        add_vers_string( vcur, curr );
435
        SPRTF("After : %s\n", vcur);
436
    }
437
}
438
#else /* !#if defined(ENABLE_DEBUG_LOG) */
439
/* everything is allowed in proprietary version of HTML */
440
/* this is handled here rather than in the tag/attr dicts */
441
void TY_(ConstrainVersion)(TidyDocImpl* doc, uint vers)
442
475
{
443
475
    doc->lexer->versions &= (vers | VERS_PROPRIETARY);
444
475
}
445
#endif /* #if defined(ENABLE_DEBUG_LOG) y/n */
446
447
Bool TY_(IsWhite)(uint c)
448
212M
{
449
212M
    uint map = MAP(c);
450
451
212M
    return (map & white)!=0;
452
212M
}
453
454
Bool TY_(IsNewline)(uint c)
455
0
{
456
0
    uint map = MAP(c);
457
0
    return (map & newline)!=0;
458
0
}
459
460
Bool TY_(IsDigit)(uint c)
461
7
{
462
7
    uint map;
463
464
7
    map = MAP(c);
465
466
7
    return (map & digit)!=0;
467
7
}
468
469
static Bool IsDigitHex(uint c)
470
4.02k
{
471
4.02k
    uint map;
472
473
4.02k
    map = MAP(c);
474
475
4.02k
    return (map & digithex)!=0;
476
4.02k
}
477
478
Bool TY_(IsLetter)(uint c)
479
1.63M
{
480
1.63M
    uint map;
481
482
1.63M
    map = MAP(c);
483
484
1.63M
    return (map & letter)!=0;
485
1.63M
}
486
487
Bool TY_(IsHTMLSpace)(uint c)
488
0
{
489
0
    return c == 0x020 || c == 0x009 || c == 0x00a || c == 0x00c || c == 0x00d;
490
0
}
491
492
Bool TY_(IsNamechar)(uint c)
493
258k
{
494
258k
    uint map = MAP(c);
495
258k
    return (map & namechar)!=0;
496
258k
}
497
498
Bool TY_(IsXMLLetter)(uint c)
499
5.65M
{
500
5.65M
    return ((c >= 0x41 && c <= 0x5a) ||
501
5.61M
        (c >= 0x61 && c <= 0x7a) ||
502
5.33M
        (c >= 0xc0 && c <= 0xd6) ||
503
5.33M
        (c >= 0xd8 && c <= 0xf6) ||
504
5.33M
        (c >= 0xf8 && c <= 0xff) ||
505
5.33M
        (c >= 0x100 && c <= 0x131) ||
506
5.33M
        (c >= 0x134 && c <= 0x13e) ||
507
5.33M
        (c >= 0x141 && c <= 0x148) ||
508
5.33M
        (c >= 0x14a && c <= 0x17e) ||
509
5.33M
        (c >= 0x180 && c <= 0x1c3) ||
510
5.33M
        (c >= 0x1cd && c <= 0x1f0) ||
511
5.33M
        (c >= 0x1f4 && c <= 0x1f5) ||
512
5.33M
        (c >= 0x1fa && c <= 0x217) ||
513
5.33M
        (c >= 0x250 && c <= 0x2a8) ||
514
5.33M
        (c >= 0x2bb && c <= 0x2c1) ||
515
5.33M
        c == 0x386 ||
516
5.33M
        (c >= 0x388 && c <= 0x38a) ||
517
5.33M
        c == 0x38c ||
518
5.33M
        (c >= 0x38e && c <= 0x3a1) ||
519
5.33M
        (c >= 0x3a3 && c <= 0x3ce) ||
520
5.33M
        (c >= 0x3d0 && c <= 0x3d6) ||
521
5.33M
        c == 0x3da ||
522
5.33M
        c == 0x3dc ||
523
5.33M
        c == 0x3de ||
524
5.33M
        c == 0x3e0 ||
525
5.33M
        (c >= 0x3e2 && c <= 0x3f3) ||
526
5.33M
        (c >= 0x401 && c <= 0x40c) ||
527
5.33M
        (c >= 0x40e && c <= 0x44f) ||
528
5.33M
        (c >= 0x451 && c <= 0x45c) ||
529
5.33M
        (c >= 0x45e && c <= 0x481) ||
530
5.33M
        (c >= 0x490 && c <= 0x4c4) ||
531
5.33M
        (c >= 0x4c7 && c <= 0x4c8) ||
532
5.33M
        (c >= 0x4cb && c <= 0x4cc) ||
533
5.33M
        (c >= 0x4d0 && c <= 0x4eb) ||
534
5.33M
        (c >= 0x4ee && c <= 0x4f5) ||
535
5.33M
        (c >= 0x4f8 && c <= 0x4f9) ||
536
5.33M
        (c >= 0x531 && c <= 0x556) ||
537
5.33M
        c == 0x559 ||
538
5.33M
        (c >= 0x561 && c <= 0x586) ||
539
5.33M
        (c >= 0x5d0 && c <= 0x5ea) ||
540
5.33M
        (c >= 0x5f0 && c <= 0x5f2) ||
541
5.33M
        (c >= 0x621 && c <= 0x63a) ||
542
5.33M
        (c >= 0x641 && c <= 0x64a) ||
543
5.33M
        (c >= 0x671 && c <= 0x6b7) ||
544
5.33M
        (c >= 0x6ba && c <= 0x6be) ||
545
5.33M
        (c >= 0x6c0 && c <= 0x6ce) ||
546
5.33M
        (c >= 0x6d0 && c <= 0x6d3) ||
547
5.33M
        c == 0x6d5 ||
548
5.33M
        (c >= 0x6e5 && c <= 0x6e6) ||
549
5.33M
        (c >= 0x905 && c <= 0x939) ||
550
5.33M
        c == 0x93d ||
551
5.33M
        (c >= 0x958 && c <= 0x961) ||
552
5.33M
        (c >= 0x985 && c <= 0x98c) ||
553
5.33M
        (c >= 0x98f && c <= 0x990) ||
554
5.33M
        (c >= 0x993 && c <= 0x9a8) ||
555
5.33M
        (c >= 0x9aa && c <= 0x9b0) ||
556
5.33M
        c == 0x9b2 ||
557
5.33M
        (c >= 0x9b6 && c <= 0x9b9) ||
558
5.33M
        (c >= 0x9dc && c <= 0x9dd) ||
559
5.33M
        (c >= 0x9df && c <= 0x9e1) ||
560
5.33M
        (c >= 0x9f0 && c <= 0x9f1) ||
561
5.33M
        (c >= 0xa05 && c <= 0xa0a) ||
562
5.33M
        (c >= 0xa0f && c <= 0xa10) ||
563
5.33M
        (c >= 0xa13 && c <= 0xa28) ||
564
5.33M
        (c >= 0xa2a && c <= 0xa30) ||
565
5.33M
        (c >= 0xa32 && c <= 0xa33) ||
566
5.33M
        (c >= 0xa35 && c <= 0xa36) ||
567
5.33M
        (c >= 0xa38 && c <= 0xa39) ||
568
5.33M
        (c >= 0xa59 && c <= 0xa5c) ||
569
5.33M
        c == 0xa5e ||
570
5.33M
        (c >= 0xa72 && c <= 0xa74) ||
571
5.33M
        (c >= 0xa85 && c <= 0xa8b) ||
572
5.33M
        c == 0xa8d ||
573
5.33M
        (c >= 0xa8f && c <= 0xa91) ||
574
5.33M
        (c >= 0xa93 && c <= 0xaa8) ||
575
5.33M
        (c >= 0xaaa && c <= 0xab0) ||
576
5.33M
        (c >= 0xab2 && c <= 0xab3) ||
577
5.33M
        (c >= 0xab5 && c <= 0xab9) ||
578
5.33M
        c == 0xabd ||
579
5.33M
        c == 0xae0 ||
580
5.33M
        (c >= 0xb05 && c <= 0xb0c) ||
581
5.33M
        (c >= 0xb0f && c <= 0xb10) ||
582
5.33M
        (c >= 0xb13 && c <= 0xb28) ||
583
5.33M
        (c >= 0xb2a && c <= 0xb30) ||
584
5.33M
        (c >= 0xb32 && c <= 0xb33) ||
585
5.33M
        (c >= 0xb36 && c <= 0xb39) ||
586
5.33M
        c == 0xb3d ||
587
5.33M
        (c >= 0xb5c && c <= 0xb5d) ||
588
5.33M
        (c >= 0xb5f && c <= 0xb61) ||
589
5.33M
        (c >= 0xb85 && c <= 0xb8a) ||
590
5.33M
        (c >= 0xb8e && c <= 0xb90) ||
591
5.33M
        (c >= 0xb92 && c <= 0xb95) ||
592
5.33M
        (c >= 0xb99 && c <= 0xb9a) ||
593
5.33M
        c == 0xb9c ||
594
5.33M
        (c >= 0xb9e && c <= 0xb9f) ||
595
5.33M
        (c >= 0xba3 && c <= 0xba4) ||
596
5.33M
        (c >= 0xba8 && c <= 0xbaa) ||
597
5.33M
        (c >= 0xbae && c <= 0xbb5) ||
598
5.33M
        (c >= 0xbb7 && c <= 0xbb9) ||
599
5.33M
        (c >= 0xc05 && c <= 0xc0c) ||
600
5.33M
        (c >= 0xc0e && c <= 0xc10) ||
601
5.33M
        (c >= 0xc12 && c <= 0xc28) ||
602
5.33M
        (c >= 0xc2a && c <= 0xc33) ||
603
5.33M
        (c >= 0xc35 && c <= 0xc39) ||
604
5.33M
        (c >= 0xc60 && c <= 0xc61) ||
605
5.33M
        (c >= 0xc85 && c <= 0xc8c) ||
606
5.33M
        (c >= 0xc8e && c <= 0xc90) ||
607
5.33M
        (c >= 0xc92 && c <= 0xca8) ||
608
5.33M
        (c >= 0xcaa && c <= 0xcb3) ||
609
5.33M
        (c >= 0xcb5 && c <= 0xcb9) ||
610
5.33M
        c == 0xcde ||
611
5.33M
        (c >= 0xce0 && c <= 0xce1) ||
612
5.33M
        (c >= 0xd05 && c <= 0xd0c) ||
613
5.33M
        (c >= 0xd0e && c <= 0xd10) ||
614
5.33M
        (c >= 0xd12 && c <= 0xd28) ||
615
5.33M
        (c >= 0xd2a && c <= 0xd39) ||
616
5.33M
        (c >= 0xd60 && c <= 0xd61) ||
617
5.33M
        (c >= 0xe01 && c <= 0xe2e) ||
618
5.33M
        c == 0xe30 ||
619
5.33M
        (c >= 0xe32 && c <= 0xe33) ||
620
5.33M
        (c >= 0xe40 && c <= 0xe45) ||
621
5.33M
        (c >= 0xe81 && c <= 0xe82) ||
622
5.33M
        c == 0xe84 ||
623
5.33M
        (c >= 0xe87 && c <= 0xe88) ||
624
5.33M
        c == 0xe8a ||
625
5.33M
        c == 0xe8d ||
626
5.33M
        (c >= 0xe94 && c <= 0xe97) ||
627
5.33M
        (c >= 0xe99 && c <= 0xe9f) ||
628
5.33M
        (c >= 0xea1 && c <= 0xea3) ||
629
5.33M
        c == 0xea5 ||
630
5.33M
        c == 0xea7 ||
631
5.33M
        (c >= 0xeaa && c <= 0xeab) ||
632
5.33M
        (c >= 0xead && c <= 0xeae) ||
633
5.33M
        c == 0xeb0 ||
634
5.33M
        (c >= 0xeb2 && c <= 0xeb3) ||
635
5.33M
        c == 0xebd ||
636
5.33M
        (c >= 0xec0 && c <= 0xec4) ||
637
5.33M
        (c >= 0xf40 && c <= 0xf47) ||
638
5.33M
        (c >= 0xf49 && c <= 0xf69) ||
639
5.33M
        (c >= 0x10a0 && c <= 0x10c5) ||
640
5.33M
        (c >= 0x10d0 && c <= 0x10f6) ||
641
5.33M
        c == 0x1100 ||
642
5.33M
        (c >= 0x1102 && c <= 0x1103) ||
643
5.33M
        (c >= 0x1105 && c <= 0x1107) ||
644
5.33M
        c == 0x1109 ||
645
5.33M
        (c >= 0x110b && c <= 0x110c) ||
646
5.33M
        (c >= 0x110e && c <= 0x1112) ||
647
5.33M
        c == 0x113c ||
648
5.33M
        c == 0x113e ||
649
5.33M
        c == 0x1140 ||
650
5.33M
        c == 0x114c ||
651
5.33M
        c == 0x114e ||
652
5.33M
        c == 0x1150 ||
653
5.33M
        (c >= 0x1154 && c <= 0x1155) ||
654
5.33M
        c == 0x1159 ||
655
5.33M
        (c >= 0x115f && c <= 0x1161) ||
656
5.33M
        c == 0x1163 ||
657
5.33M
        c == 0x1165 ||
658
5.33M
        c == 0x1167 ||
659
5.33M
        c == 0x1169 ||
660
5.33M
        (c >= 0x116d && c <= 0x116e) ||
661
5.33M
        (c >= 0x1172 && c <= 0x1173) ||
662
5.33M
        c == 0x1175 ||
663
5.33M
        c == 0x119e ||
664
5.33M
        c == 0x11a8 ||
665
5.33M
        c == 0x11ab ||
666
5.33M
        (c >= 0x11ae && c <= 0x11af) ||
667
5.33M
        (c >= 0x11b7 && c <= 0x11b8) ||
668
5.33M
        c == 0x11ba ||
669
5.33M
        (c >= 0x11bc && c <= 0x11c2) ||
670
5.33M
        c == 0x11eb ||
671
5.33M
        c == 0x11f0 ||
672
5.33M
        c == 0x11f9 ||
673
5.33M
        (c >= 0x1e00 && c <= 0x1e9b) ||
674
5.33M
        (c >= 0x1ea0 && c <= 0x1ef9) ||
675
5.33M
        (c >= 0x1f00 && c <= 0x1f15) ||
676
5.33M
        (c >= 0x1f18 && c <= 0x1f1d) ||
677
5.33M
        (c >= 0x1f20 && c <= 0x1f45) ||
678
5.33M
        (c >= 0x1f48 && c <= 0x1f4d) ||
679
5.33M
        (c >= 0x1f50 && c <= 0x1f57) ||
680
5.33M
        c == 0x1f59 ||
681
5.33M
        c == 0x1f5b ||
682
5.33M
        c == 0x1f5d ||
683
5.33M
        (c >= 0x1f5f && c <= 0x1f7d) ||
684
5.33M
        (c >= 0x1f80 && c <= 0x1fb4) ||
685
5.33M
        (c >= 0x1fb6 && c <= 0x1fbc) ||
686
5.33M
        c == 0x1fbe ||
687
5.33M
        (c >= 0x1fc2 && c <= 0x1fc4) ||
688
5.33M
        (c >= 0x1fc6 && c <= 0x1fcc) ||
689
5.33M
        (c >= 0x1fd0 && c <= 0x1fd3) ||
690
5.33M
        (c >= 0x1fd6 && c <= 0x1fdb) ||
691
5.33M
        (c >= 0x1fe0 && c <= 0x1fec) ||
692
5.33M
        (c >= 0x1ff2 && c <= 0x1ff4) ||
693
5.33M
        (c >= 0x1ff6 && c <= 0x1ffc) ||
694
5.33M
        c == 0x2126 ||
695
5.33M
        (c >= 0x212a && c <= 0x212b) ||
696
5.33M
        c == 0x212e ||
697
5.33M
        (c >= 0x2180 && c <= 0x2182) ||
698
5.33M
        (c >= 0x3041 && c <= 0x3094) ||
699
5.33M
        (c >= 0x30a1 && c <= 0x30fa) ||
700
5.33M
        (c >= 0x3105 && c <= 0x312c) ||
701
5.33M
        (c >= 0xac00 && c <= 0xd7a3) ||
702
5.33M
        (c >= 0x4e00 && c <= 0x9fa5) ||
703
5.33M
        c == 0x3007 ||
704
5.33M
        (c >= 0x3021 && c <= 0x3029) ||
705
5.33M
        (c >= 0x4e00 && c <= 0x9fa5) ||
706
5.33M
        c == 0x3007 ||
707
5.33M
        (c >= 0x3021 && c <= 0x3029));
708
5.65M
}
709
710
Bool TY_(IsXMLNamechar)(uint c)
711
5.60M
{
712
5.60M
    return (TY_(IsXMLLetter)(c) ||
713
5.31M
        c == '.' || c == '_' ||
714
5.31M
        c == ':' || c == '-' ||
715
5.30M
        (c >= 0x300 && c <= 0x345) ||
716
5.30M
        (c >= 0x360 && c <= 0x361) ||
717
5.30M
        (c >= 0x483 && c <= 0x486) ||
718
5.30M
        (c >= 0x591 && c <= 0x5a1) ||
719
5.30M
        (c >= 0x5a3 && c <= 0x5b9) ||
720
5.30M
        (c >= 0x5bb && c <= 0x5bd) ||
721
5.30M
        c == 0x5bf ||
722
5.30M
        (c >= 0x5c1 && c <= 0x5c2) ||
723
5.30M
        c == 0x5c4 ||
724
5.30M
        (c >= 0x64b && c <= 0x652) ||
725
5.30M
        c == 0x670 ||
726
5.30M
        (c >= 0x6d6 && c <= 0x6dc) ||
727
5.30M
        (c >= 0x6dd && c <= 0x6df) ||
728
5.30M
        (c >= 0x6e0 && c <= 0x6e4) ||
729
5.30M
        (c >= 0x6e7 && c <= 0x6e8) ||
730
5.30M
        (c >= 0x6ea && c <= 0x6ed) ||
731
5.30M
        (c >= 0x901 && c <= 0x903) ||
732
5.30M
        c == 0x93c ||
733
5.30M
        (c >= 0x93e && c <= 0x94c) ||
734
5.30M
        c == 0x94d ||
735
5.30M
        (c >= 0x951 && c <= 0x954) ||
736
5.30M
        (c >= 0x962 && c <= 0x963) ||
737
5.30M
        (c >= 0x981 && c <= 0x983) ||
738
5.30M
        c == 0x9bc ||
739
5.30M
        c == 0x9be ||
740
5.30M
        c == 0x9bf ||
741
5.30M
        (c >= 0x9c0 && c <= 0x9c4) ||
742
5.30M
        (c >= 0x9c7 && c <= 0x9c8) ||
743
5.30M
        (c >= 0x9cb && c <= 0x9cd) ||
744
5.30M
        c == 0x9d7 ||
745
5.30M
        (c >= 0x9e2 && c <= 0x9e3) ||
746
5.30M
        c == 0xa02 ||
747
5.30M
        c == 0xa3c ||
748
5.30M
        c == 0xa3e ||
749
5.30M
        c == 0xa3f ||
750
5.30M
        (c >= 0xa40 && c <= 0xa42) ||
751
5.30M
        (c >= 0xa47 && c <= 0xa48) ||
752
5.30M
        (c >= 0xa4b && c <= 0xa4d) ||
753
5.30M
        (c >= 0xa70 && c <= 0xa71) ||
754
5.30M
        (c >= 0xa81 && c <= 0xa83) ||
755
5.30M
        c == 0xabc ||
756
5.30M
        (c >= 0xabe && c <= 0xac5) ||
757
5.30M
        (c >= 0xac7 && c <= 0xac9) ||
758
5.30M
        (c >= 0xacb && c <= 0xacd) ||
759
5.30M
        (c >= 0xb01 && c <= 0xb03) ||
760
5.30M
        c == 0xb3c ||
761
5.30M
        (c >= 0xb3e && c <= 0xb43) ||
762
5.30M
        (c >= 0xb47 && c <= 0xb48) ||
763
5.30M
        (c >= 0xb4b && c <= 0xb4d) ||
764
5.30M
        (c >= 0xb56 && c <= 0xb57) ||
765
5.30M
        (c >= 0xb82 && c <= 0xb83) ||
766
5.30M
        (c >= 0xbbe && c <= 0xbc2) ||
767
5.30M
        (c >= 0xbc6 && c <= 0xbc8) ||
768
5.30M
        (c >= 0xbca && c <= 0xbcd) ||
769
5.30M
        c == 0xbd7 ||
770
5.30M
        (c >= 0xc01 && c <= 0xc03) ||
771
5.30M
        (c >= 0xc3e && c <= 0xc44) ||
772
5.30M
        (c >= 0xc46 && c <= 0xc48) ||
773
5.30M
        (c >= 0xc4a && c <= 0xc4d) ||
774
5.30M
        (c >= 0xc55 && c <= 0xc56) ||
775
5.30M
        (c >= 0xc82 && c <= 0xc83) ||
776
5.30M
        (c >= 0xcbe && c <= 0xcc4) ||
777
5.30M
        (c >= 0xcc6 && c <= 0xcc8) ||
778
5.30M
        (c >= 0xcca && c <= 0xccd) ||
779
5.30M
        (c >= 0xcd5 && c <= 0xcd6) ||
780
5.30M
        (c >= 0xd02 && c <= 0xd03) ||
781
5.30M
        (c >= 0xd3e && c <= 0xd43) ||
782
5.30M
        (c >= 0xd46 && c <= 0xd48) ||
783
5.30M
        (c >= 0xd4a && c <= 0xd4d) ||
784
5.30M
        c == 0xd57 ||
785
5.30M
        c == 0xe31 ||
786
5.30M
        (c >= 0xe34 && c <= 0xe3a) ||
787
5.30M
        (c >= 0xe47 && c <= 0xe4e) ||
788
5.30M
        c == 0xeb1 ||
789
5.30M
        (c >= 0xeb4 && c <= 0xeb9) ||
790
5.30M
        (c >= 0xebb && c <= 0xebc) ||
791
5.30M
        (c >= 0xec8 && c <= 0xecd) ||
792
5.30M
        (c >= 0xf18 && c <= 0xf19) ||
793
5.30M
        c == 0xf35 ||
794
5.30M
        c == 0xf37 ||
795
5.30M
        c == 0xf39 ||
796
5.30M
        c == 0xf3e ||
797
5.30M
        c == 0xf3f ||
798
5.30M
        (c >= 0xf71 && c <= 0xf84) ||
799
5.30M
        (c >= 0xf86 && c <= 0xf8b) ||
800
5.30M
        (c >= 0xf90 && c <= 0xf95) ||
801
5.30M
        c == 0xf97 ||
802
5.30M
        (c >= 0xf99 && c <= 0xfad) ||
803
5.30M
        (c >= 0xfb1 && c <= 0xfb7) ||
804
5.30M
        c == 0xfb9 ||
805
5.30M
        (c >= 0x20d0 && c <= 0x20dc) ||
806
5.30M
        c == 0x20e1 ||
807
5.30M
        (c >= 0x302a && c <= 0x302f) ||
808
5.30M
        c == 0x3099 ||
809
5.30M
        c == 0x309a ||
810
5.30M
        (c >= 0x30 && c <= 0x39) ||
811
1.60M
        (c >= 0x660 && c <= 0x669) ||
812
1.60M
        (c >= 0x6f0 && c <= 0x6f9) ||
813
1.60M
        (c >= 0x966 && c <= 0x96f) ||
814
1.60M
        (c >= 0x9e6 && c <= 0x9ef) ||
815
1.60M
        (c >= 0xa66 && c <= 0xa6f) ||
816
1.60M
        (c >= 0xae6 && c <= 0xaef) ||
817
1.60M
        (c >= 0xb66 && c <= 0xb6f) ||
818
1.60M
        (c >= 0xbe7 && c <= 0xbef) ||
819
1.60M
        (c >= 0xc66 && c <= 0xc6f) ||
820
1.60M
        (c >= 0xce6 && c <= 0xcef) ||
821
1.60M
        (c >= 0xd66 && c <= 0xd6f) ||
822
1.60M
        (c >= 0xe50 && c <= 0xe59) ||
823
1.60M
        (c >= 0xed0 && c <= 0xed9) ||
824
1.60M
        (c >= 0xf20 && c <= 0xf29) ||
825
1.60M
        c == 0xb7 ||
826
1.60M
        c == 0x2d0 ||
827
1.60M
        c == 0x2d1 ||
828
1.60M
        c == 0x387 ||
829
1.60M
        c == 0x640 ||
830
1.60M
        c == 0xe46 ||
831
1.60M
        c == 0xec6 ||
832
1.60M
        c == 0x3005 ||
833
1.60M
        (c >= 0x3031 && c <= 0x3035) ||
834
1.60M
        (c >= 0x309d && c <= 0x309e) ||
835
1.60M
        (c >= 0x30fc && c <= 0x30fe));
836
5.60M
}
837
838
Bool TY_(IsUpper)(uint c)
839
258
{
840
258
    uint map = MAP(c);
841
842
258
    return (map & uppercase)!=0;
843
258
}
844
845
uint TY_(ToLower)(uint c)
846
10.9M
{
847
10.9M
    uint map = MAP(c);
848
849
10.9M
    if (map & uppercase)
850
181k
        c += 'a' - 'A';
851
852
10.9M
    return c;
853
10.9M
}
854
855
uint TY_(ToUpper)(uint c)
856
152k
{
857
152k
    uint map = MAP(c);
858
859
152k
    if (map & lowercase)
860
4.00k
        c += (uint) ('A' - 'a' );
861
862
152k
    return c;
863
152k
}
864
865
/*
866
 return last character in string
867
 this is useful when trailing quotemark
868
 is missing on an attribute
869
*/
870
static tmbchar LastChar( tmbstr str )
871
20.3k
{
872
20.3k
    if ( str && *str )
873
20.3k
    {
874
20.3k
        int n = TY_(tmbstrlen)(str);
875
20.3k
        return str[n-1];
876
20.3k
    }
877
0
    return 0;
878
20.3k
}
879
880
Lexer* TY_(NewLexer)( TidyDocImpl* doc )
881
103
{
882
103
    Lexer* lexer = (Lexer*) TidyDocAlloc( doc, sizeof(Lexer) );
883
884
103
    if ( lexer != NULL )
885
103
    {
886
103
        TidyClearMemory( lexer, sizeof(Lexer) );
887
888
103
        lexer->allocator = doc->allocator;
889
103
        lexer->lines = 1;
890
103
        lexer->columns = 1;
891
103
        lexer->state = LEX_CONTENT;
892
893
103
        lexer->versions = (VERS_ALL|VERS_PROPRIETARY);
894
103
        lexer->doctype = VERS_UNKNOWN;
895
103
        lexer->root = &doc->root;
896
103
    }
897
103
    return lexer;
898
103
}
899
900
static Bool EndOfInput( TidyDocImpl* doc )
901
39.8k
{
902
39.8k
    assert( doc->docIn != NULL );
903
39.8k
    return ( !doc->docIn->pushed && TY_(IsEOF)(doc->docIn) );
904
39.8k
}
905
906
void TY_(FreeLexer)( TidyDocImpl* doc )
907
206
{
908
206
    Lexer *lexer = doc->lexer;
909
206
    if ( lexer )
910
103
    {
911
103
        TY_(FreeStyles)( doc );
912
913
        /* See GetToken() */
914
103
        if ( lexer->pushed || lexer->itoken )
915
0
        {
916
0
            if (lexer->pushed)
917
0
                TY_(FreeNode)( doc, lexer->itoken );
918
0
            TY_(FreeNode)( doc, lexer->token );
919
0
        }
920
921
103
        while ( lexer->istacksize > 0 )
922
0
            TY_(PopInline)( doc, NULL );
923
924
103
        TidyDocFree( doc, lexer->istack );
925
103
        TidyDocFree( doc, lexer->lexbuf );
926
103
        TidyDocFree( doc, lexer );
927
103
        doc->lexer = NULL;
928
103
    }
929
206
}
930
931
/* Lexer uses bigger memory chunks than pprint as
932
** it must hold the entire input document. not just
933
** the last line or three.
934
*/
935
static void AddByte( Lexer *lexer, tmbchar ch )
936
256M
{
937
256M
    if ( lexer->lexsize + 2 >= lexer->lexlength )
938
967
    {
939
967
        tmbstr buf = NULL;
940
967
        uint allocAmt = lexer->lexlength;
941
967
        uint prev = allocAmt; /* Is. #761 */
942
1.93k
        while ( lexer->lexsize + 2 >= allocAmt )
943
967
        {
944
967
            if ( allocAmt == 0 )
945
103
                allocAmt = 8192;
946
864
            else
947
864
                allocAmt *= 2;
948
967
            if (allocAmt < prev) /* Is. #761 - watch for wrap - and */
949
0
                TidyPanic(lexer->allocator, "\nPanic: out of internal memory!\nDocument input too big!\n");
950
967
        }
951
967
        buf = (tmbstr) TidyRealloc( lexer->allocator, lexer->lexbuf, allocAmt );
952
967
        if ( buf )
953
967
        {
954
967
          TidyClearMemory( buf + lexer->lexlength, 
955
967
                           allocAmt - lexer->lexlength );
956
967
          lexer->lexbuf = buf;
957
967
          lexer->lexlength = allocAmt;
958
967
        }
959
967
    }
960
961
256M
    lexer->lexbuf[ lexer->lexsize++ ] = ch;
962
256M
    lexer->lexbuf[ lexer->lexsize ]   = '\0';  /* debug */
963
256M
}
964
965
static void ChangeChar( Lexer *lexer, tmbchar c )
966
41.1k
{
967
41.1k
    if ( lexer->lexsize > 0 )
968
41.1k
    {
969
41.1k
        lexer->lexbuf[ lexer->lexsize-1 ] = c;
970
41.1k
    }
971
41.1k
}
972
973
/* store character c as UTF-8 encoded byte stream */
974
void TY_(AddCharToLexer)( Lexer *lexer, uint c )
975
256M
{
976
256M
    int i, err, count = 0;
977
256M
    tmbchar buf[10] = {0};
978
    
979
256M
    err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
980
256M
    if (err)
981
468
    {
982
        /* replacement character 0xFFFD encoded as UTF-8 */
983
468
        buf[0] = (byte) 0xEF;
984
468
        buf[1] = (byte) 0xBF;
985
468
        buf[2] = (byte) 0xBD;
986
468
        count = 3;
987
468
    }
988
    
989
513M
    for ( i = 0; i < count; ++i )
990
256M
        AddByte( lexer, buf[i] );
991
256M
}
992
993
static void AddStringToLexer( Lexer *lexer, ctmbstr str )
994
0
{
995
0
    uint c;
996
997
    /*  Many (all?) compilers will sign-extend signed chars (the default) when
998
    **  converting them to unsigned integer values.  We must cast our char to
999
    **  unsigned char before assigning it to prevent this from happening.
1000
    */
1001
0
    while( 0 != (c = (unsigned char) *str++ ))
1002
0
        TY_(AddCharToLexer)( lexer, c );
1003
0
}
1004
1005
1006
static void SetLexerLocus( TidyDocImpl* doc, Lexer *lexer )
1007
972k
{
1008
972k
    lexer->lines = doc->docIn->curline;
1009
972k
    lexer->columns = doc->docIn->curcol;
1010
972k
}
1011
1012
/*
1013
    Issue #483
1014
    Have detected the first of a surrogate pair...
1015
    Try to find, decode the second...
1016
    Already have '&' start...
1017
*/
1018
1019
typedef enum {
1020
    SP_ok,
1021
    SP_failed,
1022
    SP_error
1023
}SPStatus;
1024
1025
static SPStatus GetSurrogatePair(TidyDocImpl* doc, Bool isXml, uint *pch)
1026
0
{
1027
0
    Lexer* lexer = doc->lexer;
1028
0
    uint bufSize = 32;
1029
0
    uint c, ch = 0, offset = 0;
1030
0
    tmbstr buf = 0;
1031
0
    SPStatus status = SP_error;  /* assume failed */
1032
0
    int type = 0;   /* assume numeric */
1033
0
    uint fch = *pch;
1034
0
    int i;  /* has to be signed due to for i >= 0 */
1035
0
    if (!lexer)
1036
0
        return status;
1037
0
    buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
1038
0
    if (!buf)
1039
0
        return status;
1040
0
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
1041
0
    {
1042
0
        if (c == ';')
1043
0
        {
1044
0
            break;  /* reached end of entity */
1045
0
        }
1046
0
        if ((offset + 2) > bufSize)
1047
0
        {
1048
0
            bufSize *= 2;
1049
0
            buf = (tmbstr)TidyRealloc(lexer->allocator, buf, bufSize);
1050
0
            if (!buf)
1051
0
            {
1052
0
                break;
1053
0
            }
1054
0
        }
1055
0
        buf[offset++] = c;  /* add char to buffer */
1056
0
        if (offset == 1)
1057
0
        {
1058
0
            if (c != '#')   /* is a numeric entity */
1059
0
                break;
1060
0
        }
1061
0
        else if (offset == 2 && ((c == 'x') || (!isXml && c == 'X')))
1062
0
        {
1063
0
            type = 1;   /* set hex digits */
1064
0
        }
1065
0
        else
1066
0
        {
1067
0
            if (type)   /* if hex digits */
1068
0
            {
1069
0
                if (!IsDigitHex(c))
1070
0
                    break;
1071
0
            }
1072
0
            else    /* if numeric */
1073
0
            {
1074
0
                if (!TY_(IsDigit)(c))
1075
0
                    break;
1076
0
            }
1077
0
        }
1078
0
    }
1079
1080
0
    if (c == ';')
1081
0
    {
1082
0
        int scanned;
1083
1084
0
        buf[offset] = 0;
1085
0
        if (type)
1086
0
            scanned = sscanf(buf + 2, "%x", &ch);
1087
0
        else
1088
0
            scanned = sscanf(buf + 1, "%d", &ch);
1089
1090
0
        if (scanned == 1 && TY_(IsHighSurrogate)(ch))
1091
0
        {
1092
0
            ch = TY_(CombineSurrogatePair)(ch, fch);
1093
0
            if (TY_(IsValidCombinedChar)(ch))
1094
0
            {
1095
0
                *pch = ch;  /* return combined pair value */
1096
0
                status = SP_ok; /* full success - pair used */
1097
0
            }
1098
0
            else
1099
0
            {
1100
0
                status = SP_failed; /* is one of the 32 out-of-range pairs */
1101
0
                *pch = 0xFFFD;  /* return substitute character */
1102
0
                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_PAIR, fch, ch); /* SP WARNING: -  */
1103
0
            }
1104
0
        }
1105
0
    }
1106
1107
0
    if (status == SP_error)
1108
0
    {
1109
        /* Error condition - can only put back all the chars */
1110
0
        if (c == ';') /* if last, not added to buffer */
1111
0
            TY_(UngetChar)(c, doc->docIn);
1112
0
        if (buf && offset)
1113
0
        {
1114
            /* correct the order for unget - last first */
1115
0
            for (i = offset - 1; i >= 0; i--)
1116
0
            {
1117
0
                c = buf[i];
1118
0
                TY_(UngetChar)(c, doc->docIn);
1119
0
            }
1120
0
        }
1121
0
    }
1122
1123
0
    if (buf)
1124
0
        TidyFree(lexer->allocator, buf);
1125
1126
0
    return status;
1127
0
}
1128
1129
/*
1130
  No longer attempts to insert missing ';' for unknown
1131
 entities unless one was present already, since this
1132
  gives unexpected results.
1133
1134
  For example:   <a href="something.htm?foo&bar&fred">
1135
  was tidied to: <a href="something.htm?foo&amp;bar;&amp;fred;">
1136
  rather than:   <a href="something.htm?foo&amp;bar&amp;fred">
1137
1138
  My thanks for Maurice Buxton for spotting this.
1139
1140
  Also Randy Waki pointed out the following case for the
1141
  04 Aug 00 version (bug #433012):
1142
  
1143
  For example:   <a href="something.htm?id=1&lang=en">
1144
  was tidied to: <a href="something.htm?id=1&lang;=en">
1145
  rather than:   <a href="something.htm?id=1&amp;lang=en">
1146
  
1147
  where "lang" is a known entity (#9001), but browsers would
1148
  misinterpret "&lang;" because it had a value > 256.
1149
  
1150
  So the case of an apparently known entity with a value > 256 and
1151
  missing a semicolon is handled specially.
1152
  
1153
  "ParseEntity" is also a bit of a misnomer - it handles entities and
1154
  numeric character references. Invalid NCR's are now reported.
1155
*/
1156
static void ParseEntity( TidyDocImpl* doc, GetTokenMode mode )
1157
1.38k
{
1158
1.38k
    typedef enum
1159
1.38k
    {
1160
1.38k
        ENT_default,
1161
1.38k
        ENT_numdec,
1162
1.38k
        ENT_numhex
1163
1.38k
    } ENTState;
1164
    
1165
1.38k
    typedef Bool (*ENTfn)(uint);
1166
1.38k
    const ENTfn entFn[] = {
1167
1.38k
        TY_(IsNamechar),
1168
1.38k
        TY_(IsDigit),
1169
1.38k
        IsDigitHex
1170
1.38k
    };
1171
1.38k
    uint start;
1172
1.38k
    ENTState entState = ENT_default;
1173
1.38k
    uint charRead = 0;
1174
1.38k
    Bool semicolon = no, found = no;
1175
1.38k
    Bool isXml = cfgBool( doc, TidyXmlTags );
1176
1.38k
    Bool preserveEntities = cfgBool( doc, TidyPreserveEntities );
1177
1.38k
    uint c, ch, startcol, entver = 0;
1178
1.38k
    Lexer* lexer = doc->lexer;
1179
1180
1.38k
    start = lexer->lexsize - 1;  /* to start at "&" */
1181
1.38k
    startcol = doc->docIn->curcol - 1;
1182
1183
9.30k
    while ( (c = TY_(ReadChar)(doc->docIn)) != EndOfStream )
1184
9.30k
    {
1185
9.30k
        if ( c == ';' )
1186
480
        {
1187
480
            semicolon = yes;
1188
480
            break;
1189
480
        }
1190
8.82k
        ++charRead;
1191
1192
8.82k
        if (charRead == 1 && c == '#')
1193
542
        {
1194
542
            if ( !cfgBool(doc, TidyNCR) ||
1195
542
                 cfg(doc, TidyInCharEncoding) == BIG5 ||
1196
542
                 cfg(doc, TidyInCharEncoding) == SHIFTJIS )
1197
0
            {
1198
0
                TY_(UngetChar)('#', doc->docIn);
1199
0
                return;
1200
0
            }
1201
1202
542
            TY_(AddCharToLexer)( lexer, c );
1203
542
            entState = ENT_numdec;
1204
542
            continue;
1205
542
        }
1206
8.27k
        else if (charRead == 2 && entState == ENT_numdec
1207
542
                 && (c == 'x' || (!isXml && c == 'X')) )
1208
537
        {
1209
537
            TY_(AddCharToLexer)( lexer, c );
1210
537
            entState = ENT_numhex;
1211
537
            continue;
1212
537
        }
1213
1214
7.74k
        if ( entFn[entState](c) )
1215
6.83k
        {
1216
6.83k
            TY_(AddCharToLexer)( lexer, c );
1217
6.83k
            continue;
1218
6.83k
        }
1219
1220
        /* otherwise put it back */
1221
908
        TY_(UngetChar)( c, doc->docIn );
1222
908
        break;
1223
7.74k
    }
1224
1225
    /* make sure entity is NULL terminated */
1226
1.38k
    lexer->lexbuf[lexer->lexsize] = '\0';
1227
1228
    /* Should contrain version to XML/XHTML if &apos; 
1229
    ** is encountered.  But this is not possible with
1230
    ** Tidy's content model bit mask.
1231
    */
1232
1.38k
    if ( TY_(tmbstrcmp)(lexer->lexbuf+start, "&apos") == 0
1233
0
         && !cfgBool(doc, TidyXmlOut)
1234
0
         && !lexer->isvoyager
1235
0
         && !cfgBool(doc, TidyXhtmlOut)
1236
0
         && !(TY_(HTMLVersion)(doc) == HT50) ) /* Issue #239 - no warning if in HTML5++ mode */
1237
0
        TY_(ReportEntityError)( doc, APOS_UNDEFINED, lexer->lexbuf+start, 39 );
1238
1239
1.38k
    if (( mode == OtherNamespace ) && ( c == ';' ))
1240
0
    {
1241
        /* #130 MathML attr and entity fix! */
1242
0
        found = yes;
1243
0
        ch = 255;
1244
0
        entver = XH50|HT50;
1245
0
        preserveEntities = yes;
1246
0
    }
1247
1.38k
    else
1248
1.38k
    {
1249
        /* Lookup entity code and version
1250
        */
1251
1.38k
        found = TY_(EntityInfo)( lexer->lexbuf+start, isXml, &ch, &entver );
1252
1.38k
    }
1253
1254
    /* Issue #483 - Deal with 'surrogate pairs' */
1255
    /* TODO: Maybe warning/error, like found a leading surrogate
1256
       but no following surrogate! Maybe should avoid outputting
1257
       invalid utf-8 for this entity - maybe substitute?  */
1258
1.38k
    if (!preserveEntities && found && TY_(IsLowSurrogate)(ch))
1259
0
    {
1260
0
        uint c1;
1261
0
        if ((c1 = TY_(ReadChar)(doc->docIn)) == '&')
1262
0
        {
1263
0
            SPStatus status;
1264
            /* Have a following entity, 
1265
               so there is a chance of having a valid surrogate pair */
1266
0
            c1 = ch;    /* keep first value, in case of error */
1267
0
            status = GetSurrogatePair(doc, isXml, &ch);
1268
0
            if (status == SP_error)
1269
0
            {
1270
0
                TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, c1, 0); /* SP WARNING: - using substitute character */
1271
0
                TY_(UngetChar)('&', doc->docIn);  /* otherwise put it back */
1272
0
            }
1273
0
        }
1274
0
        else
1275
0
        {
1276
            /* put this non-entity lead char back */
1277
0
            TY_(UngetChar)(c1, doc->docIn);
1278
            /* Have leading surrogate pair, with no tail */
1279
0
            TY_(ReportSurrogateError)(doc, BAD_SURROGATE_TAIL, ch, 0); /* SP WARNING: - using substitute character */
1280
0
            ch = 0xFFFD;
1281
0
        }
1282
0
    } 
1283
1.38k
    else if (!preserveEntities && found && TY_(IsHighSurrogate)(ch))
1284
55
    {
1285
        /* Have trailing surrogate pair, with no lead */
1286
55
        TY_(ReportSurrogateError)(doc, BAD_SURROGATE_LEAD, ch, 0); /* SP WARNING: - using substitute character */
1287
55
        ch = 0xFFFD;
1288
55
    }
1289
1290
    /* deal with unrecognized or invalid entities */
1291
    /* #433012 - fix by Randy Waki 17 Feb 01 */
1292
    /* report invalid NCR's - Terry Teague 01 Sep 01 */
1293
1.38k
    if ( !found || (ch >= 128 && ch <= 159) || (ch >= 256 && c != ';') )
1294
913
    {
1295
        /* set error position just before offending character */
1296
913
        SetLexerLocus( doc, lexer );
1297
913
        lexer->columns = startcol;
1298
1299
913
        if (lexer->lexsize > start + 1)
1300
831
        {
1301
831
            if (ch >= 128 && ch <= 159)
1302
0
            {
1303
                /* invalid numeric character reference */
1304
                
1305
0
                uint c1 = 0;
1306
0
                int replaceMode = DISCARDED_CHAR;
1307
            
1308
                /* Always assume Win1252 in this circumstance. */
1309
0
                c1 = TY_(DecodeWin1252)( ch );
1310
1311
0
                if ( c1 )
1312
0
                    replaceMode = REPLACED_CHAR;
1313
                
1314
0
                if ( c != ';' )  /* issue warning if not terminated by ';' */
1315
0
                    TY_(ReportEntityError)( doc, MISSING_SEMICOLON_NCR,
1316
0
                                            lexer->lexbuf+start, c );
1317
 
1318
0
                TY_(ReportEncodingError)(doc, INVALID_NCR, ch, replaceMode == DISCARDED_CHAR);
1319
                
1320
0
                if ( c1 )
1321
0
                {
1322
                    /* make the replacement */
1323
0
                    lexer->lexsize = start;
1324
0
                    TY_(AddCharToLexer)( lexer, c1 );
1325
0
                    semicolon = no;
1326
0
                }
1327
0
                else
1328
0
                {
1329
                    /* discard */
1330
0
                    lexer->lexsize = start;
1331
0
                    semicolon = no;
1332
0
               }
1333
               
1334
0
            }
1335
831
            else
1336
831
                TY_(ReportEntityError)( doc, UNKNOWN_ENTITY,
1337
831
                                        lexer->lexbuf+start, ch );
1338
1339
831
            if (semicolon)
1340
10
                TY_(AddCharToLexer)( lexer, ';' );
1341
831
        }
1342
82
        else
1343
82
        {
1344
            /*\ 
1345
             *  Issue #207 - A naked & is allowed in HTML5, as an unambiguous ampersand!
1346
            \*/
1347
82
            if (TY_(HTMLVersion)(doc) != HT50) 
1348
82
            {
1349
82
                TY_(ReportEntityError)( doc, UNESCAPED_AMPERSAND,
1350
82
                                    lexer->lexbuf+start, ch );
1351
82
            }
1352
82
        }
1353
913
    }
1354
475
    else
1355
475
    {
1356
475
        if ( c != ';' )    /* issue warning if not terminated by ';' */
1357
5
        {
1358
            /* set error position just before offending character */
1359
5
            SetLexerLocus( doc, lexer );
1360
5
            lexer->columns = startcol;
1361
5
            TY_(ReportEntityError)( doc, MISSING_SEMICOLON, lexer->lexbuf+start, c );
1362
5
        }
1363
1364
475
        if (preserveEntities)
1365
0
            TY_(AddCharToLexer)( lexer, ';' );
1366
475
        else
1367
475
        {
1368
475
            lexer->lexsize = start;
1369
475
            if ( ch == 160 && (mode == Preformatted) )
1370
0
                ch = ' ';
1371
475
            TY_(AddCharToLexer)( lexer, ch );
1372
1373
475
            if ( ch == '&' && !cfgBool(doc, TidyQuoteAmpersand) )
1374
0
                AddStringToLexer( lexer, "amp;" );
1375
475
        }
1376
1377
        /* Detect extended vs. basic entities */
1378
475
        TY_(ConstrainVersion)( doc, entver );
1379
475
    }
1380
1.38k
}
1381
1382
static tmbchar ParseTagName( TidyDocImpl* doc )
1383
6.20k
{
1384
6.20k
    Lexer *lexer = doc->lexer;
1385
6.20k
    uint c = lexer->lexbuf[ lexer->txtstart ];
1386
6.20k
    Bool xml = cfgBool(doc, TidyXmlTags);
1387
1388
    /* fold case of first character in buffer */
1389
6.20k
    if (!xml && TY_(IsUpper)(c))
1390
0
        lexer->lexbuf[lexer->txtstart] = (tmbchar) TY_(ToLower)(c);
1391
1392
11.5k
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
1393
11.5k
    {
1394
11.5k
        if ((!xml && !TY_(IsNamechar)(c)) ||
1395
11.5k
            (xml && !TY_(IsXMLNamechar)(c)))
1396
6.20k
            break;
1397
1398
        /* fold case of subsequent characters */
1399
5.38k
        if (!xml && TY_(IsUpper)(c))
1400
0
             c = TY_(ToLower)(c);
1401
1402
5.38k
        TY_(AddCharToLexer)(lexer, c);
1403
5.38k
    }
1404
1405
6.20k
    lexer->txtend = lexer->lexsize;
1406
6.20k
    return (tmbchar) c;
1407
6.20k
}
1408
1409
/*
1410
  Used for elements and text nodes
1411
  element name is NULL for text nodes
1412
  start and end are offsets into lexbuf
1413
  which contains the textual content of
1414
  all elements in the parse tree.
1415
1416
  parent and content allow traversal
1417
  of the parse tree in any direction.
1418
  attributes are represented as a linked
1419
  list of AttVal nodes which hold the
1420
  strings for attribute/value pairs.
1421
*/
1422
1423
1424
Node *TY_(NewNode)(TidyAllocator* allocator, Lexer *lexer)
1425
146k
{
1426
146k
    Node* node = (Node*) TidyAlloc( allocator, sizeof(Node) );
1427
146k
    TidyClearMemory( node, sizeof(Node) );
1428
146k
    if ( lexer )
1429
146k
    {
1430
146k
        node->line = lexer->lines;
1431
146k
        node->column = lexer->columns;
1432
146k
    }
1433
146k
    node->type = TextNode;
1434
#if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION)
1435
    SPRTF("Allocated node %p\n", node );
1436
#endif
1437
146k
    return node;
1438
146k
}
1439
1440
/* used to clone heading nodes when split by an <HR> */
1441
Node *TY_(CloneNode)( TidyDocImpl* doc, Node *element )
1442
0
{
1443
0
    Lexer* lexer = doc->lexer;
1444
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1445
1446
0
    node->start = lexer->lexsize;
1447
0
    node->end   = lexer->lexsize;
1448
1449
0
    if ( element )
1450
0
    {
1451
0
        node->parent     = element->parent;
1452
0
        node->type       = element->type;
1453
0
        node->closed     = element->closed;
1454
0
        node->implicit   = element->implicit;
1455
0
        node->tag        = element->tag;
1456
0
        node->element    = TY_(tmbstrdup)( doc->allocator, element->element );
1457
0
        node->attributes = TY_(DupAttrs)( doc, element->attributes );
1458
0
    }
1459
0
    return node;
1460
0
}
1461
1462
/* free node's attributes */
1463
void TY_(FreeAttrs)( TidyDocImpl* doc, Node *node )
1464
146k
{
1465
159k
    while ( node->attributes )
1466
13.3k
    {
1467
13.3k
        AttVal *av = node->attributes;
1468
1469
13.3k
        if ( av->attribute )
1470
13.2k
        {
1471
13.2k
            if ( (attrIsID(av) || attrIsNAME(av)) &&
1472
2
                 TY_(IsAnchorElement)(doc, node) )
1473
0
            {
1474
0
                TY_(RemoveAnchorByNode)( doc, av->value, node );
1475
0
            }
1476
13.2k
        }
1477
1478
13.3k
        node->attributes = av->next;
1479
13.3k
        TY_(FreeAttribute)( doc, av );
1480
13.3k
    }
1481
146k
}
1482
1483
/* doesn't repair attribute list linkage */
1484
void TY_(FreeAttribute)( TidyDocImpl* doc, AttVal *av )
1485
34.0k
{
1486
34.0k
    TY_(FreeNode)( doc, av->asp );
1487
34.0k
    TY_(FreeNode)( doc, av->php );
1488
34.0k
    TidyDocFree( doc, av->attribute );
1489
34.0k
    TidyDocFree( doc, av->value );
1490
34.0k
    TidyDocFree( doc, av );
1491
34.0k
}
1492
1493
/* detach attribute from node
1494
*/
1495
void TY_(DetachAttribute)( Node *node, AttVal *attr )
1496
351
{
1497
351
    AttVal *av, *prev = NULL;
1498
1499
796k
    for ( av = node->attributes; av; av = av->next )
1500
796k
    {
1501
796k
        if ( av == attr )
1502
351
        {
1503
351
            if ( prev )
1504
306
                prev->next = attr->next;
1505
45
            else
1506
45
                node->attributes = attr->next;
1507
351
            break;
1508
351
        }
1509
796k
        prev = av;
1510
796k
    }
1511
351
}
1512
1513
/* detach attribute from node then free it
1514
*/
1515
void TY_(RemoveAttribute)( TidyDocImpl* doc, Node *node, AttVal *attr )
1516
351
{
1517
351
    TY_(DetachAttribute)( node, attr );
1518
351
    TY_(FreeAttribute)( doc, attr );
1519
351
}
1520
1521
/*
1522
  Free document nodes by iterating through peers and recursing
1523
  through children. Set next to NULL before calling TY_(FreeNode)()
1524
  to avoid freeing peer nodes. Doesn't patch up prev/next links.
1525
 */
1526
void TY_(FreeNode)( TidyDocImpl* doc, Node *node )
1527
215k
{
1528
#if defined(ENABLE_DEBUG_LOG) && defined(DEBUG_ALLOCATION)
1529
    /* avoid showing free of root node! */
1530
    if (node) {
1531
        if (RootNode != node->type) {
1532
            SPRTF("Free node %p\n", node);
1533
        }
1534
        else {
1535
            SPRTF("Root node %p\n", node);
1536
        }
1537
    }
1538
#endif
1539
1540
361k
    while ( node )
1541
146k
    {
1542
146k
        Node* next = node->next;
1543
1544
146k
        TY_(FreeAttrs)( doc, node );
1545
146k
        TY_(FreeNode)( doc, node->content );
1546
146k
        TidyDocFree( doc, node->element );
1547
146k
        if (RootNode != node->type)
1548
146k
            TidyDocFree( doc, node );
1549
206
        else
1550
206
            node->content = NULL;
1551
1552
146k
        node = next;
1553
146k
    }
1554
215k
}
1555
1556
Node* TY_(TextToken)( Lexer *lexer )
1557
42.4k
{
1558
42.4k
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1559
42.4k
    node->start = lexer->txtstart;
1560
42.4k
    node->end = lexer->txtend;
1561
42.4k
    return node;
1562
42.4k
}
1563
1564
/* used for creating preformatted text from Word2000 */
1565
Node *TY_(NewLineNode)( Lexer *lexer )
1566
0
{
1567
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1568
0
    node->start = lexer->lexsize;
1569
0
    TY_(AddCharToLexer)( lexer, (uint)'\n' );
1570
0
    node->end = lexer->lexsize;
1571
0
    return node;
1572
0
}
1573
1574
/* used for adding a &nbsp; for Word2000 */
1575
Node* TY_(NewLiteralTextNode)( Lexer *lexer, ctmbstr txt )
1576
0
{
1577
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
1578
0
    node->start = lexer->lexsize;
1579
0
    AddStringToLexer( lexer, txt );
1580
0
    node->end = lexer->lexsize;
1581
0
    return node;
1582
0
}
1583
1584
static Node* TagToken( TidyDocImpl* doc, NodeType type )
1585
6.20k
{
1586
6.20k
    Lexer* lexer = doc->lexer;
1587
6.20k
    Node* node = TY_(NewNode)( lexer->allocator, lexer );
1588
6.20k
    node->type = type;
1589
6.20k
    node->element = TY_(tmbstrndup)( doc->allocator,
1590
6.20k
                                     lexer->lexbuf + lexer->txtstart,
1591
6.20k
                                     lexer->txtend - lexer->txtstart );
1592
6.20k
    node->start = lexer->txtstart;
1593
6.20k
    node->end = lexer->txtstart;
1594
1595
6.20k
    if ( type == StartTag || type == StartEndTag || type == EndTag )
1596
6.20k
        TY_(FindTag)(doc, node);
1597
1598
6.20k
    return node;
1599
6.20k
}
1600
1601
static Node* NewToken(TidyDocImpl* doc, NodeType type)
1602
61.4k
{
1603
61.4k
    Lexer* lexer = doc->lexer;
1604
61.4k
    Node* node = TY_(NewNode)(lexer->allocator, lexer);
1605
61.4k
    node->type = type;
1606
61.4k
    node->start = lexer->txtstart;
1607
61.4k
    node->end = lexer->txtend;
1608
61.4k
    return node;
1609
61.4k
}
1610
1611
28
#define CommentToken(doc) NewToken(doc, CommentTag)
1612
#define DocTypeToken(doc) NewToken(doc, DocTypeTag)
1613
15
#define PIToken(doc)      NewToken(doc, ProcInsTag)
1614
57
#define AspToken(doc)     NewToken(doc, AspTag)
1615
6
#define JsteToken(doc)    NewToken(doc, JsteTag)
1616
41
#define PhpToken(doc)     NewToken(doc, PhpTag)
1617
63
#define XmlDeclToken(doc) NewToken(doc, XmlDecl)
1618
61.2k
#define SectionToken(doc) NewToken(doc, SectionTag)
1619
0
#define CDATAToken(doc)   NewToken(doc, CDATATag)
1620
1621
void TY_(AddStringLiteral)( Lexer* lexer, ctmbstr str )
1622
0
{
1623
0
    byte c;
1624
0
    while(0 != (c = *str++) ) {
1625
        /*\
1626
         *  Issue #286
1627
         *  Previously this used TY_(AddCharToLexer)( lexer, c );
1628
         *  which uses err = TY_(EncodeCharToUTF8Bytes)( c, buf, NULL, &count );
1629
         *  But this is transferring already 'translated' data from an
1630
         *  internal location to the lexer, so should use AddByte()
1631
        \*/
1632
0
        AddByte( lexer, c );
1633
0
    }
1634
0
}
1635
1636
/*
1637
void AddStringLiteralLen( Lexer* lexer, ctmbstr str, int len )
1638
{
1639
    byte c;
1640
    int ix;
1641
1642
    for ( ix=0; ix < len && (c = *str++); ++ix )
1643
        TY_(AddCharToLexer)(lexer, c);
1644
}
1645
*/
1646
1647
/* find doctype element */
1648
Node *TY_(FindDocType)( TidyDocImpl* doc )
1649
0
{
1650
0
    Node* node;
1651
0
    for ( node = (doc ? doc->root.content : NULL);
1652
0
          node && node->type != DocTypeTag; 
1653
0
          node = node->next )
1654
0
        /**/;
1655
0
    return node;
1656
0
}
1657
1658
/* find parent container element */
1659
Node* TY_(FindContainer)( Node* node )
1660
0
{
1661
0
    for ( node = (node ? node->parent : NULL);
1662
0
          node && TY_(nodeHasCM)(node, CM_INLINE);
1663
0
          node = node->parent )
1664
0
        /**/;
1665
1666
0
    return node;
1667
0
}
1668
1669
1670
/* find html element */
1671
Node *TY_(FindHTML)( TidyDocImpl* doc )
1672
0
{
1673
0
    Node *node;
1674
0
    for ( node = (doc ? doc->root.content : NULL);
1675
0
          node && !nodeIsHTML(node); 
1676
0
          node = node->next )
1677
0
        /**/;
1678
1679
0
    return node;
1680
0
}
1681
1682
/* find XML Declaration */
1683
Node *TY_(FindXmlDecl)(TidyDocImpl* doc)
1684
0
{
1685
0
    Node *node;
1686
0
    for ( node = (doc ? doc->root.content : NULL);
1687
0
          node && !(node->type == XmlDecl);
1688
0
          node = node->next )
1689
0
        /**/;
1690
1691
0
    return node;
1692
0
}
1693
1694
1695
Node *TY_(FindHEAD)( TidyDocImpl* doc )
1696
0
{
1697
0
    Node *node = TY_(FindHTML)( doc );
1698
1699
0
    if ( node )
1700
0
    {
1701
0
        for ( node = node->content;
1702
0
              node && !nodeIsHEAD(node); 
1703
0
              node = node->next )
1704
0
            /**/;
1705
0
    }
1706
1707
0
    return node;
1708
0
}
1709
1710
Node *TY_(FindTITLE)(TidyDocImpl* doc)
1711
0
{
1712
0
    Node *node = TY_(FindHEAD)(doc);
1713
1714
0
    if (node)
1715
0
        for (node = node->content;
1716
0
             node && !nodeIsTITLE(node);
1717
0
             node = node->next) {}
1718
1719
0
    return node;
1720
0
}
1721
1722
Node *TY_(FindBody)( TidyDocImpl* doc )
1723
0
{
1724
0
    Node *node = ( doc ? doc->root.content : NULL );
1725
1726
0
    while ( node && !nodeIsHTML(node) )
1727
0
        node = node->next;
1728
1729
0
    if (node == NULL)
1730
0
        return NULL;
1731
1732
0
    node = node->content;
1733
0
    while ( node && !nodeIsBODY(node) && !nodeIsFRAMESET(node) )
1734
0
        node = node->next;
1735
1736
0
    if ( node && nodeIsFRAMESET(node) )
1737
0
    {
1738
0
        node = node->content;
1739
0
        while ( node && !nodeIsNOFRAMES(node) )
1740
0
            node = node->next;
1741
1742
0
        if ( node )
1743
0
        {
1744
0
            node = node->content;
1745
0
            while ( node && !nodeIsBODY(node) )
1746
0
                node = node->next;
1747
0
        }
1748
0
    }
1749
1750
0
    return node;
1751
0
}
1752
1753
/* add meta element for Tidy */
1754
Bool TY_(AddGenerator)( TidyDocImpl* doc )
1755
0
{
1756
0
    AttVal *attval;
1757
0
    Node *node;
1758
0
    Node *head = TY_(FindHEAD)( doc );
1759
0
    tmbchar buf[256];
1760
    
1761
0
    if (head)
1762
0
    {
1763
0
#ifdef PLATFORM_NAME
1764
0
        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 for "PLATFORM_NAME" version %s",
1765
0
                         tidyLibraryVersion());
1766
#else
1767
        TY_(tmbsnprintf)(buf, sizeof(buf), "HTML Tidy for HTML5 version %s", tidyLibraryVersion());
1768
#endif
1769
1770
0
        for ( node = head->content; node; node = node->next )
1771
0
        {
1772
0
            if ( nodeIsMETA(node) )
1773
0
            {
1774
0
                attval = TY_(AttrGetById)(node, TidyAttr_NAME);
1775
1776
0
                if (AttrValueIs(attval, "generator"))
1777
0
                {
1778
0
                    attval = TY_(AttrGetById)(node, TidyAttr_CONTENT);
1779
1780
0
                    if (AttrHasValue(attval) &&
1781
0
                        TY_(tmbstrncasecmp)(attval->value, "HTML Tidy", 9) == 0)
1782
0
                    {
1783
                        /* update the existing content to reflect the */
1784
                        /* actual version of Tidy currently being used */
1785
                        
1786
0
                        TidyDocFree(doc, attval->value);
1787
0
                        attval->value = TY_(tmbstrdup)(doc->allocator, buf);
1788
0
                        return no;
1789
0
                    }
1790
0
                }
1791
0
            }
1792
0
        }
1793
1794
0
        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
1795
0
        {
1796
0
            node = TY_(InferredTag)(doc, TidyTag_META);
1797
0
            TY_(AddAttribute)( doc, node, "name", "generator" );
1798
0
            TY_(AddAttribute)( doc, node, "content", buf );
1799
0
            TY_(InsertNodeAtStart)( head, node );
1800
0
            return yes;
1801
0
        }
1802
0
    }
1803
1804
0
    return no;
1805
0
}
1806
1807
/*\ examine <!DOCTYPE ...> to identify version 
1808
 *  Issue #167 and #169
1809
 *   If HTML5
1810
 *        <!DOCTYPE html>
1811
 *       <!DOCTYPE html SYSTEM "about:legacy-compat">
1812
 *   else others
1813
\*/
1814
static uint FindGivenVersion( TidyDocImpl* doc, Node* doctype )
1815
0
{
1816
0
    AttVal * fpi = TY_(GetAttrByName)(doctype, "PUBLIC");
1817
0
    uint vers;
1818
1819
0
    if (!fpi || !fpi->value) 
1820
0
    {
1821
        /*\
1822
         * Is. #815 - change to case-insensitive test
1823
         * See REC: https://www.w3.org/TR/html5/syntax.html#the-doctype
1824
        \*/
1825
0
        if (doctype->element && (TY_(tmbstrcasecmp)(doctype->element,"html") == 0))
1826
0
        {
1827
0
            return VERS_HTML5;  /* TODO: do we need to check MORE? */
1828
0
        }
1829
        /* TODO: Consider warning, error message */
1830
0
        return VERS_UNKNOWN;
1831
0
    }
1832
0
    vers = GetVersFromFPI(fpi->value);
1833
1834
0
    if (VERS_XHTML & vers)
1835
0
    {
1836
0
        TY_(SetOptionBool)(doc, TidyXmlOut, yes);
1837
0
        TY_(SetOptionBool)(doc, TidyXhtmlOut, yes);
1838
0
        doc->lexer->isvoyager = yes;
1839
0
    }
1840
1841
    /* todo: add a warning if case does not match? */
1842
0
    TidyDocFree(doc, fpi->value);
1843
0
    fpi->value = TY_(tmbstrdup)(doc->allocator, GetFPIFromVers(vers));
1844
1845
0
    return vers;
1846
0
}
1847
1848
/* return guessed version */
1849
uint TY_(ApparentVersion)( TidyDocImpl* doc )
1850
0
{
1851
0
    if ((doc->lexer->doctype == XH11 ||
1852
0
         doc->lexer->doctype == XB10) &&
1853
0
        (doc->lexer->versions & doc->lexer->doctype))
1854
0
        return doc->lexer->doctype;
1855
0
    else
1856
0
        return TY_(HTMLVersion)(doc);
1857
0
}
1858
1859
ctmbstr TY_(HTMLVersionNameFromCode)( uint vers, Bool ARG_UNUSED(isXhtml) )
1860
0
{
1861
0
    ctmbstr name = GetNameFromVers(vers);
1862
0
    return name;
1863
0
}
1864
1865
uint TY_(HTMLVersionNumberFromCode)( uint vers )
1866
0
{
1867
0
    uint i;
1868
1869
0
    for (i = 0; W3C_Doctypes[i].name; ++i)
1870
0
        if (W3C_Doctypes[i].vers == vers)
1871
0
            return W3C_Doctypes[i].vers_out;
1872
1873
0
    return VERS_UNKNOWN;
1874
0
}
1875
1876
Bool TY_(WarnMissingSIInEmittedDocType)( TidyDocImpl* doc )
1877
0
{
1878
0
    Bool isXhtml = doc->lexer->isvoyager;
1879
0
    Node* doctype;
1880
    
1881
    /* Do not warn in XHTML mode */
1882
0
    if ( isXhtml )
1883
0
        return no;
1884
1885
    /* Do not warn if emitted doctype is proprietary */
1886
0
    if ( TY_(HTMLVersionNameFromCode)(doc->lexer->versionEmitted, isXhtml ) == NULL )
1887
0
        return no;
1888
1889
    /* Do not warn if no SI is possible */
1890
0
    if ( GetSIFromVers(doc->lexer->versionEmitted) == NULL )
1891
0
        return no;
1892
1893
0
    if ( (doctype = TY_(FindDocType)( doc )) != NULL
1894
0
         && TY_(GetAttrByName)(doctype, "SYSTEM") == NULL )
1895
0
        return yes;
1896
1897
0
    return no;
1898
0
}
1899
1900
1901
/* Put DOCTYPE declaration between the
1902
** <?xml version "1.0" ... ?> declaration, if any,
1903
** and the <html> tag.  Should also work for any comments, 
1904
** etc. that may precede the <html> tag.
1905
*/
1906
1907
static Node* NewDocTypeNode( TidyDocImpl* doc )
1908
0
{
1909
0
    Node* doctype = NULL;
1910
0
    Node* html = TY_(FindHTML)( doc );
1911
1912
0
    if ( !html )
1913
0
        return NULL;
1914
1915
0
    doctype = TY_(NewNode)( doc->allocator, NULL );
1916
0
    doctype->type = DocTypeTag;
1917
0
    TY_(InsertNodeBeforeElement)(html, doctype);
1918
0
    return doctype;
1919
0
}
1920
1921
Bool TY_(SetXHTMLDocType)( TidyDocImpl* doc )
1922
0
{
1923
0
    Lexer *lexer = doc->lexer;
1924
0
    Node *doctype = TY_(FindDocType)( doc );
1925
0
    TidyDoctypeModes dtmode = (TidyDoctypeModes)cfg(doc, TidyDoctypeMode);
1926
0
    ctmbstr pub = "PUBLIC";
1927
0
    ctmbstr sys = "SYSTEM";
1928
1929
0
    lexer->versionEmitted = TY_(ApparentVersion)( doc );
1930
1931
0
    if (dtmode == TidyDoctypeOmit)
1932
0
    {
1933
0
        if (doctype)
1934
0
            TY_(DiscardElement)(doc, doctype);
1935
0
        return yes;
1936
0
    }
1937
1938
0
    if (dtmode == TidyDoctypeUser && !cfgStr(doc, TidyDoctype))
1939
0
        return no;
1940
1941
0
    if (!doctype)
1942
0
    {
1943
0
        doctype = NewDocTypeNode(doc);
1944
0
        doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
1945
0
    }
1946
0
    else
1947
0
    {
1948
0
        doctype->element = TY_(tmbstrtolower)(doctype->element);
1949
0
    }
1950
1951
0
    switch(dtmode)
1952
0
    {
1953
0
    case TidyDoctypeHtml5:
1954
        /* HTML5 */
1955
0
        TY_(RepairAttrValue)(doc, doctype, pub, NULL);
1956
0
        TY_(RepairAttrValue)(doc, doctype, sys, NULL);
1957
0
        lexer->versionEmitted = XH50;
1958
0
        break;
1959
0
    case TidyDoctypeStrict:
1960
        /* XHTML 1.0 Strict */
1961
0
        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
1962
0
        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
1963
0
        lexer->versionEmitted = X10S;
1964
0
        break;
1965
0
    case TidyDoctypeLoose:
1966
        /* XHTML 1.0 Transitional */
1967
0
        TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
1968
0
        TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
1969
0
        lexer->versionEmitted = X10T;
1970
0
        break;
1971
0
    case TidyDoctypeUser:
1972
        /* user defined document type declaration */
1973
0
        TY_(RepairAttrValue)(doc, doctype, pub, cfgStr(doc, TidyDoctype));
1974
0
        TY_(RepairAttrValue)(doc, doctype, sys, "");
1975
0
        break;
1976
0
    case TidyDoctypeAuto:
1977
0
        if (lexer->doctype == VERS_UNKNOWN || lexer->doctype == VERS_HTML5) {
1978
0
          lexer->versionEmitted = XH50;
1979
0
          return yes;
1980
0
        }
1981
0
        else if (lexer->versions & XH11 && lexer->doctype == XH11)
1982
0
        {
1983
0
            if (!TY_(GetAttrByName)(doctype, sys))
1984
0
                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1985
0
            lexer->versionEmitted = XH11;
1986
0
            return yes;
1987
0
        }
1988
0
        else if (lexer->versions & XH11 && !(lexer->versions & VERS_HTML40))
1989
0
        {
1990
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(XH11));
1991
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XH11));
1992
0
            lexer->versionEmitted = XH11;
1993
0
        }
1994
0
        else if (lexer->versions & XB10 && lexer->doctype == XB10)
1995
0
        {
1996
0
            if (!TY_(GetAttrByName)(doctype, sys))
1997
0
                TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(XB10));
1998
0
            lexer->versionEmitted = XB10;
1999
0
            return yes;
2000
0
        }
2001
0
        else if (lexer->versions & VERS_HTML40_STRICT)
2002
0
        {
2003
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10S));
2004
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10S));
2005
0
            lexer->versionEmitted = X10S;
2006
0
        }
2007
0
        else if (lexer->versions & VERS_FRAMESET)
2008
0
        {
2009
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10F));
2010
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10F));
2011
0
            lexer->versionEmitted = X10F;
2012
0
        }
2013
0
        else if (lexer->versions & VERS_LOOSE)
2014
0
        {
2015
0
            TY_(RepairAttrValue)(doc, doctype, pub, GetFPIFromVers(X10T));
2016
0
            TY_(RepairAttrValue)(doc, doctype, sys, GetSIFromVers(X10T));
2017
0
            lexer->versionEmitted = X10T;
2018
0
        }
2019
0
        else if (lexer->versions & VERS_HTML5)
2020
0
        {
2021
            /*\
2022
             *  Issue #273 - If still a html5/xhtml5 bit
2023
             *  existing, that is the 'ConstrainVersion' has
2024
             *  not eliminated all HTML5, then nothing to do here.
2025
             *  Certainly do **not** delete the DocType node!
2026
             *  see: http://www.w3.org/QA/Tips/Doctype
2027
            \*/
2028
0
        }
2029
0
        else
2030
0
        {
2031
0
            if (doctype)
2032
0
                TY_(DiscardElement)(doc, doctype);
2033
0
            return no;
2034
0
        }
2035
0
        break;
2036
0
    case TidyDoctypeOmit:
2037
0
        assert(0);
2038
0
        break;
2039
0
    }
2040
2041
0
    return no;
2042
0
}
2043
2044
/* fixup doctype if missing */
2045
Bool TY_(FixDocType)( TidyDocImpl* doc )
2046
0
{
2047
0
    Lexer* lexer = doc->lexer;
2048
0
    Node* doctype = TY_(FindDocType)( doc );
2049
0
    uint dtmode = cfg( doc, TidyDoctypeMode );
2050
0
    uint guessed = VERS_UNKNOWN;
2051
0
    Bool hadSI = no;
2052
2053
    /* Issue #167 - found doctype, and doctype is default VERS_HTML5, set VERS_HTML5 and return yes */
2054
0
    if (doctype && (dtmode == TidyDoctypeAuto) &&
2055
0
        (lexer->doctype == VERS_HTML5) )
2056
0
    {
2057
        /* The version emitted cannot be a composite value! */
2058
0
        lexer->versionEmitted = HT50;
2059
0
        return yes;
2060
0
    }
2061
0
    if (dtmode == TidyDoctypeAuto &&
2062
0
        lexer->versions & lexer->doctype &&
2063
0
        !(VERS_XHTML & lexer->doctype && !lexer->isvoyager)
2064
0
        && TY_(FindDocType)(doc))
2065
0
    {
2066
0
        lexer->versionEmitted = lexer->doctype;
2067
0
        return yes;
2068
0
    }
2069
2070
0
    if (dtmode == TidyDoctypeOmit)
2071
0
    {
2072
0
        if (doctype)
2073
0
            TY_(DiscardElement)( doc, doctype );
2074
0
        lexer->versionEmitted = TY_(ApparentVersion)( doc );
2075
0
        return yes;
2076
0
    }
2077
2078
0
    if (cfgBool(doc, TidyXmlOut))
2079
0
        return yes;
2080
2081
0
    if (doctype)
2082
0
        hadSI = TY_(GetAttrByName)(doctype, "SYSTEM") != NULL;
2083
2084
0
    if ((dtmode == TidyDoctypeStrict ||
2085
0
         dtmode == TidyDoctypeLoose) && doctype)
2086
0
    {
2087
0
        TY_(DiscardElement)(doc, doctype);
2088
0
        doctype = NULL;
2089
0
    }
2090
2091
0
    switch (dtmode)
2092
0
    {
2093
0
    case TidyDoctypeHtml5:
2094
0
        guessed = HT50;
2095
0
        break;
2096
0
    case TidyDoctypeStrict:
2097
0
        guessed = H41S;
2098
0
        break;
2099
0
    case TidyDoctypeLoose:
2100
0
        guessed = H41T;
2101
0
        break;
2102
0
    case TidyDoctypeAuto:
2103
0
        guessed = TY_(HTMLVersion)(doc);
2104
0
        break;
2105
0
    }
2106
2107
0
    lexer->versionEmitted = guessed;
2108
0
    if (guessed == VERS_UNKNOWN)
2109
0
        return no;
2110
2111
0
    if (doctype)
2112
0
    {
2113
0
        doctype->element = TY_(tmbstrtolower)(doctype->element);
2114
0
    }
2115
0
    else
2116
0
    {
2117
0
        doctype = NewDocTypeNode(doc);
2118
0
        doctype->element = TY_(tmbstrdup)(doc->allocator, "html");
2119
0
    }
2120
2121
0
    TY_(RepairAttrValue)(doc, doctype, "PUBLIC", GetFPIFromVers(guessed));
2122
2123
0
    if (hadSI)
2124
0
        TY_(RepairAttrValue)(doc, doctype, "SYSTEM", GetSIFromVers(guessed));
2125
2126
0
    return yes;
2127
0
}
2128
2129
/* ensure XML document starts with <?xml version="1.0"?> */
2130
/* add encoding attribute if not using ASCII or UTF-8 output */
2131
Bool TY_(FixXmlDecl)( TidyDocImpl* doc )
2132
0
{
2133
0
    Node* xml;
2134
0
    AttVal *version, *encoding;
2135
0
    Lexer*lexer = doc->lexer;
2136
0
    Node* root = &doc->root;
2137
2138
0
    if ( root->content && root->content->type == XmlDecl )
2139
0
    {
2140
0
        xml = root->content;
2141
0
    }
2142
0
    else
2143
0
    {
2144
0
        xml = TY_(NewNode)(lexer->allocator, lexer);
2145
0
        xml->type = XmlDecl;
2146
0
        if ( root->content )
2147
0
            TY_(InsertNodeBeforeElement)(root->content, xml);
2148
0
        else
2149
0
            root->content = xml;
2150
0
    }
2151
2152
0
    version = TY_(GetAttrByName)(xml, "version");
2153
0
    encoding = TY_(GetAttrByName)(xml, "encoding");
2154
2155
    /*
2156
      We need to insert a check if declared encoding 
2157
      and output encoding mismatch and fix the XML
2158
      declaration accordingly!!!
2159
    */
2160
2161
0
    if ( encoding == NULL && cfg(doc, TidyOutCharEncoding) != UTF8 )
2162
0
    {
2163
0
        ctmbstr enc = TY_(GetEncodingNameFromTidyId)(cfg(doc, TidyOutCharEncoding));
2164
0
        if ( enc )
2165
0
            TY_(AddAttribute)( doc, xml, "encoding", enc );
2166
0
    }
2167
2168
0
    if ( version == NULL )
2169
0
        TY_(AddAttribute)( doc, xml, "version", "1.0" );
2170
0
    return yes;
2171
0
}
2172
2173
Node* TY_(InferredTag)(TidyDocImpl* doc, TidyTagId id)
2174
0
{
2175
0
    Lexer *lexer = doc->lexer;
2176
0
    Node *node = TY_(NewNode)( lexer->allocator, lexer );
2177
0
    const Dict* dict = TY_(LookupTagDef)(id);
2178
2179
0
    assert( dict != NULL );
2180
2181
0
    node->type = StartTag;
2182
0
    node->implicit = yes;
2183
0
    node->element = TY_(tmbstrdup)(doc->allocator, dict->name);
2184
0
    node->tag = dict;
2185
0
    node->start = lexer->txtstart;
2186
0
    node->end = lexer->txtend;
2187
2188
0
    return node;
2189
0
}
2190
2191
static Bool ExpectsContent(Node *node)
2192
6.05k
{
2193
6.05k
    if (node->type != StartTag)
2194
0
        return no;
2195
2196
    /* unknown element? */
2197
6.05k
    if (node->tag == NULL)
2198
0
        return yes;
2199
2200
6.05k
    if (node->tag->model & CM_EMPTY)
2201
0
        return no;
2202
2203
6.05k
    return yes;
2204
6.05k
}
2205
2206
/*
2207
  create a text node for the contents of
2208
  a CDATA element like style or script
2209
  which ends with </foo> for some foo.
2210
*/
2211
2212
typedef enum
2213
{
2214
    CDATA_INTERMEDIATE,
2215
    CDATA_STARTTAG,
2216
    CDATA_ENDTAG
2217
} CDATAState;
2218
2219
static Node *GetCDATA( TidyDocImpl* doc, Node *container )
2220
0
{
2221
0
    Lexer* lexer = doc->lexer;
2222
0
    uint start = 0;
2223
0
    int nested = 0;
2224
0
    CDATAState state = CDATA_INTERMEDIATE;
2225
0
    uint i;
2226
0
    Bool isEmpty = yes;
2227
0
    Bool matches = no;
2228
0
    uint c;
2229
0
    Bool hasSrc = (TY_(AttrGetById)(container, TidyAttr_SRC) != NULL) ? yes : no;
2230
    /*\ Issue #65 (1642186) and #280 - is script or style, and the option on
2231
     *  If yes, then avoid incrementing nested...
2232
    \*/
2233
0
    Bool nonested = ((nodeIsSCRIPT(container) || (nodeIsSTYLE(container))) && 
2234
0
        cfgBool(doc, TidySkipNested)) ? yes : no;
2235
2236
0
    SetLexerLocus( doc, lexer );
2237
0
    lexer->waswhite = no;
2238
0
    lexer->txtstart = lexer->txtend = lexer->lexsize;
2239
2240
    /* seen start tag, look for matching end tag */
2241
0
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2242
0
    {
2243
0
        TY_(AddCharToLexer)(lexer, c);
2244
0
        lexer->txtend = lexer->lexsize;
2245
2246
0
        if (state == CDATA_INTERMEDIATE)
2247
0
        {
2248
0
            if (c != '<')
2249
0
            {
2250
0
                if (isEmpty && !TY_(IsWhite)(c))
2251
0
                    isEmpty = no;
2252
0
                continue;
2253
0
            }
2254
2255
0
            c = TY_(ReadChar)(doc->docIn);
2256
2257
0
            if (TY_(IsLetter)(c))
2258
0
            {
2259
                /* <head><script src=foo><meta name=foo content=bar>*/
2260
0
                if (hasSrc && isEmpty && nodeIsSCRIPT(container))
2261
0
                {
2262
                    /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
2263
0
                    lexer->lexsize = lexer->txtstart;
2264
0
                    TY_(UngetChar)(c, doc->docIn);
2265
0
                    TY_(UngetChar)('<', doc->docIn);
2266
0
                    return NULL;
2267
0
                }
2268
0
                TY_(AddCharToLexer)(lexer, c);
2269
0
                start = lexer->lexsize - 1;
2270
0
                state = CDATA_STARTTAG;
2271
0
            }
2272
0
            else if (c == '/')
2273
0
            {
2274
0
                TY_(AddCharToLexer)(lexer, c);
2275
2276
0
                c = TY_(ReadChar)(doc->docIn);
2277
                
2278
0
                if (!TY_(IsLetter)(c))
2279
0
                {
2280
0
                    TY_(UngetChar)(c, doc->docIn);
2281
0
                    continue;
2282
0
                }
2283
0
                TY_(UngetChar)(c, doc->docIn);
2284
2285
0
                start = lexer->lexsize;
2286
0
                state = CDATA_ENDTAG;
2287
0
            }
2288
0
            else if (c == '\\')
2289
0
            {
2290
                /* recognize document.write("<script><\/script>") */
2291
0
                TY_(AddCharToLexer)(lexer, c);
2292
2293
0
                c = TY_(ReadChar)(doc->docIn);
2294
2295
0
                if (c != '/')
2296
0
                {
2297
0
                    TY_(UngetChar)(c, doc->docIn);
2298
0
                    continue;
2299
0
                }
2300
2301
0
                TY_(AddCharToLexer)(lexer, c);
2302
2303
0
                if (nonested) {
2304
                    /*\ 
2305
                     *  Issue #65 - for version 5.1.14.EXP2
2306
                     *  If the nonested option is ON then the <script> 
2307
                     *  tag did not bump nested, so no need to treat this as 
2308
                     *  an end tag just to decrease nested, just continue!
2309
                    \*/
2310
0
                    continue;
2311
0
                }
2312
2313
0
                c = TY_(ReadChar)(doc->docIn);
2314
                
2315
0
                if (!TY_(IsLetter)(c))
2316
0
                {
2317
0
                    TY_(UngetChar)(c, doc->docIn);
2318
0
                    continue;
2319
0
                }
2320
0
                TY_(UngetChar)(c, doc->docIn);
2321
2322
0
                start = lexer->lexsize;
2323
0
                state = CDATA_ENDTAG;
2324
0
            }
2325
0
            else
2326
0
            {
2327
0
                TY_(UngetChar)(c, doc->docIn);
2328
0
            }
2329
0
        }
2330
        /* '<' + Letter found */
2331
0
        else if (state == CDATA_STARTTAG)
2332
0
        {
2333
0
            if (TY_(IsLetter)(c))
2334
0
                continue;
2335
2336
0
            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2337
0
                                          TY_(tmbstrlen)(container->element)) == 0;
2338
0
            if (matches && !nonested)
2339
0
                nested++;
2340
2341
0
            state = CDATA_INTERMEDIATE;
2342
0
        }
2343
        /* '<' + '/' + Letter found */
2344
0
        else if (state == CDATA_ENDTAG)
2345
0
        {
2346
0
            if (TY_(IsLetter)(c))
2347
0
                continue;
2348
2349
0
            matches = TY_(tmbstrncasecmp)(container->element, lexer->lexbuf + start,
2350
0
                                          TY_(tmbstrlen)(container->element)) == 0;
2351
2352
0
            if (isEmpty && !matches)
2353
0
            {
2354
                /* ReportError(doc, container, NULL, MISSING_ENDTAG_FOR); */
2355
2356
0
                for (i = lexer->lexsize - 1; i >= start; --i)
2357
0
                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2358
0
                TY_(UngetChar)('/', doc->docIn);
2359
0
                TY_(UngetChar)('<', doc->docIn);
2360
0
                break;
2361
0
            }
2362
2363
0
            if (matches && nested-- <= 0)
2364
0
            {
2365
0
                for (i = lexer->lexsize - 1; i >= start; --i)
2366
0
                    TY_(UngetChar)((uint)lexer->lexbuf[i], doc->docIn);
2367
0
                TY_(UngetChar)('/', doc->docIn);
2368
0
                TY_(UngetChar)('<', doc->docIn);
2369
0
                lexer->lexsize -= (lexer->lexsize - start) + 2;
2370
0
                break;
2371
0
            }
2372
0
            else if (lexer->lexbuf[start - 2] != '\\')
2373
0
            {
2374
                /* if the end tag is not already escaped using backslash */
2375
0
                SetLexerLocus( doc, lexer );
2376
0
                lexer->columns -= 3;
2377
2378
                /*\ if javascript insert backslash before / 
2379
                 *  Issue #348 - Add option, escape-scripts, to skip
2380
                \*/
2381
0
                if ((TY_(IsJavaScript)(container)) && cfgBool(doc, TidyEscapeScripts) &&
2382
0
                    !TY_(IsHTML5Mode)(doc) )    /* Is #700 - This only applies to legacy html4 mode */
2383
0
                {
2384
                    /* Issue #281 - only warn if adding the escape! */
2385
0
                    TY_(Report)(doc, NULL, NULL, BAD_CDATA_CONTENT);
2386
2387
0
                    for (i = lexer->lexsize; i > start-1; --i)
2388
0
                        lexer->lexbuf[i] = lexer->lexbuf[i-1];
2389
2390
0
                    lexer->lexbuf[start-1] = '\\';
2391
0
                    lexer->lexsize++;
2392
0
                }
2393
0
            }
2394
0
            state = CDATA_INTERMEDIATE;
2395
0
        }
2396
0
    }
2397
0
    if (isEmpty)
2398
0
        lexer->lexsize = lexer->txtstart = lexer->txtend;
2399
0
    else
2400
0
        lexer->txtend = lexer->lexsize;
2401
2402
0
    if (c == EndOfStream)
2403
0
        TY_(Report)(doc, container, NULL, MISSING_ENDTAG_FOR );
2404
2405
0
    return TY_(TextToken)(lexer);
2406
0
}
2407
2408
void TY_(UngetToken)( TidyDocImpl* doc )
2409
0
{
2410
0
    doc->lexer->pushed = yes;
2411
0
}
2412
2413
#if defined(ENABLE_DEBUG_LOG)
2414
#  define CondReturnTextNode(doc, skip) \
2415
            if (lexer->txtend > lexer->txtstart) { \
2416
                Node *_node = TY_(TextToken)(lexer); \
2417
                lexer->token = _node; \
2418
                GTDBG(doc,"text_node",_node); \
2419
                return _node; \
2420
            }
2421
2422
#else
2423
#  define CondReturnTextNode(doc, skip) \
2424
104k
            if (lexer->txtend > lexer->txtstart) \
2425
104k
            { \
2426
5.74k
                lexer->token = TY_(TextToken)(lexer); \
2427
5.74k
                return lexer->token; \
2428
5.74k
            }
2429
#endif
2430
2431
/*
2432
  modes for GetToken()
2433
2434
  MixedContent   -- for elements which don't accept PCDATA
2435
  Preformatted   -- white space preserved as is
2436
  IgnoreMarkup   -- for CDATA elements such as script, style
2437
*/
2438
static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode );
2439
2440
Node* TY_(GetToken)( TidyDocImpl* doc, GetTokenMode mode )
2441
116k
{
2442
116k
    Node *node;
2443
116k
    Lexer* lexer = doc->lexer;
2444
2445
116k
    if (lexer->pushed || lexer->itoken)
2446
0
    {
2447
        /* Deal with previously returned duplicate inline token */
2448
0
        if (lexer->itoken)
2449
0
        {
2450
            /* itoken rejected */
2451
0
            if (lexer->pushed)
2452
0
            {
2453
0
                lexer->pushed = no;
2454
0
                node = lexer->itoken;
2455
0
                GTDBG(doc,"lex-itoken", node);
2456
0
                return node;
2457
0
            }
2458
            /* itoken has been accepted */
2459
0
            lexer->itoken = NULL;
2460
0
        }
2461
            
2462
        /* duplicate inlines in preference to pushed text nodes when appropriate */
2463
0
        lexer->pushed = no;
2464
0
        if (lexer->token->type != TextNode
2465
0
            || !(lexer->insert || lexer->inode)) {
2466
0
            node = lexer->token;
2467
0
            GTDBG(doc,"lex-token", node);
2468
0
            return node;
2469
0
        }
2470
0
        lexer->itoken = TY_(InsertedToken)( doc );
2471
0
        node = lexer->itoken;
2472
0
        GTDBG(doc,"lex-inserted", node);
2473
0
        return node;
2474
0
    }
2475
2476
116k
    assert( !(lexer->pushed || lexer->itoken) );
2477
2478
    /* at start of block elements, unclosed inline
2479
       elements are inserted into the token stream 
2480
       Issue #341 - Can NOT insert a token if NO istacksize  
2481
     */
2482
116k
    if ((lexer->insert || lexer->inode) && lexer->istacksize)
2483
0
    {
2484
        /*\ Issue #92: could fix by the following, but instead chose not to stack these 2
2485
         *  if ( !(lexer->insert && (nodeIsINS(lexer->insert) || nodeIsDEL(lexer->insert))) ) {
2486
        \*/
2487
0
        lexer->token = TY_(InsertedToken)( doc );
2488
0
        node = lexer->token;
2489
0
        GTDBG(doc,"lex-inserted2", node);
2490
0
        return node;
2491
0
    }
2492
2493
116k
    if (mode == CdataContent)
2494
0
    {
2495
0
        assert( lexer->parent != NULL );
2496
0
        node = GetCDATA(doc, lexer->parent);
2497
0
        GTDBG(doc,"lex-cdata", node);
2498
0
        return node;
2499
0
    }
2500
2501
116k
    return GetTokenFromStream( doc, mode );
2502
116k
}
2503
2504
#if defined(ENABLE_DEBUG_LOG)
2505
static void check_me(char *name)
2506
{
2507
    SPRTF("Have node %s\n", name);
2508
}
2509
#endif
2510
2511
static Node* GetTokenFromStream( TidyDocImpl* doc, GetTokenMode mode )
2512
116k
{
2513
116k
    Lexer* lexer = doc->lexer;
2514
116k
    uint c, lexdump, badcomment = 0;
2515
116k
    Bool isempty = no;
2516
116k
    AttVal *attributes = NULL;
2517
116k
    Node *node;
2518
116k
    Bool fixComments;
2519
    
2520
116k
    switch ( cfgAutoBool(doc, TidyFixComments) )
2521
116k
    {
2522
0
        case TidyYesState:
2523
0
            fixComments = yes;
2524
0
            break;
2525
2526
0
        case TidyNoState:
2527
0
            fixComments = no;
2528
0
            break;
2529
2530
116k
        default:
2531
116k
            fixComments = (TY_(HTMLVersion)(doc) & HT50) == 0;
2532
116k
            break;
2533
116k
    }
2534
2535
    /* Lexer->token must be set on return. Nullify it for safety. */
2536
116k
    lexer->token = NULL;
2537
2538
116k
    SetLexerLocus( doc, lexer );
2539
116k
    lexer->waswhite = no;
2540
2541
116k
    lexer->txtstart = lexer->txtend = lexer->lexsize;
2542
2543
112M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
2544
112M
    {
2545
112M
        if (lexer->insertspace)
2546
0
        {
2547
0
            TY_(AddCharToLexer)(lexer, ' ');
2548
0
            lexer->waswhite = yes;
2549
0
            lexer->insertspace = no;
2550
0
        }
2551
2552
112M
        if (c == 160 && (mode == Preformatted))
2553
0
            c = ' ';
2554
2555
112M
        TY_(AddCharToLexer)(lexer, c);
2556
2557
112M
        switch (lexer->state)
2558
112M
        {
2559
72.6M
            case LEX_CONTENT:  /* element content */
2560
2561
                /*
2562
                 Discard white space if appropriate. Its cheaper
2563
                 to do this here rather than in parser methods
2564
                 for elements that don't have mixed content.
2565
                */
2566
72.6M
                if (TY_(IsWhite)(c) && (mode == IgnoreWhitespace) 
2567
1.36M
                      && lexer->lexsize == lexer->txtstart + 1)
2568
343k
                {
2569
343k
                    --(lexer->lexsize);
2570
343k
                    lexer->waswhite = no;
2571
343k
                    SetLexerLocus( doc, lexer );
2572
343k
                    continue;
2573
343k
                }
2574
2575
72.3M
                if (c == '<')
2576
1.69M
                {
2577
1.69M
                    lexer->state = LEX_GT;
2578
1.69M
                    continue;
2579
1.69M
                }
2580
2581
70.6M
                if (TY_(IsWhite)(c))
2582
66.8M
                {
2583
                    /* was previous character white? */
2584
66.8M
                    if (lexer->waswhite)
2585
65.0M
                    {
2586
65.0M
                        if (mode != Preformatted && mode != IgnoreMarkup)
2587
511k
                        {
2588
511k
                            --(lexer->lexsize);
2589
511k
                            SetLexerLocus( doc, lexer );
2590
511k
                        }
2591
65.0M
                    }
2592
1.74M
                    else /* prev character wasn't white */
2593
1.74M
                    {
2594
1.74M
                        lexer->waswhite = yes;
2595
2596
1.74M
                        if (mode != Preformatted && mode != IgnoreMarkup && c != ' ')
2597
35.0k
                            ChangeChar(lexer, ' ');
2598
1.74M
                    }
2599
2600
66.8M
                    continue;
2601
66.8M
                }
2602
3.81M
                else if (c == '&' && mode != IgnoreMarkup)
2603
1.34k
                    ParseEntity( doc, mode );
2604
2605
                /* this is needed to avoid trimming trailing whitespace */
2606
3.81M
                if (mode == IgnoreWhitespace)
2607
4.95k
                    mode = MixedContent;
2608
2609
3.81M
                lexer->waswhite = no;
2610
3.81M
                continue;
2611
2612
1.69M
            case LEX_GT:  /* < */
2613
2614
                /* check for endtag */
2615
1.69M
                if (c == '/')
2616
156
                {
2617
156
                    if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
2618
0
                    {
2619
0
                        TY_(UngetChar)(c, doc->docIn);
2620
0
                        continue;
2621
0
                    }
2622
2623
156
                    TY_(AddCharToLexer)(lexer, c);
2624
2625
156
                    if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c)))
2626
146
                    {
2627
146
                        lexer->lexsize -= 3;
2628
146
                        lexer->txtend = lexer->lexsize;
2629
146
                        TY_(UngetChar)(c, doc->docIn);
2630
146
                        lexer->state = LEX_ENDTAG;
2631
146
                        lexer->lexbuf[lexer->lexsize] = '\0';  /* debug */
2632
146
                        doc->docIn->curcol -= 2;
2633
2634
                        /* if some text before the </ return it now */
2635
146
                        if (lexer->txtend > lexer->txtstart)
2636
40
                        {
2637
                            /* trim space character before end tag */
2638
40
                            if (mode == IgnoreWhitespace && lexer->lexbuf[lexer->lexsize - 1] == ' ')
2639
0
                            {
2640
0
                                lexer->lexsize -= 1;
2641
0
                                lexer->txtend = lexer->lexsize;
2642
0
                            }
2643
40
                            lexer->token = TY_(TextToken)(lexer);
2644
40
                            node = lexer->token;
2645
40
                            GTDBG(doc,"text", node);
2646
40
                            return node;
2647
40
                        }
2648
2649
106
                        continue;       /* no text so keep going */
2650
146
                    }
2651
2652
                    /* otherwise treat as CDATA */
2653
10
                    lexer->waswhite = no;
2654
10
                    lexer->state = LEX_CONTENT;
2655
10
                    continue;
2656
156
                }
2657
2658
1.69M
                if (mode == IgnoreMarkup)
2659
0
                {
2660
                    /* otherwise treat as CDATA */
2661
0
                    lexer->waswhite = no;
2662
0
                    lexer->state = LEX_CONTENT;
2663
0
                    continue;
2664
0
                }
2665
2666
                /*
2667
                   look out for comments, doctype or marked sections
2668
                   this isn't quite right, but its getting there ...
2669
                */
2670
1.69M
                if (c == '!')
2671
97.9k
                {
2672
97.9k
                    c = TY_(ReadChar)(doc->docIn);
2673
2674
97.9k
                    if (c == '-')
2675
28
                    {
2676
28
                        c = TY_(ReadChar)(doc->docIn);
2677
2678
28
                        if (c == '-')
2679
28
                        {
2680
28
                            lexer->state = LEX_COMMENT;  /* comment */
2681
28
                            lexer->lexsize -= 2;
2682
28
                            lexer->txtend = lexer->lexsize;
2683
2684
28
                            CondReturnTextNode(doc, 4)
2685
2686
24
                            lexer->txtstart = lexer->lexsize;
2687
24
                            continue;
2688
28
                        }
2689
2690
                        /*
2691
                           TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING );
2692
                           Warning now done later - see issue #487
2693
                         */
2694
28
                    }
2695
97.8k
                    else if (c == 'd' || c == 'D')
2696
36.5k
                    {
2697
                        /* todo: check for complete "<!DOCTYPE" not just <!D */
2698
2699
36.5k
                        uint skip = 0;
2700
2701
36.5k
                        lexer->state = LEX_DOCTYPE; /* doctype */
2702
36.5k
                        lexer->lexsize -= 2;
2703
36.5k
                        lexer->txtend = lexer->lexsize;
2704
36.5k
                        mode = IgnoreWhitespace;
2705
2706
                        /* skip until white space or '>' */
2707
2708
36.5k
                        for (;;)
2709
36.6k
                        {
2710
36.6k
                            c = TY_(ReadChar)(doc->docIn);
2711
36.6k
                            ++skip;
2712
2713
36.6k
                            if (c == EndOfStream || c == '>')
2714
17
                            {
2715
17
                                TY_(UngetChar)(c, doc->docIn);
2716
17
                                break;
2717
17
                            }
2718
2719
2720
36.5k
                            if (!TY_(IsWhite)(c))
2721
36
                                continue;
2722
2723
                            /* and skip to end of whitespace */
2724
2725
36.5k
                            for (;;)
2726
38.5k
                            {
2727
38.5k
                                c = TY_(ReadChar)(doc->docIn);
2728
38.5k
                                ++skip;
2729
2730
38.5k
                                if (c == EndOfStream || c == '>')
2731
0
                                {
2732
0
                                    TY_(UngetChar)(c, doc->docIn);
2733
0
                                    break;
2734
0
                                }
2735
2736
2737
38.5k
                                if (TY_(IsWhite)(c))
2738
2.00k
                                    continue;
2739
2740
36.5k
                                TY_(UngetChar)(c, doc->docIn);
2741
36.5k
                                break;
2742
38.5k
                            }
2743
2744
36.5k
                            break;
2745
36.5k
                        }
2746
2747
36.5k
                        CondReturnTextNode(doc, (skip + 3))
2748
2749
36.3k
                        lexer->txtstart = lexer->lexsize;
2750
36.3k
                        continue;
2751
36.5k
                    }
2752
61.3k
                    else if (c == '[')
2753
61.2k
                    {
2754
                        /* Word 2000 embeds <![if ...]> ... <![endif]> sequences */
2755
61.2k
                        lexer->lexsize -= 2;
2756
61.2k
                        lexer->state = LEX_SECTION;
2757
61.2k
                        lexer->txtend = lexer->lexsize;
2758
2759
61.2k
                        CondReturnTextNode(doc, 2)
2760
2761
61.2k
                        lexer->txtstart = lexer->lexsize;
2762
61.2k
                        continue;
2763
61.2k
                    }
2764
2765
2766
                    /*
2767
                       We only print this message if there's a missing
2768
                       starting hyphen; this comment will be dropped.
2769
                     */
2770
47
                    TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_DROPPING ); /* Is. #487 */
2771
2772
                    /* else swallow characters up to and including next '>' */
2773
11.3k
                    while ((c = TY_(ReadChar)(doc->docIn)) != '>')
2774
11.2k
                    {
2775
11.2k
                        if (c == EndOfStream)
2776
1
                        {
2777
1
                            TY_(UngetChar)(c, doc->docIn);
2778
1
                            break;
2779
1
                        }
2780
11.2k
                    }
2781
2782
47
                    lexer->lexsize -= 2;
2783
47
                    lexer->lexbuf[lexer->lexsize] = '\0';
2784
47
                    lexer->state = LEX_CONTENT;
2785
47
                    continue;
2786
97.9k
                }
2787
2788
                /*
2789
                   processing instructions
2790
                */
2791
2792
1.60M
                if (c == '?')
2793
92
                {
2794
92
                    lexer->lexsize -= 2;
2795
92
                    lexer->state = LEX_PROCINSTR;
2796
92
                    lexer->txtend = lexer->lexsize;
2797
2798
92
                    CondReturnTextNode(doc, 2)
2799
2800
27
                    lexer->txtstart = lexer->lexsize;
2801
27
                    continue;
2802
92
                }
2803
2804
                /* Microsoft ASP's e.g. <% ... server-code ... %> */
2805
1.60M
                if (c == '%')
2806
44
                {
2807
44
                    lexer->lexsize -= 2;
2808
44
                    lexer->state = LEX_ASP;
2809
44
                    lexer->txtend = lexer->lexsize;
2810
2811
44
                    CondReturnTextNode(doc, 2)
2812
2813
13
                    lexer->txtstart = lexer->lexsize;
2814
13
                    continue;
2815
44
                }
2816
2817
                /* Netscapes JSTE e.g. <# ... server-code ... #> */
2818
1.60M
                if (c == '#')
2819
12
                {
2820
12
                    lexer->lexsize -= 2;
2821
12
                    lexer->state = LEX_JSTE;
2822
12
                    lexer->txtend = lexer->lexsize;
2823
2824
12
                    CondReturnTextNode(doc, 2)
2825
2826
0
                    lexer->txtstart = lexer->lexsize;
2827
0
                    continue;
2828
12
                }
2829
2830
                /* check for start tag */
2831
1.60M
                if (TY_(IsLetter)(c) || (cfgBool(doc, TidyXmlTags) && TY_(IsXMLNamechar)(c)))
2832
6.05k
                {
2833
6.05k
                    TY_(UngetChar)(c, doc->docIn);     /* push back letter */
2834
6.05k
                    TY_(UngetChar)('<', doc->docIn);
2835
6.05k
                    lexer->lexsize -= 2;      /* discard "<" + letter */
2836
6.05k
                    lexer->txtend = lexer->lexsize;
2837
6.05k
                    lexer->state = LEX_STARTTAG;         /* ready to read tag name */
2838
2839
6.05k
                    CondReturnTextNode(doc, 2)
2840
2841
                    /* lexer->txtstart = lexer->lexsize; missing here? */
2842
639
                    continue;       /* no text so keep going */
2843
6.05k
                }
2844
2845
                /* otherwise treat as CDATA */
2846
                /* fix for bug 762102 (486) */
2847
                /* Issue #384 - Fix skipping parsing character, particularly '<<' */
2848
1.59M
                TY_(UngetChar)(c, doc->docIn);
2849
1.59M
                lexer->lexsize -= 1;
2850
1.59M
                lexer->state = LEX_CONTENT;
2851
1.59M
                lexer->waswhite = no;
2852
1.59M
                continue;
2853
2854
146
            case LEX_ENDTAG:  /* </letter */
2855
146
                lexer->txtstart = lexer->lexsize - 1;
2856
146
                doc->docIn->curcol += 2;
2857
146
                c = ParseTagName( doc );
2858
146
                lexer->token = TagToken( doc, EndTag );  /* create endtag token */
2859
146
                lexer->lexsize = lexer->txtend = lexer->txtstart;
2860
2861
                /* skip to '>' */
2862
1.25k
                while ( c != '>' && c != EndOfStream )
2863
1.11k
                {
2864
1.11k
                    c = TY_(ReadChar)(doc->docIn);
2865
1.11k
                }
2866
2867
146
                if (c == EndOfStream)
2868
0
                {
2869
0
                    TY_(FreeNode)( doc, lexer->token );
2870
0
                    continue;
2871
0
                }
2872
2873
146
                lexer->state = LEX_CONTENT;
2874
146
                lexer->waswhite = no;
2875
146
                node = lexer->token;
2876
146
                GTDBG(doc,"endtag", node);
2877
146
                return node;  /* the endtag token */
2878
2879
6.05k
            case LEX_STARTTAG: /* first letter of tagname */
2880
6.05k
                c = TY_(ReadChar)(doc->docIn);
2881
6.05k
                ChangeChar(lexer, (tmbchar)c);
2882
6.05k
                lexer->txtstart = lexer->lexsize - 1; /* set txtstart to first letter */
2883
6.05k
                c = ParseTagName( doc );
2884
6.05k
                isempty = no;
2885
6.05k
                attributes = NULL;
2886
6.05k
                lexer->token = TagToken( doc, StartTag ); /* [i_a]2 'isempty' is always false, thanks to code 2 lines above */
2887
2888
                /* parse attributes, consuming closing ">" */
2889
6.05k
                if (c != '>')
2890
5.98k
                {
2891
5.98k
                    if (c == '/')
2892
248
                        TY_(UngetChar)(c, doc->docIn);
2893
2894
5.98k
                    attributes = ParseAttrs( doc, &isempty );
2895
5.98k
                }
2896
2897
6.05k
                if (isempty)
2898
0
                    lexer->token->type = StartEndTag;
2899
2900
6.05k
                lexer->token->attributes = attributes;
2901
6.05k
                lexer->lexsize = lexer->txtend = lexer->txtstart;
2902
2903
                /* swallow newline following start tag */
2904
                /* special check needed for CRLF sequence */
2905
                /* this doesn't apply to empty elements */
2906
                /* nor to preformatted content that needs escaping */
2907
                /*\
2908
                 * Issue #230: Need to KEEP this user newline character in certain 
2909
                 * circumstances, certainly for <pre>, <script>, <style>...
2910
                 * Any others?
2911
                 * Issue #238: maybe **ONLY** for <pre>
2912
                \*/
2913
6.05k
                if ( nodeIsPRE(lexer->token) )
2914
0
                {
2915
0
                    mode = Preformatted;
2916
0
                }
2917
2918
6.05k
                if ((mode != Preformatted && ExpectsContent(lexer->token))
2919
6.05k
                    || nodeIsBR(lexer->token) || nodeIsHR(lexer->token))
2920
6.05k
                {
2921
6.05k
                    c = TY_(ReadChar)(doc->docIn);
2922
2923
6.05k
                    if ((c == '\n') && (mode != IgnoreWhitespace)) /* Issue #329 - Can NOT afford to lose this newline */
2924
0
                        TY_(UngetChar)(c, doc->docIn);  /* Issue #329 - make sure the newline is maintained for now */
2925
6.05k
                    else if (c != '\n' && c != '\f')
2926
6.05k
                        TY_(UngetChar)(c, doc->docIn);
2927
2928
6.05k
                    lexer->waswhite = yes;  /* to swallow leading whitespace */
2929
6.05k
                }
2930
3
                else
2931
3
                    lexer->waswhite = no;
2932
2933
6.05k
                lexer->state = LEX_CONTENT;
2934
6.05k
                if (lexer->token->tag == NULL) 
2935
0
                {
2936
0
                    if (mode != OtherNamespace) /* [i_a]2 only issue warning if NOT 'OtherNamespace', and tag null */
2937
0
                    {
2938
                        /* Special case for HTML5 unknown tags: if it looks 
2939
                           like an autonomous custom tag, then emit a variation
2940
                           of the standard message. We don't want to do this
2941
                           for older HTML, because it's not truly supported
2942
                           by the standard, although Tidy will allow it. */
2943
0
                        if ( (doc->lexer->doctype & VERS_HTML5) > 0 && TY_(elementIsAutonomousCustomFormat)( lexer->token->element ) )
2944
0
                            TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT_LOOKS_CUSTOM );
2945
0
                        else
2946
0
                            TY_(Report)( doc, NULL, lexer->token, UNKNOWN_ELEMENT );
2947
0
                    }
2948
0
                }
2949
6.05k
                else if ( !cfgBool(doc, TidyXmlTags) )
2950
0
                {
2951
0
                    TY_(ConstrainVersion)( doc, lexer->token->tag->versions );
2952
0
                    TY_(RepairDuplicateAttributes)( doc, lexer->token, no );
2953
0
                } else 
2954
6.05k
                    TY_(RepairDuplicateAttributes)( doc, lexer->token, yes );
2955
6.05k
                node = lexer->token;
2956
6.05k
                GTDBG(doc,"starttag", node);
2957
6.05k
                return node;  /* return start tag */
2958
2959
1.27M
            case LEX_COMMENT:  /* seen <!-- so look for --> */
2960
2961
1.27M
                if (c != '-')
2962
1.01M
                    continue;
2963
2964
260k
                c = TY_(ReadChar)(doc->docIn);
2965
2966
                /* Fix hyphens at beginning of tag */
2967
260k
                if ( c != '-' && fixComments && lexer->lexsize - lexer->txtstart == 1 )
2968
1
                {
2969
1
                    lexer->lexbuf[lexer->lexsize - 1] = '=';
2970
1
                }
2971
2972
260k
                TY_(AddCharToLexer)(lexer, c);
2973
2974
260k
                if (c != '-')
2975
109k
                    continue;
2976
2977
441k
            end_comment:
2978
441k
                c = TY_(ReadChar)(doc->docIn);
2979
2980
441k
                if (c == '>')
2981
24
                {
2982
24
                    if (badcomment)
2983
24
                    {
2984
                        /*
2985
                           We've got bad comments that we either fixed or
2986
                           ignored; provide proper user feedback based on
2987
                           doctype and whether or not we fixed them.
2988
                         */
2989
24
                        if ( (TY_(HTMLVersion)(doc) & HT50) )
2990
0
                        {
2991
0
                            if ( fixComments )
2992
0
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT );
2993
                            /* Otherwise for HTML5, it's safe to ignore. */
2994
0
                        }
2995
24
                        else
2996
24
                        {
2997
24
                            if ( fixComments )
2998
24
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT );
2999
0
                            else
3000
0
                                TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_WARN );
3001
24
                        }
3002
24
                    }
3003
3004
                    /* do not store closing -- in lexbuf */
3005
24
                    lexer->lexsize -= 2;
3006
24
                    lexer->txtend = lexer->lexsize;
3007
24
                    lexer->lexbuf[lexer->lexsize] = '\0';
3008
24
                    lexer->state = LEX_CONTENT;
3009
24
                    lexer->waswhite = no;
3010
24
                    lexer->token = CommentToken(doc);
3011
3012
                    /* now look for a line break */
3013
3014
24
                    c = TY_(ReadChar)(doc->docIn);
3015
3016
24
                    if (c == '\n')
3017
0
                        lexer->token->linebreak = yes;
3018
24
                    else
3019
24
                        TY_(UngetChar)(c, doc->docIn);
3020
3021
24
                    node = lexer->token;
3022
24
                    GTDBG(doc,"comment", node);
3023
24
                    return node;
3024
24
                }
3025
3026
                /* note position of first such error in the comment */
3027
441k
                if (!badcomment)
3028
28
                {
3029
28
                    SetLexerLocus( doc, lexer );
3030
28
                    lexer->columns -= 3;
3031
28
                }
3032
3033
441k
                badcomment++;
3034
3035
                /* fix hyphens in the middle */
3036
441k
                if ( fixComments )
3037
441k
                    lexer->lexbuf[lexer->lexsize - 2] = '=';
3038
3039
                /* if '-' then look for '>' to end the comment */
3040
441k
                if (c == '-')
3041
291k
                {
3042
291k
                    TY_(AddCharToLexer)(lexer, c);
3043
291k
                    goto end_comment;
3044
291k
                }
3045
3046
                /* fix hyphens end, and continue to look for --> */
3047
150k
                if ( fixComments )
3048
150k
                    lexer->lexbuf[lexer->lexsize - 1] = '=';
3049
3050
                /* http://tidy.sf.net/bug/1266647 */
3051
150k
                TY_(AddCharToLexer)(lexer, c);
3052
3053
150k
                continue; 
3054
3055
36.5k
            case LEX_DOCTYPE:  /* seen <!d so look for '>' munging whitespace */
3056
3057
                /* use ParseDocTypeDecl() to tokenize doctype declaration */
3058
36.5k
                TY_(UngetChar)(c, doc->docIn);
3059
36.5k
                lexer->lexsize -= 1;
3060
36.5k
                lexer->token = ParseDocTypeDecl(doc);
3061
3062
36.5k
                lexer->txtend = lexer->lexsize;
3063
36.5k
                lexer->lexbuf[lexer->lexsize] = '\0';
3064
36.5k
                lexer->state = LEX_CONTENT;
3065
36.5k
                lexer->waswhite = no;
3066
3067
                /* make a note of the version named by the 1st doctype */
3068
36.5k
                if (lexer->doctype == VERS_UNKNOWN && lexer->token && !cfgBool(doc, TidyXmlTags))
3069
0
                {
3070
0
                    lexer->doctype = FindGivenVersion(doc, lexer->token);
3071
0
                    if (lexer->doctype != VERS_HTML5)
3072
0
                    {
3073
                        /*\
3074
                         *  Back to legacy HTML4 mode for -
3075
                         *  Issue #167 & #169 - TidyTag_A
3076
                         *  Issue #196        - TidyTag_CAPTION
3077
                         *  others?
3078
                        \*/ 
3079
0
                        TY_(AdjustTags)(doc); /* Dynamically modify the tags table  */
3080
0
                    }
3081
0
                }
3082
36.5k
                node = lexer->token;
3083
36.5k
                GTDBG(doc,"doctype", node);
3084
36.5k
                return node;
3085
3086
34.3M
            case LEX_PROCINSTR:  /* seen <? so look for '>' */
3087
                /* check for PHP preprocessor instructions <?php ... ?> */
3088
3089
34.3M
                if  (lexer->lexsize - lexer->txtstart == 3)
3090
88
                {
3091
88
                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "php", 3) == 0)
3092
0
                    {
3093
0
                        lexer->state = LEX_PHP;
3094
0
                        continue;
3095
0
                    }
3096
88
                }
3097
3098
34.3M
                if  (lexer->lexsize - lexer->txtstart == 4)
3099
78
                {
3100
78
                    if (TY_(tmbstrncmp)(lexer->lexbuf + lexer->txtstart, "xml", 3) == 0 &&
3101
74
                        TY_(IsWhite)(lexer->lexbuf[lexer->txtstart + 3]))
3102
63
                    {
3103
63
                        lexer->state = LEX_XMLDECL;
3104
63
                        attributes = NULL;
3105
63
                        continue;
3106
63
                    }
3107
78
                }
3108
3109
34.3M
                if (cfgBool(doc, TidyXmlPIs) || lexer->isvoyager) /* insist on ?> as terminator */
3110
34.3M
                {
3111
34.3M
                    if (c != '?')
3112
34.3M
                        continue;
3113
3114
                    /* now look for '>' */
3115
1.78k
                    c = TY_(ReadChar)(doc->docIn);
3116
3117
1.78k
                    if (c == EndOfStream)
3118
1
                    {
3119
1
                        TY_(Report)(doc, NULL, NULL, UNEXPECTED_END_OF_FILE );
3120
1
                        TY_(UngetChar)(c, doc->docIn);
3121
1
                        continue;
3122
1
                    }
3123
3124
1.78k
                    TY_(AddCharToLexer)(lexer, c);
3125
1.78k
                }
3126
3127
3128
1.78k
                if (c != '>')
3129
1.76k
                    continue;
3130
3131
15
                lexer->lexsize -= 1;
3132
3133
15
                if (lexer->lexsize)
3134
15
                {
3135
15
                    uint i;
3136
15
                    Bool closed;
3137
3138
102
                    for (i = 0; i < lexer->lexsize - lexer->txtstart &&
3139
98
                        !TY_(IsWhite)(lexer->lexbuf[i + lexer->txtstart]); ++i)
3140
87
                        /**/;
3141
3142
15
                    closed = lexer->lexbuf[lexer->lexsize - 1] == '?';
3143
3144
15
                    if (closed)
3145
15
                        lexer->lexsize -= 1;
3146
3147
15
                    lexer->txtstart += i;
3148
15
                    lexer->txtend = lexer->lexsize;
3149
15
                    lexer->lexbuf[lexer->lexsize] = '\0';
3150
3151
15
                    lexer->token = PIToken(doc);
3152
15
                    lexer->token->closed = closed;
3153
15
                    lexer->token->element = TY_(tmbstrndup)(doc->allocator,
3154
15
                                                            lexer->lexbuf +
3155
15
                                                            lexer->txtstart - i, i);
3156
15
                }
3157
0
                else
3158
0
                {
3159
0
                    lexer->txtend = lexer->lexsize;
3160
0
                    lexer->lexbuf[lexer->lexsize] = '\0';
3161
0
                    lexer->token = PIToken(doc);
3162
0
                }
3163
3164
15
                lexer->state = LEX_CONTENT;
3165
15
                lexer->waswhite = no;
3166
15
                node = lexer->token;
3167
15
                GTDBG(doc,"procinstr", node);
3168
15
                return node;
3169
3170
231
            case LEX_ASP:  /* seen <% so look for "%>" */
3171
231
                if (c != '%')
3172
167
                    continue;
3173
3174
                /* now look for '>' */
3175
64
                c = TY_(ReadChar)(doc->docIn);
3176
3177
3178
64
                if (c != '>')
3179
21
                {
3180
21
                    TY_(UngetChar)(c, doc->docIn);
3181
21
                    continue;
3182
21
                }
3183
3184
43
                lexer->lexsize -= 1;
3185
43
                lexer->txtend = lexer->lexsize;
3186
43
                lexer->lexbuf[lexer->lexsize] = '\0';
3187
43
                lexer->state = LEX_CONTENT;
3188
43
                lexer->waswhite = no;
3189
43
                lexer->token = AspToken(doc);
3190
43
                node = lexer->token;
3191
43
                GTDBG(doc,"ASP", node);
3192
43
                return node;  /* the endtag token */
3193
3194
3195
3196
486
            case LEX_JSTE:  /* seen <# so look for "#>" */
3197
486
                if (c != '#')
3198
480
                    continue;
3199
3200
                /* now look for '>' */
3201
6
                c = TY_(ReadChar)(doc->docIn);
3202
3203
3204
6
                if (c != '>')
3205
0
                {
3206
0
                    TY_(UngetChar)(c, doc->docIn);
3207
0
                    continue;
3208
0
                }
3209
3210
6
                lexer->lexsize -= 1;
3211
6
                lexer->txtend = lexer->lexsize;
3212
6
                lexer->lexbuf[lexer->lexsize] = '\0';
3213
6
                lexer->state = LEX_CONTENT;
3214
6
                lexer->waswhite = no;
3215
6
                lexer->token = JsteToken(doc);
3216
6
                node = lexer->token;
3217
6
                GTDBG(doc,"JSTE", node);
3218
6
                return node;  /* the JSTE token */
3219
3220
3221
0
            case LEX_PHP: /* seen "<?php" so look for "?>" */
3222
0
                if (c != '?')
3223
0
                    continue;
3224
3225
                /* now look for '>' */
3226
0
                c = TY_(ReadChar)(doc->docIn);
3227
3228
0
                if (c != '>')
3229
0
                {
3230
0
                    TY_(UngetChar)(c, doc->docIn);
3231
0
                    continue;
3232
0
                }
3233
3234
0
                lexer->lexsize -= 1;
3235
0
                lexer->txtend = lexer->lexsize;
3236
0
                lexer->lexbuf[lexer->lexsize] = '\0';
3237
0
                lexer->state = LEX_CONTENT;
3238
0
                lexer->waswhite = no;
3239
0
                lexer->token = PhpToken(doc);
3240
0
                node = lexer->token;
3241
0
                GTDBG(doc,"PHP", node);
3242
0
                return node;  /* the PHP token */
3243
3244
165
            case LEX_XMLDECL: /* seen "<?xml" so look for "?>" */
3245
3246
165
                if (TY_(IsWhite)(c) && c != '?')
3247
1
                    continue;
3248
3249
                /* get pseudo-attribute */
3250
164
                if (c != '?')
3251
130
                {
3252
130
                    tmbstr name;
3253
130
                    Node *asp, *php;
3254
130
                    AttVal *av = NULL;
3255
130
                    int pdelim = 0;
3256
130
                    isempty = no;
3257
3258
130
                    TY_(UngetChar)(c, doc->docIn);
3259
3260
130
                    name = ParseAttribute( doc, &isempty, &asp, &php );
3261
3262
130
                    if (!name)
3263
47
                    {
3264
                        /* check if attributes are created by ASP markup */
3265
47
                        if (asp)
3266
4
                        {
3267
4
                            av = TY_(NewAttribute)(doc);
3268
4
                            av->asp = asp;
3269
4
                            AddAttrToList( &attributes, av ); 
3270
4
                        }
3271
3272
                        /* check if attributes are created by PHP markup */
3273
47
                        if (php)
3274
24
                        {
3275
24
                            av = TY_(NewAttribute)(doc);
3276
24
                            av->php = php;
3277
24
                            AddAttrToList( &attributes, av ); 
3278
24
                        }
3279
                      
3280
                        /* fix for http://tidy.sf.net/bug/788031 */
3281
47
                        lexer->lexsize -= 1;
3282
47
                        lexer->txtend = lexer->txtstart;
3283
47
                        lexer->lexbuf[lexer->txtend] = '\0';
3284
47
                        lexer->state = LEX_CONTENT;
3285
47
                        lexer->waswhite = no;
3286
47
                        lexer->token = XmlDeclToken(doc);
3287
47
                        lexer->token->attributes = attributes;
3288
47
                        node = lexer->token;
3289
47
                        GTDBG(doc,"xml", node);
3290
47
                        return node;  /* the xml token */
3291
47
                    }
3292
3293
83
                    av = TY_(NewAttribute)(doc);
3294
83
                    av->attribute = name;
3295
83
                    av->value = ParseValue( doc, name, yes, &isempty, &pdelim );
3296
83
                    av->delim = pdelim;
3297
83
                    av->dict = TY_(FindAttribute)( doc, av );
3298
3299
83
                    AddAttrToList( &attributes, av );
3300
                    /* continue; */
3301
83
                }
3302
3303
                /* now look for '>' */
3304
117
                c = TY_(ReadChar)(doc->docIn);
3305
3306
117
                if (c != '>')
3307
101
                {
3308
101
                    TY_(UngetChar)(c, doc->docIn);
3309
101
                    continue;
3310
101
                }
3311
16
                lexer->lexsize -= 1;
3312
16
                lexer->txtend = lexer->txtstart;
3313
16
                lexer->lexbuf[lexer->txtend] = '\0';
3314
16
                lexer->state = LEX_CONTENT;
3315
16
                lexer->waswhite = no;
3316
16
                lexer->token = XmlDeclToken(doc);
3317
16
                lexer->token->attributes = attributes;
3318
16
                node = lexer->token;
3319
16
                GTDBG(doc,"XML", node);
3320
16
                return node;  /* the XML token */
3321
3322
2.83M
            case LEX_SECTION: /* seen "<![" so look for "]>" */
3323
2.83M
                if (c == '[')
3324
499k
                {
3325
499k
                    if (lexer->lexsize == (lexer->txtstart + 6) &&
3326
2
                        TY_(tmbstrncmp)(lexer->lexbuf+lexer->txtstart, "CDATA[", 6) == 0)
3327
0
                    {
3328
0
                        lexer->state = LEX_CDATA;
3329
0
                        lexer->lexsize -= 6;
3330
0
                        continue;
3331
0
                    }
3332
499k
                }
3333
3334
2.83M
                if (c == '>')
3335
61.2k
                {
3336
                    /* Is. #462 - reached '>' before ']' */
3337
61.2k
                    TY_(UngetChar)(c, doc->docIn);
3338
2.77M
                } else if (c != ']')
3339
2.77M
                    continue;
3340
3341
                /* now look for '>' */
3342
61.4k
                c = TY_(ReadChar)(doc->docIn);
3343
3344
61.4k
                lexdump = 1;
3345
61.4k
                if (c != '>')
3346
140
                {
3347
                    /* Issue #153 - can also be ]'-->' */
3348
140
                    if (c == '-') 
3349
10
                    {
3350
10
                        c = TY_(ReadChar)(doc->docIn);
3351
10
                        if (c == '-')
3352
10
                        {
3353
10
                            c = TY_(ReadChar)(doc->docIn);
3354
10
                            if (c != '>')
3355
10
                            {
3356
10
                                TY_(UngetChar)(c, doc->docIn);
3357
10
                                TY_(UngetChar)('-', doc->docIn);
3358
10
                                TY_(UngetChar)('-', doc->docIn);
3359
10
                                continue;
3360
10
                            }
3361
                            /* this failed!
3362
                               TY_(AddCharToLexer)(lexer, '-'); TY_(AddCharToLexer)(lexer, '-'); lexdump = 0; 
3363
                               got output <![endif]--]> - needs further fix in pprint section output
3364
                             */
3365
10
                        }
3366
0
                        else
3367
0
                        {
3368
0
                            TY_(UngetChar)(c, doc->docIn);
3369
0
                            TY_(UngetChar)('-', doc->docIn);
3370
0
                            continue;
3371
0
                        }
3372
10
                    } 
3373
130
                    else 
3374
130
                    {
3375
130
                        TY_(UngetChar)(c, doc->docIn);
3376
130
                        continue;
3377
130
                    }
3378
140
                }
3379
 
3380
61.2k
                lexer->lexsize -= lexdump;
3381
61.2k
                lexer->txtend = lexer->lexsize;
3382
61.2k
                lexer->lexbuf[lexer->lexsize] = '\0';
3383
61.2k
                lexer->state = LEX_CONTENT;
3384
61.2k
                lexer->waswhite = no;
3385
61.2k
                lexer->token = SectionToken(doc);
3386
61.2k
                node = lexer->token;
3387
61.2k
                GTDBG(doc,"SECTION", node);
3388
61.2k
                return node;  /* the SECTION token */
3389
3390
0
            case LEX_CDATA: /* seen "<![CDATA[" so look for "]]>" */
3391
0
                if (c != ']')
3392
0
                    continue;
3393
3394
                /* now look for ']' */
3395
0
                c = TY_(ReadChar)(doc->docIn);
3396
3397
0
                if (c != ']')
3398
0
                {
3399
0
                    TY_(UngetChar)(c, doc->docIn);
3400
0
                    continue;
3401
0
                }
3402
3403
                /* now look for '>' */
3404
0
                c = TY_(ReadChar)(doc->docIn);
3405
3406
0
                if (c != '>')
3407
0
                {
3408
0
                    TY_(UngetChar)(c, doc->docIn);
3409
0
                    TY_(UngetChar)(']', doc->docIn);
3410
0
                    continue;
3411
0
                }
3412
3413
0
                lexer->lexsize -= 1;
3414
0
                lexer->txtend = lexer->lexsize;
3415
0
                lexer->lexbuf[lexer->lexsize] = '\0';
3416
0
                lexer->state = LEX_CONTENT;
3417
0
                lexer->waswhite = no;
3418
0
                lexer->token = CDATAToken(doc);
3419
0
                node = lexer->token;
3420
0
                GTDBG(doc,"CDATA", node);
3421
0
                return node;  /* the CDATA token */
3422
112M
        }
3423
112M
    }
3424
3425
6.20k
    if (lexer->state == LEX_CONTENT)  /* text string */
3426
5.74k
    {
3427
5.74k
        lexer->txtend = lexer->lexsize;
3428
3429
5.74k
        if (lexer->txtend > lexer->txtstart)
3430
16
        {
3431
16
            TY_(UngetChar)(c, doc->docIn);
3432
3433
16
            if (lexer->lexbuf[lexer->lexsize - 1] == ' ')
3434
5
            {
3435
5
                lexer->lexsize -= 1;
3436
5
                lexer->txtend = lexer->lexsize;
3437
5
            }
3438
16
            lexer->token = TY_(TextToken)(lexer);
3439
16
            node = lexer->token;
3440
16
            GTDBG(doc,"textstring", node);
3441
16
            return node;  /* the textstring token */
3442
16
        }
3443
5.74k
    }
3444
460
    else if (lexer->state == LEX_COMMENT) /* comment */
3445
4
    {
3446
4
        if (c == EndOfStream)
3447
4
        {
3448
            /* We print this if we reached end of the stream mid-comment. */
3449
4
            TY_(Report)(doc, NULL, NULL, MALFORMED_COMMENT_EOS );
3450
4
        }
3451
3452
4
        lexer->txtend = lexer->lexsize;
3453
4
        lexer->lexbuf[lexer->lexsize] = '\0';
3454
4
        lexer->state = LEX_CONTENT;
3455
4
        lexer->waswhite = no;
3456
4
        lexer->token = CommentToken(doc);
3457
4
        node = lexer->token;
3458
4
        GTDBG(doc,"COMMENT", node);
3459
4
        return node;  /* the COMMENT token */
3460
4
    }
3461
3462
    /* check attributes before return NULL */
3463
6.18k
    if (attributes)
3464
0
        TY_(FreeAttribute)( doc, attributes );
3465
3466
6.18k
    DEBUG_LOG(SPRTF("Returning NULL...\n"));
3467
6.18k
    return NULL;
3468
6.20k
}
3469
3470
static void MapStr( ctmbstr str, uint code )
3471
721
{
3472
9.27k
    while ( *str )
3473
8.54k
    {
3474
8.54k
        uint i = (byte) *str++;
3475
8.54k
        lexmap[i] |= code;
3476
8.54k
    }
3477
721
}
3478
3479
void TY_(InitMap)(void)
3480
103
{
3481
103
    MapStr("\r\n\f", newline|white);
3482
103
    MapStr(" \t", white);
3483
103
    MapStr("-.:_", namechar);
3484
103
    MapStr("0123456789", digit|digithex|namechar);
3485
103
    MapStr("abcdefghijklmnopqrstuvwxyz", lowercase|letter|namechar);
3486
103
    MapStr("ABCDEFGHIJKLMNOPQRSTUVWXYZ", uppercase|letter|namechar);
3487
103
    MapStr("abcdefABCDEF", digithex);
3488
103
}
3489
3490
/*
3491
 parser for ASP within start tags
3492
3493
 Some people use ASP for to customize attributes
3494
 Tidy isn't really well suited to dealing with ASP
3495
 This is a workaround for attributes, but won't
3496
 deal with the case where the ASP is used to tailor
3497
 the attribute value. Here is an example of a work
3498
 around for using ASP in attribute values:
3499
3500
  href='<%=rsSchool.Fields("ID").Value%>'
3501
3502
 where the ASP that generates the attribute value
3503
 is masked from Tidy by the quotemarks.
3504
3505
*/
3506
3507
static Node *ParseAsp( TidyDocImpl* doc )
3508
14
{
3509
14
    Lexer* lexer = doc->lexer;
3510
14
    uint c;
3511
14
    Node *asp = NULL;
3512
3513
14
    lexer->txtstart = lexer->lexsize;
3514
3515
14
    for (;;)
3516
24.9M
    {
3517
24.9M
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3518
6
            break;
3519
3520
24.9M
        TY_(AddCharToLexer)(lexer, c);
3521
3522
3523
24.9M
        if (c != '%')
3524
24.9M
            continue;
3525
3526
13
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3527
0
            break;
3528
3529
13
        TY_(AddCharToLexer)(lexer, c);
3530
3531
13
        if (c == '>')
3532
8
        {
3533
8
            lexer->lexsize -= 2;
3534
8
            break;
3535
8
        }
3536
13
    }
3537
3538
14
    lexer->txtend = lexer->lexsize;
3539
14
    if (lexer->txtend > lexer->txtstart)
3540
14
        asp = AspToken(doc);
3541
3542
14
    lexer->txtstart = lexer->txtend;
3543
14
    return asp;
3544
14
}   
3545
 
3546
3547
/*
3548
 PHP is like ASP but is based upon XML
3549
 processing instructions, e.g. <?php ... ?>
3550
*/
3551
static Node *ParsePhp( TidyDocImpl* doc )
3552
41
{
3553
41
    Lexer* lexer = doc->lexer;
3554
41
    uint c;
3555
41
    Node *php = NULL;
3556
3557
41
    lexer->txtstart = lexer->lexsize;
3558
3559
41
    for (;;)
3560
50.5M
    {
3561
50.5M
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3562
16
            break;
3563
3564
50.5M
        TY_(AddCharToLexer)(lexer, c);
3565
3566
3567
50.5M
        if (c != '?')
3568
50.5M
            continue;
3569
3570
25
        if ((c = TY_(ReadChar)(doc->docIn)) == EndOfStream)
3571
0
            break;
3572
3573
25
        TY_(AddCharToLexer)(lexer, c);
3574
3575
25
        if (c == '>')
3576
25
        {
3577
25
            lexer->lexsize -= 2;
3578
25
            break;
3579
25
        }
3580
25
    }
3581
3582
41
    lexer->txtend = lexer->lexsize;
3583
41
    if (lexer->txtend > lexer->txtstart)
3584
41
        php = PhpToken(doc);
3585
3586
41
    lexer->txtstart = lexer->txtend;
3587
41
    return php;
3588
41
}   
3589
3590
/* consumes the '>' terminating start tags */
3591
/* @TODO: float the errors back to the calling method */
3592
static tmbstr  ParseAttribute( TidyDocImpl* doc, Bool *isempty,
3593
                              Node **asp, Node **php )
3594
39.9k
{
3595
39.9k
    Lexer* lexer = doc->lexer;
3596
39.9k
    int start, len = 0;
3597
39.9k
    tmbstr attr = NULL;
3598
39.9k
    uint c, lastc;
3599
3600
39.9k
    *asp = NULL;  /* clear asp pointer */
3601
39.9k
    *php = NULL;  /* clear php pointer */
3602
3603
 /* skip white space before the attribute */
3604
3605
39.9k
    for (;;)
3606
625k
    {
3607
625k
        c = TY_(ReadChar)( doc->docIn );
3608
3609
3610
625k
        if (c == '/')
3611
519
        {
3612
519
            c = TY_(ReadChar)( doc->docIn );
3613
3614
519
            if (c == '>')
3615
0
            {
3616
0
                *isempty = yes;
3617
0
                return NULL;
3618
0
            }
3619
3620
519
            TY_(UngetChar)(c, doc->docIn);
3621
519
            c = '/';
3622
519
            break;
3623
519
        }
3624
3625
625k
        if (c == '>')
3626
3.92k
            return NULL;
3627
3628
621k
        if (c =='<')
3629
2.09k
        {
3630
2.09k
            c = TY_(ReadChar)(doc->docIn);
3631
3632
2.09k
            if (c == '%')
3633
14
            {
3634
14
                *asp = ParseAsp( doc );
3635
14
                return NULL;
3636
14
            }
3637
2.07k
            else if (c == '?')
3638
41
            {
3639
41
                *php = ParsePhp( doc );
3640
41
                return NULL;
3641
41
            }
3642
3643
2.03k
            TY_(UngetChar)(c, doc->docIn);
3644
2.03k
            TY_(UngetChar)('<', doc->docIn);
3645
2.03k
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3646
2.03k
            return NULL;
3647
2.09k
        }
3648
3649
619k
        if (c == '=')
3650
0
        {
3651
0
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_EQUALSIGN );
3652
0
            continue;
3653
0
        }
3654
3655
619k
        if (c == '"' || c == '\'')
3656
114
        {
3657
114
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_QUOTEMARK );
3658
114
            continue;
3659
114
        }
3660
3661
619k
        if (c == EndOfStream)
3662
0
        {
3663
0
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3664
0
            TY_(UngetChar)(c, doc->docIn);
3665
0
            return NULL;
3666
0
        }
3667
3668
3669
619k
        if (!TY_(IsWhite)(c))
3670
33.4k
           break;
3671
619k
    }
3672
3673
33.9k
    start = lexer->lexsize;
3674
33.9k
    lastc = c;
3675
3676
33.9k
    for (;;)
3677
5.93M
    {
3678
     /* but push back '=' for parseValue() */
3679
5.93M
        if (c == '=' || c == '>')
3680
4.23k
        {
3681
4.23k
            TY_(UngetChar)(c, doc->docIn);
3682
4.23k
            break;
3683
4.23k
        }
3684
3685
5.92M
        if (c == '<' || c == EndOfStream)
3686
1.57k
        {
3687
1.57k
            TY_(UngetChar)(c, doc->docIn);
3688
1.57k
            break;
3689
1.57k
        }
3690
3691
5.92M
        if (lastc == '-' && (c == '"' || c == '\''))
3692
0
        {
3693
0
            lexer->lexsize--;
3694
0
            --len;
3695
0
            TY_(UngetChar)(c, doc->docIn);
3696
0
            break;
3697
0
        }
3698
3699
5.92M
        if (TY_(IsWhite)(c))
3700
28.1k
            break;
3701
3702
5.89M
        if (c == '/') /* Issue #395 - potential self closing tag */
3703
1.30k
        {
3704
1.30k
            c = TY_(ReadChar)(doc->docIn);  /* read next */
3705
1.30k
            if (c == '>')
3706
1
            {
3707
                /* got a self closing tag - put is back and continue... */
3708
1
                TY_(UngetChar)(c, doc->docIn);
3709
1
                break;
3710
1
            }
3711
1.30k
            else
3712
1.30k
            {
3713
                /* Not '/>' - put it back */
3714
1.30k
                TY_(UngetChar)(c, doc->docIn);
3715
1.30k
                c = '/';  /* restore original char */
3716
1.30k
            }
3717
1.30k
        }
3718
3719
        /* what should be done about non-namechar characters? */
3720
        /* currently these are incorporated into the attr name */
3721
3722
5.89M
        if ( cfg(doc, TidyUpperCaseAttrs) != TidyUppercasePreserve )
3723
5.89M
        {
3724
5.89M
            if ( !cfgBool(doc, TidyXmlTags) && TY_(IsUpper)(c) )
3725
0
                c = TY_(ToLower)(c);
3726
5.89M
        }
3727
3728
5.89M
        TY_(AddCharToLexer)( lexer, c );
3729
5.89M
        lastc = c;
3730
5.89M
        c = TY_(ReadChar)(doc->docIn);
3731
5.89M
    }
3732
3733
    /* handle attribute names with multibyte chars */
3734
33.9k
    len = lexer->lexsize - start;
3735
33.9k
    attr = (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3736
33.9k
                                      lexer->lexbuf+start, len) : NULL);
3737
33.9k
    lexer->lexsize = start;
3738
33.9k
    return attr;
3739
39.9k
}
3740
3741
/*
3742
 invoked when < is seen in place of attribute value
3743
 but terminates on whitespace if not ASP, PHP or Tango
3744
 this routine recognizes ' and " quoted strings
3745
*/
3746
static int ParseServerInstruction( TidyDocImpl* doc )
3747
8
{
3748
8
    Lexer* lexer = doc->lexer;
3749
8
    uint c;
3750
8
    int delim = '"';
3751
8
    Bool isrule = no;
3752
3753
8
    c = TY_(ReadChar)(doc->docIn);
3754
8
    TY_(AddCharToLexer)(lexer, c);
3755
3756
    /* check for ASP, PHP or Tango */
3757
8
    if (c == '%' || c == '?' || c == '@')
3758
3
        isrule = yes;
3759
3760
8
    for (;;)
3761
307k
    {
3762
307k
        c = TY_(ReadChar)(doc->docIn);
3763
3764
307k
        if (c == EndOfStream)
3765
2
            break;
3766
3767
307k
        if (c == '>')
3768
0
        {
3769
0
            if (isrule)
3770
0
                TY_(AddCharToLexer)(lexer, c);
3771
0
            else
3772
0
                TY_(UngetChar)(c, doc->docIn);
3773
3774
0
            break;
3775
0
        }
3776
3777
        /* if not recognized as ASP, PHP or Tango */
3778
        /* then also finish value on whitespace */
3779
307k
        if (!isrule)
3780
72
        {
3781
72
            if (TY_(IsWhite)(c))
3782
3
                break;
3783
72
        }
3784
3785
307k
        TY_(AddCharToLexer)(lexer, c);
3786
3787
307k
        if (c == '"')
3788
285k
        {
3789
285k
            do
3790
1.31M
            {
3791
1.31M
                c = TY_(ReadChar)(doc->docIn);
3792
1.31M
                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3793
0
                {
3794
0
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3795
0
                    TY_(UngetChar)(c, doc->docIn);
3796
0
                    return 0;
3797
0
                }
3798
1.31M
                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3799
1
                {
3800
1
                    TY_(UngetChar)(c, doc->docIn);
3801
1
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3802
1
                    return 0;
3803
1
                }
3804
1.31M
                TY_(AddCharToLexer)(lexer, c);
3805
1.31M
            }
3806
1.31M
            while (c != '"');
3807
285k
            delim = '\'';
3808
285k
            continue;
3809
285k
        }
3810
3811
21.5k
        if (c == '\'')
3812
13.9k
        {
3813
13.9k
            do
3814
306k
            {
3815
306k
                c = TY_(ReadChar)(doc->docIn);
3816
306k
                if (c == EndOfStream) /* #427840 - fix by Terry Teague 30 Jun 01 */
3817
2
                {
3818
2
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3819
2
                    TY_(UngetChar)(c, doc->docIn);
3820
2
                    return 0;
3821
2
                }
3822
306k
                if (c == '>') /* #427840 - fix by Terry Teague 30 Jun 01 */
3823
0
                {
3824
0
                    TY_(UngetChar)(c, doc->docIn);
3825
0
                    TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3826
0
                    return 0;
3827
0
                }
3828
306k
                TY_(AddCharToLexer)(lexer, c);
3829
306k
            }
3830
306k
            while (c != '\'');
3831
13.9k
        }
3832
21.5k
    }
3833
3834
5
    return delim;
3835
8
}
3836
3837
/* values start with "=" or " = " etc. */
3838
/* doesn't consume the ">" at end of start tag */
3839
3840
static tmbstr ParseValue( TidyDocImpl* doc, ctmbstr name,
3841
                          Bool foldCase, Bool *isempty, int *pdelim)
3842
33.9k
{
3843
33.9k
    Lexer* lexer = doc->lexer;
3844
33.9k
    int len = 0, start;
3845
33.9k
    Bool seen_gt = no;
3846
33.9k
    Bool munge = yes;
3847
33.9k
    uint c, lastc, delim, quotewarning;
3848
33.9k
    tmbstr value;
3849
3850
33.9k
    delim = (tmbchar) 0;
3851
33.9k
    *pdelim = '"';
3852
3853
    /*
3854
     Henry Zrepa reports that some folk are using the
3855
     embed element with script attributes where newlines
3856
     are significant and must be preserved
3857
    */
3858
33.9k
    if ( cfgBool(doc, TidyLiteralAttribs) )
3859
0
        munge = no;
3860
3861
 /* skip white space before the '=' */
3862
3863
33.9k
    for (;;)
3864
2.63M
    {
3865
2.63M
        c = TY_(ReadChar)(doc->docIn);
3866
3867
2.63M
        if (c == EndOfStream)
3868
13
        {
3869
13
            TY_(UngetChar)(c, doc->docIn);
3870
13
            break;
3871
13
        }
3872
3873
2.63M
        if (!TY_(IsWhite)(c))
3874
33.9k
           break;
3875
2.63M
    }
3876
3877
/*
3878
  c should be '=' if there is a value
3879
  other legal possibilities are white
3880
  space, '/' and '>'
3881
*/
3882
3883
33.9k
    if (c != '=' && c != '"' && c != '\'')
3884
33.4k
    {
3885
33.4k
        TY_(UngetChar)(c, doc->docIn);
3886
33.4k
        return NULL;
3887
33.4k
    }
3888
3889
 /* skip white space after '=' */
3890
3891
525
    for (;;)
3892
528
    {
3893
528
        c = TY_(ReadChar)(doc->docIn);
3894
3895
528
        if (c == EndOfStream)
3896
0
        {
3897
0
            TY_(UngetChar)(c, doc->docIn);
3898
0
            break;
3899
0
        }
3900
3901
528
        if (!TY_(IsWhite)(c))
3902
525
           break;
3903
528
    }
3904
3905
 /* check for quote marks */
3906
3907
525
    if (c == '"' || c == '\'')
3908
201
        delim = c;
3909
324
    else if (c == '<')
3910
8
    {
3911
8
        start = lexer->lexsize;
3912
8
        TY_(AddCharToLexer)(lexer, c);
3913
8
        *pdelim = ParseServerInstruction( doc );
3914
8
        len = lexer->lexsize - start;
3915
8
        lexer->lexsize = start;
3916
8
        return (len > 0 ? TY_(tmbstrndup)(doc->allocator,
3917
8
                                          lexer->lexbuf+start, len) : NULL);
3918
8
    }
3919
316
    else
3920
316
        TY_(UngetChar)(c, doc->docIn);
3921
3922
 /*
3923
   and read the value string
3924
   check for quote mark if needed
3925
 */
3926
3927
517
    quotewarning = 0;
3928
517
    start = lexer->lexsize;
3929
517
    c = '\0';
3930
3931
517
    for (;;)
3932
28.2k
    {
3933
28.2k
        lastc = c;  /* track last character */
3934
28.2k
        c = TY_(ReadChar)(doc->docIn);
3935
3936
28.2k
        if (c == EndOfStream)
3937
0
        {
3938
0
            TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_END_OF_FILE_ATTR );
3939
0
            TY_(UngetChar)(c, doc->docIn);
3940
0
            break;
3941
0
        }
3942
3943
28.2k
        if (delim == (tmbchar)0)
3944
1.90k
        {
3945
1.90k
            if (c == '>')
3946
2
            {
3947
2
                TY_(UngetChar)(c, doc->docIn);
3948
2
                break;
3949
2
            }
3950
3951
1.89k
            if (c == '"' || c == '\'')
3952
13
            {
3953
13
                uint q = c;
3954
3955
                /* handle <input onclick=s("btn1")> and <a title=foo""">...</a> */
3956
                /* this doesn't handle <a title=foo"/> which browsers treat as  */
3957
                /* 'foo"/' nor  <a title=foo" /> which browser treat as 'foo"'  */
3958
                
3959
13
                c = TY_(ReadChar)(doc->docIn);
3960
13
                if (c == '>')
3961
0
                {
3962
0
                    TY_(AddCharToLexer)(lexer, q);
3963
0
                    TY_(UngetChar)(c, doc->docIn);
3964
0
                    break;
3965
0
                }
3966
13
                else
3967
13
                {
3968
13
                    TY_(UngetChar)(c, doc->docIn);
3969
13
                    c = q;
3970
13
                }
3971
13
            }
3972
3973
1.89k
            if (c == '<')
3974
21
            {
3975
21
                TY_(UngetChar)(c, doc->docIn);
3976
21
                c = '>';
3977
21
                TY_(UngetChar)(c, doc->docIn);
3978
21
                TY_(ReportAttrError)( doc, lexer->token, NULL, UNEXPECTED_GT );
3979
21
                break;
3980
21
            }
3981
3982
            /*
3983
             For cases like <br clear=all/> need to avoid treating /> as
3984
             part of the attribute value, however care is needed to avoid
3985
             so treating <a href=http://www.acme.com/> in this way, which
3986
             would map the <a> tag to <a href="http://www.acme.com"/>
3987
            */
3988
1.87k
            if (c == '/')
3989
12
            {
3990
                /* peek ahead in case of /> */
3991
12
                c = TY_(ReadChar)(doc->docIn);
3992
3993
12
                if ( c == '>' && !TY_(IsUrl)(doc, name) )
3994
0
                {
3995
0
                    *isempty = yes;
3996
0
                    TY_(UngetChar)(c, doc->docIn);
3997
0
                    break;
3998
0
                }
3999
4000
                /* unget peeked character */
4001
12
                TY_(UngetChar)(c, doc->docIn);
4002
12
                c = '/';
4003
12
            }
4004
1.87k
        }
4005
26.3k
        else  /* delim is '\'' or '"' */
4006
26.3k
        {
4007
26.3k
            if (c == delim)
4008
201
                break;
4009
4010
26.1k
            if (c == '\n' || c == '<' || c == '>')
4011
3.78k
                ++quotewarning;
4012
4013
26.1k
            if (c == '>')
4014
358
                seen_gt = yes;
4015
26.1k
        }
4016
4017
27.9k
        if (c == '&')
4018
46
        {
4019
46
            TY_(AddCharToLexer)(lexer, c);
4020
46
            ParseEntity( doc, IgnoreWhitespace );
4021
46
            if (lexer->lexbuf[lexer->lexsize - 1] == '\n' && munge)
4022
0
                ChangeChar(lexer, ' ');
4023
46
            continue;
4024
46
        }
4025
4026
        /*
4027
         kludge for JavaScript attribute values
4028
         with line continuations in string literals
4029
        */
4030
27.9k
        if (c == '\\')
4031
2
        {
4032
2
            c = TY_(ReadChar)(doc->docIn);
4033
4034
2
            if (c != '\n')
4035
2
            {
4036
2
                TY_(UngetChar)(c, doc->docIn);
4037
2
                c = '\\';
4038
2
            }
4039
2
        }
4040
4041
27.9k
        if (TY_(IsWhite)(c))
4042
4.93k
        {
4043
4.93k
            if ( delim == 0 )
4044
293
                break;
4045
4046
4.63k
            if (munge)
4047
4.63k
            {
4048
                /* discard line breaks in quoted URLs */ 
4049
                /* #438650 - fix by Randy Waki */
4050
4.63k
                if ( c == '\n' && TY_(IsUrl)(doc, name) )
4051
286
                {
4052
                    /* warn that we discard this newline */
4053
286
                    TY_(ReportAttrError)( doc, lexer->token, NULL, NEWLINE_IN_URI);
4054
286
                    continue;
4055
286
                }
4056
                
4057
4.35k
                c = ' ';
4058
4059
4.35k
                if (lastc == ' ')
4060
3.27k
                {
4061
3.27k
                    if (TY_(IsUrl)(doc, name) )
4062
1.94k
                        TY_(ReportAttrError)( doc, lexer->token, NULL, WHITE_IN_URI);
4063
3.27k
                    continue;
4064
3.27k
                }
4065
4.35k
            }
4066
4.63k
        }
4067
23.0k
        else if (foldCase && TY_(IsUpper)(c))
4068
27
            c = TY_(ToLower)(c);
4069
4070
24.0k
        TY_(AddCharToLexer)(lexer, c);
4071
24.0k
    }
4072
4073
517
    if (quotewarning > 10 && seen_gt && munge)
4074
138
    {
4075
        /*
4076
           there is almost certainly a missing trailing quote mark
4077
           as we have see too many newlines, < or > characters.
4078
4079
           an exception is made for Javascript attributes and the
4080
           javascript URL scheme which may legitimately include < and >,
4081
           and for attributes starting with "<xml " as generated by
4082
           Microsoft Office.
4083
        */
4084
138
        if ( !TY_(IsScript)(doc, name) &&
4085
138
             !(TY_(IsUrl)(doc, name) && TY_(tmbstrncmp)(lexer->lexbuf+start, "javascript:", 11) == 0) &&
4086
138
             !(TY_(tmbstrncmp)(lexer->lexbuf+start, "<xml ", 5) == 0)
4087
138
           )
4088
138
            TY_(Report)( doc, NULL, NULL, SUSPECTED_MISSING_QUOTE ); 
4089
138
    }
4090
4091
517
    len = lexer->lexsize - start;
4092
517
    lexer->lexsize = start;
4093
4094
4095
517
    if (len > 0 || delim)
4096
517
    {
4097
        /* ignore leading and trailing white space for all but title, alt, value */
4098
        /* and prompts attributes unless --literal-attributes is set to yes      */
4099
        /* #994841 - Whitespace is removed from value attributes                 */
4100
4101
        /* Issue #217 - Also only if/while (len > 0) - MUST NEVER GO NEGATIVE! */
4102
517
        if ((len > 0) && munge &&
4103
517
            TY_(tmbstrcasecmp)(name, "alt") &&
4104
517
            TY_(tmbstrcasecmp)(name, "title") &&
4105
514
            TY_(tmbstrcasecmp)(name, "value") &&
4106
514
            TY_(tmbstrcasecmp)(name, "prompt"))
4107
514
        {
4108
610
            while (TY_(IsWhite)(lexer->lexbuf[start+len-1]) && (len > 0))
4109
96
                --len;
4110
4111
            /* Issue #497 - Fix leading space trimming */
4112
514
            while (TY_(IsWhite)(lexer->lexbuf[start]) && (len > 0))
4113
0
            {
4114
0
                ++start;
4115
0
                --len;
4116
0
            }
4117
514
        }
4118
4119
517
        value = TY_(tmbstrndup)(doc->allocator, lexer->lexbuf + start, len);
4120
517
    }
4121
0
    else
4122
0
        value = NULL;
4123
4124
    /* note delimiter if given */
4125
517
    *pdelim = delim;
4126
4127
517
    return value;
4128
525
}
4129
4130
/* attr must be non-NULL */
4131
static Bool IsValidAttrName( ctmbstr attr )
4132
33.8k
{
4133
33.8k
    uint i, c = attr[0];
4134
4135
    /* first character should be a letter */
4136
33.8k
    if (!TY_(IsLetter)(c))
4137
18.6k
        return no;
4138
4139
    /* remaining characters should be namechars */
4140
268k
    for( i = 1; i < TY_(tmbstrlen)(attr); i++)
4141
255k
    {
4142
255k
        c = attr[i];
4143
4144
255k
        if (TY_(IsNamechar)(c))
4145
253k
            continue;
4146
4147
1.87k
        return no;
4148
255k
    }
4149
4150
13.3k
    return yes;
4151
15.2k
}
4152
4153
/* create a new attribute */
4154
AttVal *TY_(NewAttribute)( TidyDocImpl* doc )
4155
34.0k
{
4156
34.0k
    AttVal *av = (AttVal*) TidyDocAlloc( doc, sizeof(AttVal) );
4157
34.0k
    TidyClearMemory( av, sizeof(AttVal) );
4158
34.0k
    return av;
4159
34.0k
}
4160
4161
/* create a new attribute with given name and value */
4162
AttVal* TY_(NewAttributeEx)( TidyDocImpl* doc, ctmbstr name, ctmbstr value,
4163
                             int delim )
4164
0
{
4165
0
    AttVal *av = TY_(NewAttribute)(doc);
4166
0
    av->attribute = TY_(tmbstrdup)(doc->allocator, name);
4167
0
    av->value = TY_(tmbstrdup)(doc->allocator, value);
4168
0
    av->delim = delim;
4169
0
    av->dict = TY_(FindAttribute)( doc, av );
4170
0
    return av;
4171
0
}
4172
4173
static void AddAttrToList( AttVal** list, AttVal* av )
4174
13.6k
{
4175
13.6k
  if ( *list == NULL )
4176
659
    *list = av;
4177
13.0k
  else
4178
13.0k
  {
4179
13.0k
    AttVal* here = *list;
4180
66.1M
    while ( here->next )
4181
66.1M
      here = here->next;
4182
13.0k
    here->next = av;
4183
13.0k
  }
4184
13.6k
}
4185
4186
void TY_(InsertAttributeAtEnd)( Node *node, AttVal *av )
4187
16
{
4188
16
    AddAttrToList(&node->attributes, av);
4189
16
}
4190
4191
void TY_(InsertAttributeAtStart)( Node *node, AttVal *av )
4192
0
{
4193
0
    av->next = node->attributes;
4194
0
    node->attributes = av;
4195
0
}
4196
4197
/* swallows closing '>' */
4198
4199
static AttVal* ParseAttrs( TidyDocImpl* doc, Bool *isempty )
4200
5.98k
{
4201
5.98k
    Lexer* lexer = doc->lexer;
4202
5.98k
    AttVal *av, *list;
4203
5.98k
    tmbstr value;
4204
5.98k
    int delim;
4205
5.98k
    Node *asp, *php;
4206
4207
5.98k
    list = NULL;
4208
4209
39.8k
    while ( !EndOfInput(doc) )
4210
39.8k
    {
4211
39.8k
        tmbstr attribute = ParseAttribute( doc, isempty, &asp, &php );
4212
4213
39.8k
        if (attribute == NULL)
4214
5.97k
        {
4215
            /* check if attributes are created by ASP markup */
4216
5.97k
            if (asp)
4217
10
            {
4218
10
                av = TY_(NewAttribute)(doc);
4219
10
                av->asp = asp;
4220
10
                AddAttrToList( &list, av ); 
4221
10
                continue;
4222
10
            }
4223
4224
            /* check if attributes are created by PHP markup */
4225
5.96k
            if (php)
4226
17
            {
4227
17
                av = TY_(NewAttribute)(doc);
4228
17
                av->php = php;
4229
17
                AddAttrToList( &list, av ); 
4230
17
                continue;
4231
17
            }
4232
4233
5.94k
            break;
4234
5.96k
        }
4235
4236
33.8k
        value = ParseValue( doc, attribute, no, isempty, &delim );
4237
4238
33.8k
        if (attribute && (IsValidAttrName(attribute) ||
4239
20.5k
            (cfgBool(doc, TidyXmlTags) && IsValidXMLAttrName(attribute))))
4240
13.5k
        {
4241
13.5k
            av = TY_(NewAttribute)(doc);
4242
13.5k
            av->delim = delim ? delim : '"';
4243
13.5k
            av->attribute = attribute;
4244
13.5k
            av->value = value;
4245
13.5k
            av->dict = TY_(FindAttribute)( doc, av );
4246
13.5k
            AddAttrToList( &list, av );
4247
13.5k
            if ( !delim && value )
4248
145
                TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK_OPEN);
4249
13.5k
        }
4250
20.3k
        else
4251
20.3k
        {
4252
20.3k
            av = TY_(NewAttribute)(doc);
4253
20.3k
            av->attribute = attribute;
4254
20.3k
            av->value = value;
4255
4256
20.3k
            if (LastChar(attribute) == '"')
4257
8
                TY_(ReportAttrError)( doc, lexer->token, av, MISSING_QUOTEMARK);
4258
20.3k
            else if (value == NULL)
4259
20.1k
                TY_(ReportAttrError)(doc, lexer->token, av, MISSING_ATTR_VALUE);
4260
222
            else
4261
222
                TY_(ReportAttrError)(doc, lexer->token, av, INVALID_ATTRIBUTE);
4262
4263
20.3k
            TY_(FreeAttribute)( doc, av );
4264
20.3k
        }
4265
33.8k
    }
4266
4267
5.98k
    return list;
4268
5.98k
}
4269
4270
/*
4271
  Returns document type declarations like
4272
4273
  <!DOCTYPE foo PUBLIC "fpi" "sysid">
4274
  <!DOCTYPE bar SYSTEM "sysid">
4275
  <!DOCTYPE baz [ <!ENTITY ouml "&#246"> ]>
4276
4277
  as
4278
4279
  <foo PUBLIC="fpi" SYSTEM="sysid" />
4280
  <bar SYSTEM="sysid" />
4281
  <baz> &lt;!ENTITY ouml &quot;&amp;#246&quot;&gt; </baz>
4282
*/
4283
static Node *ParseDocTypeDecl(TidyDocImpl* doc)
4284
36.5k
{
4285
36.5k
    Lexer *lexer = doc->lexer;
4286
36.5k
    int start = lexer->lexsize;
4287
36.5k
    ParseDocTypeDeclState state = DT_DOCTYPENAME;
4288
36.5k
    uint c;
4289
36.5k
    uint delim = 0;
4290
36.5k
    Bool hasfpi = yes;
4291
4292
36.5k
    Node* node = TY_(NewNode)(lexer->allocator, lexer);
4293
36.5k
    node->type = DocTypeTag;
4294
36.5k
    node->start = lexer->txtstart;
4295
36.5k
    node->end = lexer->txtend;
4296
4297
36.5k
    lexer->waswhite = no;
4298
4299
    /* todo: reset lexer->lexsize when appropriate to avoid wasting memory */
4300
4301
59.4M
    while ((c = TY_(ReadChar)(doc->docIn)) != EndOfStream)
4302
59.4M
    {
4303
        /* convert newlines to spaces */
4304
59.4M
        if (state != DT_INTSUBSET)
4305
428k
            c = c == '\n' ? ' ' : c;
4306
4307
        /* convert white-space sequences to single space character */
4308
59.4M
        if (TY_(IsWhite)(c) && state != DT_INTSUBSET)
4309
853
        {
4310
853
            if (!lexer->waswhite)
4311
502
            {
4312
502
                TY_(AddCharToLexer)(lexer, c);
4313
502
                lexer->waswhite = yes;
4314
502
            }
4315
351
            else
4316
351
            {
4317
                /* discard space */
4318
351
                continue;
4319
351
            }
4320
853
        }
4321
59.4M
        else
4322
59.4M
        {
4323
59.4M
            TY_(AddCharToLexer)(lexer, c);
4324
59.4M
            lexer->waswhite = no;
4325
59.4M
        }
4326
4327
59.4M
        switch(state)
4328
59.4M
        {
4329
76.1k
        case DT_INTERMEDIATE:
4330
            /* determine what's next */
4331
76.1k
            if (TY_(ToUpper)(c) == 'P' || TY_(ToUpper)(c) == 'S')
4332
35
            {
4333
35
                start = lexer->lexsize - 1;
4334
35
                state = DT_PUBLICSYSTEM;
4335
35
                continue;
4336
35
            }
4337
76.1k
            else if (c == '[')
4338
36.6k
            {
4339
36.6k
                start = lexer->lexsize;
4340
36.6k
                state = DT_INTSUBSET;
4341
36.6k
                continue;
4342
36.6k
            }
4343
39.4k
            else if (c == '\'' || c == '"')
4344
16
            {
4345
16
                start = lexer->lexsize;
4346
16
                delim = c;
4347
16
                state = DT_QUOTEDSTRING;
4348
16
                continue;
4349
16
            }
4350
39.4k
            else if (c == '>')
4351
36.5k
            {
4352
36.5k
                AttVal* si;
4353
4354
36.5k
                node->end = --(lexer->lexsize);
4355
4356
36.5k
                si = TY_(GetAttrByName)(node, "SYSTEM");
4357
36.5k
                if (si)
4358
3
                    TY_(CheckUrl)(doc, node, si);
4359
4360
36.5k
                if (!node->element || !IsValidXMLElemName(node->element))
4361
58
                {
4362
58
                    TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE);
4363
58
                    TY_(FreeNode)(doc, node);
4364
58
                    return NULL;
4365
58
                }
4366
36.5k
                return node;
4367
36.5k
            }
4368
2.91k
            else
4369
2.91k
            {
4370
                /* error */
4371
2.91k
            }
4372
2.91k
            break;
4373
350k
        case DT_DOCTYPENAME:
4374
            /* read document type name */
4375
350k
            if (TY_(IsWhite)(c) || c == '>' || c == '[')
4376
36.5k
            {
4377
36.5k
                node->element = TY_(tmbstrndup)(doc->allocator,
4378
36.5k
                                                lexer->lexbuf + start,
4379
36.5k
                                                lexer->lexsize - start - 1);
4380
36.5k
                if (c == '>' || c == '[')
4381
36.2k
                {
4382
36.2k
                    --(lexer->lexsize);
4383
36.2k
                    TY_(UngetChar)(c, doc->docIn);
4384
36.2k
                }
4385
4386
36.5k
                state = DT_INTERMEDIATE;
4387
36.5k
                continue;
4388
36.5k
            }
4389
314k
            break;
4390
314k
        case DT_PUBLICSYSTEM:
4391
            /* read PUBLIC/SYSTEM */
4392
454
            if (TY_(IsWhite)(c) || c == '>')
4393
35
            {
4394
35
                char *attname = TY_(tmbstrndup)(doc->allocator,
4395
35
                                                lexer->lexbuf + start,
4396
35
                                                lexer->lexsize - start - 1);
4397
35
                hasfpi = !(TY_(tmbstrcasecmp)(attname, "SYSTEM") == 0);
4398
4399
35
                TidyDocFree(doc, attname);
4400
4401
                /* todo: report an error if SYSTEM/PUBLIC not uppercase */
4402
4403
35
                if (c == '>')
4404
0
                {
4405
0
                    --(lexer->lexsize);
4406
0
                    TY_(UngetChar)(c, doc->docIn);
4407
0
                }
4408
4409
35
                state = DT_INTERMEDIATE;
4410
35
                continue;
4411
35
            }
4412
419
            break;
4413
419
        case DT_QUOTEDSTRING:
4414
            /* read quoted string */
4415
211
            if (c == delim)
4416
16
            {
4417
16
                char *value = TY_(tmbstrndup)(doc->allocator,
4418
16
                                              lexer->lexbuf + start,
4419
16
                                              lexer->lexsize - start - 1);
4420
16
                AttVal* att = TY_(AddAttribute)(doc, node, hasfpi ? "PUBLIC" : "SYSTEM", value);
4421
16
                TidyDocFree(doc, value);
4422
16
                att->delim = delim;
4423
16
                hasfpi = no;
4424
16
                state = DT_INTERMEDIATE;
4425
16
                delim = 0;
4426
16
                continue;
4427
16
            }
4428
195
            break;
4429
59.0M
        case DT_INTSUBSET:
4430
            /* read internal subset */
4431
59.0M
            if (c == ']')
4432
36.6k
            {
4433
36.6k
                Node* subset;
4434
36.6k
                lexer->txtstart = start;
4435
36.6k
                lexer->txtend = lexer->lexsize - 1;
4436
36.6k
                subset = TY_(TextToken)(lexer);
4437
36.6k
                TY_(InsertNodeAtEnd)(node, subset);
4438
36.6k
                state = DT_INTERMEDIATE;
4439
36.6k
            }
4440
59.0M
            break;
4441
59.4M
        }
4442
59.4M
    }
4443
4444
    /* document type declaration not finished */
4445
2
    TY_(Report)(doc, NULL, NULL, MALFORMED_DOCTYPE);
4446
2
    TY_(FreeNode)(doc, node);
4447
2
    return NULL;
4448
36.5k
}
4449
4450
4451
/****************************************************************************//*
4452
 ** MARK: - Node Stack
4453
 ***************************************************************************/
4454
4455
4456
/**
4457
 * Create a new stack with a given starting capacity. If memory allocation
4458
 * fails, then the allocator will panic the program automatically.
4459
 */
4460
Stack* TY_(newStack)(TidyDocImpl *doc, uint capacity)
4461
0
{
4462
0
    Stack *stack = (Stack *)TidyAlloc(doc->allocator, sizeof(Stack));
4463
0
    stack->top = -1;
4464
0
    stack->capacity = capacity;
4465
0
    stack->firstNode = (Node **)TidyAlloc(doc->allocator, stack->capacity * sizeof(Node**));
4466
0
    stack->allocator = doc->allocator;
4467
0
    return stack;
4468
0
}
4469
 
4470
4471
/**
4472
 *  Increase the stack size. This will be called automatically when the
4473
 *  current stack is full. If memory allocation fails, then the allocator
4474
 *  will panic the program automatically.
4475
 */
4476
void TY_(growStack)(Stack *stack)
4477
0
{
4478
0
    uint new_capacity = stack->capacity * 2;
4479
    
4480
0
    Node **firstNode = (Node **)TidyAlloc(stack->allocator, new_capacity * sizeof(Node**));
4481
    
4482
0
    memcpy( firstNode, stack->firstNode, sizeof(Node**) * (stack->top + 1) );
4483
0
    TidyFree(stack->allocator, stack->firstNode);
4484
4485
0
    stack->firstNode = firstNode;
4486
0
    stack->capacity = new_capacity;
4487
0
}
4488
4489
4490
/**
4491
 * Stack is full when top is equal to the last index.
4492
 */
4493
Bool TY_(stackFull)(Stack *stack)
4494
0
{
4495
0
    return stack->top == stack->capacity - 1;
4496
0
}
4497
4498
4499
/**
4500
 * Stack is empty when top is equal to -1
4501
 */
4502
Bool TY_(stackEmpty)(Stack *stack)
4503
0
{
4504
0
    return stack->top == -1;
4505
0
}
4506
 
4507
4508
/**
4509
 * Push an item to the stack.
4510
 */
4511
void TY_(push)(Stack *stack, Node *node)
4512
0
{
4513
0
    if (TY_(stackFull)(stack))
4514
0
        TY_(growStack)(stack);
4515
    
4516
0
    if (node)
4517
0
        stack->firstNode[++stack->top] = node;
4518
0
}
4519
4520
4521
/**
4522
 * Pop an item from the stack.
4523
 */
4524
Node* TY_(pop)(Stack *stack)
4525
0
{
4526
0
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
4527
0
}
4528
4529
4530
/**
4531
 * Peek at the stack.
4532
 */
4533
FUNC_UNUSED Node* TY_(peek)(Stack *stack)
4534
0
{
4535
0
    return TY_(stackEmpty)(stack) ? NULL : stack->firstNode[stack->top--];
4536
0
}
4537
4538
/**
4539
 *  Frees the stack when done.
4540
 */
4541
void TY_(freeStack)(Stack *stack)
4542
0
{
4543
0
    TidyFree( stack->allocator, stack->firstNode );
4544
0
    stack->top = -1;
4545
0
    stack->capacity = 0;
4546
0
    stack->firstNode = NULL;
4547
    stack->allocator = NULL;
4548
0
}