Coverage Report

Created: 2025-11-16 06:37

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tidy-html5/src/parser.c
Line
Count
Source
1
/* parser.c -- HTML Parser
2
3
  (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
*/
7
8
#include "tidy-int.h"
9
#include "lexer.h"
10
#include "parser.h"
11
#include "message.h"
12
#include "clean.h"
13
#include "tags.h"
14
#include "tmbstr.h"
15
#include "sprtf.h"
16
17
18
/****************************************************************************//*
19
 ** MARK: - Configuration Options
20
 ***************************************************************************/
21
22
23
/**
24
 *  Issue #72  - Need to know to avoid error-reporting - no warning only if
25
 *               --show-body-only yes.
26
 *  Issue #132 - Likewise avoid warning if showing body only.
27
 */
28
57.4k
#define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no
29
30
31
/****************************************************************************//*
32
 ** MARK: - Forward Declarations
33
 ***************************************************************************/
34
35
36
static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode);
37
38
39
/****************************************************************************//*
40
 ** MARK: - Node Operations
41
 ***************************************************************************/
42
43
44
/**
45
 *  Generalised search for duplicate elements.
46
 *  Issue #166 - repeated <main> element.
47
 */
48
static Bool findNodeWithId( Node *node, TidyTagId tid )
49
2.06k
{
50
2.06k
    Node *content;
51
5.22k
    while (node)
52
4.01k
    {
53
4.01k
        if (TagIsId(node,tid))
54
284
            return yes;
55
        /*\
56
         *   Issue #459 - Under certain circumstances, with many node this use of
57
         *   'for (content = node->content; content; content = content->content)'
58
         *   would produce a **forever** circle, or at least a very extended loop...
59
         *   It is sufficient to test the content, if it exists,
60
         *   to quickly iterate all nodes. Now all nodes are tested only once.
61
        \*/
62
3.73k
        content = node->content;
63
3.73k
        if (content)
64
1.64k
        {
65
1.64k
            if ( findNodeWithId(content,tid) )
66
569
                return yes;
67
1.64k
        }
68
3.16k
        node = node->next;
69
3.16k
    }
70
1.20k
    return no;
71
2.06k
}
72
73
74
/**
75
 *  Perform a global search for an element.
76
 *  Issue #166 - repeated <main> element
77
 */
78
static Bool findNodeById( TidyDocImpl* doc, TidyTagId tid )
79
417
{
80
417
    Node *node = (doc ? doc->root.content : NULL);
81
417
    return findNodeWithId( node,tid );
82
417
}
83
84
85
/**
86
 *  Inserts node into element at an appropriate location based
87
 *  on the type of node being inserted.
88
 */
89
static Bool InsertMisc(Node *element, Node *node)
90
3.27M
{
91
3.27M
    if (node->type == CommentTag ||
92
3.06M
        node->type == ProcInsTag ||
93
3.00M
        node->type == CDATATag ||
94
3.00M
        node->type == SectionTag ||
95
2.98M
        node->type == AspTag ||
96
2.97M
        node->type == JsteTag ||
97
2.97M
        node->type == PhpTag )
98
301k
    {
99
301k
        TY_(InsertNodeAtEnd)(element, node);
100
301k
        return yes;
101
301k
    }
102
103
2.97M
    if ( node->type == XmlDecl )
104
60.8k
    {
105
60.8k
        Node* root = element;
106
184k
        while ( root && root->parent )
107
123k
            root = root->parent;
108
60.8k
        if ( root && !(root->content && root->content->type == XmlDecl))
109
2.26k
        {
110
2.26k
          TY_(InsertNodeAtStart)( root, node );
111
2.26k
          return yes;
112
2.26k
        }
113
60.8k
    }
114
115
    /* Declared empty tags seem to be slipping through
116
    ** the cracks.  This is an experiment to figure out
117
    ** a decent place to pick them up.
118
    */
119
2.97M
    if ( node->tag &&
120
2.50M
         TY_(nodeIsElement)(node) &&
121
2.40M
         TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
122
0
         (node->tag->versions & VERS_PROPRIETARY) != 0 )
123
0
    {
124
0
        TY_(InsertNodeAtEnd)(element, node);
125
0
        return yes;
126
0
    }
127
128
2.97M
    return no;
129
2.97M
}
130
131
132
/**
133
 *  Insert "node" into markup tree in place of "element"
134
 *  which is moved to become the child of the node
135
 */
136
static void InsertNodeAsParent(Node *element, Node *node)
137
780
{
138
780
    node->content = element;
139
780
    node->last = element;
140
780
    node->parent = element->parent;
141
780
    element->parent = node;
142
143
780
    if (node->parent->content == element)
144
409
        node->parent->content = node;
145
146
780
    if (node->parent->last == element)
147
510
        node->parent->last = node;
148
149
780
    node->prev = element->prev;
150
780
    element->prev = NULL;
151
152
780
    if (node->prev)
153
371
        node->prev->next = node;
154
155
780
    node->next = element->next;
156
780
    element->next = NULL;
157
158
780
    if (node->next)
159
270
        node->next->prev = node;
160
780
}
161
162
163
/**
164
 *  Unexpected content in table row is moved to just before the table in
165
 *  in accordance with Netscape and IE. This code assumes that node hasn't
166
 *  been inserted into the row.
167
 */
168
static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
169
                            Node *node )
170
42.9k
{
171
42.9k
    Node *table;
172
173
    /* first find the table element */
174
1.92M
    for (table = row->parent; table; table = table->parent)
175
1.89M
    {
176
1.89M
        if ( nodeIsTABLE(table) )
177
10.5k
        {
178
10.5k
            TY_(InsertNodeBeforeElement)( table, node );
179
10.5k
            return;
180
10.5k
        }
181
1.89M
    }
182
    /* No table element */
183
32.4k
    TY_(InsertNodeBeforeElement)( row->parent, node );
184
32.4k
}
185
186
187
/**
188
 *  Moves given node to end of body element.
189
 */
190
static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
191
1.09k
{
192
1.09k
    Node* body = TY_(FindBody)( doc );
193
1.09k
    if ( body )
194
662
    {
195
662
        TY_(RemoveNode)( node );
196
662
        TY_(InsertNodeAtEnd)( body, node );
197
662
    }
198
1.09k
}
199
200
201
/**
202
 *  Move node to the head, where element is used as starting
203
 *  point in hunt for head. Normally called during parsing.
204
 */
205
static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
206
6.65k
{
207
6.65k
    Node *head = NULL;
208
209
6.65k
    TY_(RemoveNode)( node );  /* make sure that node is isolated */
210
211
6.65k
    if ( TY_(nodeIsElement)(node) )
212
6.57k
    {
213
6.57k
        TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN );
214
215
6.57k
        head = TY_(FindHEAD)(doc);
216
6.57k
        assert(head != NULL);
217
218
6.57k
        TY_(InsertNodeAtEnd)(head, node);
219
220
6.57k
        if ( node->tag->parser )
221
6.57k
        {
222
            /* Only one of the existing test cases as of 2021-08-14 invoke
223
               MoveToHead, and it doesn't go deeper than one level. The
224
               parser() call is supposed to return a node if additional
225
               parsing is needed. Keep this in mind if we start to get bug
226
               reports.
227
             */
228
6.57k
            Parser* parser = node->tag->parser;
229
6.57k
            parser( doc, node, IgnoreWhitespace );
230
6.57k
        }
231
6.57k
    }
232
75
    else
233
75
    {
234
75
        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
235
75
        TY_(FreeNode)( doc, node );
236
75
    }
237
6.65k
}
238
239
240
/***************************************************************************//*
241
 ** MARK: - Decision Making
242
 ***************************************************************************/
243
244
245
/**
246
 *  Indicates whether or not element can be pruned based on content,
247
 *  user settings, etc.
248
 */
249
static Bool CanPrune( TidyDocImpl* doc, Node *element )
250
2.08M
{
251
2.08M
    if ( !cfgBool(doc, TidyDropEmptyElems) )
252
0
        return no;
253
254
2.08M
    if ( TY_(nodeIsText)(element) )
255
11.3k
        return yes;
256
257
2.07M
    if ( element->content )
258
1.49M
        return no;
259
260
581k
    if ( element->tag == NULL )
261
337
        return no;
262
263
581k
    if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
264
61.9k
        return no;
265
266
519k
    if ( nodeIsA(element) && element->attributes != NULL )
267
3.94k
        return no;
268
269
515k
    if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
270
0
        return no;
271
272
515k
    if ( element->tag->model & CM_ROW )
273
6.44k
        return no;
274
275
509k
    if ( element->tag->model & CM_EMPTY )
276
55.0k
        return no;
277
278
454k
    if ( nodeIsAPPLET(element) )
279
737
        return no;
280
281
453k
    if ( nodeIsOBJECT(element) )
282
586
        return no;
283
284
452k
    if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
285
0
        return no;
286
287
452k
    if ( nodeIsTITLE(element) )
288
18.9k
        return no;
289
290
    /* #433359 - fix by Randy Waki 12 Mar 01 */
291
434k
    if ( nodeIsIFRAME(element) )
292
184
        return no;
293
294
    /* fix for bug 770297 */
295
433k
    if (nodeIsTEXTAREA(element))
296
473
        return no;
297
298
    /* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */
299
433k
    if (nodeIsCANVAS(element))
300
207
        return no;
301
    
302
433k
    if (nodeIsPROGRESS(element))
303
337
        return no;
304
305
432k
    if ( attrGetID(element) || attrGetNAME(element) )
306
19.8k
        return no;
307
308
    /* fix for bug 695408; a better fix would look for unknown and    */
309
    /* known proprietary attributes that make the element significant */
310
412k
    if (attrGetDATAFLD(element))
311
0
        return no;
312
313
    /* fix for bug 723772, don't trim new-...-tags */
314
412k
    if (element->tag->id == TidyTag_UNKNOWN)
315
0
        return no;
316
317
412k
    if (nodeIsBODY(element))
318
7.23k
        return no;
319
320
405k
    if (nodeIsCOLGROUP(element))
321
1.90k
        return no;
322
323
    /* HTML5 - do NOT drop empty option if it has attributes */
324
403k
    if ( nodeIsOPTION(element) && element->attributes != NULL )
325
123
        return no;
326
327
    /* fix for #103 - don't drop empty dd tags lest document not validate */
328
403k
    if (nodeIsDD(element))
329
2.02k
        return no;
330
331
401k
    return yes;
332
403k
}
333
334
335
/**
336
 *  Indicates whether or not node is a descendant of a tag of the given tid.
337
 */
338
static Bool DescendantOf( Node *element, TidyTagId tid )
339
275k
{
340
275k
    Node *parent;
341
275k
    for ( parent = element->parent;
342
149M
         parent != NULL;
343
149M
         parent = parent->parent )
344
149M
    {
345
149M
        if ( TagIsId(parent, tid) )
346
51.9k
            return yes;
347
149M
    }
348
223k
    return no;
349
275k
}
350
351
352
/**
353
 *  Indicates whether or not node is a descendant of a pre tag.
354
 */
355
static Bool IsPreDescendant(Node* node)
356
1.12M
{
357
1.12M
    Node *parent = node->parent;
358
359
1.42G
    while (parent)
360
1.42G
    {
361
1.42G
        if (parent->tag && parent->tag->parser == TY_(ParsePre))
362
52.4k
            return yes;
363
364
1.42G
        parent = parent->parent;
365
1.42G
    }
366
367
1.07M
    return no;
368
1.12M
}
369
370
371
/**
372
 *  Indicates whether or not the only content model for the given node
373
 *  is CM_INLINE.
374
 */
375
static Bool nodeCMIsOnlyInline( Node* node )
376
0
{
377
0
    return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
378
0
}
379
380
381
/**
382
 *  Indicates whether or not the content of the given node is acceptable
383
 *  content for pre elements
384
 */
385
static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
386
134k
{
387
    /* p is coerced to br's, Text OK too */
388
134k
    if ( nodeIsP(node) || TY_(nodeIsText)(node) )
389
6.17k
        return yes;
390
391
128k
    if ( node->tag == NULL ||
392
128k
         nodeIsPARAM(node) ||
393
128k
         !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
394
123k
        return no;
395
396
5.36k
    return yes;
397
128k
}
398
399
400
/**
401
 *  Indicates whether or not leading whitespace should be cleaned.
402
 */
403
static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
404
222k
{
405
222k
    if (!TY_(nodeIsText)(node))
406
0
        return no;
407
408
222k
    if (node->parent->type == DocTypeTag)
409
564
        return no;
410
411
221k
    if (IsPreDescendant(node))
412
10.8k
        return no;
413
414
210k
    if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
415
4.39k
        return no;
416
    
417
    /* #523, prevent blank spaces after script if the next item is script.
418
     * This is actually more generalized as, if the preceding element is
419
     * a body level script, then indicate that we want to clean leading
420
     * whitespace.
421
     */
422
206k
    if ( node->prev && nodeIsSCRIPT(node->prev) && nodeIsBODY(node->prev->parent) )
423
251
        return yes;
424
425
    /* <p>...<br> <em>...</em>...</p> */
426
206k
    if (nodeIsBR(node->prev))
427
2.32k
        return yes;
428
429
    /* <p> ...</p> */
430
203k
    if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
431
61.9k
        return yes;
432
433
    /* <h4>...</h4> <em>...</em> */
434
141k
    if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
435
88.3k
        TY_(nodeIsElement)(node->prev))
436
7.04k
        return yes;
437
438
    /* <p><span> ...</span></p> */
439
134k
    if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
440
6.80k
        return yes;
441
442
127k
    return no;
443
134k
}
444
445
446
/**
447
 *  Indicates whether or not trailing whitespace should be cleaned.
448
 */
449
static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
450
222k
{
451
222k
    Node* next;
452
453
222k
    if (!TY_(nodeIsText)(node))
454
0
        return no;
455
456
222k
    if (node->parent->type == DocTypeTag)
457
564
        return no;
458
459
221k
    if (IsPreDescendant(node))
460
10.8k
        return no;
461
462
210k
    if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
463
4.39k
        return no;
464
465
    /* #523, prevent blank spaces after script if the next item is script.
466
     * This is actually more generalized as, if the next element is
467
     * a body level script, then indicate that we want to clean trailing
468
     * whitespace.
469
     */
470
206k
    if ( node->next && nodeIsSCRIPT(node->next) && nodeIsBODY(node->next->parent) )
471
295
        return yes;
472
473
206k
    next = node->next;
474
475
    /* <p>... </p> */
476
206k
    if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
477
43.2k
        return yes;
478
479
    /* <div><small>... </small><h3>...</h3></div> */
480
162k
    if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
481
6.10k
        return yes;
482
483
156k
    if (!next)
484
21.0k
        return no;
485
486
135k
    if (nodeIsBR(next))
487
1.91k
        return yes;
488
489
133k
    if (TY_(nodeHasCM)(next, CM_INLINE))
490
33.7k
        return no;
491
492
    /* <a href='/'>...</a> <p>...</p> */
493
99.9k
    if (next->type == StartTag)
494
16.7k
        return yes;
495
496
    /* <strong>...</strong> <hr /> */
497
83.1k
    if (next->type == StartEndTag)
498
1.32k
        return yes;
499
500
    /* evil adjacent text nodes, Tidy should not generate these :-( */
501
81.8k
    if (TY_(nodeIsText)(next) && next->start < next->end
502
51.6k
        && TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
503
20.4k
        return yes;
504
505
61.4k
    return no;
506
81.8k
}
507
508
509
/***************************************************************************//*
510
 ** MARK: - Information Accumulation
511
 ***************************************************************************/
512
513
514
/**
515
 *  Errors in positioning of form start or end tags
516
 *  generally require human intervention to fix.
517
 *  Issue #166 - repeated <main> element also uses this flag
518
 *  to indicate duplicates, discarded.
519
 */
520
static void BadForm( TidyDocImpl* doc )
521
1.66k
{
522
1.66k
    doc->badForm |= flg_BadForm;
523
1.66k
}
524
525
526
/***************************************************************************//*
527
 ** MARK: - Fixes and Touchup
528
 ***************************************************************************/
529
530
531
/**
532
 *  Adds style information as a class in the document or a property
533
 *  of the node to prevent indentation of inferred UL tags.
534
 */
535
static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
536
752
{
537
752
    ctmbstr sprop =
538
752
    "padding-left: 2ex; margin-left: 0ex"
539
752
    "; margin-top: 0ex; margin-bottom: 0ex";
540
752
    if ( !cfgBool(doc, TidyDecorateInferredUL) )
541
752
        return;
542
0
    if ( cfgBool(doc, TidyMakeClean) )
543
0
        TY_(AddStyleAsClass)( doc, node, sprop );
544
0
    else
545
0
        TY_(AddStyleProperty)( doc, node, sprop );
546
0
}
547
548
549
/**
550
 *  Cleans whitespace from text nodes, and drops such nodes if emptied
551
 *  completely as a result.
552
 */
553
static void CleanSpaces(TidyDocImpl* doc, Node* node)
554
16.9k
{
555
16.9k
    Stack *stack = TY_(newStack)(doc, 16);
556
16.9k
    Node *next;
557
    
558
2.24M
    while (node)
559
2.23M
    {
560
2.23M
        next = node->next;
561
562
2.23M
        if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
563
81.5k
            while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
564
3.17k
                ++(node->start);
565
566
2.23M
        if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
567
124k
            while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
568
34.6k
                --(node->end);
569
570
2.23M
        if (TY_(nodeIsText)(node) && !(node->start < node->end))
571
21.5k
        {
572
21.5k
            TY_(RemoveNode)(node);
573
21.5k
            TY_(FreeNode)(doc, node);
574
21.5k
            node = next ? next : TY_(pop)(stack);
575
21.5k
            continue;
576
21.5k
        }
577
578
2.21M
        if (node->content)
579
1.51M
        {
580
1.51M
            TY_(push)(stack, next);
581
1.51M
            node = node->content;
582
1.51M
            continue;
583
1.51M
        }
584
585
699k
        node = next ? next : TY_(pop)(stack);
586
699k
    }
587
16.9k
    TY_(freeStack)(stack);
588
16.9k
}
589
590
591
/**
592
 *  If a table row is empty then insert an empty cell. This practice is
593
 *  consistent with browser behavior and avoids potential problems with
594
 *  row spanning cells.
595
 */
596
static void FixEmptyRow(TidyDocImpl* doc, Node *row)
597
9.16k
{
598
9.16k
    Node *cell;
599
600
9.16k
    if (row->content == NULL)
601
4.34k
    {
602
4.34k
        cell = TY_(InferredTag)(doc, TidyTag_TD);
603
4.34k
        TY_(InsertNodeAtEnd)(row, cell);
604
4.34k
        TY_(Report)(doc, row, cell, MISSING_STARTTAG);
605
4.34k
    }
606
9.16k
}
607
608
609
/**
610
 *  The doctype has been found after other tags,
611
 *  and needs moving to before the html element
612
 */
613
static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
614
8.87k
{
615
8.87k
    Node* existing = TY_(FindDocType)( doc );
616
8.87k
    if ( existing )
617
4.31k
    {
618
4.31k
        TY_(Report)(doc, element, doctype, DISCARDING_UNEXPECTED );
619
4.31k
        TY_(FreeNode)( doc, doctype );
620
4.31k
    }
621
4.55k
    else
622
4.55k
    {
623
4.55k
        TY_(Report)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
624
9.15k
        while ( !nodeIsHTML(element) )
625
4.60k
            element = element->parent;
626
4.55k
        TY_(InsertNodeBeforeElement)( element, doctype );
627
4.55k
    }
628
8.87k
}
629
630
631
/**
632
 *  This maps
633
 *     <p>hello<em> world</em>
634
 *  to
635
 *     <p>hello <em>world</em>
636
 *
637
 *  Trims initial space, by moving it before the
638
 *  start tag, or if this element is the first in
639
 *  parent's content, then by discarding the space
640
 */
641
static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
642
65.3k
{
643
65.3k
    Lexer* lexer = doc->lexer;
644
65.3k
    Node *prev, *node;
645
646
65.3k
    if ( TY_(nodeIsText)(text) &&
647
65.3k
         lexer->lexbuf[text->start] == ' ' &&
648
10.5k
         text->start < text->end )
649
9.25k
    {
650
9.25k
        if ( (element->tag->model & CM_INLINE) &&
651
8.55k
             !(element->tag->model & CM_FIELD) )
652
8.44k
        {
653
8.44k
            prev = element->prev;
654
655
8.44k
            if (TY_(nodeIsText)(prev))
656
917
            {
657
917
                if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
658
592
                    lexer->lexbuf[(prev->end)++] = ' ';
659
660
917
                ++(element->start);
661
917
            }
662
7.53k
            else /* create new node */
663
7.53k
            {
664
7.53k
                node = TY_(NewNode)(lexer->allocator, lexer);
665
7.53k
                node->start = (element->start)++;
666
7.53k
                node->end = element->start;
667
7.53k
                lexer->lexbuf[node->start] = ' ';
668
7.53k
                TY_(InsertNodeBeforeElement)(element ,node);
669
7.53k
                DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n",
670
7.53k
                    (element->element ? element->element : "unknown")));
671
7.53k
            }
672
8.44k
        }
673
674
        /* discard the space in current node */
675
9.25k
        ++(text->start);
676
9.25k
    }
677
65.3k
}
678
679
680
/**
681
 *  This maps
682
 *     <em>hello </em><strong>world</strong>
683
 *  to
684
 *     <em>hello</em> <strong>world</strong>
685
 *
686
 *  If last child of element is a text node
687
 *  then trim trailing white space character
688
 *  moving it to after element's end tag.
689
 */
690
static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
691
42.5k
{
692
42.5k
    Lexer* lexer = doc->lexer;
693
42.5k
    byte c;
694
695
42.5k
    if (TY_(nodeIsText)(last))
696
42.5k
    {
697
42.5k
        if (last->end > last->start)
698
39.9k
        {
699
39.9k
            c = (byte) lexer->lexbuf[ last->end - 1 ];
700
701
39.9k
            if ( c == ' ' )
702
5.05k
            {
703
5.05k
                last->end -= 1;
704
5.05k
                if ( (element->tag->model & CM_INLINE) &&
705
4.50k
                     !(element->tag->model & CM_FIELD) )
706
4.43k
                    lexer->insertspace = yes;
707
5.05k
            }
708
39.9k
        }
709
42.5k
    }
710
42.5k
}
711
712
713
/**
714
 *  Move initial and trailing space out.
715
 *  This routine maps:
716
 *     hello<em> world</em>
717
 *  to
718
 *     hello <em>world</em>
719
 *  and
720
 *     <em>hello </em><strong>world</strong>
721
 *  to
722
 *     <em>hello</em> <strong>world</strong>
723
 */
724
static void TrimSpaces( TidyDocImpl* doc, Node *element)
725
700k
{
726
700k
    Node* text = element->content;
727
728
700k
    if (nodeIsPRE(element) || IsPreDescendant(element))
729
46.1k
        return;
730
731
654k
    if (TY_(nodeIsText)(text))
732
64.8k
        TrimInitialSpace(doc, element, text);
733
734
654k
    text = element->last;
735
736
654k
    if (TY_(nodeIsText)(text))
737
42.5k
        TrimTrailingSpace(doc, element, text);
738
654k
}
739
740
741
/***************************************************************************//*
742
 ** MARK: - Parsers Support
743
 ***************************************************************************/
744
745
746
/**
747
 *  Structure used by FindDescendant_cb.
748
 */
749
struct MatchingDescendantData
750
{
751
    Node *found_node;
752
    Bool *passed_marker_node;
753
754
    /* input: */
755
    TidyTagId matching_tagId;
756
    Node *node_to_find;
757
    Node *marker_node;
758
};
759
760
761
/**
762
 *  The main engine for FindMatchingDescendant.
763
 */
764
static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate)
765
52.5k
{
766
52.5k
    struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate;
767
768
52.5k
    if (TagId(node) == cb_data->matching_tagId)
769
9.66k
    {
770
        /* make sure we match up 'unknown' tags exactly! */
771
9.66k
        if (cb_data->matching_tagId != TidyTag_UNKNOWN ||
772
5.88k
            (node->element != NULL &&
773
5.32k
            cb_data->node_to_find != NULL &&
774
5.32k
            cb_data->node_to_find->element != NULL &&
775
5.32k
            0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element)))
776
4.33k
        {
777
4.33k
            cb_data->found_node = node;
778
4.33k
            return ExitTraversal;
779
4.33k
        }
780
9.66k
    }
781
782
48.2k
    if (cb_data->passed_marker_node && node == cb_data->marker_node)
783
0
        *cb_data->passed_marker_node = yes;
784
785
48.2k
    return VisitParent;
786
52.5k
}
787
788
789
/**
790
 *  Search the parent chain (from `parent` upwards up to the root) for a node
791
 *  matching the given 'node'.
792
 *
793
 *  When the search passes beyond the `marker_node` (which is assumed to sit
794
 *  in the parent chain), this will be flagged by setting the boolean
795
 *  referenced by `is_parent_of_marker` to `yes`.
796
 *
797
 *  'is_parent_of_marker' and 'marker_node' are optional parameters and may
798
 *  be NULL.
799
 */
800
static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker )
801
5.72k
{
802
5.72k
    struct MatchingDescendantData cb_data = { 0 };
803
5.72k
    cb_data.matching_tagId = TagId(node);
804
5.72k
    cb_data.node_to_find = node;
805
5.72k
    cb_data.marker_node = marker_node;
806
807
5.72k
    assert(node);
808
809
5.72k
    if (is_parent_of_marker)
810
5.72k
        *is_parent_of_marker = no;
811
812
5.72k
    TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data);
813
5.72k
    return cb_data.found_node;
814
5.72k
}
815
816
817
/**
818
 *   Finds the last list item for the given list, providing it in the
819
 *   in-out parameter. Returns yes or no if the item was the last list
820
 *   item.
821
 */
822
static Bool FindLastLI( Node *list, Node **lastli )
823
18.7k
{
824
18.7k
    Node *node;
825
826
18.7k
    *lastli = NULL;
827
23.5k
    for ( node = list->content; node ; node = node->next )
828
4.79k
        if ( nodeIsLI(node) && node->type == StartTag )
829
671
            *lastli=node;
830
18.7k
    return *lastli ? yes:no;
831
18.7k
}
832
833
834
/***************************************************************************//*
835
 ** MARK: - Parser Stack
836
 ***************************************************************************/
837
838
839
/**
840
 *  Allocates and initializes the parser's stack.
841
 */
842
void TY_(InitParserStack)( TidyDocImpl* doc )
843
16.9k
{
844
16.9k
    enum { default_size = 32 };
845
16.9k
    TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size );
846
847
16.9k
    doc->stack.content = content;
848
16.9k
    doc->stack.size = default_size;
849
16.9k
    doc->stack.top = -1;
850
16.9k
}
851
852
853
/**
854
 *  Frees the parser's stack when done.
855
 */
856
void TY_(FreeParserStack)( TidyDocImpl* doc )
857
16.9k
{
858
16.9k
    TidyFree( doc->allocator, doc->stack.content );
859
860
16.9k
    doc->stack.content = NULL;
861
16.9k
    doc->stack.size = 0;
862
16.9k
    doc->stack.top = -1;
863
16.9k
}
864
865
866
/**
867
 *  Increase the stack size.
868
 */
869
static void growParserStack( TidyDocImpl* doc )
870
4.15k
{
871
4.15k
    TidyParserMemory *content;
872
4.15k
    content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 );
873
874
4.15k
    memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) );
875
4.15k
    TidyFree(doc->allocator, doc->stack.content);
876
877
4.15k
    doc->stack.content = content;
878
4.15k
    doc->stack.size = doc->stack.size * 2;
879
4.15k
}
880
881
882
/**
883
 *  Indicates whether or not the stack is empty.
884
 */
885
Bool TY_(isEmptyParserStack)( TidyDocImpl* doc )
886
3.63M
{
887
3.63M
    return doc->stack.top < 0;
888
3.63M
}
889
890
891
/**
892
 *  Peek at the parser memory.
893
 */
894
TidyParserMemory TY_(peekMemory)( TidyDocImpl* doc )
895
0
{
896
0
    return doc->stack.content[doc->stack.top];
897
0
}
898
899
900
/**
901
 *  Peek at the parser memory "identity" field. This is just a convenience
902
 *  to avoid having to create a new struct instance in the caller.
903
 */
904
Parser* TY_(peekMemoryIdentity)( TidyDocImpl* doc )
905
1.81M
{
906
1.81M
    return doc->stack.content[doc->stack.top].identity;
907
1.81M
}
908
909
910
/**
911
 *  Peek at the parser memory "mode" field. This is just a convenience
912
 *  to avoid having to create a new struct instance in the caller.
913
 */
914
GetTokenMode TY_(peekMemoryMode)( TidyDocImpl* doc )
915
17.5k
{
916
17.5k
    return doc->stack.content[doc->stack.top].mode;
917
17.5k
}
918
919
920
/**
921
 *  Pop out a parser memory.
922
 */
923
TidyParserMemory TY_(popMemory)( TidyDocImpl* doc )
924
1.81M
{
925
1.81M
    if ( !TY_(isEmptyParserStack)( doc ) )
926
1.81M
    {
927
1.81M
        TidyParserMemory data = doc->stack.content[doc->stack.top];
928
1.81M
        DEBUG_LOG(SPRTF("\n"
929
1.81M
                        "<--POP  original: %s @ %p\n"
930
1.81M
                        "         reentry: %s @ %p\n"
931
1.81M
                        "     stack depth: %lu @ %p\n"
932
1.81M
                        "            mode: %u\n"
933
1.81M
                        "      register 1: %i\n"
934
1.81M
                        "      register 2: %i\n\n",
935
1.81M
                        data.original_node ? data.original_node->element : "none", data.original_node,
936
1.81M
                        data.reentry_node ? data.reentry_node->element : "none", data.reentry_node,
937
1.81M
                        doc->stack.top, &doc->stack.content[doc->stack.top],
938
1.81M
                        data.mode,
939
1.81M
                        data.register_1,
940
1.81M
                        data.register_2
941
1.81M
                        ));
942
1.81M
        doc->stack.top = doc->stack.top - 1;
943
1.81M
        return data;
944
1.81M
    }
945
0
    TidyParserMemory blank = { NULL };
946
0
    return blank;
947
1.81M
}
948
949
950
/**
951
 * Push the parser memory to the stack.
952
 */
953
void TY_(pushMemory)( TidyDocImpl* doc, TidyParserMemory data )
954
1.94M
{
955
1.94M
    if ( doc->stack.top == doc->stack.size - 1 )
956
4.15k
        growParserStack( doc );
957
958
1.94M
    doc->stack.top++;
959
    
960
1.94M
    doc->stack.content[doc->stack.top] = data;
961
1.94M
    DEBUG_LOG(SPRTF("\n"
962
1.94M
                    "-->PUSH original: %s @ %p\n"
963
1.94M
                    "         reentry: %s @ %p\n"
964
1.94M
                    "     stack depth: %lu @ %p\n"
965
1.94M
                    "            mode: %u\n"
966
1.94M
                    "      register 1: %i\n"
967
1.94M
                    "      register 2: %i\n\n",
968
1.94M
                    data.original_node ? data.original_node->element : "none", data.original_node,
969
1.94M
                    data.reentry_node ? data.reentry_node->element : "none", data.reentry_node,
970
1.94M
                    doc->stack.top, &doc->stack.content[doc->stack.top],
971
1.94M
                    data.mode,
972
1.94M
                    data.register_1,
973
1.94M
                    data.register_2
974
1.94M
                    ));
975
1.94M
}
976
977
978
/***************************************************************************//*
979
 ** MARK: Convenience Logging Macros
980
 ***************************************************************************/
981
982
983
#if defined(ENABLE_DEBUG_LOG)
984
#  define DEBUG_LOG_COUNTERS \
985
     static int depth_parser = 0;\
986
     static int count_parser = 0;\
987
     int old_mode = IgnoreWhitespace;
988
#  define DEBUG_LOG_GET_OLD_MODE old_mode = mode;
989
#  define DEBUG_LOG_REENTER_WITH_NODE(NODE) SPRTF("\n>>>Re-Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser);
990
#  define DEBUG_LOG_ENTER_WITH_NODE(NODE) SPRTF("\n>>>Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser);
991
#  define DEBUG_LOG_CHANGE_MODE SPRTF("+++%s-%u Changing mode to %u (was %u)\n", __FUNCTION__, __LINE__, mode, old_mode);
992
#  define DEBUG_LOG_GOT_TOKEN(NODE) SPRTF("---%s-%u got token '%s' with mode '%u'.\n", __FUNCTION__, __LINE__, NODE ? NODE->element : NULL, mode);
993
#  define DEBUG_LOG_EXIT_WITH_NODE(NODE) SPRTF("<<<Exit %s-%u with a node to parse: '%s', depth: %d\n", __FUNCTION__, __LINE__, NODE->element, depth_parser--);
994
#  define DEBUG_LOG_EXIT SPRTF("<<<Exit %s-%u, depth: %d\n", __FUNCTION__, __LINE__, depth_parser--);
995
#else
996
#  define DEBUG_LOG_COUNTERS
997
#  define DEBUG_LOG_GET_OLD_MODE
998
#  define DEBUG_LOG_REENTER_WITH_NODE(NODE)
999
#  define DEBUG_LOG_ENTER_WITH_NODE(NODE)
1000
#  define DEBUG_LOG_CHANGE_MODE
1001
#  define DEBUG_LOG_GOT_TOKEN(NODE)
1002
#  define DEBUG_LOG_EXIT_WITH_NODE(NODE)
1003
#  define DEBUG_LOG_EXIT
1004
#endif
1005
1006
1007
/***************************************************************************//*
1008
 ** MARK: - Parser Search and Instantiation
1009
 ***************************************************************************/
1010
1011
1012
/**
1013
 *  Retrieves the correct parser for the given node, accounting for various
1014
 *  conditions, and readies the lexer for parsing that node.
1015
 */
1016
static Parser* GetParserForNode( TidyDocImpl* doc, Node *node )
1017
1.97M
{
1018
1.97M
    Lexer* lexer = doc->lexer;
1019
1020
1.97M
    if ( cfgBool( doc, TidyXmlTags ) )
1021
0
        return ParseXMLElement;
1022
    
1023
    /* [i_a]2 prevent crash for active content (php, asp) docs */
1024
1.97M
    if (!node || node->tag == NULL)
1025
5.96k
        return NULL;
1026
1027
    /*
1028
       Fix by GLP 2000-12-21.  Need to reset insertspace if this is both
1029
       a non-inline and empty tag (base, link, meta, isindex, hr, area).
1030
    */
1031
1.97M
    if (node->tag->model & CM_EMPTY)
1032
34.7k
    {
1033
34.7k
        lexer->waswhite = no;
1034
34.7k
        if (node->tag->parser == NULL)
1035
0
            return NULL;
1036
34.7k
    }
1037
1.93M
    else if (!(node->tag->model & CM_INLINE))
1038
471k
        lexer->insertspace = no;
1039
1040
1.97M
    if (node->tag->parser == NULL)
1041
0
        return NULL;
1042
1043
1.97M
    if (node->type == StartEndTag)
1044
10.9k
        return NULL;
1045
1046
    /* [i_a]2 added this - not sure why - CHECKME: */
1047
1.95M
    lexer->parent = node;
1048
1049
1.95M
    return (node->tag->parser);
1050
1.97M
}
1051
1052
1053
/**
1054
 *  This parser controller initiates the parsing process with the document's
1055
 *  root starting with the provided node, which should be the HTML node after
1056
 *  the pre-HTML stuff is handled at a higher level.
1057
 *
1058
 *  This controller is responsible for calling each of the individual parsers,
1059
 *  based on the tokens it pulls from the lexer, or the tokens passed back via
1060
 *  the parserMemory stack from each of the parsers. Having a main, central
1061
 *  looping dispatcher in this fashion allows the prevention of recursion.
1062
 */
1063
void ParseHTMLWithNode( TidyDocImpl* doc, Node* node )
1064
16.9k
{
1065
16.9k
    GetTokenMode mode = IgnoreWhitespace;
1066
16.9k
    Parser* parser = GetParserForNode( doc, node );
1067
16.9k
    Bool something_to_do = yes;
1068
1069
    /*
1070
     This main loop is only extinguished when all of the parser tokens are
1071
     consumed. Ideally, EVERY parser will return nodes to this loop for
1072
     dispatch to the appropriate parser, but some of the recursive parsers
1073
     still consume some tokens on their own.
1074
     */
1075
3.78M
    while (something_to_do)
1076
3.77M
    {
1077
3.77M
        node = parser ? parser( doc, node, mode ) : NULL;
1078
        
1079
        /*
1080
         We have a node, so anything deferred was already pushed to the stack
1081
         to be dealt with later.
1082
         */
1083
3.77M
        if ( node )
1084
1.94M
        {
1085
1.94M
            parser = GetParserForNode( doc, node );
1086
1.94M
            continue;
1087
1.94M
        }
1088
1089
        /*
1090
         We weren't given a node, which means this particular leaf is bottomed
1091
         out. We'll re-enter the parsers using information from the stack.
1092
         */
1093
1.82M
        if ( !TY_(isEmptyParserStack)(doc))
1094
1.81M
        {
1095
1.81M
            parser = TY_(peekMemoryIdentity)(doc);
1096
1.81M
            if (parser)
1097
1.79M
            {
1098
1.79M
                continue;
1099
1.79M
            }
1100
17.5k
            else
1101
17.5k
            {
1102
                /* No parser means we're only passing back a parsing mode. */
1103
17.5k
                mode = TY_(peekMemoryMode)( doc );
1104
17.5k
                TY_(popMemory)( doc );
1105
17.5k
            }
1106
1.81M
        }
1107
        
1108
        /*
1109
         At this point, there's nothing being returned from parsers, and
1110
         nothing on the stack, so we can draw a new node from the lexer.
1111
         */
1112
30.0k
        node = TY_(GetToken)( doc, mode );
1113
30.0k
        DEBUG_LOG_GOT_TOKEN(node);
1114
1115
30.0k
        if (node)
1116
13.0k
            parser = GetParserForNode( doc, node );
1117
16.9k
        else
1118
16.9k
            something_to_do = no;
1119
30.0k
    }
1120
16.9k
}
1121
1122
1123
/***************************************************************************//*
1124
 ** MARK: - Parsers
1125
 ***************************************************************************/
1126
1127
1128
/** MARK: TY_(ParseBlock)
1129
 *  `element` is a node created by the lexer upon seeing the start tag, or
1130
 *  by the parser when the start tag is inferred
1131
 *
1132
 *  This is a non-recursing parser. It uses the document's parser memory stack
1133
 *  to send subsequent nodes back to the controller for dispatching to parsers.
1134
 *  This parser is also re-enterable, so that post-processing can occur after
1135
 *  such dispatching.
1136
 */
1137
Node* TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
1138
363k
{
1139
363k
    Lexer* lexer = doc->lexer;
1140
363k
    Node *node = NULL;
1141
363k
    Bool checkstack = yes;
1142
363k
    uint istackbase = 0;
1143
363k
    DEBUG_LOG_COUNTERS;
1144
    
1145
363k
    if ( element == NULL )
1146
179k
    {
1147
179k
        TidyParserMemory memory = TY_(popMemory)( doc );
1148
179k
        node = memory.reentry_node; /* Throwaway, because the loop overwrites this immediately. */
1149
179k
        DEBUG_LOG_REENTER_WITH_NODE(node);
1150
179k
        element = memory.original_node;
1151
179k
        DEBUG_LOG_GET_OLD_MODE;
1152
179k
        mode = memory.reentry_mode;
1153
179k
        DEBUG_LOG_CHANGE_MODE;
1154
179k
    }
1155
183k
    else
1156
183k
    {
1157
183k
        DEBUG_LOG_ENTER_WITH_NODE(element);
1158
1159
183k
        if ( element->tag->model & CM_EMPTY )
1160
231
        {
1161
231
            DEBUG_LOG_EXIT;
1162
231
            return NULL;
1163
231
        }
1164
1165
183k
        if ( nodeIsDIV(element) && nodeIsDL(element->parent) && TY_(IsHTML5Mode)(doc) )
1166
43
        {
1167
43
            DEBUG_LOG_EXIT;
1168
43
            return TY_(ParseDefList)(doc, element, mode); /* @warning: possible recursion! */
1169
43
        }
1170
        
1171
183k
        if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) )
1172
4.31k
        {
1173
4.31k
            TY_(Report)(doc, element, NULL, ILLEGAL_NESTING );
1174
4.31k
        }
1175
1176
        /*
1177
         InlineDup() asks the lexer to insert inline emphasis tags
1178
         currently pushed on the istack, but take care to avoid
1179
         propagating inline emphasis inside OBJECT or APPLET.
1180
         For these elements a fresh inline stack context is created
1181
         and disposed of upon reaching the end of the element.
1182
         They thus behave like table cells in this respect.
1183
        */
1184
183k
        if (element->tag->model & CM_OBJECT)
1185
13.4k
        {
1186
13.4k
            istackbase = lexer->istackbase;
1187
13.4k
            lexer->istackbase = lexer->istacksize;
1188
13.4k
        }
1189
1190
183k
        if (!(element->tag->model & CM_MIXED))
1191
173k
        {
1192
173k
            TY_(InlineDup)( doc, NULL );
1193
173k
        }
1194
1195
        /*\
1196
         *  Issue #212 - If it is likely that it may be necessary
1197
         *  to move a leading space into a text node before this
1198
         *  element, then keep the mode MixedContent to keep any
1199
         *  leading space
1200
        \*/
1201
183k
        if ( !(element->tag->model & CM_INLINE) ||
1202
127k
              (element->tag->model & CM_FIELD ) )
1203
56.4k
        {
1204
56.4k
            DEBUG_LOG_GET_OLD_MODE;
1205
56.4k
            mode = IgnoreWhitespace;
1206
56.4k
            DEBUG_LOG_CHANGE_MODE;
1207
56.4k
        }
1208
127k
        else if (mode == IgnoreWhitespace)
1209
127k
        {
1210
            /* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace'
1211
               when such a leading space may need to be inserted before this element to
1212
               preserve the browser view */
1213
127k
            DEBUG_LOG_GET_OLD_MODE;
1214
127k
            mode = MixedContent;
1215
127k
            DEBUG_LOG_CHANGE_MODE;
1216
127k
        }
1217
183k
    } /* Re-Entering */
1218
    
1219
    /*
1220
     Main Loop
1221
     */
1222
    
1223
436k
    while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
1224
299k
    {
1225
299k
        DEBUG_LOG_GOT_TOKEN(node);
1226
        /* end tag for this element */
1227
299k
        if (node->type == EndTag && node->tag &&
1228
18.0k
            (node->tag == element->tag || element->was == node->tag))
1229
6.00k
        {
1230
6.00k
            TY_(FreeNode)( doc, node );
1231
1232
6.00k
            if (element->tag->model & CM_OBJECT)
1233
630
            {
1234
                /* pop inline stack */
1235
3.69k
                while (lexer->istacksize > lexer->istackbase)
1236
3.06k
                    TY_(PopInline)( doc, NULL );
1237
630
                lexer->istackbase = istackbase;
1238
630
            }
1239
1240
6.00k
            element->closed = yes;
1241
6.00k
            TrimSpaces( doc, element );
1242
6.00k
            DEBUG_LOG_EXIT;
1243
6.00k
            return NULL;
1244
6.00k
        }
1245
1246
292k
        if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
1247
2.33k
        {
1248
2.33k
            if ( TY_(nodeIsElement)(node) )
1249
1.34k
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1250
2.33k
            TY_(FreeNode)( doc, node );
1251
2.33k
            continue;
1252
2.33k
        }
1253
1254
1255
290k
        if (node->type == EndTag)
1256
11.8k
        {
1257
11.8k
            if (node->tag == NULL)
1258
765
            {
1259
765
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1260
765
                TY_(FreeNode)( doc, node );
1261
765
                continue;
1262
765
            }
1263
11.0k
            else if ( nodeIsBR(node) )
1264
658
            {
1265
658
                node->type = StartTag;
1266
658
            }
1267
10.3k
            else if ( nodeIsP(node) )
1268
1.05k
            {
1269
                /* Cannot have a block inside a paragraph, so no checking
1270
                   for an ancestor is necessary -- but we _can_ have
1271
                   paragraphs inside a block, so change it to an implicit
1272
                   empty paragraph, to be dealt with according to the user's
1273
                   options
1274
                */
1275
1.05k
                node->type = StartEndTag;
1276
1.05k
                node->implicit = yes;
1277
1.05k
            }
1278
9.32k
            else if (DescendantOf( element, node->tag->id ))
1279
4.47k
            {
1280
                /*
1281
                  if this is the end tag for an ancestor element
1282
                  then infer end tag for this element
1283
                */
1284
4.47k
                TY_(UngetToken)( doc );
1285
4.47k
                break;
1286
4.47k
            }
1287
4.85k
            else
1288
4.85k
            {
1289
                /* special case </tr> etc. for stuff moved in front of table */
1290
4.85k
                if ( lexer->exiled
1291
1.40k
                     && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
1292
802
                {
1293
802
                    TY_(UngetToken)( doc );
1294
802
                    TrimSpaces( doc, element );
1295
802
                    DEBUG_LOG_EXIT;
1296
802
                    return NULL;
1297
802
                }
1298
4.85k
            }
1299
11.8k
        }
1300
1301
        /* mixed content model permits text */
1302
284k
        if (TY_(nodeIsText)(node))
1303
28.9k
        {
1304
28.9k
            if ( checkstack )
1305
23.2k
            {
1306
23.2k
                checkstack = no;
1307
23.2k
                if (!(element->tag->model & CM_MIXED))
1308
19.4k
                {
1309
19.4k
                    if ( TY_(InlineDup)(doc, node) > 0 )
1310
2.86k
                        continue;
1311
19.4k
                }
1312
23.2k
            }
1313
1314
26.0k
            TY_(InsertNodeAtEnd)(element, node);
1315
26.0k
            DEBUG_LOG_GET_OLD_MODE
1316
26.0k
            mode = MixedContent;
1317
26.0k
            DEBUG_LOG_CHANGE_MODE;
1318
            /*
1319
              HTML4 strict doesn't allow mixed content for
1320
              elements with %block; as their content model
1321
            */
1322
            /*
1323
              But only body, map, blockquote, form and
1324
              noscript have content model %block;
1325
            */
1326
26.0k
            if ( nodeIsBODY(element)       ||
1327
26.0k
                 nodeIsMAP(element)        ||
1328
26.0k
                 nodeIsBLOCKQUOTE(element) ||
1329
26.0k
                 nodeIsFORM(element)       ||
1330
18.5k
                 nodeIsNOSCRIPT(element) )
1331
7.80k
                TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1332
26.0k
            continue;
1333
28.9k
        }
1334
1335
255k
        if ( InsertMisc(element, node) )
1336
4.45k
            continue;
1337
1338
        /* allow PARAM elements? */
1339
251k
        if ( nodeIsPARAM(node) )
1340
7.20k
        {
1341
7.20k
            if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
1342
6.93k
            {
1343
6.93k
                TY_(InsertNodeAtEnd)(element, node);
1344
6.93k
                continue;
1345
6.93k
            }
1346
1347
            /* otherwise discard it */
1348
261
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1349
261
            TY_(FreeNode)( doc, node );
1350
261
            continue;
1351
7.20k
        }
1352
1353
        /* allow AREA elements? */
1354
244k
        if ( nodeIsAREA(node) )
1355
3.96k
        {
1356
3.96k
            if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
1357
1.78k
            {
1358
1.78k
                TY_(InsertNodeAtEnd)(element, node);
1359
1.78k
                continue;
1360
1.78k
            }
1361
1362
            /* otherwise discard it */
1363
2.17k
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1364
2.17k
            TY_(FreeNode)( doc, node );
1365
2.17k
            continue;
1366
3.96k
        }
1367
1368
        /* ignore unknown start/end tags */
1369
240k
        if ( node->tag == NULL )
1370
14.9k
        {
1371
14.9k
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1372
14.9k
            TY_(FreeNode)( doc, node );
1373
14.9k
            continue;
1374
14.9k
        }
1375
1376
        /*
1377
          Allow CM_INLINE elements here.
1378
1379
          Allow CM_BLOCK elements here unless
1380
          lexer->excludeBlocks is yes.
1381
1382
          LI and DD are special cased.
1383
1384
          Otherwise infer end tag for this element.
1385
        */
1386
1387
225k
        if ( !TY_(nodeHasCM)(node, CM_INLINE) )
1388
89.5k
        {
1389
89.5k
            if ( !TY_(nodeIsElement)(node) )
1390
895
            {
1391
895
                if ( nodeIsFORM(node) )
1392
208
                    BadForm( doc );
1393
1394
895
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1395
895
                TY_(FreeNode)( doc, node );
1396
895
                continue;
1397
895
            }
1398
            
1399
            /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1400
            /*
1401
             If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1402
             start tag, discard the start tag and let the subsequent content get
1403
             parsed as content of the enclosing LI.  This seems to mimic IE and
1404
             Netscape, and avoids an infinite loop: without this check,
1405
             ParseBlock (which is parsing the LI's content) and ParseList (which
1406
             is parsing the LI's parent's content) repeatedly defer to each
1407
             other to parse the illegal start tag, each time inferring a missing
1408
             </li> or <li> respectively.
1409
1410
             NOTE: This check is a bit fragile.  It specifically checks for the
1411
             four tags that happen to weave their way through the current series
1412
             of tests performed by ParseBlock and ParseList to trigger the
1413
             infinite loop.
1414
            */
1415
88.6k
            if ( nodeIsLI(element) )
1416
23.3k
            {
1417
23.3k
                if ( nodeIsFRAME(node)    ||
1418
23.3k
                     nodeIsFRAMESET(node) ||
1419
23.3k
                     nodeIsOPTGROUP(node) ||
1420
22.5k
                     nodeIsOPTION(node) )
1421
937
                {
1422
937
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1423
937
                    TY_(FreeNode)( doc, node );  /* DSR - 27Apr02 avoid memory leak */
1424
937
                    continue;
1425
937
                }
1426
23.3k
            }
1427
1428
87.6k
            if ( nodeIsTD(element) || nodeIsTH(element) )
1429
14.3k
            {
1430
                /* if parent is a table cell, avoid inferring the end of the cell */
1431
1432
14.3k
                if ( TY_(nodeHasCM)(node, CM_HEAD) )
1433
603
                {
1434
603
                    MoveToHead( doc, element, node );
1435
603
                    continue;
1436
603
                }
1437
1438
13.7k
                if ( TY_(nodeHasCM)(node, CM_LIST) )
1439
279
                {
1440
279
                    TY_(UngetToken)( doc );
1441
279
                    node = TY_(InferredTag)(doc, TidyTag_UL);
1442
279
                    AddClassNoIndent(doc, node);
1443
279
                    lexer->excludeBlocks = yes;
1444
279
                }
1445
13.4k
                else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1446
5.09k
                {
1447
5.09k
                    TY_(UngetToken)( doc );
1448
5.09k
                    node = TY_(InferredTag)(doc, TidyTag_DL);
1449
5.09k
                    lexer->excludeBlocks = yes;
1450
5.09k
                }
1451
1452
                /* infer end of current table cell */
1453
13.7k
                if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
1454
2.92k
                {
1455
2.92k
                    TY_(UngetToken)( doc );
1456
2.92k
                    TrimSpaces( doc, element );
1457
2.92k
                    DEBUG_LOG_EXIT;
1458
2.92k
                    return NULL;
1459
2.92k
                }
1460
13.7k
            }
1461
73.3k
            else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
1462
42.2k
            {
1463
42.2k
                if ( lexer->excludeBlocks )
1464
3.24k
                {
1465
3.24k
                    if ( !TY_(nodeHasCM)(element, CM_OPT) )
1466
2.67k
                        TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
1467
1468
3.24k
                    TY_(UngetToken)( doc );
1469
1470
3.24k
                    if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1471
50
                        lexer->istackbase = istackbase;
1472
1473
3.24k
                    TrimSpaces( doc, element );
1474
3.24k
                    DEBUG_LOG_EXIT;
1475
3.24k
                    return NULL;
1476
3.24k
                }
1477
42.2k
            }
1478
31.0k
            else if ( ! nodeIsTEMPLATE( element ) )/* things like list items */
1479
31.0k
            {
1480
31.0k
                if (node->tag->model & CM_HEAD)
1481
1.64k
                {
1482
1.64k
                    MoveToHead( doc, element, node );
1483
1.64k
                    continue;
1484
1.64k
                }
1485
1486
                /*
1487
                 special case where a form start tag
1488
                 occurs in a tr and is followed by td or th
1489
                */
1490
1491
29.3k
                if ( nodeIsFORM(element) &&
1492
29.3k
                     nodeIsTD(element->parent) &&
1493
2.69k
                     element->parent->implicit )
1494
771
                {
1495
771
                    if ( nodeIsTD(node) )
1496
88
                    {
1497
88
                        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1498
88
                        TY_(FreeNode)( doc, node );
1499
88
                        continue;
1500
88
                    }
1501
1502
683
                    if ( nodeIsTH(node) )
1503
308
                    {
1504
308
                        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1505
308
                        TY_(FreeNode)( doc, node );
1506
308
                        node = element->parent;
1507
308
                        TidyDocFree(doc, node->element);
1508
308
                        node->element = TY_(tmbstrdup)(doc->allocator, "th");
1509
308
                        node->tag = TY_(LookupTagDef)( TidyTag_TH );
1510
308
                        continue;
1511
308
                    }
1512
683
                }
1513
1514
29.0k
                if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
1515
14.6k
                    TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
1516
1517
                /* #521, warn on missing optional end-tags if not omitting them. */
1518
29.0k
                if ( cfgBool( doc, TidyOmitOptionalTags ) == no && TY_(nodeHasCM)(element, CM_OPT) )
1519
13.9k
                    TY_(Report)(doc, element, node, MISSING_ENDTAG_OPTIONAL );
1520
1521
1522
29.0k
                TY_(UngetToken)( doc );
1523
1524
29.0k
                if ( TY_(nodeHasCM)(node, CM_LIST) )
1525
2.16k
                {
1526
2.16k
                    if ( element->parent && element->parent->tag &&
1527
2.08k
                         element->parent->tag->parser == TY_(ParseList) )
1528
1.86k
                    {
1529
1.86k
                        TrimSpaces( doc, element );
1530
1.86k
                        DEBUG_LOG_EXIT;
1531
1.86k
                        return NULL;
1532
1.86k
                    }
1533
1534
307
                    node = TY_(InferredTag)(doc, TidyTag_UL);
1535
307
                    AddClassNoIndent(doc, node);
1536
307
                }
1537
26.8k
                else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1538
3.35k
                {
1539
3.35k
                    if ( nodeIsDL(element->parent) )
1540
2.87k
                    {
1541
2.87k
                        TrimSpaces( doc, element );
1542
2.87k
                        DEBUG_LOG_EXIT;
1543
2.87k
                        return NULL;
1544
2.87k
                    }
1545
1546
479
                    node = TY_(InferredTag)(doc, TidyTag_DL);
1547
479
                }
1548
23.4k
                else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
1549
15.8k
                {
1550
                    /* http://tidy.sf.net/issue/1316307 */
1551
                    /* In exiled mode, return so table processing can
1552
                       continue. */
1553
15.8k
                    if (lexer->exiled)
1554
5.08k
                    {
1555
5.08k
                        DEBUG_LOG_EXIT;
1556
5.08k
                        return NULL;
1557
5.08k
                    }
1558
10.7k
                    node = TY_(InferredTag)(doc, TidyTag_TABLE);
1559
10.7k
                }
1560
7.64k
                else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1561
597
                {
1562
                    /* pop inline stack */
1563
6.64k
                    while ( lexer->istacksize > lexer->istackbase )
1564
6.04k
                        TY_(PopInline)( doc, NULL );
1565
597
                    lexer->istackbase = istackbase;
1566
597
                    TrimSpaces( doc, element );
1567
597
                    DEBUG_LOG_EXIT;
1568
597
                    return NULL;
1569
1570
597
                }
1571
7.05k
                else
1572
7.05k
                {
1573
7.05k
                    TrimSpaces( doc, element );
1574
7.05k
                    DEBUG_LOG_EXIT;
1575
7.05k
                    return NULL;
1576
7.05k
                }
1577
29.0k
            }
1578
87.6k
        }
1579
1580
        /*\
1581
         *  Issue #307 - an <A> tag to ends any open <A> element
1582
         *  Like #427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1583
         *  in ParseInline(), fix copied HERE to ParseBlock()
1584
         *  href: http://www.w3.org/TR/html-markup/a.html
1585
         *  The interactive element a must not appear as a descendant of the a element.
1586
        \*/
1587
196k
        if ( nodeIsA(node) && !node->implicit &&
1588
14.4k
             (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1589
10.3k
        {
1590
10.3k
            if (node->type != EndTag && node->attributes == NULL
1591
3.86k
                && cfgBool(doc, TidyCoerceEndTags) )
1592
3.86k
            {
1593
3.86k
                node->type = EndTag;
1594
3.86k
                TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
1595
3.86k
                TY_(UngetToken)( doc );
1596
3.86k
                continue;
1597
3.86k
            }
1598
1599
6.45k
            if (nodeIsA(element))
1600
3.49k
            {
1601
3.49k
                TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
1602
3.49k
                TY_(UngetToken)( doc );
1603
3.49k
            }
1604
2.96k
            else
1605
2.96k
            {
1606
                /* Issue #597 - if we not 'UngetToken' then it is being discarded.
1607
                   Add message, and 'FreeNode' - thanks @ralfjunker */
1608
2.96k
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
1609
2.96k
                TY_(FreeNode)(doc, node);
1610
2.96k
            }
1611
1612
6.45k
            if (!(mode & Preformatted))
1613
6.45k
                TrimSpaces(doc, element);
1614
1615
6.45k
            DEBUG_LOG_EXIT;
1616
6.45k
            return NULL;
1617
10.3k
        }
1618
1619
        /* parse known element */
1620
186k
        if (TY_(nodeIsElement)(node))
1621
185k
        {
1622
185k
            if (node->tag->model & CM_INLINE)
1623
123k
            {
1624
123k
                if (checkstack && !node->implicit)
1625
43.4k
                {
1626
43.4k
                    checkstack = no;
1627
1628
43.4k
                    if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1629
41.9k
                    {
1630
41.9k
                        if ( TY_(InlineDup)(doc, node) > 0 )
1631
947
                            continue;
1632
41.9k
                    }
1633
43.4k
                }
1634
1635
123k
                DEBUG_LOG_GET_OLD_MODE;
1636
123k
                mode = MixedContent;
1637
123k
                DEBUG_LOG_CHANGE_MODE;
1638
123k
            }
1639
61.4k
            else
1640
61.4k
            {
1641
61.4k
                checkstack = yes;
1642
61.4k
                DEBUG_LOG_GET_OLD_MODE;
1643
61.4k
                mode = IgnoreWhitespace;
1644
61.4k
                DEBUG_LOG_CHANGE_MODE;
1645
61.4k
            }
1646
1647
            /* trim white space before <br> */
1648
184k
            if ( nodeIsBR(node) )
1649
2.25k
                TrimSpaces( doc, element );
1650
1651
184k
            TY_(InsertNodeAtEnd)(element, node);
1652
1653
184k
            if (node->implicit)
1654
93.7k
                TY_(Report)(doc, element, node, INSERTING_TAG );
1655
1656
            /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an
1657
               effort has been made above to set a 'MixedContent' mode in some cases?
1658
               WHY IS THE 'mode' VARIABLE NOT USED HERE???? */
1659
1660
184k
            {
1661
184k
                TidyParserMemory memory = {0};
1662
184k
                memory.identity = TY_(ParseBlock);
1663
184k
                memory.reentry_node = node;
1664
184k
                memory.reentry_mode = mode;
1665
184k
                memory.original_node = element;
1666
184k
                TY_(pushMemory)(doc, memory);
1667
184k
                DEBUG_LOG_EXIT_WITH_NODE(node);
1668
184k
            }
1669
184k
            return node;
1670
185k
        }
1671
1672
        /* discard unexpected tags */
1673
1.26k
        if (node->type == EndTag)
1674
1.26k
            TY_(PopInline)( doc, node );  /* if inline end tag */
1675
1676
1.26k
        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1677
1.26k
        TY_(FreeNode)( doc, node );
1678
1.26k
        continue;
1679
186k
    }
1680
1681
141k
    if (!(element->tag->model & CM_OPT))
1682
110k
        TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR);
1683
1684
141k
    if (element->tag->model & CM_OBJECT)
1685
10.9k
    {
1686
        /* pop inline stack */
1687
11.4k
        while ( lexer->istacksize > lexer->istackbase )
1688
499
            TY_(PopInline)( doc, NULL );
1689
10.9k
        lexer->istackbase = istackbase;
1690
10.9k
    }
1691
1692
141k
    TrimSpaces( doc, element );
1693
1694
141k
    DEBUG_LOG_EXIT;
1695
141k
    return NULL;
1696
363k
}
1697
1698
1699
/** MARK: TY_(ParseBody)
1700
 *  Parses the `body` tag.
1701
 *
1702
 *  This is a non-recursing parser. It uses the document's parser memory stack
1703
 *  to send subsequent nodes back to the controller for dispatching to parsers.
1704
 *  This parser is also re-enterable, so that post-processing can occur after
1705
 *  such dispatching.
1706
 */
1707
Node* TY_(ParseBody)( TidyDocImpl* doc, Node *body, GetTokenMode mode )
1708
169k
{
1709
169k
    Lexer* lexer = doc->lexer;
1710
169k
    Node *node = NULL;
1711
169k
    Bool checkstack = no;
1712
169k
    Bool iswhitenode = no;
1713
169k
    DEBUG_LOG_COUNTERS;
1714
1715
169k
    mode = IgnoreWhitespace;
1716
169k
    checkstack = yes;
1717
1718
    /*
1719
     If we're re-entering, then we need to setup from a previous state,
1720
     instead of starting fresh. We can pull what we need from the document's
1721
     stack.
1722
     */
1723
169k
    if ( body == NULL )
1724
108k
    {
1725
108k
        TidyParserMemory memory = TY_(popMemory)( doc );
1726
108k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
1727
108k
        DEBUG_LOG_REENTER_WITH_NODE(node);
1728
108k
        body = memory.original_node;
1729
108k
        checkstack = memory.register_1;
1730
108k
        iswhitenode = memory.register_2;
1731
108k
        DEBUG_LOG_GET_OLD_MODE;
1732
108k
        mode = memory.mode;
1733
108k
        DEBUG_LOG_CHANGE_MODE;
1734
108k
    }
1735
60.5k
    else
1736
60.5k
    {
1737
60.5k
        DEBUG_LOG_ENTER_WITH_NODE(body);
1738
60.5k
        TY_(BumpObject)( doc, body->parent );
1739
60.5k
    }
1740
    
1741
643k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
1742
591k
    {
1743
591k
        DEBUG_LOG_GOT_TOKEN(node);
1744
        /* find and discard multiple <body> elements */
1745
591k
        if (node->tag == body->tag && node->type == StartTag)
1746
2.76k
        {
1747
2.76k
            TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1748
2.76k
            TY_(FreeNode)(doc, node);
1749
2.76k
            continue;
1750
2.76k
        }
1751
1752
        /* #538536 Extra endtags not detected */
1753
588k
        if ( nodeIsHTML(node) )
1754
2.43k
        {
1755
2.43k
            if (TY_(nodeIsElement)(node) || lexer->seenEndHtml)
1756
2.40k
                TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1757
36
            else
1758
36
                lexer->seenEndHtml = 1;
1759
1760
2.43k
            TY_(FreeNode)( doc, node);
1761
2.43k
            continue;
1762
2.43k
        }
1763
1764
586k
        if ( lexer->seenEndBody &&
1765
12.2k
             ( node->type == StartTag ||
1766
9.16k
               node->type == EndTag   ||
1767
6.83k
               node->type == StartEndTag ) )
1768
10.0k
        {
1769
10.0k
            TY_(Report)(doc, body, node, CONTENT_AFTER_BODY );
1770
10.0k
        }
1771
1772
586k
        if ( node->tag == body->tag && node->type == EndTag )
1773
2.21k
        {
1774
2.21k
            body->closed = yes;
1775
2.21k
            TrimSpaces(doc, body);
1776
2.21k
            TY_(FreeNode)( doc, node);
1777
2.21k
            lexer->seenEndBody = 1;
1778
2.21k
            DEBUG_LOG_GET_OLD_MODE;
1779
2.21k
            mode = IgnoreWhitespace;
1780
2.21k
            DEBUG_LOG_CHANGE_MODE;
1781
1782
2.21k
            if ( nodeIsNOFRAMES(body->parent) )
1783
1.06k
                break;
1784
1785
1.14k
            continue;
1786
2.21k
        }
1787
1788
583k
        if ( nodeIsNOFRAMES(node) )
1789
10.6k
        {
1790
10.6k
            if (node->type == StartTag)
1791
7.41k
            {
1792
7.41k
                TidyParserMemory memory = {0};
1793
1794
7.41k
                TY_(InsertNodeAtEnd)(body, node);
1795
                
1796
7.41k
                memory.identity = TY_(ParseBody);
1797
7.41k
                memory.original_node = body;
1798
7.41k
                memory.reentry_node = node;
1799
7.41k
                memory.register_1 = checkstack;
1800
7.41k
                memory.register_2 = iswhitenode;
1801
7.41k
                memory.mode = mode;
1802
7.41k
                TY_(pushMemory)( doc, memory );
1803
7.41k
                DEBUG_LOG_EXIT_WITH_NODE(node);
1804
7.41k
                return node;
1805
7.41k
            }
1806
1807
3.27k
            if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
1808
606
            {
1809
606
                TrimSpaces(doc, body);
1810
606
                TY_(UngetToken)( doc );
1811
606
                break;
1812
606
            }
1813
3.27k
        }
1814
1815
575k
        if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
1816
3.42k
             && nodeIsNOFRAMES(body->parent) )
1817
1.17k
        {
1818
1.17k
            TrimSpaces(doc, body);
1819
1.17k
            TY_(UngetToken)( doc );
1820
1.17k
            break;
1821
1.17k
        }
1822
1823
574k
        iswhitenode = no;
1824
1825
574k
        if ( TY_(nodeIsText)(node) &&
1826
79.2k
             node->end <= node->start + 1 &&
1827
55.0k
             lexer->lexbuf[node->start] == ' ' )
1828
14.1k
            iswhitenode = yes;
1829
1830
        /* deal with comments etc. */
1831
574k
        if (InsertMisc(body, node))
1832
222k
            continue;
1833
1834
        /* mixed content model permits text */
1835
352k
        if (TY_(nodeIsText)(node))
1836
79.2k
        {
1837
79.2k
            if (iswhitenode && mode == IgnoreWhitespace)
1838
159
            {
1839
159
                TY_(FreeNode)( doc, node);
1840
159
                continue;
1841
159
            }
1842
1843
            /* HTML 2 and HTML4 strict don't allow text here */
1844
79.0k
            TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
1845
1846
79.0k
            if (checkstack)
1847
43.7k
            {
1848
43.7k
                checkstack = no;
1849
1850
43.7k
                if ( TY_(InlineDup)(doc, node) > 0 )
1851
1.33k
                    continue;
1852
43.7k
            }
1853
1854
77.7k
            TY_(InsertNodeAtEnd)(body, node);
1855
77.7k
            DEBUG_LOG_GET_OLD_MODE;
1856
77.7k
            mode = MixedContent;
1857
77.7k
            DEBUG_LOG_CHANGE_MODE;
1858
77.7k
            continue;
1859
79.0k
        }
1860
1861
273k
        if (node->type == DocTypeTag)
1862
6.35k
        {
1863
6.35k
            InsertDocType(doc, body, node);
1864
6.35k
            continue;
1865
6.35k
        }
1866
        /* discard unknown  and PARAM tags */
1867
267k
        if ( node->tag == NULL || nodeIsPARAM(node) )
1868
150k
        {
1869
150k
            TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1870
150k
            TY_(FreeNode)( doc, node);
1871
150k
            continue;
1872
150k
        }
1873
1874
        /*
1875
          Netscape allows LI and DD directly in BODY
1876
          We infer UL or DL respectively and use this
1877
          Bool to exclude block-level elements so as
1878
          to match Netscape's observed behaviour.
1879
        */
1880
116k
        lexer->excludeBlocks = no;
1881
1882
116k
        if ((( nodeIsINPUT(node) ||
1883
115k
             (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE))
1884
116k
           ) && !TY_(IsHTML5Mode)(doc)) || nodeIsLI(node) )
1885
5.52k
        {
1886
            /* avoid this error message being issued twice */
1887
5.52k
            if (!(node->tag->model & CM_HEAD))
1888
4.74k
                TY_(Report)(doc, body, node, TAG_NOT_ALLOWED_IN);
1889
1890
5.52k
            if (node->tag->model & CM_HTML)
1891
1.68k
            {
1892
                /* copy body attributes if current body was inferred */
1893
1.68k
                if ( nodeIsBODY(node) && body->implicit
1894
530
                     && body->attributes == NULL )
1895
327
                {
1896
327
                    body->attributes = node->attributes;
1897
327
                    node->attributes = NULL;
1898
327
                }
1899
1900
1.68k
                TY_(FreeNode)( doc, node);
1901
1.68k
                continue;
1902
1.68k
            }
1903
1904
3.83k
            if (node->tag->model & CM_HEAD)
1905
775
            {
1906
775
                MoveToHead(doc, body, node);
1907
775
                continue;
1908
775
            }
1909
1910
3.05k
            if (node->tag->model & CM_LIST)
1911
166
            {
1912
166
                TY_(UngetToken)( doc );
1913
166
                node = TY_(InferredTag)(doc, TidyTag_UL);
1914
166
                AddClassNoIndent(doc, node);
1915
166
                lexer->excludeBlocks = yes;
1916
166
            }
1917
2.89k
            else if (node->tag->model & CM_DEFLIST)
1918
373
            {
1919
373
                TY_(UngetToken)( doc );
1920
373
                node = TY_(InferredTag)(doc, TidyTag_DL);
1921
373
                lexer->excludeBlocks = yes;
1922
373
            }
1923
2.52k
            else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
1924
567
            {
1925
                /* http://tidy.sf.net/issue/2855621 */
1926
567
                if (node->type != EndTag) {
1927
328
                    TY_(UngetToken)( doc );
1928
328
                    node = TY_(InferredTag)(doc, TidyTag_TABLE);
1929
328
                }
1930
567
                lexer->excludeBlocks = yes;
1931
567
            }
1932
1.95k
            else if ( nodeIsINPUT(node) )
1933
400
            {
1934
400
                TY_(UngetToken)( doc );
1935
400
                node = TY_(InferredTag)(doc, TidyTag_FORM);
1936
400
                lexer->excludeBlocks = yes;
1937
400
            }
1938
1.55k
            else
1939
1.55k
            {
1940
1.55k
                if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) )
1941
362
                {
1942
362
                    TY_(UngetToken)( doc );
1943
362
                    DEBUG_LOG_EXIT;
1944
362
                    return NULL;
1945
362
                }
1946
1947
                /* ignore </td> </th> <option> etc. */
1948
1.19k
                TY_(FreeNode)( doc, node );
1949
1.19k
                continue;
1950
1.55k
            }
1951
3.05k
        }
1952
1953
112k
        if (node->type == EndTag)
1954
5.70k
        {
1955
5.70k
            if ( nodeIsBR(node) )
1956
292
            {
1957
292
                node->type = StartTag;
1958
292
            }
1959
5.40k
            else if ( nodeIsP(node) )
1960
426
            {
1961
426
                node->type = StartEndTag;
1962
426
                node->implicit = yes;
1963
426
            }
1964
4.98k
            else if ( TY_(nodeHasCM)(node, CM_INLINE) )
1965
1.31k
                TY_(PopInline)( doc, node );
1966
5.70k
        }
1967
1968
112k
        if (TY_(nodeIsElement)(node))
1969
107k
        {
1970
107k
            if (nodeIsMAIN(node))
1971
417
            {
1972
                /*\ Issue #166 - repeated <main> element
1973
                 *  How to efficiently search for a previous main element?
1974
                \*/
1975
417
                if ( findNodeById(doc, TidyTag_MAIN) )
1976
284
                {
1977
284
                    doc->badForm |= flg_BadMain; /* this is an ERROR in format */
1978
284
                    TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1979
284
                    TY_(FreeNode)( doc, node);
1980
284
                    continue;
1981
284
                }
1982
417
            }
1983
            /* Issue #20 - merging from Ger Hobbelt fork put back CM_MIXED, which had been
1984
               removed to fix this issue - reverting to fix 880221e
1985
             */
1986
107k
            if ( TY_(nodeHasCM)(node, CM_INLINE) )
1987
36.8k
            {
1988
                /* HTML4 strict doesn't allow inline content here */
1989
                /* but HTML2 does allow img elements as children of body */
1990
36.8k
                if ( nodeIsIMG(node) )
1991
1.40k
                    TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT);
1992
35.4k
                else
1993
35.4k
                    TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
1994
1995
36.8k
                if (checkstack && !node->implicit)
1996
6.76k
                {
1997
6.76k
                    checkstack = no;
1998
1999
6.76k
                    if ( TY_(InlineDup)(doc, node) > 0 )
2000
478
                        continue;
2001
6.76k
                }
2002
                
2003
36.4k
                DEBUG_LOG_GET_OLD_MODE;
2004
36.4k
                mode = MixedContent;
2005
36.4k
                DEBUG_LOG_CHANGE_MODE;
2006
36.4k
            }
2007
70.2k
            else
2008
70.2k
            {
2009
70.2k
                checkstack = yes;
2010
70.2k
                DEBUG_LOG_GET_OLD_MODE;
2011
70.2k
                mode = IgnoreWhitespace;
2012
70.2k
                DEBUG_LOG_CHANGE_MODE;
2013
70.2k
            }
2014
2015
106k
            if (node->implicit)
2016
3.92k
            {
2017
3.92k
                TY_(Report)(doc, body, node, INSERTING_TAG);
2018
3.92k
            }
2019
2020
106k
            TY_(InsertNodeAtEnd)(body, node);
2021
            
2022
106k
            {
2023
106k
                TidyParserMemory memory = {0};
2024
106k
                memory.identity = TY_(ParseBody);
2025
106k
                memory.original_node = body;
2026
106k
                memory.reentry_node = node;
2027
106k
                memory.register_1 = checkstack;
2028
106k
                memory.register_2 = iswhitenode;
2029
106k
                memory.mode = mode;
2030
106k
                TY_(pushMemory)( doc, memory );
2031
106k
            }
2032
106k
            DEBUG_LOG_EXIT_WITH_NODE(node);
2033
106k
            return node;
2034
107k
        }
2035
2036
        /* discard unexpected tags */
2037
4.98k
        TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
2038
4.98k
        TY_(FreeNode)( doc, node);
2039
4.98k
    }
2040
54.9k
    DEBUG_LOG_EXIT;
2041
54.9k
    return NULL;
2042
169k
}
2043
2044
2045
/** MARK: TY_(ParseColGroup)
2046
 *  Parses the `colgroup` tag.
2047
 *
2048
 *  This is a non-recursing parser. It uses the document's parser memory stack
2049
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2050
 *  This parser is also re-enterable, so that post-processing can occur after
2051
 *  such dispatching.
2052
 */
2053
Node* TY_(ParseColGroup)( TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode) )
2054
2.97k
{
2055
2.97k
    Node *node, *parent;
2056
2.97k
    DEBUG_LOG_COUNTERS;
2057
2058
    /*
2059
     If we're re-entering, then we need to setup from a previous state,
2060
     instead of starting fresh. We can pull what we need from the document's
2061
     stack.
2062
     */
2063
2.97k
    if ( colgroup == NULL )
2064
738
    {
2065
738
        TidyParserMemory memory = TY_(popMemory)( doc );
2066
738
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2067
738
        DEBUG_LOG_REENTER_WITH_NODE(node);
2068
738
        colgroup = memory.original_node;
2069
738
        DEBUG_LOG_GET_OLD_MODE;
2070
738
        mode = memory.mode;
2071
738
        DEBUG_LOG_CHANGE_MODE;
2072
738
    }
2073
2.24k
    else
2074
2.24k
    {
2075
2.24k
        DEBUG_LOG_ENTER_WITH_NODE(colgroup);
2076
2.24k
        if (colgroup->tag->model & CM_EMPTY)
2077
0
            return NULL;
2078
2.24k
    }
2079
2080
5.00k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2081
4.42k
    {
2082
4.42k
        DEBUG_LOG_GOT_TOKEN(node);
2083
2084
4.42k
        if (node->tag == colgroup->tag && node->type == EndTag)
2085
196
        {
2086
196
            TY_(FreeNode)( doc, node);
2087
196
            colgroup->closed = yes;
2088
196
            return NULL;
2089
196
        }
2090
2091
        /*
2092
          if this is the end tag for an ancestor element
2093
          then infer end tag for this element
2094
        */
2095
4.22k
        if (node->type == EndTag)
2096
1.33k
        {
2097
1.33k
            if ( nodeIsFORM(node) )
2098
243
            {
2099
243
                BadForm( doc );
2100
243
                TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2101
243
                TY_(FreeNode)( doc, node);
2102
243
                continue;
2103
243
            }
2104
2105
1.09k
            for ( parent = colgroup->parent;
2106
111k
                  parent != NULL;
2107
110k
                  parent = parent->parent )
2108
110k
            {
2109
110k
                if (node->tag == parent->tag)
2110
223
                {
2111
223
                    TY_(UngetToken)( doc );
2112
223
                    DEBUG_LOG_EXIT;
2113
223
                    return NULL;
2114
223
                }
2115
110k
            }
2116
1.09k
        }
2117
2118
3.76k
        if (TY_(nodeIsText)(node))
2119
425
        {
2120
425
            TY_(UngetToken)( doc );
2121
425
            DEBUG_LOG_EXIT;
2122
425
            return NULL;
2123
425
        }
2124
2125
        /* deal with comments etc. */
2126
3.33k
        if (InsertMisc(colgroup, node))
2127
73
            continue;
2128
2129
        /* discard unknown tags */
2130
3.26k
        if (node->tag == NULL)
2131
837
        {
2132
837
            TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2133
837
            TY_(FreeNode)( doc, node);
2134
837
            continue;
2135
837
        }
2136
2137
2.42k
        if ( !nodeIsCOL(node) )
2138
817
        {
2139
817
            TY_(UngetToken)( doc );
2140
817
            DEBUG_LOG_EXIT;
2141
817
            return NULL;
2142
817
        }
2143
2144
1.60k
        if (node->type == EndTag)
2145
871
        {
2146
871
            TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2147
871
            TY_(FreeNode)( doc, node);
2148
871
            continue;
2149
871
        }
2150
2151
        /* node should be <COL> */
2152
738
        TY_(InsertNodeAtEnd)(colgroup, node);
2153
        
2154
738
        {
2155
738
            TidyParserMemory memory = {0};
2156
738
            memory.identity = TY_(ParseColGroup);
2157
738
            memory.original_node = colgroup;
2158
738
            memory.reentry_node = node;
2159
738
            memory.mode = mode;
2160
738
            TY_(pushMemory)( doc, memory );
2161
738
            DEBUG_LOG_EXIT_WITH_NODE(node);
2162
738
        }
2163
738
        DEBUG_LOG_EXIT;
2164
738
        return node;
2165
1.60k
    }
2166
580
    DEBUG_LOG_EXIT;
2167
580
    return NULL;
2168
2.97k
}
2169
2170
2171
/** MARK: TY_(ParseDatalist)
2172
 *  Parses the `datalist` tag.
2173
 *
2174
 *  This is a non-recursing parser. It uses the document's parser memory stack
2175
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2176
 *  This parser is also re-enterable, so that post-processing can occur after
2177
 *  such dispatching.
2178
*/
2179
Node* TY_(ParseDatalist)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) )
2180
3.58k
{
2181
3.58k
    Lexer* lexer = doc->lexer;
2182
3.58k
    Node *node;
2183
3.58k
    DEBUG_LOG_COUNTERS;
2184
2185
3.58k
    if ( field == NULL )
2186
1.97k
    {
2187
1.97k
        TidyParserMemory memory = TY_(popMemory)( doc );
2188
1.97k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2189
1.97k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2190
1.97k
        field = memory.original_node;
2191
1.97k
        DEBUG_LOG_GET_OLD_MODE;
2192
1.97k
        mode = memory.mode;
2193
1.97k
        DEBUG_LOG_CHANGE_MODE;
2194
1.97k
    }
2195
1.60k
    else
2196
1.60k
    {
2197
1.60k
        DEBUG_LOG_ENTER_WITH_NODE(field);
2198
1.60k
    }
2199
    
2200
3.58k
    lexer->insert = NULL;  /* defer implicit inline start tags */
2201
2202
6.23k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2203
4.91k
    {
2204
4.91k
        if (node->tag == field->tag && node->type == EndTag)
2205
284
        {
2206
284
            TY_(FreeNode)( doc, node);
2207
284
            field->closed = yes;
2208
284
            TrimSpaces(doc, field);
2209
2210
284
            DEBUG_LOG_EXIT;
2211
284
            return NULL;
2212
284
        }
2213
2214
        /* deal with comments etc. */
2215
4.62k
        if (InsertMisc(field, node))
2216
317
            continue;
2217
2218
4.30k
        if ( node->type == StartTag &&
2219
3.28k
             ( nodeIsOPTION(node)   ||
2220
3.28k
               nodeIsOPTGROUP(node) ||
2221
3.28k
               nodeIsDATALIST(node) ||
2222
3.28k
               nodeIsSCRIPT(node))
2223
4.30k
           )
2224
1.97k
        {
2225
1.97k
            TidyParserMemory memory = {0};
2226
1.97k
            memory.identity = TY_(ParseDatalist);
2227
1.97k
            memory.original_node = field;
2228
1.97k
            memory.reentry_node = node;
2229
1.97k
            memory.reentry_mode = IgnoreWhitespace;
2230
2231
1.97k
            TY_(InsertNodeAtEnd)(field, node);
2232
1.97k
            TY_(pushMemory)(doc, memory);
2233
1.97k
            DEBUG_LOG_EXIT_WITH_NODE(node);
2234
1.97k
            return node;
2235
1.97k
        }
2236
2237
        /* discard unexpected tags */
2238
2.33k
        TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
2239
2.33k
        TY_(FreeNode)( doc, node);
2240
2.33k
    }
2241
2242
1.32k
    TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
2243
2244
1.32k
    DEBUG_LOG_EXIT;
2245
1.32k
    return NULL;
2246
3.58k
}
2247
2248
2249
/** MARK: TY_(ParseDefList)
2250
 *  Parses the `dl` tag.
2251
 *
2252
 *  This is a non-recursing parser. It uses the document's parser memory stack
2253
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2254
 *  This parser is also re-enterable, so that post-processing can occur after
2255
 *  such dispatching.
2256
*/
2257
Node* TY_(ParseDefList)( TidyDocImpl* doc, Node *list, GetTokenMode mode )
2258
27.8k
{
2259
27.8k
    Lexer* lexer = doc->lexer;
2260
27.8k
    Node *node = NULL;
2261
27.8k
    Node *parent = NULL;
2262
27.8k
    DEBUG_LOG_COUNTERS;
2263
2264
27.8k
    enum parserState {
2265
27.8k
        STATE_INITIAL,                /* This is the initial state for every parser. */
2266
27.8k
        STATE_POST_NODEISCENTER,      /* To-do after re-entering after checks. */
2267
27.8k
        STATE_COMPLETE,               /* Done with the switch. */
2268
27.8k
    } state = STATE_INITIAL;
2269
2270
27.8k
    if ( list == NULL )
2271
17.6k
    {
2272
17.6k
        TidyParserMemory memory = TY_(popMemory)( doc );
2273
17.6k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2274
17.6k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2275
17.6k
        list = memory.original_node;
2276
17.6k
        state = memory.reentry_state;
2277
17.6k
        DEBUG_LOG_GET_OLD_MODE;
2278
17.6k
        mode = memory.mode;
2279
17.6k
        DEBUG_LOG_CHANGE_MODE;
2280
17.6k
    }
2281
10.1k
    else
2282
10.1k
    {
2283
10.1k
        DEBUG_LOG_ENTER_WITH_NODE(list);
2284
10.1k
    }
2285
2286
27.8k
    if (list->tag->model & CM_EMPTY)
2287
0
        return NULL;
2288
2289
27.8k
    lexer->insert = NULL;  /* defer implicit inline start tags */
2290
2291
39.6k
    while ( state != STATE_COMPLETE )
2292
32.2k
    {
2293
32.2k
        if ( state == STATE_INITIAL )
2294
31.1k
            node = TY_(GetToken)( doc, IgnoreWhitespace);
2295
        
2296
32.2k
        switch ( state)
2297
32.2k
        {
2298
31.1k
            case STATE_INITIAL:
2299
31.1k
            {
2300
31.1k
                if ( node == NULL)
2301
7.35k
                {
2302
7.35k
                    state = STATE_COMPLETE;
2303
7.35k
                    continue;
2304
7.35k
                }
2305
2306
23.7k
                if (node->tag == list->tag && node->type == EndTag)
2307
114
                {
2308
114
                    TY_(FreeNode)( doc, node);
2309
114
                    list->closed = yes;
2310
114
                    DEBUG_LOG_EXIT;
2311
114
                    return NULL;
2312
114
                }
2313
2314
                /* deal with comments etc. */
2315
23.6k
                if (InsertMisc(list, node))
2316
287
                    continue;
2317
2318
23.3k
                if (TY_(nodeIsText)(node))
2319
2.90k
                {
2320
2.90k
                    TY_(UngetToken)( doc );
2321
2.90k
                    node = TY_(InferredTag)(doc, TidyTag_DT);
2322
2.90k
                    TY_(Report)(doc, list, node, MISSING_STARTTAG);
2323
2.90k
                }
2324
2325
23.3k
                if (node->tag == NULL)
2326
545
                {
2327
545
                    TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2328
545
                    TY_(FreeNode)( doc, node);
2329
545
                    continue;
2330
545
                }
2331
2332
                /*
2333
                  if this is the end tag for an ancestor element
2334
                  then infer end tag for this element
2335
                */
2336
22.8k
                if (node->type == EndTag)
2337
3.44k
                {
2338
3.44k
                    Bool discardIt = no;
2339
3.44k
                    if ( nodeIsFORM(node) )
2340
200
                    {
2341
200
                        BadForm( doc );
2342
200
                        TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2343
200
                        TY_(FreeNode)( doc, node );
2344
200
                        continue;
2345
200
                    }
2346
2347
3.24k
                    for (parent = list->parent;
2348
195k
                            parent != NULL; parent = parent->parent)
2349
194k
                    {
2350
                       /* Do not match across BODY to avoid infinite loop
2351
                          between ParseBody and this parser,
2352
                          See http://tidy.sf.net/bug/1098012. */
2353
194k
                        if (nodeIsBODY(parent))
2354
869
                        {
2355
869
                            discardIt = yes;
2356
869
                            break;
2357
869
                        }
2358
193k
                        if (node->tag == parent->tag)
2359
799
                        {
2360
799
                            TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
2361
799
                            TY_(UngetToken)( doc );
2362
2363
799
                            DEBUG_LOG_EXIT;
2364
799
                            return NULL;
2365
799
                        }
2366
193k
                    }
2367
2.44k
                    if (discardIt)
2368
869
                    {
2369
869
                        TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2370
869
                        TY_(FreeNode)( doc, node);
2371
869
                        continue;
2372
869
                    }
2373
2.44k
                }
2374
2375
                /* center in a dt or a dl breaks the dl list in two */
2376
20.9k
                if ( nodeIsCENTER(node) )
2377
1.13k
                {
2378
1.13k
                    if (list->content)
2379
900
                        TY_(InsertNodeAfterElement)(list, node);
2380
238
                    else /* trim empty dl list */
2381
238
                    {
2382
238
                        TY_(InsertNodeBeforeElement)(list, node);
2383
238
                    }
2384
2385
                    /* #426885 - fix by Glenn Carroll 19 Apr 00, and
2386
                                 Gary Dechaines 11 Aug 00 */
2387
                    /* ParseTag can destroy node, if it finds that
2388
                     * this <center> is followed immediately by </center>.
2389
                     * It's awkward but necessary to determine if this
2390
                     * has happened.
2391
                     */
2392
1.13k
                    parent = node->parent;
2393
2394
                    /* and parse contents of center */
2395
1.13k
                    lexer->excludeBlocks = no;
2396
2397
1.13k
                    {
2398
1.13k
                        TidyParserMemory memory = {0};
2399
1.13k
                        memory.identity = TY_(ParseDefList);
2400
1.13k
                        memory.original_node = list;
2401
1.13k
                        memory.reentry_node = node;
2402
1.13k
                        memory.reentry_state = STATE_POST_NODEISCENTER;
2403
1.13k
                        TY_(pushMemory)( doc, memory );
2404
1.13k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
2405
1.13k
                        return node;
2406
1.13k
                    }
2407
1.13k
                }
2408
2409
19.8k
                if ( !( nodeIsDT(node) || nodeIsDD(node) || ( nodeIsDIV(node) && TY_(IsHTML5Mode)(doc) ) ) )
2410
5.75k
                {
2411
5.75k
                    TY_(UngetToken)( doc );
2412
2413
5.75k
                    if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
2414
1.08k
                    {
2415
1.08k
                        TY_(Report)(doc, list, node, TAG_NOT_ALLOWED_IN);
2416
1.08k
                        DEBUG_LOG_EXIT;
2417
1.08k
                        return NULL;
2418
1.08k
                    }
2419
2420
                    /* if DD appeared directly in BODY then exclude blocks */
2421
4.67k
                    if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
2422
838
                    {
2423
838
                        DEBUG_LOG_EXIT;
2424
838
                        return NULL;
2425
838
                    }
2426
2427
3.83k
                    node = TY_(InferredTag)(doc, TidyTag_DD);
2428
3.83k
                    TY_(Report)(doc, list, node, MISSING_STARTTAG);
2429
3.83k
                }
2430
2431
17.9k
                if (node->type == EndTag)
2432
1.39k
                {
2433
1.39k
                    TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2434
1.39k
                    TY_(FreeNode)( doc, node);
2435
1.39k
                    continue;
2436
1.39k
                }
2437
2438
                /* node should be <DT> or <DD> or <DIV>*/
2439
16.5k
                TY_(InsertNodeAtEnd)(list, node);
2440
16.5k
                {
2441
16.5k
                    TidyParserMemory memory = {0};
2442
16.5k
                    memory.identity = TY_(ParseDefList);
2443
16.5k
                    memory.original_node = list;
2444
16.5k
                    memory.reentry_node = node;
2445
16.5k
                    memory.reentry_state = STATE_INITIAL;
2446
16.5k
                    TY_(pushMemory)( doc, memory );
2447
16.5k
                    DEBUG_LOG_EXIT;
2448
16.5k
                    return node;
2449
17.9k
                }
2450
17.9k
            } break;
2451
2452
2453
1.13k
            case STATE_POST_NODEISCENTER:
2454
1.13k
            {
2455
1.13k
                lexer->excludeBlocks = yes;
2456
2457
                /* now create a new dl element,
2458
                 * unless node has been blown away because the
2459
                 * center was empty, as above.
2460
                 */
2461
1.13k
                if (parent && parent->last == node)
2462
0
                {
2463
0
                    list = TY_(InferredTag)(doc, TidyTag_DL);
2464
0
                    TY_(InsertNodeAfterElement)(node, list);
2465
0
                }
2466
1.13k
                state = STATE_INITIAL;
2467
1.13k
                continue;
2468
17.9k
            } break;
2469
2470
2471
0
            default:
2472
0
                break;
2473
32.2k
        } /* switch */
2474
32.2k
    } /* while */
2475
2476
7.35k
    TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR);
2477
7.35k
    DEBUG_LOG_EXIT;
2478
7.35k
    return NULL;
2479
27.8k
}
2480
2481
2482
/** MARK: TY_(ParseEmpty)
2483
 *  Parse empty element nodes.
2484
 *
2485
 *  This is a non-recursing parser. It uses the document's parser memory stack
2486
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2487
 *  This parser is also re-enterable, so that post-processing can occur after
2488
 *  such dispatching.
2489
  */
2490
Node* TY_(ParseEmpty)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
2491
39.5k
{
2492
39.5k
    Lexer* lexer = doc->lexer;
2493
39.5k
    if ( lexer->isvoyager )
2494
695
    {
2495
695
        Node *node = TY_(GetToken)( doc, mode);
2496
695
        if ( node )
2497
642
        {
2498
642
            if ( !(node->type == EndTag && node->tag == element->tag) )
2499
554
            {
2500
                /* TY_(Report)(doc, element, node, ELEMENT_NOT_EMPTY); */
2501
554
                TY_(UngetToken)( doc );
2502
554
            }
2503
88
            else
2504
88
            {
2505
88
                TY_(FreeNode)( doc, node );
2506
88
            }
2507
642
        }
2508
695
    }
2509
39.5k
    return NULL;
2510
39.5k
}
2511
2512
2513
/** MARK: TY_(ParseFrameSet)
2514
 *  Parses the `frameset` tag.
2515
 *
2516
 *  This is a non-recursing parser. It uses the document's parser memory stack
2517
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2518
 *  This parser is also re-enterable, so that post-processing can occur after
2519
 *  such dispatching.
2520
 */
2521
Node* TY_(ParseFrameSet)( TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode) )
2522
40.2k
{
2523
40.2k
    Lexer* lexer = doc->lexer;
2524
40.2k
    Node *node;
2525
40.2k
    DEBUG_LOG_COUNTERS;
2526
2527
    /*
2528
     If we're re-entering, then we need to setup from a previous state,
2529
     instead of starting fresh. We can pull what we need from the document's
2530
     stack.
2531
     */
2532
40.2k
    if ( frameset == NULL )
2533
18.6k
    {
2534
18.6k
        TidyParserMemory memory = TY_(popMemory)( doc );
2535
18.6k
        node = memory.reentry_node; /* Throwaway, because we replace it entering the loop. */
2536
18.6k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2537
18.6k
        frameset = memory.original_node;
2538
18.6k
        DEBUG_LOG_GET_OLD_MODE;
2539
18.6k
        mode = memory.mode;
2540
18.6k
        DEBUG_LOG_CHANGE_MODE;
2541
18.6k
    }
2542
21.6k
    else
2543
21.6k
    {
2544
21.6k
        DEBUG_LOG_ENTER_WITH_NODE(frameset);
2545
21.6k
        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
2546
21.6k
        {
2547
21.6k
            doc->badAccess |= BA_USING_FRAMES;
2548
21.6k
        }
2549
21.6k
    }
2550
2551
53.4k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2552
32.2k
    {
2553
32.2k
        if (node->tag == frameset->tag && node->type == EndTag)
2554
452
        {
2555
452
            TY_(FreeNode)( doc, node);
2556
452
            frameset->closed = yes;
2557
452
            TrimSpaces(doc, frameset);
2558
452
            DEBUG_LOG_EXIT;
2559
452
            return NULL;
2560
452
        }
2561
2562
        /* deal with comments etc. */
2563
31.7k
        if (InsertMisc(frameset, node))
2564
2.86k
            continue;
2565
2566
28.9k
        if (node->tag == NULL)
2567
5.52k
        {
2568
5.52k
            TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED);
2569
5.52k
            TY_(FreeNode)( doc, node);
2570
5.52k
            continue;
2571
5.52k
        }
2572
2573
23.3k
        if (TY_(nodeIsElement)(node))
2574
22.4k
        {
2575
22.4k
            if (node->tag && node->tag->model & CM_HEAD)
2576
465
            {
2577
465
                MoveToHead(doc, frameset, node);
2578
465
                continue;
2579
465
            }
2580
22.4k
        }
2581
2582
22.9k
        if ( nodeIsBODY(node) )
2583
984
        {
2584
984
            TY_(UngetToken)( doc );
2585
984
            node = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
2586
984
            TY_(Report)(doc, frameset, node, INSERTING_TAG);
2587
984
        }
2588
2589
22.9k
        if (node->type == StartTag && (node->tag && node->tag->model & CM_FRAMES))
2590
18.6k
        {
2591
18.6k
            TY_(InsertNodeAtEnd)(frameset, node);
2592
18.6k
            lexer->excludeBlocks = no;
2593
            
2594
            /*
2595
             * We don't really have to do anything when re-entering, except
2596
             * setting up the state when we left. No post-processing means
2597
             * this stays simple.
2598
             */
2599
18.6k
            TidyParserMemory memory = {0};
2600
18.6k
            memory.identity = TY_(ParseFrameSet);
2601
18.6k
            memory.original_node = frameset;
2602
18.6k
            memory.reentry_node = node;
2603
18.6k
            memory.mode = MixedContent;
2604
18.6k
            TY_(pushMemory)( doc, memory );
2605
18.6k
            DEBUG_LOG_EXIT_WITH_NODE(node);
2606
18.6k
            return node;
2607
18.6k
        }
2608
4.27k
        else if (node->type == StartEndTag && (node->tag && node->tag->model & CM_FRAMES))
2609
695
        {
2610
695
            TY_(InsertNodeAtEnd)(frameset, node);
2611
695
            continue;
2612
695
        }
2613
2614
        /* discard unexpected tags */
2615
        /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */
2616
3.58k
        if ( nodeIsA(node) )
2617
214
           doc->badAccess |= BA_INVALID_LINK_NOFRAMES;
2618
2619
3.58k
        TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED);
2620
3.58k
        TY_(FreeNode)( doc, node);
2621
3.58k
    }
2622
2623
21.2k
    TY_(Report)(doc, frameset, node, MISSING_ENDTAG_FOR);
2624
21.2k
    DEBUG_LOG_EXIT;
2625
21.2k
    return NULL;
2626
40.2k
}
2627
2628
2629
/** MARK: TY_(ParseHead)
2630
 *  Parses the `head` tag.
2631
 *
2632
 *  This is a non-recursing parser. It uses the document's parser memory stack
2633
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2634
 *  This parser is also re-enterable, so that post-processing can occur after
2635
 *  such dispatching.
2636
 */
2637
Node* TY_(ParseHead)( TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode) )
2638
39.8k
{
2639
39.8k
    Lexer* lexer = doc->lexer;
2640
39.8k
    Node *node;
2641
39.8k
    int HasTitle = 0;
2642
39.8k
    int HasBase = 0;
2643
39.8k
    DEBUG_LOG_COUNTERS;
2644
2645
39.8k
    if ( head == NULL )
2646
14.5k
    {
2647
14.5k
        TidyParserMemory memory = TY_(popMemory)( doc );
2648
14.5k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2649
14.5k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2650
14.5k
        head = memory.original_node;
2651
14.5k
        HasTitle = memory.register_1;
2652
14.5k
        HasBase = memory.register_2;
2653
14.5k
        DEBUG_LOG_GET_OLD_MODE;
2654
14.5k
        mode = memory.mode;
2655
14.5k
        DEBUG_LOG_CHANGE_MODE;
2656
14.5k
    }
2657
25.3k
    else
2658
25.3k
    {
2659
25.3k
        DEBUG_LOG_ENTER_WITH_NODE(head);
2660
25.3k
    }
2661
    
2662
53.1k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2663
49.6k
    {
2664
49.6k
        if (node->tag == head->tag && node->type == EndTag)
2665
339
        {
2666
339
            TY_(FreeNode)( doc, node);
2667
339
            head->closed = yes;
2668
339
            break;
2669
339
        }
2670
2671
        /* find and discard multiple <head> elements */
2672
        /* find and discard <html> in <head> elements */
2673
49.2k
        if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
2674
1.13k
        {
2675
1.13k
            TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2676
1.13k
            TY_(FreeNode)(doc, node);
2677
1.13k
            continue;
2678
1.13k
        }
2679
2680
48.1k
        if (TY_(nodeIsText)(node))
2681
7.53k
        {
2682
            /*\ Issue #132 - avoid warning for missing body tag,
2683
             *  if configured to --omit-otpional-tags yes
2684
             *  Issue #314 - and if --show-body-only
2685
            \*/
2686
7.53k
            if (!cfgBool( doc, TidyOmitOptionalTags ) &&
2687
7.53k
                !showingBodyOnly(doc) )
2688
7.53k
            {
2689
7.53k
                TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN);
2690
7.53k
            }
2691
7.53k
            TY_(UngetToken)( doc );
2692
7.53k
            break;
2693
7.53k
        }
2694
2695
40.6k
        if (node->type == ProcInsTag && node->element &&
2696
1.06k
            TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0)
2697
707
        {
2698
707
            TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN);
2699
707
            TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node);
2700
707
            continue;
2701
707
        }
2702
2703
        /* deal with comments etc. */
2704
39.9k
        if (InsertMisc(head, node))
2705
710
            continue;
2706
2707
39.1k
        if (node->type == DocTypeTag)
2708
2.51k
        {
2709
2.51k
            InsertDocType(doc, head, node);
2710
2.51k
            continue;
2711
2.51k
        }
2712
2713
        /* discard unknown tags */
2714
36.6k
        if (node->tag == NULL)
2715
7.86k
        {
2716
7.86k
            TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2717
7.86k
            TY_(FreeNode)( doc, node);
2718
7.86k
            continue;
2719
7.86k
        }
2720
2721
        /*
2722
         if it doesn't belong in the head then
2723
         treat as implicit end of head and deal
2724
         with as part of the body
2725
        */
2726
28.8k
        if (!(node->tag->model & CM_HEAD))
2727
13.5k
        {
2728
            /* #545067 Implicit closing of head broken - warn only for XHTML input */
2729
13.5k
            if ( lexer->isvoyager )
2730
151
                TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN );
2731
13.5k
            TY_(UngetToken)( doc );
2732
13.5k
            break;
2733
13.5k
        }
2734
2735
15.2k
        if (TY_(nodeIsElement)(node))
2736
14.9k
        {
2737
14.9k
            if ( nodeIsTITLE(node) )
2738
1.11k
            {
2739
1.11k
                ++HasTitle;
2740
2741
1.11k
                if (HasTitle > 1)
2742
790
                    TY_(Report)(doc, head, node,
2743
790
                                     head ?
2744
790
                                     TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
2745
1.11k
            }
2746
13.8k
            else if ( nodeIsBASE(node) )
2747
3.00k
            {
2748
3.00k
                ++HasBase;
2749
2750
3.00k
                if (HasBase > 1)
2751
1.93k
                    TY_(Report)(doc, head, node,
2752
1.93k
                                     head ?
2753
1.93k
                                     TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
2754
3.00k
            }
2755
2756
14.9k
            TY_(InsertNodeAtEnd)(head, node);
2757
2758
14.9k
            {
2759
14.9k
                TidyParserMemory memory = {0};
2760
14.9k
                memory.identity = TY_(ParseHead);
2761
14.9k
                memory.original_node = head;
2762
14.9k
                memory.reentry_node = node;
2763
14.9k
                memory.register_1 = HasTitle;
2764
14.9k
                memory.register_2 = HasBase;
2765
14.9k
                TY_(pushMemory)( doc, memory );
2766
14.9k
                DEBUG_LOG_EXIT_WITH_NODE(node);
2767
14.9k
                return node;
2768
14.9k
            }
2769
14.9k
        }
2770
2771
        /* discard unexpected text nodes and end tags */
2772
303
        TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2773
303
        TY_(FreeNode)( doc, node);
2774
303
    }
2775
24.9k
    DEBUG_LOG_EXIT;
2776
24.9k
    return NULL;
2777
39.8k
}
2778
2779
2780
/** MARK: TY_(ParseHTML)
2781
 *  Parses the `html` tag. At this point, other root-level stuff (doctype,
2782
 *  comments) are already set up, and here we handle all of the complexities
2783
 *  of things such as frameset documents, etc.
2784
 *
2785
 *  This is a non-recursing parser. It uses the document's parser memory stack
2786
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2787
 *  This parser is also re-enterable, so that post-processing can occur after
2788
 *  such dispatching.
2789
 */
2790
Node* TY_(ParseHTML)( TidyDocImpl *doc, Node *html, GetTokenMode mode )
2791
59.3k
{
2792
59.3k
    Node *node = NULL;
2793
59.3k
    Node *head = NULL;
2794
59.3k
    Node *frameset = NULL;
2795
59.3k
    Node *noframes = NULL;
2796
59.3k
    DEBUG_LOG_COUNTERS;
2797
2798
59.3k
    enum parserState {
2799
59.3k
        STATE_INITIAL,                /* This is the initial state for every parser. */
2800
59.3k
        STATE_COMPLETE,               /* Complete! */
2801
59.3k
        STATE_PRE_BODY,               /* In this state, we'll consider frames vs. body. */
2802
59.3k
        STATE_PARSE_BODY,             /* In this state, we can parse the body. */
2803
59.3k
        STATE_PARSE_HEAD,             /* In this state, we will setup head for parsing. */
2804
59.3k
        STATE_PARSE_HEAD_REENTER,     /* Resume here after parsing head. */
2805
59.3k
        STATE_PARSE_NOFRAMES,         /* In this state, we can parse noframes content. */
2806
59.3k
        STATE_PARSE_NOFRAMES_REENTER, /* In this state, we can restore more state. */
2807
59.3k
        STATE_PARSE_FRAMESET,         /* In this state, we will parse frameset content. */
2808
59.3k
        STATE_PARSE_FRAMESET_REENTER, /* We need to cleanup some things after parsing frameset. */
2809
59.3k
    } state = STATE_INITIAL;
2810
2811
59.3k
    TY_(SetOptionBool)( doc, TidyXmlTags, no );
2812
2813
59.3k
    if ( html == NULL )
2814
34.7k
    {
2815
34.7k
        TidyParserMemory memory = TY_(popMemory)( doc );
2816
34.7k
        node = memory.reentry_node;
2817
34.7k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2818
34.7k
        html = memory.original_node;
2819
34.7k
        state = memory.reentry_state;
2820
34.7k
        DEBUG_LOG_GET_OLD_MODE;
2821
34.7k
        mode = memory.reentry_mode;
2822
34.7k
        DEBUG_LOG_CHANGE_MODE;
2823
34.7k
    }
2824
24.5k
    else
2825
24.5k
    {
2826
24.5k
        DEBUG_LOG_ENTER_WITH_NODE(html);
2827
24.5k
    }
2828
2829
    /*
2830
     This main loop pulls tokens from the lexer until we're out of tokens,
2831
     or until there's no more work to do.
2832
     */
2833
163k
    while ( state != STATE_COMPLETE )
2834
162k
    {
2835
162k
        if ( state == STATE_INITIAL || state == STATE_PRE_BODY )
2836
69.0k
        {
2837
69.0k
            node = TY_(GetToken)( doc, IgnoreWhitespace );
2838
69.0k
            DEBUG_LOG_GOT_TOKEN(node);
2839
69.0k
        }
2840
2841
162k
        switch ( state )
2842
162k
        {
2843
            /**************************************************************
2844
             This case is all about finding a head tag and dealing with
2845
             cases were we don't, so that we can move on to parsing a head
2846
             tag.
2847
             **************************************************************/
2848
28.1k
            case STATE_INITIAL:
2849
28.1k
            {
2850
                /*
2851
                 The only way we can possibly be here is if the lexer
2852
                 had nothing to give us. Thus we'll create our own
2853
                 head, and set the signal to start parsing it.
2854
                 */
2855
28.1k
                if (node == NULL)
2856
2.72k
                {
2857
2.72k
                    node = TY_(InferredTag)(doc, TidyTag_HEAD);
2858
2.72k
                    state = STATE_PARSE_HEAD;
2859
2.72k
                    continue;
2860
2.72k
                }
2861
2862
                /* We found exactly what we expected: head. */
2863
25.4k
                if ( nodeIsHEAD(node) )
2864
248
                {
2865
248
                    state = STATE_PARSE_HEAD;
2866
248
                    continue;
2867
248
                }
2868
2869
                /* We did not expect to find an html closing tag here! */
2870
25.1k
                if (html && (node->tag == html->tag) && (node->type == EndTag))
2871
1.11k
                {
2872
1.11k
                    TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2873
1.11k
                    TY_(FreeNode)( doc, node);
2874
1.11k
                    continue;
2875
1.11k
                }
2876
2877
                /* Find and discard multiple <html> elements. */
2878
24.0k
                if (html && (node->tag == html->tag) && (node->type == StartTag))
2879
1.93k
                {
2880
1.93k
                    TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2881
1.93k
                    TY_(FreeNode)(doc, node);
2882
1.93k
                    continue;
2883
1.93k
                }
2884
2885
                /* Deal with comments, etc. */
2886
22.1k
                if (InsertMisc(html, node))
2887
528
                    continue;
2888
2889
                /*
2890
                 At this point, we didn't find a head tag, so put the
2891
                 token back and create our own head tag, so we can
2892
                 move on.
2893
                 */
2894
21.5k
                TY_(UngetToken)( doc );
2895
21.5k
                node = TY_(InferredTag)(doc, TidyTag_HEAD);
2896
21.5k
                state = STATE_PARSE_HEAD;
2897
21.5k
                continue;
2898
22.1k
            } break;
2899
2900
2901
            /**************************************************************
2902
             This case determines whether we're dealing with body or
2903
             frameset + noframes, and sets things up accordingly.
2904
             **************************************************************/
2905
40.9k
            case STATE_PRE_BODY:
2906
40.9k
            {
2907
40.9k
                if (node == NULL )
2908
4.31k
                {
2909
4.31k
                    if (frameset == NULL) /* Implied body. */
2910
3.37k
                    {
2911
3.37k
                        node = TY_(InferredTag)(doc, TidyTag_BODY);
2912
3.37k
                        state = STATE_PARSE_BODY;
2913
3.37k
                    } else {
2914
944
                        state = STATE_COMPLETE;
2915
944
                    }
2916
2917
4.31k
                    continue;
2918
4.31k
                }
2919
2920
                /* Robustly handle html tags. */
2921
36.5k
                if (node->tag == html->tag)
2922
1.86k
                {
2923
1.86k
                    if (node->type != StartTag && frameset == NULL)
2924
1.06k
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2925
2926
1.86k
                    TY_(FreeNode)( doc, node);
2927
1.86k
                    continue;
2928
1.86k
                }
2929
2930
                /* Deal with comments, etc. */
2931
34.7k
                if (InsertMisc(html, node))
2932
245
                    continue;
2933
2934
                /* If frameset document, coerce <body> to <noframes> */
2935
34.4k
                if ( nodeIsBODY(node) )
2936
805
                {
2937
805
                    if (node->type != StartTag)
2938
94
                    {
2939
94
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2940
94
                        TY_(FreeNode)( doc, node);
2941
94
                        continue;
2942
94
                    }
2943
2944
711
                    if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
2945
711
                    {
2946
711
                        if (frameset != NULL)
2947
530
                        {
2948
530
                            TY_(UngetToken)( doc );
2949
2950
530
                            if (noframes == NULL)
2951
356
                            {
2952
356
                                noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
2953
356
                                TY_(InsertNodeAtEnd)(frameset, noframes);
2954
356
                                TY_(Report)(doc, html, noframes, INSERTING_TAG);
2955
356
                            }
2956
174
                            else
2957
174
                            {
2958
174
                                if (noframes->type == StartEndTag)
2959
79
                                    noframes->type = StartTag;
2960
174
                            }
2961
2962
530
                            state = STATE_PARSE_NOFRAMES;
2963
530
                            continue;
2964
530
                        }
2965
711
                    }
2966
2967
181
                    TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
2968
181
                    state = STATE_PARSE_BODY;
2969
181
                    continue;
2970
711
                }
2971
2972
                /* Flag an error if we see more than one frameset. */
2973
33.6k
                if ( nodeIsFRAMESET(node) )
2974
5.21k
                {
2975
5.21k
                    if (node->type != StartTag)
2976
247
                    {
2977
247
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2978
247
                        TY_(FreeNode)( doc, node);
2979
247
                        continue;
2980
247
                    }
2981
2982
4.96k
                    if (frameset != NULL)
2983
3.41k
                        TY_(Report)(doc, html, node, DUPLICATE_FRAMESET);
2984
1.54k
                    else
2985
1.54k
                        frameset = node;
2986
2987
4.96k
                    state = STATE_PARSE_FRAMESET;
2988
4.96k
                    continue;
2989
5.21k
                }
2990
2991
                /* If not a frameset document coerce <noframes> to <body>. */
2992
28.4k
                if ( nodeIsNOFRAMES(node) )
2993
4.57k
                {
2994
4.57k
                    if (node->type != StartTag)
2995
252
                    {
2996
252
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2997
252
                        TY_(FreeNode)( doc, node);
2998
252
                        continue;
2999
252
                    }
3000
3001
4.32k
                    if (frameset == NULL)
3002
243
                    {
3003
243
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
3004
243
                        TY_(FreeNode)( doc, node);
3005
243
                        node = TY_(InferredTag)(doc, TidyTag_BODY);
3006
243
                        state = STATE_PARSE_BODY;
3007
243
                        continue;
3008
243
                    }
3009
3010
4.07k
                    if (noframes == NULL)
3011
1.79k
                    {
3012
1.79k
                        noframes = node;
3013
1.79k
                        TY_(InsertNodeAtEnd)(frameset, noframes);
3014
1.79k
                        state = STATE_PARSE_NOFRAMES;
3015
1.79k
                    }
3016
2.28k
                    else
3017
2.28k
                    {
3018
2.28k
                        TY_(FreeNode)( doc, node);
3019
2.28k
                    }
3020
3021
4.07k
                    continue;
3022
4.32k
                }
3023
3024
                /* Deal with some other element that we're not expecting. */
3025
23.8k
                if (TY_(nodeIsElement)(node))
3026
15.1k
                {
3027
15.1k
                    if (node->tag && node->tag->model & CM_HEAD)
3028
374
                    {
3029
374
                        MoveToHead(doc, html, node);
3030
374
                        continue;
3031
374
                    }
3032
3033
                    /* Discard illegal frame element following a frameset. */
3034
14.7k
                    if ( frameset != NULL && nodeIsFRAME(node) )
3035
753
                    {
3036
753
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
3037
753
                        TY_(FreeNode)(doc, node);
3038
753
                        continue;
3039
753
                    }
3040
14.7k
                }
3041
3042
22.7k
                TY_(UngetToken)( doc );
3043
3044
                /* Insert other content into noframes element. */
3045
22.7k
                if (frameset)
3046
3.93k
                {
3047
3.93k
                    if (noframes == NULL)
3048
2.18k
                    {
3049
2.18k
                        noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3050
2.18k
                        TY_(InsertNodeAtEnd)(frameset, noframes);
3051
2.18k
                    }
3052
1.75k
                    else
3053
1.75k
                    {
3054
1.75k
                        TY_(Report)(doc, html, node, NOFRAMES_CONTENT);
3055
1.75k
                        if (noframes->type == StartEndTag)
3056
39
                            noframes->type = StartTag;
3057
1.75k
                    }
3058
3059
3.93k
                    TY_(ConstrainVersion)(doc, VERS_FRAMESET);
3060
3.93k
                    state = STATE_PARSE_NOFRAMES;
3061
3.93k
                    continue;
3062
3.93k
                }
3063
3064
18.8k
                node = TY_(InferredTag)(doc, TidyTag_BODY);
3065
3066
                /* Issue #132 - disable inserting BODY tag warning
3067
                 BUT only if NOT --show-body-only yes */
3068
37.6k
                if (!showingBodyOnly(doc))
3069
18.8k
                    TY_(Report)(doc, html, node, INSERTING_TAG );
3070
3071
18.8k
                TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3072
18.8k
                state = STATE_PARSE_BODY;
3073
18.8k
                continue;
3074
22.7k
            } break;
3075
3076
3077
            /**************************************************************
3078
             In this case, we're ready to parse the head, and move on to
3079
             look for the body or body alternative.
3080
             **************************************************************/
3081
24.5k
            case STATE_PARSE_HEAD:
3082
24.5k
            {
3083
24.5k
                TidyParserMemory memory = {0};
3084
24.5k
                memory.identity = TY_(ParseHTML);
3085
24.5k
                memory.mode = mode;
3086
24.5k
                memory.original_node = html;
3087
24.5k
                memory.reentry_node = node;
3088
24.5k
                memory.reentry_mode = mode;
3089
24.5k
                memory.reentry_state = STATE_PARSE_HEAD_REENTER;
3090
24.5k
                TY_(InsertNodeAtEnd)(html, node);
3091
24.5k
                TY_(pushMemory)( doc, memory );
3092
24.5k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3093
24.5k
                return node;
3094
22.7k
            } break;
3095
3096
24.1k
            case STATE_PARSE_HEAD_REENTER:
3097
24.1k
            {
3098
24.1k
                head = node;
3099
24.1k
                state = STATE_PRE_BODY;
3100
24.1k
            } break;
3101
3102
3103
            /**************************************************************
3104
             In this case, we can finally parse a body.
3105
             **************************************************************/
3106
22.6k
            case STATE_PARSE_BODY:
3107
22.6k
            {
3108
22.6k
                TidyParserMemory memory = {0};
3109
22.6k
                memory.identity = NULL; /* we don't need to reenter */
3110
22.6k
                memory.mode = mode;
3111
22.6k
                memory.original_node = html;
3112
22.6k
                memory.reentry_node = NULL;
3113
22.6k
                memory.reentry_mode = mode;
3114
22.6k
                memory.reentry_state = STATE_COMPLETE;
3115
22.6k
                TY_(InsertNodeAtEnd)(html, node);
3116
22.6k
                TY_(pushMemory)( doc, memory );
3117
22.6k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3118
22.6k
                return node;
3119
22.7k
            } break;
3120
3121
3122
            /**************************************************************
3123
             In this case, we will parse noframes. If necessary, the
3124
             node is already inserted in the proper spot.
3125
             **************************************************************/
3126
6.26k
            case STATE_PARSE_NOFRAMES:
3127
6.26k
            {
3128
6.26k
                TidyParserMemory memory = {0};
3129
6.26k
                memory.identity = TY_(ParseHTML);
3130
6.26k
                memory.mode = mode;
3131
6.26k
                memory.original_node = html;
3132
6.26k
                memory.reentry_node = frameset;
3133
6.26k
                memory.reentry_mode = mode;
3134
6.26k
                memory.reentry_state = STATE_PARSE_NOFRAMES_REENTER;
3135
6.26k
                TY_(pushMemory)( doc, memory );
3136
6.26k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3137
6.26k
                return noframes;
3138
22.7k
            } break;
3139
3140
5.66k
            case STATE_PARSE_NOFRAMES_REENTER:
3141
5.66k
            {
3142
5.66k
                frameset = node;
3143
5.66k
                state = STATE_PRE_BODY;
3144
5.66k
            } break;
3145
3146
3147
            /**************************************************************
3148
             In this case, we parse the frameset, and look for noframes
3149
             content to merge later if necessary.
3150
             **************************************************************/
3151
4.96k
            case STATE_PARSE_FRAMESET:
3152
4.96k
            {
3153
4.96k
                TidyParserMemory memory = {0};
3154
4.96k
                memory.identity = TY_(ParseHTML);
3155
4.96k
                memory.mode = mode;
3156
4.96k
                memory.original_node = html;
3157
4.96k
                memory.reentry_node = frameset;
3158
4.96k
                memory.reentry_mode = mode;
3159
4.96k
                memory.reentry_state = STATE_PARSE_FRAMESET_REENTER;
3160
4.96k
                TY_(InsertNodeAtEnd)(html, node);
3161
4.96k
                TY_(pushMemory)( doc, memory );
3162
4.96k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3163
4.96k
                return node;
3164
22.7k
            } break;
3165
3166
4.95k
            case (STATE_PARSE_FRAMESET_REENTER):
3167
4.95k
            {
3168
4.95k
                frameset = node;
3169
                /*
3170
                 See if it includes a noframes element so that
3171
                 we can merge subsequent noframes elements.
3172
                 */
3173
1.41M
                for (node = frameset->content; node; node = node->next)
3174
1.41M
                {
3175
1.41M
                    if ( nodeIsNOFRAMES(node) )
3176
1.40M
                        noframes = node;
3177
1.41M
                }
3178
4.95k
                state = STATE_PRE_BODY;
3179
4.95k
            } break;
3180
3181
3182
            /**************************************************************
3183
             We really shouldn't get here, but if we do, finish nicely.
3184
             **************************************************************/
3185
0
            default:
3186
0
            {
3187
0
                state = STATE_COMPLETE;
3188
0
            }
3189
162k
        } /* switch */
3190
162k
    } /* while */
3191
3192
944
    DEBUG_LOG_EXIT;
3193
944
    return NULL;
3194
59.3k
}
3195
3196
3197
/** MARK: TY_(ParseInline)
3198
 *  Parse inline element nodes.
3199
 *
3200
 *  This is a non-recursing parser. It uses the document's parser memory stack
3201
 *  to send subsequent nodes back to the controller for dispatching to parsers.
3202
 *  This parser is also re-enterable, so that post-processing can occur after
3203
 *  such dispatching.
3204
*/
3205
Node* TY_(ParseInline)( TidyDocImpl *doc, Node *element, GetTokenMode mode )
3206
2.56M
{
3207
2.56M
    Lexer* lexer = doc->lexer;
3208
2.56M
    Node *node = NULL;
3209
2.56M
    Node *parent = NULL;
3210
2.56M
    DEBUG_LOG_COUNTERS;
3211
    
3212
2.56M
    if ( element == NULL )
3213
1.19M
    {
3214
1.19M
        TidyParserMemory memory = TY_(popMemory)( doc );
3215
1.19M
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
3216
1.19M
        DEBUG_LOG_REENTER_WITH_NODE(node);
3217
1.19M
        element = memory.original_node;
3218
1.19M
        DEBUG_LOG_GET_OLD_MODE;
3219
1.19M
        mode = memory.reentry_mode;
3220
1.19M
        DEBUG_LOG_CHANGE_MODE;
3221
1.19M
    }
3222
1.37M
    else
3223
1.37M
    {
3224
1.37M
        DEBUG_LOG_ENTER_WITH_NODE(element);
3225
3226
1.37M
        if (element->tag->model & CM_EMPTY)
3227
0
        {
3228
0
            DEBUG_LOG_EXIT;
3229
0
            return NULL;
3230
0
        }
3231
3232
        /*
3233
         ParseInline is used for some block level elements like H1 to H6
3234
         For such elements we need to insert inline emphasis tags currently
3235
         on the inline stack. For Inline elements, we normally push them
3236
         onto the inline stack provided they aren't implicit or OBJECT/APPLET.
3237
         This test is carried out in PushInline and PopInline, see istack.c
3238
3239
         InlineDup(...) is not called for elements with a CM_MIXED (inline and
3240
         block) content model, e.g. <del> or <ins>, otherwise constructs like
3241
3242
           <p>111<a name='foo'>222<del>333</del>444</a>555</p>
3243
           <p>111<span>222<del>333</del>444</span>555</p>
3244
           <p>111<em>222<del>333</del>444</em>555</p>
3245
3246
         will get corrupted.
3247
        */
3248
1.37M
        if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
3249
69.6k
            !TY_(nodeHasCM)(element, CM_MIXED))
3250
63.4k
            TY_(InlineDup)(doc, NULL);
3251
1.31M
        else if (TY_(nodeHasCM)(element, CM_INLINE))
3252
1.30M
            TY_(PushInline)(doc, element);
3253
3254
1.37M
        if ( nodeIsNOBR(element) )
3255
1.18k
            doc->badLayout |= USING_NOBR;
3256
1.37M
        else if ( nodeIsFONT(element) )
3257
1.05M
            doc->badLayout |= USING_FONT;
3258
3259
        /* Inline elements may or may not be within a preformatted element */
3260
1.37M
        if (mode != Preformatted)
3261
1.37M
        {
3262
1.37M
            DEBUG_LOG_GET_OLD_MODE;
3263
1.37M
            mode = MixedContent;
3264
1.37M
            DEBUG_LOG_CHANGE_MODE;
3265
1.37M
        }
3266
1.37M
    }
3267
    
3268
2.71M
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
3269
1.86M
    {
3270
        /* end tag for current element */
3271
1.86M
        if (node->tag == element->tag && node->type == EndTag)
3272
11.2k
        {
3273
11.2k
            if (element->tag->model & CM_INLINE)
3274
8.02k
                TY_(PopInline)( doc, node );
3275
3276
11.2k
            TY_(FreeNode)( doc, node );
3277
3278
11.2k
            if (!(mode & Preformatted))
3279
11.2k
                TrimSpaces(doc, element);
3280
3281
            /*
3282
             if a font element wraps an anchor and nothing else
3283
             then move the font element inside the anchor since
3284
             otherwise it won't alter the anchor text color
3285
            */
3286
11.2k
            if ( nodeIsFONT(element) &&
3287
2.16k
                 element->content && element->content == element->last )
3288
1.69k
            {
3289
1.69k
                Node *child = element->content;
3290
3291
1.69k
                if ( nodeIsA(child) )
3292
989
                {
3293
989
                    child->parent = element->parent;
3294
989
                    child->next = element->next;
3295
989
                    child->prev = element->prev;
3296
3297
989
                    element->next = NULL;
3298
989
                    element->prev = NULL;
3299
989
                    element->parent = child;
3300
3301
989
                    element->content = child->content;
3302
989
                    element->last = child->last;
3303
989
                    child->content = element;
3304
3305
989
                    TY_(FixNodeLinks)(child);
3306
989
                    TY_(FixNodeLinks)(element);
3307
989
                }
3308
1.69k
            }
3309
3310
11.2k
            element->closed = yes;
3311
11.2k
            TrimSpaces( doc, element );
3312
3313
11.2k
            DEBUG_LOG_EXIT;
3314
11.2k
            return NULL;
3315
11.2k
        }
3316
3317
        /* <u>...<u>  map 2nd <u> to </u> if 1st is explicit */
3318
        /* (see additional conditions below) */
3319
        /* otherwise emphasis nesting is probably unintentional */
3320
        /* big, small, sub, sup have cumulative effect to leave them alone */
3321
1.85M
        if ( node->type == StartTag
3322
1.70M
             && node->tag == element->tag
3323
1.10M
             && TY_(IsPushed)( doc, node )
3324
1.05M
             && !node->implicit
3325
135k
             && !element->implicit
3326
132k
             && node->tag && (node->tag->model & CM_INLINE)
3327
1.85M
             && !nodeIsA(node)
3328
1.85M
             && !nodeIsFONT(node)
3329
1.85M
             && !nodeIsBIG(node)
3330
1.85M
             && !nodeIsSMALL(node)
3331
1.85M
             && !nodeIsSUB(node)
3332
1.85M
             && !nodeIsSUP(node)
3333
1.85M
             && !nodeIsQ(node)
3334
1.85M
             && !nodeIsSPAN(node)
3335
11.5k
             && cfgBool(doc, TidyCoerceEndTags)
3336
1.85M
           )
3337
11.5k
        {
3338
            /* proceeds only if "node" does not have any attribute and
3339
               follows a text node not finishing with a space */
3340
11.5k
            if (element->content != NULL && node->attributes == NULL
3341
3.58k
                && TY_(nodeIsText)(element->last)
3342
2.42k
                && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
3343
1.53k
            {
3344
1.53k
                TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
3345
1.53k
                node->type = EndTag;
3346
1.53k
                TY_(UngetToken)(doc);
3347
1.53k
                continue;
3348
1.53k
            }
3349
3350
9.99k
            if (node->attributes == NULL || element->attributes == NULL)
3351
8.27k
                TY_(Report)(doc, element, node, NESTED_EMPHASIS);
3352
9.99k
        }
3353
1.84M
        else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
3354
1.19M
                  nodeIsQ(node) )
3355
17.2k
        {
3356
            /*\
3357
             * Issue #215 - such nested quotes are NOT a problem if HTML5, so
3358
             * only issue this warning if NOT HTML5 mode.
3359
            \*/
3360
17.2k
            if (TY_(HTMLVersion)(doc) != HT50)
3361
17.2k
            {
3362
17.2k
                TY_(Report)(doc, element, node, NESTED_QUOTATION);
3363
17.2k
            }
3364
17.2k
        }
3365
3366
1.85M
        if ( TY_(nodeIsText)(node) )
3367
77.0k
        {
3368
            /* only called for 1st child */
3369
77.0k
            if ( element->content == NULL && !(mode & Preformatted) )
3370
53.7k
                TrimSpaces( doc, element );
3371
3372
77.0k
            if ( node->start >= node->end )
3373
278
            {
3374
278
                TY_(FreeNode)( doc, node );
3375
278
                continue;
3376
278
            }
3377
3378
76.7k
            TY_(InsertNodeAtEnd)(element, node);
3379
76.7k
            continue;
3380
77.0k
        }
3381
3382
        /* mixed content model so allow text */
3383
1.77M
        if (InsertMisc(element, node))
3384
8.95k
            continue;
3385
3386
        /* deal with HTML tags */
3387
1.77M
        if ( nodeIsHTML(node) )
3388
4.22k
        {
3389
4.22k
            if ( TY_(nodeIsElement)(node) )
3390
3.87k
            {
3391
3.87k
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
3392
3.87k
                TY_(FreeNode)( doc, node );
3393
3.87k
                continue;
3394
3.87k
            }
3395
3396
            /* otherwise infer end of inline element */
3397
348
            TY_(UngetToken)( doc );
3398
3399
348
            if (!(mode & Preformatted))
3400
348
                TrimSpaces(doc, element);
3401
3402
348
            DEBUG_LOG_EXIT;
3403
348
            return NULL;
3404
4.22k
        }
3405
3406
        /* within <dt> or <pre> map <p> to <br> */
3407
1.76M
        if ( nodeIsP(node) &&
3408
205k
             node->type == StartTag &&
3409
194k
             ( (mode & Preformatted) ||
3410
194k
               nodeIsDT(element) ||
3411
193k
               DescendantOf(element, TidyTag_DT )
3412
194k
             )
3413
1.76M
           )
3414
1.53k
        {
3415
1.53k
            node->tag = TY_(LookupTagDef)( TidyTag_BR );
3416
1.53k
            TidyDocFree(doc, node->element);
3417
1.53k
            node->element = TY_(tmbstrdup)(doc->allocator, "br");
3418
1.53k
            TrimSpaces(doc, element);
3419
1.53k
            TY_(InsertNodeAtEnd)(element, node);
3420
1.53k
            continue;
3421
1.53k
        }
3422
3423
        /* <p> allowed within <address> in HTML 4.01 Transitional */
3424
1.76M
        if ( nodeIsP(node) &&
3425
204k
             node->type == StartTag &&
3426
193k
             nodeIsADDRESS(element) )
3427
0
        {
3428
0
            TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
3429
0
            TY_(InsertNodeAtEnd)(element, node);
3430
0
            (*node->tag->parser)( doc, node, mode );
3431
0
            continue;
3432
0
        }
3433
3434
        /* ignore unknown and PARAM tags */
3435
1.76M
        if ( node->tag == NULL || nodeIsPARAM(node) )
3436
45.4k
        {
3437
45.4k
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3438
45.4k
            TY_(FreeNode)( doc, node );
3439
45.4k
            continue;
3440
45.4k
        }
3441
3442
1.71M
        if ( nodeIsBR(node) && node->type == EndTag )
3443
2.20k
            node->type = StartTag;
3444
3445
1.71M
        if ( node->type == EndTag )
3446
51.6k
        {
3447
           /* coerce </br> to <br> */
3448
51.6k
           if ( nodeIsBR(node) )
3449
0
                node->type = StartTag;
3450
51.6k
           else if ( nodeIsP(node) )
3451
10.8k
           {
3452
               /* coerce unmatched </p> to <br><br> */
3453
10.8k
                if ( !DescendantOf(element, TidyTag_P) )
3454
1.66k
                {
3455
1.66k
                    TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
3456
1.66k
                    TrimSpaces( doc, element );
3457
1.66k
                    TY_(InsertNodeAtEnd)( element, node );
3458
1.66k
                    node = TY_(InferredTag)(doc, TidyTag_BR);
3459
1.66k
                    TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
3460
1.66k
                    continue;
3461
1.66k
                }
3462
10.8k
           }
3463
40.8k
           else if ( TY_(nodeHasCM)(node, CM_INLINE)
3464
40.8k
                     && !nodeIsA(node)
3465
8.54k
                     && !TY_(nodeHasCM)(node, CM_OBJECT)
3466
4.87k
                     && TY_(nodeHasCM)(element, CM_INLINE) )
3467
4.03k
            {
3468
                /* allow any inline end tag to end current element */
3469
3470
                /* http://tidy.sf.net/issue/1426419 */
3471
                /* but, like the browser, retain an earlier inline element.
3472
                   This is implemented by setting the lexer into a mode
3473
                   where it gets tokens from the inline stack rather than
3474
                   from the input stream. Check if the scenerio fits. */
3475
4.03k
                if ( !nodeIsA(element)
3476
3.16k
                     && (node->tag != element->tag)
3477
3.16k
                     && TY_(IsPushed)( doc, node )
3478
2.54k
                     && TY_(IsPushed)( doc, element ) )
3479
2.07k
                {
3480
                    /* we have something like
3481
                       <b>bold <i>bold and italic</b> italics</i> */
3482
2.07k
                    if ( TY_(SwitchInline)( doc, element, node ) )
3483
714
                    {
3484
714
                        TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG);
3485
714
                        TY_(UngetToken)( doc ); /* put this back */
3486
714
                        TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
3487
714
                        if (!(mode & Preformatted))
3488
714
                            TrimSpaces( doc, element );
3489
3490
714
                        DEBUG_LOG_EXIT;
3491
714
                        return NULL; /* close <i>, but will re-open it, after </b> */
3492
714
                    }
3493
2.07k
                }
3494
3.32k
                TY_(PopInline)( doc, element );
3495
3496
3.32k
                if ( !nodeIsA(element) )
3497
2.45k
                {
3498
2.45k
                    if ( nodeIsA(node) && node->tag != element->tag )
3499
0
                    {
3500
0
                       TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
3501
0
                       TY_(UngetToken)( doc );
3502
0
                    }
3503
2.45k
                    else
3504
2.45k
                    {
3505
2.45k
                        TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG);
3506
2.45k
                        TY_(FreeNode)( doc, node);
3507
2.45k
                    }
3508
3509
2.45k
                    if (!(mode & Preformatted))
3510
2.45k
                        TrimSpaces(doc, element);
3511
3512
2.45k
                    DEBUG_LOG_EXIT;
3513
2.45k
                    return NULL;
3514
2.45k
                }
3515
3516
                /* if parent is <a> then discard unexpected inline end tag */
3517
870
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3518
870
                TY_(FreeNode)( doc, node);
3519
870
                continue;
3520
3.32k
            }  /* special case </tr> etc. for stuff moved in front of table */
3521
36.7k
            else if ( lexer->exiled
3522
6.63k
                     && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
3523
1.01k
            {
3524
1.01k
                TY_(UngetToken)( doc );
3525
1.01k
                TrimSpaces(doc, element);
3526
3527
1.01k
                DEBUG_LOG_EXIT;
3528
1.01k
                return NULL;
3529
1.01k
            }
3530
51.6k
        }
3531
3532
        /* allow any header tag to end current header */
3533
1.71M
        if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
3534
2.07k
        {
3535
3536
2.07k
            if ( node->tag == element->tag )
3537
783
            {
3538
783
                TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG );
3539
783
                TY_(FreeNode)( doc, node);
3540
783
            }
3541
1.29k
            else
3542
1.29k
            {
3543
1.29k
                TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
3544
1.29k
                TY_(UngetToken)( doc );
3545
1.29k
            }
3546
3547
2.07k
            if (!(mode & Preformatted))
3548
2.07k
                TrimSpaces(doc, element);
3549
3550
2.07k
            DEBUG_LOG_EXIT;
3551
2.07k
            return NULL;
3552
2.07k
        }
3553
3554
        /*
3555
           an <A> tag to ends any open <A> element
3556
           but <A href=...> is mapped to </A><A href=...>
3557
        */
3558
        /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
3559
        /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
3560
1.71M
        if ( nodeIsA(node) && !node->implicit &&
3561
37.3k
             (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
3562
33.1k
        {
3563
            /* coerce <a> to </a> unless it has some attributes */
3564
            /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
3565
            /* other fixes by Dave Raggett */
3566
            /* if (node->attributes == NULL) */
3567
33.1k
            if (node->type != EndTag && node->attributes == NULL
3568
3.54k
                && cfgBool(doc, TidyCoerceEndTags) )
3569
3.54k
            {
3570
3.54k
                node->type = EndTag;
3571
3.54k
                TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
3572
                /* TY_(PopInline)( doc, node ); */
3573
3.54k
                TY_(UngetToken)( doc );
3574
3.54k
                continue;
3575
3.54k
            }
3576
3577
29.5k
            TY_(UngetToken)( doc );
3578
29.5k
            TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
3579
            /* TY_(PopInline)( doc, element ); */
3580
3581
29.5k
            if (!(mode & Preformatted))
3582
29.5k
                TrimSpaces(doc, element);
3583
3584
29.5k
            DEBUG_LOG_EXIT;
3585
29.5k
            return NULL;
3586
33.1k
        }
3587
3588
1.67M
        if (element->tag->model & CM_HEADING)
3589
5.64k
        {
3590
5.64k
            if ( nodeIsCENTER(node) || nodeIsDIV(node) )
3591
2.09k
            {
3592
2.09k
                if (!TY_(nodeIsElement)(node))
3593
757
                {
3594
757
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3595
757
                    TY_(FreeNode)( doc, node);
3596
757
                    continue;
3597
757
                }
3598
3599
1.34k
                TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
3600
3601
                /* insert center as parent if heading is empty */
3602
1.34k
                if (element->content == NULL)
3603
780
                {
3604
780
                    InsertNodeAsParent(element, node);
3605
780
                    continue;
3606
780
                }
3607
3608
                /* split heading and make center parent of 2nd part */
3609
562
                TY_(InsertNodeAfterElement)(element, node);
3610
3611
562
                if (!(mode & Preformatted))
3612
562
                    TrimSpaces(doc, element);
3613
3614
562
                element = TY_(CloneNode)( doc, element );
3615
562
                TY_(InsertNodeAtEnd)(node, element);
3616
562
                continue;
3617
1.34k
            }
3618
3619
3.55k
            if ( nodeIsHR(node) )
3620
1.46k
            {
3621
1.46k
                if ( !TY_(nodeIsElement)(node) )
3622
166
                {
3623
166
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3624
166
                    TY_(FreeNode)( doc, node);
3625
166
                    continue;
3626
166
                }
3627
3628
1.29k
                TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
3629
3630
                /* insert hr before heading if heading is empty */
3631
1.29k
                if (element->content == NULL)
3632
315
                {
3633
315
                    TY_(InsertNodeBeforeElement)(element, node);
3634
315
                    continue;
3635
315
                }
3636
3637
                /* split heading and insert hr before 2nd part */
3638
979
                TY_(InsertNodeAfterElement)(element, node);
3639
3640
979
                if (!(mode & Preformatted))
3641
979
                    TrimSpaces(doc, element);
3642
3643
979
                element = TY_(CloneNode)( doc, element );
3644
979
                TY_(InsertNodeAfterElement)(node, element);
3645
979
                continue;
3646
1.29k
            }
3647
3.55k
        }
3648
3649
1.67M
        if ( nodeIsDT(element) )
3650
7.70k
        {
3651
7.70k
            if ( nodeIsHR(node) )
3652
2.17k
            {
3653
2.17k
                Node *dd;
3654
2.17k
                if ( !TY_(nodeIsElement)(node) )
3655
207
                {
3656
207
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3657
207
                    TY_(FreeNode)( doc, node);
3658
207
                    continue;
3659
207
                }
3660
3661
1.96k
                TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
3662
1.96k
                dd = TY_(InferredTag)(doc, TidyTag_DD);
3663
3664
                /* insert hr within dd before dt if dt is empty */
3665
1.96k
                if (element->content == NULL)
3666
1.03k
                {
3667
1.03k
                    TY_(InsertNodeBeforeElement)(element, dd);
3668
1.03k
                    TY_(InsertNodeAtEnd)(dd, node);
3669
1.03k
                    continue;
3670
1.03k
                }
3671
3672
                /* split dt and insert hr within dd before 2nd part */
3673
933
                TY_(InsertNodeAfterElement)(element, dd);
3674
933
                TY_(InsertNodeAtEnd)(dd, node);
3675
3676
933
                if (!(mode & Preformatted))
3677
933
                    TrimSpaces(doc, element);
3678
3679
933
                element = TY_(CloneNode)( doc, element );
3680
933
                TY_(InsertNodeAfterElement)(dd, element);
3681
933
                continue;
3682
1.96k
            }
3683
7.70k
        }
3684
3685
3686
        /*
3687
          if this is the end tag for an ancestor element
3688
          then infer end tag for this element
3689
        */
3690
1.67M
        if (node->type == EndTag)
3691
21.1k
        {
3692
21.1k
            for (parent = element->parent;
3693
345k
                    parent != NULL; parent = parent->parent)
3694
343k
            {
3695
343k
                if (node->tag == parent->tag)
3696
19.1k
                {
3697
19.1k
                    if (!(element->tag->model & CM_OPT) && !element->implicit)
3698
7.29k
                        TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
3699
3700
19.1k
                    if( TY_(IsPushedLast)( doc, element, node ) )
3701
0
                        TY_(PopInline)( doc, element );
3702
19.1k
                    TY_(UngetToken)( doc );
3703
3704
19.1k
                    if (!(mode & Preformatted))
3705
19.1k
                        TrimSpaces(doc, element);
3706
3707
19.1k
                    DEBUG_LOG_EXIT;
3708
19.1k
                    return NULL;
3709
19.1k
                }
3710
343k
            }
3711
21.1k
        }
3712
3713
        /*\
3714
         *  block level tags end this element
3715
         *  Issue #333 - There seems an exception if the element is a 'span',
3716
         *  and the node just collected is a 'meta'. The 'meta' can not have
3717
         *  CM_INLINE added, nor can the 'span' have CM_MIXED added without
3718
         *  big consequences.
3719
         *  There may be other exceptions to be added...
3720
        \*/
3721
1.65M
        if (!(node->tag->model & CM_INLINE) &&
3722
352k
            !(element->tag->model & CM_MIXED) &&
3723
350k
            !(nodeIsSPAN(element) && nodeIsMETA(node)) )
3724
350k
        {
3725
350k
            if ( !TY_(nodeIsElement)(node) )
3726
977
            {
3727
977
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3728
977
                TY_(FreeNode)( doc, node);
3729
977
                continue;
3730
977
            }
3731
            /* HTML5 */
3732
349k
            if (nodeIsDATALIST(element)) {
3733
0
                TY_(ConstrainVersion)( doc, ~VERS_HTML5 );
3734
0
            } else
3735
349k
            if (!(element->tag->model & CM_OPT))
3736
297k
                TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
3737
3738
349k
            if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
3739
1.96k
            {
3740
1.96k
                MoveToHead(doc, element, node);
3741
1.96k
                continue;
3742
1.96k
            }
3743
3744
            /*
3745
               prevent anchors from propagating into block tags
3746
               except for headings h1 to h6
3747
            */
3748
347k
            if ( nodeIsA(element) )
3749
7.83k
            {
3750
7.83k
                if (node->tag && !(node->tag->model & CM_HEADING))
3751
7.29k
                    TY_(PopInline)( doc, element );
3752
541
                else if (!(element->content))
3753
282
                {
3754
282
                    TY_(DiscardElement)( doc, element );
3755
282
                    TY_(UngetToken)( doc );
3756
3757
282
                    DEBUG_LOG_EXIT;
3758
282
                    return NULL;
3759
282
                }
3760
7.83k
            }
3761
3762
347k
            TY_(UngetToken)( doc );
3763
3764
347k
            if (!(mode & Preformatted))
3765
347k
                TrimSpaces(doc, element);
3766
3767
347k
            DEBUG_LOG_EXIT;
3768
347k
            return NULL;
3769
347k
        }
3770
3771
        /* parse inline element */
3772
1.30M
        if (TY_(nodeIsElement)(node))
3773
1.30M
        {
3774
1.30M
            if (node->implicit)
3775
1.06M
                TY_(Report)(doc, element, node, INSERTING_TAG);
3776
3777
            /* trim white space before <br> */
3778
1.30M
            if ( nodeIsBR(node) )
3779
5.97k
                TrimSpaces(doc, element);
3780
3781
1.30M
            TY_(InsertNodeAtEnd)(element, node);
3782
            
3783
1.30M
            {
3784
1.30M
                TidyParserMemory memory = {0};
3785
1.30M
                memory.identity = TY_(ParseInline);
3786
1.30M
                memory.original_node = element;
3787
1.30M
                memory.reentry_node = node;
3788
1.30M
                memory.mode = mode;
3789
1.30M
                memory.reentry_mode = mode;
3790
1.30M
                TY_(pushMemory)( doc, memory );
3791
1.30M
                DEBUG_LOG_EXIT_WITH_NODE(node);
3792
1.30M
                return node;
3793
1.30M
            }
3794
1.30M
        }
3795
3796
        /* discard unexpected tags */
3797
1.03k
        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3798
1.03k
        TY_(FreeNode)( doc, node );
3799
1.03k
        continue;
3800
1.30M
    }
3801
3802
850k
    if (!(element->tag->model & CM_OPT))
3803
845k
        TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR);
3804
3805
850k
    DEBUG_LOG_EXIT;
3806
850k
    return NULL;
3807
2.56M
}
3808
3809
3810
/** MARK: TY_(ParseList)
3811
 *  Parses list tags.
3812
 *
3813
 *  This is a non-recursing parser. It uses the document's parser memory stack
3814
 *  to send subsequent nodes back to the controller for dispatching to parsers.
3815
 *  This parser is also re-enterable, so that post-processing can occur after
3816
 *  such dispatching.
3817
*/
3818
Node* TY_(ParseList)( TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode) )
3819
75.2k
{
3820
75.2k
    Lexer* lexer = doc->lexer;
3821
75.2k
    Node *node = NULL;
3822
75.2k
    Node *parent = NULL;
3823
75.2k
    Node *lastli = NULL;;
3824
75.2k
    Bool wasblock = no;
3825
75.2k
    Bool nodeisOL = nodeIsOL(list);
3826
75.2k
    DEBUG_LOG_COUNTERS;
3827
3828
75.2k
    if ( list == NULL )
3829
40.2k
    {
3830
40.2k
        TidyParserMemory memory = TY_(popMemory)( doc );
3831
40.2k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
3832
40.2k
        DEBUG_LOG_REENTER_WITH_NODE(node);
3833
40.2k
        list = memory.original_node;
3834
40.2k
        DEBUG_LOG_GET_OLD_MODE;
3835
40.2k
        mode = memory.mode;
3836
40.2k
        DEBUG_LOG_CHANGE_MODE;
3837
40.2k
    }
3838
35.0k
    else
3839
35.0k
    {
3840
35.0k
        DEBUG_LOG_ENTER_WITH_NODE(list);
3841
3842
35.0k
        if (list->tag->model & CM_EMPTY)
3843
0
        {
3844
0
            DEBUG_LOG_EXIT;
3845
0
            return NULL;
3846
0
        }
3847
35.0k
    }
3848
    
3849
75.2k
    lexer->insert = NULL;  /* defer implicit inline start tags */
3850
3851
82.0k
    while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
3852
56.3k
    {
3853
56.3k
        Bool foundLI = no;
3854
56.3k
        if (node->tag == list->tag && node->type == EndTag)
3855
709
        {
3856
709
            TY_(FreeNode)( doc, node);
3857
709
            list->closed = yes;
3858
709
            DEBUG_LOG_EXIT;
3859
709
            return NULL;
3860
709
        }
3861
3862
        /* deal with comments etc. */
3863
55.6k
        if (InsertMisc(list, node))
3864
1.27k
            continue;
3865
3866
54.3k
        if (node->type != TextNode && node->tag == NULL)
3867
3.01k
        {
3868
3.01k
            TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3869
3.01k
            TY_(FreeNode)( doc, node);
3870
3.01k
            continue;
3871
3.01k
        }
3872
51.3k
        if (lexer && (node->type == TextNode))
3873
12.2k
        {
3874
12.2k
            uint ch, ix = node->start;
3875
            /* Issue #572 - Skip whitespace. */
3876
13.2k
            while (ix < node->end && (ch = (lexer->lexbuf[ix] & 0xff))
3877
12.6k
                && (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'))
3878
1.08k
                ++ix;
3879
12.2k
            if (ix >= node->end)
3880
302
            {
3881
                /* Issue #572 - Discard if ALL whitespace. */
3882
302
                TY_(FreeNode)(doc, node);
3883
302
                continue;
3884
302
            }
3885
12.2k
        }
3886
3887
3888
        /*
3889
          if this is the end tag for an ancestor element
3890
          then infer end tag for this element
3891
        */
3892
51.0k
        if (node->type == EndTag)
3893
2.49k
        {
3894
2.49k
            if ( nodeIsFORM(node) )
3895
221
            {
3896
221
                BadForm( doc );
3897
221
                TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3898
221
                TY_(FreeNode)( doc, node );
3899
221
                continue;
3900
221
            }
3901
3902
2.27k
            if (TY_(nodeHasCM)(node,CM_INLINE))
3903
705
            {
3904
705
                TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3905
705
                TY_(PopInline)( doc, node );
3906
705
                TY_(FreeNode)( doc, node);
3907
705
                continue;
3908
705
            }
3909
3910
1.56k
            for ( parent = list->parent;
3911
47.3k
                  parent != NULL; parent = parent->parent )
3912
46.8k
            {
3913
               /* Do not match across BODY to avoid infinite loop
3914
                  between ParseBody and this parser,
3915
                  See http://tidy.sf.net/bug/1053626. */
3916
46.8k
                if (nodeIsBODY(parent))
3917
700
                    break;
3918
46.1k
                if (node->tag == parent->tag)
3919
311
                {
3920
311
                    TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
3921
311
                    TY_(UngetToken)( doc );
3922
311
                    DEBUG_LOG_EXIT;
3923
311
                    return NULL;
3924
311
                }
3925
46.1k
            }
3926
3927
1.25k
            TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3928
1.25k
            TY_(FreeNode)( doc, node);
3929
1.25k
            continue;
3930
1.56k
        }
3931
3932
48.5k
        if ( !nodeIsLI(node) && nodeisOL )
3933
11.8k
        {
3934
            /* Issue #572 - A <ol><li> can have nested <ol> elements */
3935
11.8k
            foundLI = FindLastLI(list, &lastli); /* find last <li> */
3936
11.8k
        }
3937
3938
48.5k
        if ( nodeIsLI(node) || (TY_(IsHTML5Mode)(doc) && !foundLI) )
3939
26.4k
        {
3940
            /* node is <LI> OR
3941
               Issue #396 - A <ul> can have Zero or more <li> elements
3942
             */
3943
26.4k
            TY_(InsertNodeAtEnd)(list,node);
3944
26.4k
        }
3945
22.1k
        else
3946
22.1k
        {
3947
22.1k
            TY_(UngetToken)( doc );
3948
3949
22.1k
            if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
3950
512
            {
3951
512
                TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
3952
512
                DEBUG_LOG_EXIT;
3953
512
                return NULL;
3954
512
            }
3955
            /* http://tidy.sf.net/issue/1316307 */
3956
            /* In exiled mode, return so table processing can continue. */
3957
21.6k
            else if ( lexer->exiled
3958
8.39k
                      && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
3959
8.39k
                          || nodeIsTABLE(node)) )
3960
4.18k
            {
3961
4.18k
                DEBUG_LOG_EXIT;
3962
4.18k
                return NULL;
3963
4.18k
            }
3964
            /* http://tidy.sf.net/issue/836462
3965
               If "list" is an unordered list, insert the next tag within
3966
               the last <li> to preserve the numbering to match the visual
3967
               rendering of most browsers. */
3968
17.4k
            if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
3969
381
            {
3970
                /* Create a node for error reporting */
3971
381
                node = TY_(InferredTag)(doc, TidyTag_LI);
3972
381
                TY_(Report)(doc, list, node, MISSING_STARTTAG );
3973
381
                TY_(FreeNode)( doc, node);
3974
381
                node = lastli;
3975
381
            }
3976
17.0k
            else
3977
17.0k
            {
3978
                /* Add an inferred <li> */
3979
17.0k
                wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
3980
17.0k
                node = TY_(InferredTag)(doc, TidyTag_LI);
3981
                /* Add "display: inline" to avoid a blank line after <li> with
3982
                   Internet Explorer. See http://tidy.sf.net/issue/836462 */
3983
17.0k
                TY_(AddStyleProperty)( doc, node,
3984
17.0k
                                       wasblock
3985
17.0k
                                       ? "list-style: none; display: inline"
3986
17.0k
                                       : "list-style: none"
3987
17.0k
                                       );
3988
17.0k
                TY_(Report)(doc, list, node, MISSING_STARTTAG );
3989
17.0k
                TY_(InsertNodeAtEnd)(list,node);
3990
17.0k
            }
3991
17.4k
        }
3992
3993
43.8k
        {
3994
43.8k
            TidyParserMemory memory = {0};
3995
43.8k
            memory.identity = TY_(ParseList);
3996
43.8k
            memory.original_node = list;
3997
43.8k
            memory.reentry_node = node;
3998
43.8k
            memory.mode = IgnoreWhitespace;
3999
43.8k
            TY_(pushMemory)( doc, memory );
4000
43.8k
            DEBUG_LOG_EXIT_WITH_NODE(node);
4001
43.8k
            return node;
4002
48.5k
        }
4003
48.5k
    }
4004
4005
25.6k
    TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR);
4006
25.6k
    DEBUG_LOG_EXIT;
4007
25.6k
    return NULL;
4008
75.2k
}
4009
4010
4011
/** MARK: TY_(ParseNamespace)
4012
 *  Act as a generic XML (sub)tree parser: collect each node and add it
4013
 *  to the DOM, without any further validation. It's useful for tags that
4014
 *  have XML-like content, such as `svg` and `math`.
4015
 *
4016
 *  @note Perhaps this is poorly named, as we're not parsing the namespace
4017
 *    of a particular tag, but a tag with XML-like content.
4018
 *
4019
 *  @todo Add schema- or other-hierarchy-definition-based validation
4020
 *    of the subtree here.
4021
 *
4022
 *  This is a non-recursing parser. It uses the document's parser memory stack
4023
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4024
 *  This parser is also re-enterable, so that post-processing can occur after
4025
 *  such dispatching.
4026
*/
4027
Node* TY_(ParseNamespace)( TidyDocImpl* doc, Node *basenode, GetTokenMode mode )
4028
17.0k
{
4029
17.0k
    Lexer* lexer = doc->lexer;
4030
17.0k
    Node *node;
4031
17.0k
    Node *parent = basenode;
4032
17.0k
    uint istackbase;
4033
17.0k
    AttVal* av; /* #130 MathML attr and entity fix! */
4034
4035
    /* a la <table>: defer popping elements off the inline stack */
4036
17.0k
    TY_(DeferDup)( doc );
4037
17.0k
    istackbase = lexer->istackbase;
4038
17.0k
    lexer->istackbase = lexer->istacksize;
4039
4040
17.0k
    mode = OtherNamespace; /* Preformatted; IgnoreWhitespace; */
4041
4042
76.4k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
4043
59.7k
    {
4044
        /*
4045
        fix check to skip action in InsertMisc for regular/empty
4046
        nodes, which we don't want here...
4047
4048
        The way we do it here is by checking and processing everything
4049
        and only what remains goes into InsertMisc()
4050
        */
4051
4052
        /* is this a close tag? And does it match the current parent node? */
4053
59.7k
        if (node->type == EndTag)
4054
5.72k
        {
4055
            /*
4056
            to prevent end tags flowing from one 'alternate namespace' we
4057
            check this in two phases: first we check if the tag is a
4058
            descendant of the current node, and when it is, we check whether
4059
            it is the end tag for a node /within/ or /outside/ the basenode.
4060
            */
4061
5.72k
            Bool outside;
4062
5.72k
            Node *mp = FindMatchingDescendant(parent, node, basenode, &outside);
4063
4064
5.72k
            if (mp != NULL)
4065
4.33k
            {
4066
                /*
4067
                when mp != parent as we might expect,
4068
                infer end tags until we 'hit' the matched
4069
                parent or the basenode
4070
                */
4071
4.33k
                Node *n;
4072
4073
4.33k
                for (n = parent;
4074
5.31k
                     n != NULL && n != basenode->parent && n != mp;
4075
4.33k
                     n = n->parent)
4076
976
                {
4077
                    /* n->implicit = yes; */
4078
976
                    n->closed = yes;
4079
976
                    TY_(Report)(doc, n->parent, n, MISSING_ENDTAG_BEFORE);
4080
976
                }
4081
4082
                /* Issue #369 - Since 'assert' is DEBUG only, and there are
4083
                   simple cases where these can be fired, removing them
4084
                   pending feedback from the original author!
4085
                   assert(outside == no ? n == mp : 1);
4086
                   assert(outside == yes ? n == basenode->parent : 1);
4087
                   =================================================== */
4088
4089
4.33k
                if (outside == no)
4090
4.33k
                {
4091
                    /* EndTag for a node within the basenode subtree. Roll on... */
4092
4.33k
                    if (n)
4093
4.33k
                        n->closed = yes;
4094
4.33k
                    TY_(FreeNode)(doc, node);
4095
4096
4.33k
                    node = n;
4097
4.33k
                    parent = node ? node->parent : NULL;
4098
4.33k
                }
4099
0
                else
4100
0
                {
4101
                    /* EndTag for a node outside the basenode subtree: let the caller handle that. */
4102
0
                    TY_(UngetToken)( doc );
4103
0
                    node = basenode;
4104
0
                    parent = node->parent;
4105
0
                }
4106
4107
                /* when we've arrived at the end-node for the base node, it's quitting time */
4108
4.33k
                if (node == basenode)
4109
272
                {
4110
272
                    lexer->istackbase = istackbase;
4111
272
                    assert(basenode && basenode->closed == yes);
4112
272
                    return NULL;
4113
272
                }
4114
4.33k
            }
4115
1.39k
            else
4116
1.39k
            {
4117
                /* unmatched close tag: report an error and discard */
4118
                /* TY_(Report)(doc, parent, node, NON_MATCHING_ENDTAG); Issue #308 - Seems wrong warning! */
4119
1.39k
                TY_(Report)(doc, parent, node, DISCARDING_UNEXPECTED);
4120
1.39k
                assert(parent);
4121
                /* assert(parent->tag != node->tag); Issue #308 - Seems would always be true! */
4122
1.39k
                TY_(FreeNode)( doc, node); /* Issue #308 - Discard unexpected end tag memory */
4123
1.39k
            }
4124
5.72k
        }
4125
54.0k
        else if (node->type == StartTag)
4126
27.1k
        {
4127
            /* #130 MathML attr and entity fix!
4128
               care if it has attributes, and 'accidently' any of those attributes match known */
4129
35.3k
            for ( av = node->attributes; av; av = av->next )
4130
8.25k
            {
4131
8.25k
                av->dict = 0; /* does something need to be freed? */
4132
8.25k
            }
4133
            /* add another child to the current parent */
4134
27.1k
            TY_(InsertNodeAtEnd)(parent, node);
4135
27.1k
            parent = node;
4136
27.1k
        }
4137
26.9k
        else
4138
26.9k
        {
4139
            /* #130 MathML attr and entity fix!
4140
               care if it has attributes, and 'accidently' any of those attributes match known */
4141
36.5k
            for ( av = node->attributes; av; av = av->next )
4142
9.60k
            {
4143
9.60k
                av->dict = 0; /* does something need to be freed? */
4144
9.60k
            }
4145
26.9k
            TY_(InsertNodeAtEnd)(parent, node);
4146
26.9k
        }
4147
59.7k
    }
4148
4149
16.7k
    TY_(Report)(doc, basenode->parent, basenode, MISSING_ENDTAG_FOR);
4150
16.7k
    return NULL;
4151
17.0k
}
4152
4153
4154
/** MARK: TY_(ParseNoFrames)
4155
 *  Parses the `noframes` tag.
4156
 *
4157
 *  This is a non-recursing parser. It uses the document's parser memory stack
4158
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4159
 *  This parser is also re-enterable, so that post-processing can occur after
4160
 *  such dispatching.
4161
 */
4162
Node* TY_(ParseNoFrames)( TidyDocImpl* doc, Node *noframes, GetTokenMode mode )
4163
66.1k
{
4164
66.1k
    Lexer* lexer = doc->lexer;
4165
66.1k
    Node *node = NULL;
4166
66.1k
    Bool body_seen = no;
4167
66.1k
    DEBUG_LOG_COUNTERS;
4168
4169
66.1k
    enum parserState {
4170
66.1k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4171
66.1k
        STATE_POST_NODEISBODY,        /* To-do after re-entering after checks. */
4172
66.1k
        STATE_COMPLETE,               /* Done with the switch. */
4173
66.1k
    } state = STATE_INITIAL;
4174
4175
    /*
4176
     If we're re-entering, then we need to setup from a previous state,
4177
     instead of starting fresh. We can pull what we need from the document's
4178
     stack.
4179
     */
4180
66.1k
    if ( noframes == NULL )
4181
45.1k
    {
4182
45.1k
        TidyParserMemory memory = TY_(popMemory)( doc );
4183
45.1k
        node = memory.reentry_node; /* Throwaway, because we replace it entering the loop anyway.*/
4184
45.1k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4185
45.1k
        noframes = memory.original_node;
4186
45.1k
        state = memory.reentry_state;
4187
45.1k
        body_seen = memory.register_1;
4188
45.1k
        DEBUG_LOG_GET_OLD_MODE;
4189
45.1k
        mode = memory.mode;
4190
45.1k
        DEBUG_LOG_CHANGE_MODE;
4191
45.1k
    }
4192
20.9k
    else
4193
20.9k
    {
4194
20.9k
        DEBUG_LOG_ENTER_WITH_NODE(noframes);
4195
20.9k
        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
4196
20.9k
        {
4197
20.9k
            doc->badAccess |=  BA_USING_NOFRAMES;
4198
20.9k
        }
4199
20.9k
    }
4200
4201
66.1k
    mode = IgnoreWhitespace;
4202
4203
91.1k
    while ( state != STATE_COMPLETE )
4204
75.9k
    {
4205
75.9k
        if ( state == STATE_INITIAL )
4206
73.5k
        {
4207
73.5k
            node = TY_(GetToken)(doc, mode);
4208
73.5k
            DEBUG_LOG_GOT_TOKEN(node);
4209
73.5k
        }
4210
        
4211
75.9k
        switch ( state )
4212
75.9k
        {
4213
73.5k
            case STATE_INITIAL:
4214
73.5k
            {
4215
73.5k
                if ( node == NULL )
4216
15.2k
                {
4217
15.2k
                    state = STATE_COMPLETE;
4218
15.2k
                    continue;
4219
15.2k
                }
4220
                
4221
58.3k
                if ( node->tag == noframes->tag && node->type == EndTag )
4222
1.17k
                {
4223
1.17k
                    TY_(FreeNode)( doc, node);
4224
1.17k
                    noframes->closed = yes;
4225
1.17k
                    TrimSpaces(doc, noframes);
4226
1.17k
                    DEBUG_LOG_EXIT;
4227
1.17k
                    return NULL;
4228
1.17k
                }
4229
4230
57.1k
                if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
4231
3.89k
                {
4232
3.89k
                    TrimSpaces(doc, noframes);
4233
3.89k
                    if (node->type == EndTag)
4234
64
                    {
4235
64
                        TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4236
64
                        TY_(FreeNode)( doc, node);       /* Throw it away */
4237
64
                    }
4238
3.82k
                    else
4239
3.82k
                    {
4240
3.82k
                        TY_(Report)(doc, noframes, node, MISSING_ENDTAG_BEFORE);
4241
3.82k
                        TY_(UngetToken)( doc );
4242
3.82k
                    }
4243
3.89k
                    DEBUG_LOG_EXIT;
4244
3.89k
                    return NULL;
4245
3.89k
                }
4246
4247
53.3k
                if ( nodeIsHTML(node) )
4248
219
                {
4249
219
                    if (TY_(nodeIsElement)(node))
4250
85
                        TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4251
4252
219
                    TY_(FreeNode)( doc, node);
4253
219
                    continue;
4254
219
                }
4255
4256
                /* deal with comments etc. */
4257
53.0k
                if (InsertMisc(noframes, node))
4258
337
                    continue;
4259
4260
52.7k
                if ( nodeIsBODY(node) && node->type == StartTag )
4261
2.45k
                {
4262
2.45k
                    TidyParserMemory memory = {0};
4263
2.45k
                    memory.identity = TY_(ParseNoFrames);
4264
2.45k
                    memory.original_node = noframes;
4265
2.45k
                    memory.reentry_node = node;
4266
2.45k
                    memory.reentry_state = STATE_POST_NODEISBODY;
4267
2.45k
                    memory.register_1 = lexer->seenEndBody;
4268
2.45k
                    memory.mode = IgnoreWhitespace;
4269
4270
2.45k
                    TY_(InsertNodeAtEnd)(noframes, node);
4271
2.45k
                    TY_(pushMemory)( doc, memory );
4272
2.45k
                    DEBUG_LOG_EXIT_WITH_NODE(node);
4273
2.45k
                    return node;
4274
2.45k
                }
4275
4276
                /* implicit body element inferred */
4277
50.2k
                if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag))
4278
43.7k
                {
4279
43.7k
                    Node *body = TY_(FindBody)( doc );
4280
43.7k
                    if ( body || lexer->seenEndBody )
4281
8.81k
                    {
4282
8.81k
                        if ( body == NULL )
4283
323
                        {
4284
323
                            TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4285
323
                            TY_(FreeNode)( doc, node);
4286
323
                            continue;
4287
323
                        }
4288
8.49k
                        if ( TY_(nodeIsText)(node) )
4289
2.35k
                        {
4290
2.35k
                            TY_(UngetToken)( doc );
4291
2.35k
                            node = TY_(InferredTag)(doc, TidyTag_P);
4292
2.35k
                            TY_(Report)(doc, noframes, node, CONTENT_AFTER_BODY );
4293
2.35k
                        }
4294
8.49k
                        TY_(InsertNodeAtEnd)( body, node );
4295
8.49k
                    }
4296
34.8k
                    else
4297
34.8k
                    {
4298
34.8k
                        TY_(UngetToken)( doc );
4299
34.8k
                        node = TY_(InferredTag)(doc, TidyTag_BODY);
4300
34.8k
                        if ( cfgBool(doc, TidyXmlOut) )
4301
34.8k
                            TY_(Report)(doc, noframes, node, INSERTING_TAG);
4302
34.8k
                        TY_(InsertNodeAtEnd)( noframes, node );
4303
34.8k
                    }
4304
4305
43.3k
                    {
4306
43.3k
                        TidyParserMemory memory = {0};
4307
43.3k
                        memory.identity = TY_(ParseNoFrames);
4308
43.3k
                        memory.original_node = noframes;
4309
43.3k
                        memory.reentry_node = node;
4310
43.3k
                        memory.mode = IgnoreWhitespace; /*MixedContent*/
4311
43.3k
                        memory.reentry_state = STATE_INITIAL;
4312
43.3k
                        TY_(pushMemory)( doc, memory );
4313
43.3k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
4314
43.3k
                        return node;
4315
43.7k
                    }
4316
43.7k
                }
4317
4318
                /* discard unexpected end tags */
4319
6.58k
                TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4320
6.58k
                TY_(FreeNode)( doc, node);
4321
6.58k
            } break;
4322
                
4323
                
4324
2.33k
            case STATE_POST_NODEISBODY:
4325
2.33k
            {
4326
                /* fix for bug http://tidy.sf.net/bug/887259 */
4327
2.33k
                if (body_seen && TY_(FindBody)(doc) != node)
4328
1.09k
                {
4329
1.09k
                    TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no);
4330
1.09k
                    MoveNodeToBody(doc, node);
4331
1.09k
                }
4332
2.33k
                state = STATE_INITIAL;
4333
2.33k
                continue;
4334
4335
50.2k
            } break;
4336
                
4337
                
4338
0
            default:
4339
0
                break;
4340
75.9k
        } /* switch */
4341
75.9k
    } /* while */
4342
4343
15.2k
    TY_(Report)(doc, noframes, node, MISSING_ENDTAG_FOR);
4344
15.2k
    DEBUG_LOG_EXIT;
4345
15.2k
    return NULL;
4346
66.1k
}
4347
4348
4349
/** MARK: TY_(ParseOptGroup)
4350
 *  Parses the `optgroup` tag.
4351
 *
4352
 *  This is a non-recursing parser. It uses the document's parser memory stack
4353
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4354
 *  This parser is also re-enterable, so that post-processing can occur after
4355
 *  such dispatching.
4356
 */
4357
Node* TY_(ParseOptGroup)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) )
4358
40.7k
{
4359
40.7k
    Lexer* lexer = doc->lexer;
4360
40.7k
    Node *node;
4361
40.7k
    DEBUG_LOG_COUNTERS;
4362
    
4363
40.7k
    if ( field == NULL )
4364
20.4k
    {
4365
20.4k
        TidyParserMemory memory = TY_(popMemory)( doc );
4366
20.4k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4367
20.4k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4368
20.4k
        field = memory.original_node;
4369
20.4k
        DEBUG_LOG_GET_OLD_MODE;
4370
20.4k
        mode = memory.mode;
4371
20.4k
        DEBUG_LOG_CHANGE_MODE;
4372
20.4k
    }
4373
20.3k
    else
4374
20.3k
    {
4375
20.3k
        DEBUG_LOG_ENTER_WITH_NODE(field);
4376
20.3k
    }
4377
    
4378
40.7k
    lexer->insert = NULL;  /* defer implicit inline start tags */
4379
4380
59.2k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4381
39.8k
    {
4382
39.8k
        if (node->tag == field->tag && node->type == EndTag)
4383
997
        {
4384
997
            TY_(FreeNode)( doc, node);
4385
997
            field->closed = yes;
4386
997
            TrimSpaces(doc, field);
4387
997
            DEBUG_LOG_EXIT;
4388
997
            return NULL;
4389
997
        }
4390
4391
        /* deal with comments etc. */
4392
38.8k
        if (InsertMisc(field, node))
4393
11.1k
            continue;
4394
4395
27.7k
        if ( node->type == StartTag &&
4396
23.7k
             (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
4397
20.4k
        {
4398
20.4k
            TidyParserMemory memory = {0};
4399
4400
20.4k
            if ( nodeIsOPTGROUP(node) )
4401
19.7k
                TY_(Report)(doc, field, node, CANT_BE_NESTED);
4402
4403
20.4k
            TY_(InsertNodeAtEnd)(field, node);
4404
4405
20.4k
            memory.identity = TY_(ParseOptGroup);
4406
20.4k
            memory.original_node = field;
4407
20.4k
            memory.reentry_node = node;
4408
20.4k
            TY_(pushMemory)( doc, memory );
4409
20.4k
            DEBUG_LOG_EXIT_WITH_NODE(node);
4410
20.4k
            return node;
4411
20.4k
        }
4412
4413
        /* discard unexpected tags */
4414
7.28k
        TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED );
4415
7.28k
        TY_(FreeNode)( doc, node);
4416
7.28k
    }
4417
19.3k
    DEBUG_LOG_EXIT;
4418
19.3k
    return NULL;
4419
40.7k
}
4420
4421
4422
/** MARK: TY_(ParsePre)
4423
 *  Parses the `pre` tag.
4424
 *
4425
 *  This is a non-recursing parser. It uses the document's parser memory stack
4426
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4427
 *  This parser is also re-enterable, so that post-processing can occur after
4428
 *  such dispatching.
4429
 */
4430
Node* TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
4431
133k
{
4432
133k
    Node *node = NULL;
4433
133k
    DEBUG_LOG_COUNTERS;
4434
4435
133k
    enum parserState {
4436
133k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4437
133k
        STATE_RENTRY_ACTION,          /* To-do after re-entering after checks. */
4438
133k
        STATE_COMPLETE,               /* Done with the switch. */
4439
133k
    } state = STATE_INITIAL;
4440
4441
4442
133k
    if ( pre == NULL )
4443
66.9k
    {
4444
66.9k
        TidyParserMemory memory = TY_(popMemory)( doc );
4445
66.9k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4446
66.9k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4447
66.9k
        pre = memory.original_node;
4448
66.9k
        state = memory.reentry_state;
4449
66.9k
        DEBUG_LOG_GET_OLD_MODE;
4450
66.9k
        mode = memory.mode;
4451
66.9k
        DEBUG_LOG_CHANGE_MODE;
4452
66.9k
    }
4453
66.0k
    else
4454
66.0k
    {
4455
66.0k
        DEBUG_LOG_ENTER_WITH_NODE(pre);
4456
66.0k
        if (pre->tag->model & CM_EMPTY)
4457
0
        {
4458
0
            DEBUG_LOG_EXIT;
4459
0
            return NULL;
4460
0
        }
4461
66.0k
    }
4462
4463
133k
    TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
4464
4465
214k
    while ( state != STATE_COMPLETE )
4466
210k
    {
4467
210k
        if ( state == STATE_INITIAL )
4468
148k
            node = TY_(GetToken)(doc, Preformatted);
4469
        
4470
210k
        switch ( state )
4471
210k
        {
4472
148k
            case STATE_INITIAL:
4473
148k
            {
4474
148k
                if ( node == NULL )
4475
4.15k
                {
4476
4.15k
                    state = STATE_COMPLETE;
4477
4.15k
                    continue;
4478
4.15k
                }
4479
                
4480
143k
                if ( node->type == EndTag &&
4481
12.1k
                     (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
4482
1.69k
                {
4483
1.69k
                    if (nodeIsBODY(node) || nodeIsHTML(node))
4484
868
                    {
4485
868
                        TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4486
868
                        TY_(FreeNode)(doc, node);
4487
868
                        continue;
4488
868
                    }
4489
831
                    if (node->tag == pre->tag)
4490
296
                    {
4491
296
                        TY_(FreeNode)(doc, node);
4492
296
                    }
4493
535
                    else
4494
535
                    {
4495
535
                        TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE );
4496
535
                        TY_(UngetToken)( doc );
4497
535
                    }
4498
831
                    pre->closed = yes;
4499
831
                    TrimSpaces(doc, pre);
4500
831
                    DEBUG_LOG_EXIT;
4501
831
                    return NULL;
4502
1.69k
                }
4503
4504
142k
                if (TY_(nodeIsText)(node))
4505
3.20k
                {
4506
3.20k
                    TY_(InsertNodeAtEnd)(pre, node);
4507
3.20k
                    continue;
4508
3.20k
                }
4509
4510
                /* deal with comments etc. */
4511
139k
                if (InsertMisc(pre, node))
4512
135
                    continue;
4513
4514
138k
                if (node->tag == NULL)
4515
4.10k
                {
4516
4.10k
                    TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4517
4.10k
                    TY_(FreeNode)(doc, node);
4518
4.10k
                    continue;
4519
4.10k
                }
4520
4521
                /* strip unexpected tags */
4522
134k
                if ( !PreContent(doc, node) )
4523
123k
                {
4524
                    /* fix for http://tidy.sf.net/bug/772205 */
4525
123k
                    if (node->type == EndTag)
4526
9.25k
                    {
4527
                        /* http://tidy.sf.net/issue/1590220 */
4528
9.25k
                       if ( doc->lexer->exiled
4529
9.16k
                           && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
4530
8.90k
                       {
4531
8.90k
                          TY_(UngetToken)(doc);
4532
8.90k
                          TrimSpaces(doc, pre);
4533
8.90k
                           DEBUG_LOG_EXIT;
4534
8.90k
                          return NULL;
4535
8.90k
                       }
4536
4537
355
                       TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4538
355
                       TY_(FreeNode)(doc, node);
4539
355
                       continue;
4540
9.25k
                    }
4541
                    /* http://tidy.sf.net/issue/1590220 */
4542
114k
                    else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW)
4543
110k
                             || nodeIsTABLE(node) )
4544
48.6k
                    {
4545
48.6k
                        if (!doc->lexer->exiled)
4546
                            /* No missing close warning if exiled. */
4547
1.36k
                            TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE);
4548
4549
48.6k
                        TY_(UngetToken)(doc);
4550
48.6k
                        DEBUG_LOG_EXIT;
4551
48.6k
                        return NULL;
4552
48.6k
                    }
4553
4554
                    /*
4555
                      This is basically what Tidy 04 August 2000 did and far more accurate
4556
                      with respect to browser behaivour than the code commented out above.
4557
                      Tidy could try to propagate the <pre> into each disallowed child where
4558
                      <pre> is allowed in order to replicate some browsers behaivour, but
4559
                      there are a lot of exceptions, e.g. Internet Explorer does not propagate
4560
                      <pre> into table cells while Mozilla does. Opera 6 never propagates
4561
                      <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
4562
4563
                      Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
4564
                      which are handled like Mozilla takes them (Opera6 closes all <pre> after
4565
                      the first </pre>).
4566
4567
                      There are similar issues like replacing <p> in <pre> with <br>, for
4568
                      example
4569
4570
                        <pre>...<p>...</pre>                 (Input)
4571
                        <pre>...<br>...</pre>                (Tidy)
4572
                        <pre>...<br>...</pre>                (Opera 7 and Internet Explorer)
4573
                        <pre>...<br><br>...</pre>            (Opera 6 and Mozilla)
4574
4575
                        <pre>...<p>...</p>...</pre>          (Input)
4576
                        <pre>...<br>......</pre>             (Tidy, BUG!)
4577
                        <pre>...<br>...<br>...</pre>         (Internet Explorer)
4578
                        <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
4579
                        <pre>...<br>...<br><br>...</pre>     (Opera 7)
4580
4581
                      or something similar, they could also be closing the <pre> and propagate
4582
                      the <pre> into the newly opened <p>.
4583
4584
                      Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
4585
                      disallowed in <pre>, Tidy neither detects this nor does it perform any
4586
                      cleanup operation. Tidy should at least issue a warning if it encounters
4587
                      such constructs.
4588
4589
                      Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
4590
                    */
4591
65.3k
                    TY_(InsertNodeAfterElement)(pre, node);
4592
65.3k
                    TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE);
4593
                    
4594
65.3k
                    {
4595
65.3k
                        TidyParserMemory memory = {0};
4596
65.3k
                        memory.identity = TY_(ParsePre);
4597
65.3k
                        memory.original_node = pre;
4598
65.3k
                        memory.reentry_node = node;
4599
65.3k
                        memory.reentry_state = STATE_RENTRY_ACTION;
4600
65.3k
                        TY_(pushMemory)( doc, memory );
4601
65.3k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
4602
65.3k
                        return node;
4603
123k
                    }
4604
123k
                }
4605
4606
11.5k
                if ( nodeIsP(node) )
4607
6.17k
                {
4608
6.17k
                    if (node->type == StartTag)
4609
5.83k
                    {
4610
5.83k
                        TY_(Report)(doc, pre, node, USING_BR_INPLACE_OF);
4611
4612
                        /* trim white space before <p> in <pre>*/
4613
5.83k
                        TrimSpaces(doc, pre);
4614
4615
                        /* coerce both <p> and </p> to <br> */
4616
5.83k
                        TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
4617
5.83k
                        TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
4618
5.83k
                        TY_(InsertNodeAtEnd)( pre, node );
4619
5.83k
                    }
4620
343
                    else
4621
343
                    {
4622
343
                        TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4623
343
                        TY_(FreeNode)( doc, node);
4624
343
                    }
4625
6.17k
                    continue;
4626
6.17k
                }
4627
4628
5.36k
                if ( TY_(nodeIsElement)(node) )
4629
5.12k
                {
4630
                    /* trim white space before <br> */
4631
5.12k
                    if ( nodeIsBR(node) )
4632
131
                        TrimSpaces(doc, pre);
4633
4634
5.12k
                    TY_(InsertNodeAtEnd)(pre, node);
4635
                    
4636
5.12k
                    {
4637
5.12k
                        TidyParserMemory memory = {0};
4638
5.12k
                        memory.identity = TY_(ParsePre);
4639
5.12k
                        memory.original_node = pre;
4640
5.12k
                        memory.reentry_node = node;
4641
5.12k
                        memory.reentry_state = STATE_INITIAL;
4642
5.12k
                        TY_(pushMemory)( doc, memory );
4643
5.12k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
4644
5.12k
                        return node;
4645
5.12k
                    }
4646
5.12k
                }
4647
4648
                /* discard unexpected tags */
4649
242
                TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4650
242
                TY_(FreeNode)( doc, node);
4651
242
            } break;
4652
                
4653
61.9k
            case STATE_RENTRY_ACTION:
4654
61.9k
            {
4655
61.9k
                Node* newnode = TY_(InferredTag)(doc, TidyTag_PRE);
4656
61.9k
                TY_(Report)(doc, pre, newnode, INSERTING_TAG);
4657
61.9k
                pre = newnode;
4658
61.9k
                TY_(InsertNodeAfterElement)(node, pre);
4659
61.9k
                state = STATE_INITIAL;
4660
61.9k
                continue;
4661
5.36k
            } break;
4662
            
4663
0
            default:
4664
0
                break;
4665
4666
210k
        } /* switch */
4667
210k
    } /* while */
4668
4669
4.15k
    TY_(Report)(doc, pre, node, MISSING_ENDTAG_FOR);
4670
4.15k
    DEBUG_LOG_EXIT;
4671
4.15k
    return NULL;
4672
133k
}
4673
4674
4675
/** MARK: TY_(ParseRow)
4676
 *  Parses the `row` tag.
4677
 *
4678
 *  This is a non-recursing parser. It uses the document's parser memory stack
4679
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4680
 *  This parser is also re-enterable, so that post-processing can occur after
4681
 *  such dispatching.
4682
 */
4683
Node* TY_(ParseRow)( TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode) )
4684
36.4k
{
4685
36.4k
    Lexer* lexer = doc->lexer;
4686
36.4k
    Node *node = NULL;
4687
36.4k
    Bool exclude_state = no;
4688
36.4k
    DEBUG_LOG_COUNTERS;
4689
4690
36.4k
    enum parserState {
4691
36.4k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4692
36.4k
        STATE_POST_NOT_ENDTAG,        /* To-do after re-entering after !EndTag checks. */
4693
36.4k
        STATE_POST_TD_TH,             /* To-do after re-entering after TD/TH checks. */
4694
36.4k
        STATE_COMPLETE,               /* Done with the switch. */
4695
36.4k
    } state = STATE_INITIAL;
4696
4697
36.4k
    if ( row == NULL )
4698
16.2k
    {
4699
16.2k
        TidyParserMemory memory = TY_(popMemory)( doc );
4700
16.2k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4701
16.2k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4702
16.2k
        row = memory.original_node;
4703
16.2k
        state = memory.reentry_state;
4704
16.2k
        exclude_state = memory.register_1;
4705
16.2k
        DEBUG_LOG_GET_OLD_MODE;
4706
16.2k
        mode = memory.mode;
4707
16.2k
        DEBUG_LOG_CHANGE_MODE;
4708
16.2k
    }
4709
20.1k
    else
4710
20.1k
    {
4711
20.1k
        DEBUG_LOG_ENTER_WITH_NODE(row);
4712
4713
20.1k
        if (row->tag->model & CM_EMPTY)
4714
0
            return NULL;
4715
20.1k
    }
4716
4717
130k
    while ( state != STATE_COMPLETE )
4718
120k
    {
4719
120k
        if ( state == STATE_INITIAL )
4720
104k
        {
4721
104k
            node = TY_(GetToken)( doc, IgnoreWhitespace );
4722
104k
            DEBUG_LOG_GOT_TOKEN(node);
4723
104k
        }
4724
    
4725
120k
        switch (state)
4726
120k
        {
4727
104k
            case STATE_INITIAL:
4728
104k
            {
4729
104k
                if ( node == NULL)
4730
9.41k
                {
4731
9.41k
                    state = STATE_COMPLETE;
4732
9.41k
                    continue;
4733
9.41k
                }
4734
                
4735
95.1k
                if (node->tag == row->tag)
4736
9.16k
                {
4737
9.16k
                    if (node->type == EndTag)
4738
446
                    {
4739
446
                        TY_(FreeNode)( doc, node);
4740
446
                        row->closed = yes;
4741
446
                        FixEmptyRow( doc, row);
4742
446
                        DEBUG_LOG_EXIT;
4743
446
                        return NULL;
4744
446
                    }
4745
4746
                    /* New row start implies end of current row */
4747
8.71k
                    TY_(UngetToken)( doc );
4748
8.71k
                    FixEmptyRow( doc, row);
4749
8.71k
                    DEBUG_LOG_EXIT;
4750
8.71k
                    return NULL;
4751
9.16k
                }
4752
4753
                /*
4754
                  if this is the end tag for an ancestor element
4755
                  then infer end tag for this element
4756
                */
4757
85.9k
                if ( node->type == EndTag )
4758
8.82k
                {
4759
8.82k
                    if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
4760
4.55k
                         && DescendantOf(row, TagId(node)) )
4761
244
                    {
4762
244
                        TY_(UngetToken)( doc );
4763
244
                        DEBUG_LOG_EXIT;
4764
244
                        return NULL;
4765
244
                    }
4766
4767
8.58k
                    if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
4768
1.70k
                    {
4769
1.70k
                        if ( nodeIsFORM(node) )
4770
431
                            BadForm( doc );
4771
4772
1.70k
                        TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4773
1.70k
                        TY_(FreeNode)( doc, node);
4774
1.70k
                        continue;
4775
1.70k
                    }
4776
4777
6.88k
                    if ( nodeIsTD(node) || nodeIsTH(node) )
4778
583
                    {
4779
583
                        TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4780
583
                        TY_(FreeNode)( doc, node);
4781
583
                        continue;
4782
583
                    }
4783
6.88k
                }
4784
4785
                /* deal with comments etc. */
4786
83.4k
                if (InsertMisc(row, node))
4787
15.2k
                    continue;
4788
4789
                /* discard unknown tags */
4790
68.1k
                if (node->tag == NULL && node->type != TextNode)
4791
10.8k
                {
4792
10.8k
                    TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4793
10.8k
                    TY_(FreeNode)( doc, node);
4794
10.8k
                    continue;
4795
10.8k
                }
4796
4797
                /* discard unexpected <table> element */
4798
57.3k
                if ( nodeIsTABLE(node) )
4799
327
                {
4800
327
                    TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4801
327
                    TY_(FreeNode)( doc, node);
4802
327
                    continue;
4803
327
                }
4804
4805
                /* THEAD, TFOOT or TBODY */
4806
57.0k
                if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
4807
1.09k
                {
4808
1.09k
                    TY_(UngetToken)( doc );
4809
1.09k
                    DEBUG_LOG_EXIT;
4810
1.09k
                    return NULL;
4811
1.09k
                }
4812
4813
55.9k
                if (node->type == EndTag)
4814
4.17k
                {
4815
4.17k
                    TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4816
4.17k
                    TY_(FreeNode)( doc, node);
4817
4.17k
                    continue;
4818
4.17k
                }
4819
4820
                /*
4821
                  if text or inline or block move before table
4822
                  if head content move to head
4823
                */
4824
4825
51.7k
                if (node->type != EndTag)
4826
51.7k
                {
4827
51.7k
                    if ( nodeIsFORM(node) )
4828
730
                    {
4829
730
                        TY_(UngetToken)( doc );
4830
730
                        node = TY_(InferredTag)(doc, TidyTag_TD);
4831
730
                        TY_(Report)(doc, row, node, MISSING_STARTTAG);
4832
730
                    }
4833
51.0k
                    else if ( TY_(nodeIsText)(node)
4834
28.1k
                              || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
4835
28.6k
                    {
4836
28.6k
                        MoveBeforeTable( doc, row, node );
4837
28.6k
                        TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
4838
28.6k
                        lexer->exiled = yes;
4839
28.6k
                        exclude_state = lexer->excludeBlocks;
4840
28.6k
                        lexer->excludeBlocks = no;
4841
4842
28.6k
                        if (node->type != TextNode)
4843
5.79k
                        {
4844
5.79k
                            TidyParserMemory memory = {0};
4845
5.79k
                            memory.identity = TY_(ParseRow);
4846
5.79k
                            memory.original_node = row;
4847
5.79k
                            memory.reentry_node = node;
4848
5.79k
                            memory.reentry_state = STATE_POST_NOT_ENDTAG;
4849
5.79k
                            memory.register_1 = exclude_state;
4850
5.79k
                            TY_(pushMemory)( doc, memory );
4851
5.79k
                            DEBUG_LOG_EXIT_WITH_NODE(node);
4852
5.79k
                            return node;
4853
5.79k
                        }
4854
                        
4855
22.8k
                        lexer->exiled = no;
4856
22.8k
                        lexer->excludeBlocks = exclude_state;
4857
22.8k
                        continue;
4858
28.6k
                    }
4859
22.3k
                    else if (node->tag->model & CM_HEAD)
4860
670
                    {
4861
670
                        TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
4862
670
                        MoveToHead( doc, row, node);
4863
670
                        continue;
4864
670
                    }
4865
51.7k
                }
4866
4867
22.4k
                if ( !(nodeIsTD(node) || nodeIsTH(node)) )
4868
11.7k
                {
4869
11.7k
                    TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
4870
11.7k
                    TY_(FreeNode)( doc, node);
4871
11.7k
                    continue;
4872
11.7k
                }
4873
4874
                /* node should be <TD> or <TH> */
4875
10.7k
                TY_(InsertNodeAtEnd)(row, node);
4876
10.7k
                exclude_state = lexer->excludeBlocks;
4877
10.7k
                lexer->excludeBlocks = no;
4878
10.7k
                {
4879
10.7k
                    TidyParserMemory memory = {0};
4880
10.7k
                    memory.identity = TY_(ParseRow);
4881
10.7k
                    memory.original_node = row;
4882
10.7k
                    memory.reentry_node = node;
4883
10.7k
                    memory.reentry_state = STATE_POST_TD_TH;
4884
10.7k
                    memory.register_1 = exclude_state;
4885
10.7k
                    TY_(pushMemory)( doc, memory );
4886
10.7k
                    DEBUG_LOG_EXIT_WITH_NODE(node);
4887
10.7k
                    return node;
4888
22.4k
                }
4889
22.4k
            } break;
4890
                
4891
                
4892
5.59k
            case STATE_POST_NOT_ENDTAG:
4893
5.59k
            {
4894
5.59k
                lexer->exiled = no;
4895
5.59k
                lexer->excludeBlocks = exclude_state; /* capture this in stack. */
4896
5.59k
                state = STATE_INITIAL;
4897
5.59k
                continue;
4898
22.4k
            } break;
4899
                
4900
                
4901
10.6k
            case STATE_POST_TD_TH:
4902
10.6k
            {
4903
10.6k
                lexer->excludeBlocks = exclude_state; /* capture this in stack. */
4904
4905
                /* pop inline stack */
4906
16.0k
                while ( lexer->istacksize > lexer->istackbase )
4907
5.39k
                    TY_(PopInline)( doc, NULL );
4908
                
4909
10.6k
                state = STATE_INITIAL;
4910
10.6k
                continue;
4911
22.4k
            } break;
4912
                
4913
                
4914
0
            default:
4915
0
                break;
4916
                
4917
120k
        } /* switch */
4918
120k
    } /* while */
4919
9.41k
    DEBUG_LOG_EXIT;
4920
9.41k
    return NULL;
4921
36.4k
}
4922
4923
4924
/** MARK: TY_(ParseRowGroup)
4925
 *  Parses the `rowgroup` tag.
4926
 *
4927
 *  This is a non-recursing parser. It uses the document's parser memory stack
4928
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4929
 *  This parser is also re-enterable, so that post-processing can occur after
4930
 *  such dispatching.
4931
 */
4932
Node* TY_(ParseRowGroup)( TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode) )
4933
26.7k
{
4934
26.7k
    Lexer* lexer = doc->lexer;
4935
26.7k
    Node *node = NULL;
4936
26.7k
    Node *parent = NULL;
4937
26.7k
    DEBUG_LOG_COUNTERS;
4938
4939
26.7k
    enum parserState {
4940
26.7k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4941
26.7k
        STATE_POST_NOT_TEXTNODE,      /* To-do after re-entering after checks. */
4942
26.7k
        STATE_COMPLETE,               /* Done with the switch. */
4943
26.7k
    } state = STATE_INITIAL;
4944
4945
26.7k
    if ( rowgroup == NULL )
4946
13.3k
    {
4947
13.3k
        TidyParserMemory memory = TY_(popMemory)( doc );
4948
13.3k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4949
13.3k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4950
13.3k
        rowgroup = memory.original_node;
4951
13.3k
        state = memory.reentry_state;
4952
13.3k
        DEBUG_LOG_GET_OLD_MODE;
4953
13.3k
        mode = memory.mode;
4954
13.3k
        DEBUG_LOG_CHANGE_MODE;
4955
13.3k
    }
4956
13.3k
    else
4957
13.3k
    {
4958
13.3k
        DEBUG_LOG_ENTER_WITH_NODE(rowgroup);
4959
13.3k
        if (rowgroup->tag->model & CM_EMPTY)
4960
0
        {
4961
0
            DEBUG_LOG_EXIT;
4962
0
            return NULL;
4963
0
        }
4964
13.3k
    }
4965
4966
53.6k
    while ( state != STATE_COMPLETE )
4967
47.3k
    {
4968
47.3k
        if ( state == STATE_INITIAL )
4969
33.0k
            node = TY_(GetToken)(doc, IgnoreWhitespace);
4970
        
4971
47.3k
        switch (state)
4972
47.3k
        {
4973
33.0k
            case STATE_INITIAL:
4974
33.0k
            {
4975
33.0k
                TidyParserMemory memory = {0};
4976
4977
33.0k
                if (node == NULL)
4978
6.26k
                {
4979
6.26k
                    state = STATE_COMPLETE;
4980
6.26k
                    continue;
4981
6.26k
                }
4982
                
4983
26.8k
                if (node->tag == rowgroup->tag)
4984
5.00k
                {
4985
5.00k
                    if (node->type == EndTag)
4986
82
                    {
4987
82
                        rowgroup->closed = yes;
4988
82
                        TY_(FreeNode)( doc, node);
4989
82
                        DEBUG_LOG_EXIT;
4990
82
                        return NULL;
4991
82
                    }
4992
4993
4.92k
                    TY_(UngetToken)( doc );
4994
4.92k
                    DEBUG_LOG_EXIT;
4995
4.92k
                    return NULL;
4996
5.00k
                }
4997
4998
                /* if </table> infer end tag */
4999
21.7k
                if ( nodeIsTABLE(node) && node->type == EndTag )
5000
115
                {
5001
115
                    TY_(UngetToken)( doc );
5002
115
                    DEBUG_LOG_EXIT;
5003
115
                    return NULL;
5004
115
                }
5005
5006
                /* deal with comments etc. */
5007
21.6k
                if (InsertMisc(rowgroup, node))
5008
490
                    continue;
5009
5010
                /* discard unknown tags */
5011
21.1k
                if (node->tag == NULL && node->type != TextNode)
5012
1.29k
                {
5013
1.29k
                    TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5014
1.29k
                    TY_(FreeNode)( doc, node);
5015
1.29k
                    continue;
5016
1.29k
                }
5017
5018
                /*
5019
                  if TD or TH then infer <TR>
5020
                  if text or inline or block move before table
5021
                  if head content move to head
5022
                */
5023
5024
19.9k
                if (node->type != EndTag)
5025
17.6k
                {
5026
17.6k
                    if ( nodeIsTD(node) || nodeIsTH(node) )
5027
848
                    {
5028
848
                        TY_(UngetToken)( doc );
5029
848
                        node = TY_(InferredTag)(doc, TidyTag_TR);
5030
848
                        TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG);
5031
848
                    }
5032
16.8k
                    else if ( TY_(nodeIsText)(node)
5033
14.1k
                              || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
5034
14.3k
                    {
5035
14.3k
                        MoveBeforeTable( doc, rowgroup, node );
5036
14.3k
                        TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
5037
14.3k
                        lexer->exiled = yes;
5038
5039
14.3k
                        if (node->type != TextNode)
5040
11.6k
                        {
5041
11.6k
                            memory.identity = TY_(ParseRowGroup);
5042
11.6k
                            memory.original_node = rowgroup;
5043
11.6k
                            memory.reentry_node = node;
5044
11.6k
                            memory.reentry_state = STATE_POST_NOT_TEXTNODE;
5045
11.6k
                            TY_(pushMemory)( doc, memory );
5046
11.6k
                            DEBUG_LOG_EXIT_WITH_NODE(node);
5047
11.6k
                            return node;
5048
11.6k
                        }
5049
                        
5050
2.64k
                        state = STATE_POST_NOT_TEXTNODE;
5051
2.64k
                        continue;
5052
14.3k
                    }
5053
2.47k
                    else if (node->tag->model & CM_HEAD)
5054
78
                    {
5055
78
                        TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
5056
78
                        MoveToHead(doc, rowgroup, node);
5057
78
                        continue;
5058
78
                    }
5059
17.6k
                }
5060
5061
                /*
5062
                  if this is the end tag for ancestor element
5063
                  then infer end tag for this element
5064
                */
5065
5.49k
                if (node->type == EndTag)
5066
2.24k
                {
5067
2.24k
                    if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
5068
558
                    {
5069
558
                        if ( nodeIsFORM(node) )
5070
209
                            BadForm( doc );
5071
5072
558
                        TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5073
558
                        TY_(FreeNode)( doc, node);
5074
558
                        continue;
5075
558
                    }
5076
5077
1.69k
                    if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
5078
897
                    {
5079
897
                        TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5080
897
                        TY_(FreeNode)( doc, node);
5081
897
                        continue;
5082
897
                    }
5083
5084
793
                    for ( parent = rowgroup->parent;
5085
9.73k
                          parent != NULL;
5086
8.94k
                          parent = parent->parent )
5087
9.33k
                    {
5088
9.33k
                        if (node->tag == parent->tag)
5089
389
                        {
5090
389
                            TY_(UngetToken)( doc );
5091
389
                            DEBUG_LOG_EXIT;
5092
389
                            return NULL;
5093
389
                        }
5094
9.33k
                    }
5095
793
                }
5096
5097
                /*
5098
                  if THEAD, TFOOT or TBODY then implied end tag
5099
5100
                */
5101
3.65k
                if (node->tag->model & CM_ROWGRP)
5102
1.68k
                {
5103
1.68k
                    if (node->type != EndTag)
5104
1.57k
                    {
5105
1.57k
                        TY_(UngetToken)( doc );
5106
1.57k
                        DEBUG_LOG_EXIT;
5107
1.57k
                        return NULL;
5108
1.57k
                    }
5109
1.68k
                }
5110
5111
2.08k
                if (node->type == EndTag)
5112
404
                {
5113
404
                    TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5114
404
                    TY_(FreeNode)( doc, node);
5115
404
                    continue;
5116
404
                }
5117
5118
1.67k
                if ( !nodeIsTR(node) )
5119
434
                {
5120
434
                    node = TY_(InferredTag)(doc, TidyTag_TR);
5121
434
                    TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG);
5122
434
                    TY_(UngetToken)( doc );
5123
434
                }
5124
5125
               /* node should be <TR> */
5126
1.67k
                TY_(InsertNodeAtEnd)(rowgroup, node);
5127
1.67k
                memory.identity = TY_(ParseRowGroup);
5128
1.67k
                memory.original_node = rowgroup;
5129
1.67k
                memory.reentry_node = node;
5130
1.67k
                memory.reentry_state = STATE_INITIAL;
5131
1.67k
                TY_(pushMemory)( doc, memory );
5132
1.67k
                DEBUG_LOG_EXIT_WITH_NODE(node);
5133
1.67k
                return node;
5134
2.08k
            } break;
5135
                
5136
                
5137
14.3k
            case STATE_POST_NOT_TEXTNODE:
5138
14.3k
            {
5139
14.3k
                lexer->exiled = no;
5140
14.3k
                state = STATE_INITIAL;
5141
14.3k
                continue;
5142
2.08k
            } break;
5143
5144
                
5145
0
            default:
5146
0
                break;
5147
47.3k
        } /* switch */
5148
47.3k
    } /* while */
5149
6.26k
    DEBUG_LOG_EXIT;
5150
6.26k
    return NULL;
5151
26.7k
}
5152
5153
5154
/** MARK: TY_(ParseScript)
5155
 *  Parses the `script` tag.
5156
 *
5157
 *  @todo This isn't quite right for CDATA content as it recognises tags
5158
 *  within the content and parses them accordingly. This will unfortunately
5159
 *  screw up scripts which include:
5160
 *    < + letter
5161
 *    < + !
5162
 *    < + ?
5163
 *    < + / + letter
5164
 *
5165
 *  This is a non-recursing parser. It uses the document's parser memory stack
5166
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5167
 *  This parser is also re-enterable, so that post-processing can occur after
5168
 *  such dispatching.
5169
 */
5170
Node* TY_(ParseScript)( TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode) )
5171
4.61k
{
5172
4.61k
    Node *node = NULL;
5173
#if defined(ENABLE_DEBUG_LOG)
5174
    static int depth_parser = 0;
5175
    static int count_parser = 0;
5176
#endif
5177
    
5178
4.61k
    DEBUG_LOG_ENTER_WITH_NODE(script);
5179
    
5180
4.61k
    doc->lexer->parent = script;
5181
4.61k
    node = TY_(GetToken)(doc, CdataContent);
5182
4.61k
    doc->lexer->parent = NULL;
5183
5184
4.61k
    if (node)
5185
4.54k
    {
5186
4.54k
        TY_(InsertNodeAtEnd)(script, node);
5187
4.54k
    }
5188
67
    else
5189
67
    {
5190
        /* handle e.g. a document like "<script>" */
5191
67
        TY_(Report)(doc, script, NULL, MISSING_ENDTAG_FOR);
5192
67
        DEBUG_LOG_EXIT;
5193
67
        return NULL;
5194
67
    }
5195
5196
4.54k
    node = TY_(GetToken)(doc, IgnoreWhitespace);
5197
4.54k
    DEBUG_LOG_GOT_TOKEN(node);
5198
5199
4.54k
    if (!(node && node->type == EndTag && node->tag &&
5200
3.00k
        node->tag->id == script->tag->id))
5201
2.61k
    {
5202
2.61k
        TY_(Report)(doc, script, node, MISSING_ENDTAG_FOR);
5203
5204
2.61k
        if (node)
5205
2.04k
            TY_(UngetToken)(doc);
5206
2.61k
    }
5207
1.93k
    else
5208
1.93k
    {
5209
1.93k
        TY_(FreeNode)(doc, node);
5210
1.93k
    }
5211
4.54k
    DEBUG_LOG_EXIT;
5212
4.54k
    return NULL;
5213
4.61k
}
5214
5215
5216
/** MARK: TY_(ParseSelect)
5217
 *  Parses the `select` tag.
5218
 *
5219
 *  This is a non-recursing parser. It uses the document's parser memory stack
5220
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5221
 *  This parser is also re-enterable, so that post-processing can occur after
5222
 *  such dispatching.
5223
 */
5224
Node* TY_(ParseSelect)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) )
5225
3.52k
{
5226
3.52k
    Lexer* lexer = doc->lexer;
5227
3.52k
    Node *node;
5228
3.52k
    DEBUG_LOG_COUNTERS;
5229
5230
3.52k
    if ( field == NULL )
5231
2.58k
    {
5232
2.58k
        TidyParserMemory memory = TY_(popMemory)( doc );
5233
2.58k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
5234
2.58k
        DEBUG_LOG_REENTER_WITH_NODE(node);
5235
2.58k
        field = memory.original_node;
5236
2.58k
        DEBUG_LOG_GET_OLD_MODE;
5237
2.58k
        mode = memory.mode;
5238
2.58k
        DEBUG_LOG_CHANGE_MODE;
5239
2.58k
    }
5240
945
    else
5241
945
    {
5242
945
        DEBUG_LOG_ENTER_WITH_NODE(field);
5243
945
    }
5244
    
5245
3.52k
    lexer->insert = NULL;  /* defer implicit inline start tags */
5246
5247
19.5k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
5248
18.8k
    {
5249
18.8k
        if (node->tag == field->tag && node->type == EndTag)
5250
251
        {
5251
251
            TY_(FreeNode)( doc, node);
5252
251
            field->closed = yes;
5253
251
            TrimSpaces(doc, field);
5254
5255
251
            DEBUG_LOG_EXIT;
5256
251
            return NULL;
5257
251
        }
5258
5259
        /* deal with comments etc. */
5260
18.6k
        if (InsertMisc(field, node))
5261
1.45k
            continue;
5262
5263
17.1k
        if ( node->type == StartTag &&
5264
14.6k
             ( nodeIsOPTION(node)   ||
5265
14.6k
               nodeIsOPTGROUP(node) ||
5266
14.6k
               nodeIsDATALIST(node) ||
5267
14.6k
               nodeIsSCRIPT(node))
5268
17.1k
           )
5269
2.58k
        {
5270
2.58k
            TidyParserMemory memory = {0};
5271
2.58k
            memory.identity = TY_(ParseSelect);
5272
2.58k
            memory.original_node = field;
5273
2.58k
            memory.reentry_node = node;
5274
5275
2.58k
            TY_(InsertNodeAtEnd)(field, node);
5276
2.58k
            TY_(pushMemory)( doc, memory );
5277
2.58k
            DEBUG_LOG_EXIT_WITH_NODE(node);
5278
2.58k
            return node;
5279
2.58k
        }
5280
5281
        /* discard unexpected tags */
5282
14.5k
        TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
5283
14.5k
        TY_(FreeNode)( doc, node);
5284
14.5k
    }
5285
5286
694
    TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
5287
5288
694
    DEBUG_LOG_EXIT;
5289
694
    return NULL;
5290
3.52k
}
5291
5292
5293
/** MARK: TY_(ParseTableTag)
5294
 *  Parses the `table` tag.
5295
 *
5296
 *  This is a non-recursing parser. It uses the document's parser memory stack
5297
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5298
 *  This parser is also re-enterable, so that post-processing can occur after
5299
 *  such dispatching.
5300
 */
5301
Node* TY_(ParseTableTag)( TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode) )
5302
37.7k
{
5303
37.7k
    Lexer* lexer = doc->lexer;
5304
37.7k
    Node *node, *parent;
5305
37.7k
    uint istackbase;
5306
37.7k
    DEBUG_LOG_COUNTERS;
5307
5308
37.7k
    if ( table == NULL )
5309
20.9k
    {
5310
20.9k
        TidyParserMemory memory = TY_(popMemory)( doc );
5311
20.9k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
5312
20.9k
        DEBUG_LOG_REENTER_WITH_NODE(node);
5313
20.9k
        table = memory.original_node;
5314
20.9k
        lexer->exiled = memory.register_1;
5315
20.9k
        DEBUG_LOG_GET_OLD_MODE;
5316
20.9k
        mode = memory.mode;
5317
20.9k
        DEBUG_LOG_CHANGE_MODE;
5318
20.9k
    }
5319
16.7k
    else
5320
16.7k
    {
5321
16.7k
        DEBUG_LOG_ENTER_WITH_NODE(table);
5322
16.7k
        TY_(DeferDup)( doc );
5323
16.7k
    }
5324
5325
37.7k
    istackbase = lexer->istackbase;
5326
37.7k
    lexer->istackbase = lexer->istacksize;
5327
5328
44.5k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
5329
33.4k
    {
5330
33.4k
        DEBUG_LOG_GOT_TOKEN(node);
5331
33.4k
        if (node->tag == table->tag )
5332
4.23k
        {
5333
4.23k
            if (node->type == EndTag)
5334
129
            {
5335
129
                TY_(FreeNode)(doc, node);
5336
129
            }
5337
4.10k
            else
5338
4.10k
            {
5339
                /* Issue #498 - If a <table> in a <table>
5340
                 * just close the current table, and issue a
5341
                 * warning. The previous action was to discard
5342
                 * this second <table>
5343
                 */
5344
4.10k
                TY_(UngetToken)(doc);
5345
4.10k
                TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
5346
4.10k
            }
5347
4.23k
            lexer->istackbase = istackbase;
5348
4.23k
            table->closed = yes;
5349
5350
4.23k
            DEBUG_LOG_EXIT;
5351
4.23k
            return NULL;
5352
4.23k
        }
5353
5354
        /* deal with comments etc. */
5355
29.2k
        if (InsertMisc(table, node))
5356
763
            continue;
5357
5358
        /* discard unknown tags */
5359
28.4k
        if (node->tag == NULL && node->type != TextNode)
5360
1.88k
        {
5361
1.88k
            TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5362
1.88k
            TY_(FreeNode)( doc, node);
5363
1.88k
            continue;
5364
1.88k
        }
5365
5366
        /* if TD or TH or text or inline or block then infer <TR> */
5367
5368
26.5k
        if (node->type != EndTag)
5369
24.3k
        {
5370
24.3k
            if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
5371
7.33k
            {
5372
7.33k
                TY_(UngetToken)( doc );
5373
7.33k
                node = TY_(InferredTag)(doc, TidyTag_TR);
5374
7.33k
                TY_(Report)(doc, table, node, MISSING_STARTTAG);
5375
7.33k
            }
5376
16.9k
            else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
5377
5.77k
            {
5378
5.77k
                TY_(InsertNodeBeforeElement)(table, node);
5379
5.77k
                TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
5380
5.77k
                lexer->exiled = yes;
5381
5382
5.77k
                if (node->type != TextNode)
5383
2.76k
                {
5384
2.76k
                    TidyParserMemory memory = {0};
5385
2.76k
                    memory.identity = TY_(ParseTableTag);
5386
2.76k
                    memory.original_node = table;
5387
2.76k
                    memory.reentry_node = node;
5388
2.76k
                    memory.register_1 = no; /* later, lexer->exiled = no */
5389
2.76k
                    memory.mode = IgnoreWhitespace;
5390
2.76k
                    TY_(pushMemory)( doc, memory );
5391
2.76k
                    DEBUG_LOG_EXIT_WITH_NODE(node);
5392
2.76k
                    return node;
5393
2.76k
                }
5394
5395
3.01k
                lexer->exiled = no;
5396
3.01k
                continue;
5397
5.77k
            }
5398
11.2k
            else if (node->tag->model & CM_HEAD)
5399
73
            {
5400
73
                MoveToHead(doc, table, node);
5401
73
                continue;
5402
73
            }
5403
24.3k
        }
5404
5405
        /*
5406
          if this is the end tag for an ancestor element
5407
          then infer end tag for this element
5408
        */
5409
20.7k
        if (node->type == EndTag)
5410
2.26k
        {
5411
2.26k
            if ( nodeIsFORM(node) )
5412
154
            {
5413
154
                BadForm( doc );
5414
154
                TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5415
154
                TY_(FreeNode)( doc, node);
5416
154
                continue;
5417
154
            }
5418
5419
            /* best to discard unexpected block/inline end tags */
5420
2.11k
            if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
5421
1.84k
                 TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
5422
1.00k
            {
5423
1.00k
                TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5424
1.00k
                TY_(FreeNode)( doc, node);
5425
1.00k
                continue;
5426
1.00k
            }
5427
5428
1.10k
            for ( parent = table->parent;
5429
6.91k
                  parent != NULL;
5430
5.80k
                  parent = parent->parent )
5431
6.37k
            {
5432
6.37k
                if (node->tag == parent->tag)
5433
564
                {
5434
564
                    TY_(Report)(doc, table, node, MISSING_ENDTAG_BEFORE );
5435
564
                    TY_(UngetToken)( doc );
5436
564
                    lexer->istackbase = istackbase;
5437
5438
564
                    DEBUG_LOG_EXIT;
5439
564
                    return NULL;
5440
564
                }
5441
6.37k
            }
5442
1.10k
        }
5443
5444
19.0k
        if (!(node->tag->model & CM_TABLE))
5445
730
        {
5446
730
            TY_(UngetToken)( doc );
5447
730
            TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
5448
730
            lexer->istackbase = istackbase;
5449
5450
730
            DEBUG_LOG_EXIT;
5451
730
            return NULL;
5452
730
        }
5453
5454
18.2k
        if (TY_(nodeIsElement)(node))
5455
18.2k
        {
5456
18.2k
            TidyParserMemory memory = {0};
5457
18.2k
            TY_(InsertNodeAtEnd)(table, node);
5458
18.2k
            memory.identity = TY_(ParseTableTag);
5459
18.2k
            memory.original_node = table;
5460
18.2k
            memory.reentry_node = node;
5461
18.2k
            memory.register_1 = lexer->exiled;
5462
18.2k
            TY_(pushMemory)( doc, memory );
5463
18.2k
            DEBUG_LOG_EXIT_WITH_NODE(node);
5464
18.2k
            return node;
5465
18.2k
        }
5466
5467
        /* discard unexpected text nodes and end tags */
5468
0
        TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5469
0
        TY_(FreeNode)( doc, node);
5470
0
    }
5471
5472
11.1k
    TY_(Report)(doc, table, node, MISSING_ENDTAG_FOR);
5473
11.1k
    lexer->istackbase = istackbase;
5474
5475
11.1k
    DEBUG_LOG_EXIT;
5476
11.1k
    return NULL;
5477
37.7k
}
5478
5479
5480
/** MARK: TY_(ParseText)
5481
 *  Parses the `option` and `textarea` tags.
5482
 *
5483
 *  This is a non-recursing parser. It uses the document's parser memory stack
5484
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5485
 *  This parser is also re-enterable, so that post-processing can occur after
5486
 *  such dispatching.
5487
 */
5488
Node* TY_(ParseText)( TidyDocImpl* doc, Node *field, GetTokenMode mode )
5489
5.72k
{
5490
5.72k
    Lexer* lexer = doc->lexer;
5491
5.72k
    Node *node;
5492
5.72k
    DEBUG_LOG_COUNTERS;
5493
    
5494
5.72k
    DEBUG_LOG_ENTER_WITH_NODE(field);
5495
5496
5.72k
    lexer->insert = NULL;  /* defer implicit inline start tags */
5497
5498
5.72k
    DEBUG_LOG_GET_OLD_MODE;
5499
5.72k
    if ( nodeIsTEXTAREA(field) )
5500
621
        mode = Preformatted;
5501
5.10k
    else
5502
5.10k
        mode = MixedContent;  /* kludge for font tags */
5503
5.72k
    DEBUG_LOG_CHANGE_MODE;
5504
5505
10.5k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
5506
10.0k
    {
5507
10.0k
        if (node->tag == field->tag && node->type == EndTag)
5508
40
        {
5509
40
            TY_(FreeNode)( doc, node);
5510
40
            field->closed = yes;
5511
40
            TrimSpaces(doc, field);
5512
40
            DEBUG_LOG_EXIT;
5513
40
            return NULL;
5514
40
        }
5515
5516
        /* deal with comments etc. */
5517
9.97k
        if (InsertMisc(field, node))
5518
1.29k
            continue;
5519
5520
8.67k
        if (TY_(nodeIsText)(node))
5521
2.63k
        {
5522
            /* only called for 1st child */
5523
2.63k
            if (field->content == NULL && !(mode & Preformatted))
5524
1.12k
                TrimSpaces(doc, field);
5525
5526
2.63k
            if (node->start >= node->end)
5527
2
            {
5528
2
                TY_(FreeNode)( doc, node);
5529
2
                continue;
5530
2
            }
5531
5532
2.63k
            TY_(InsertNodeAtEnd)(field, node);
5533
2.63k
            continue;
5534
2.63k
        }
5535
5536
        /* for textarea should all cases of < and & be escaped? */
5537
5538
        /* discard inline tags e.g. font */
5539
6.04k
        if (   node->tag
5540
5.48k
            && node->tag->model & CM_INLINE
5541
1.43k
            && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
5542
895
        {
5543
895
            TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
5544
895
            TY_(FreeNode)( doc, node);
5545
895
            continue;
5546
895
        }
5547
5548
        /* terminate element on other tags */
5549
5.14k
        if (!(field->tag->model & CM_OPT))
5550
588
            TY_(Report)(doc, field, node, MISSING_ENDTAG_BEFORE);
5551
5552
5.14k
        TY_(UngetToken)( doc );
5553
5.14k
        TrimSpaces(doc, field);
5554
5.14k
        DEBUG_LOG_EXIT;
5555
5.14k
        return NULL;
5556
6.04k
    }
5557
5558
537
    if (!(field->tag->model & CM_OPT))
5559
33
        TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
5560
537
    DEBUG_LOG_EXIT;
5561
537
    return NULL;
5562
5.72k
}
5563
5564
    
5565
/** MARK: TY_(ParseTitle)
5566
 *  Parses the `title` tag.
5567
 *
5568
 *  This is a non-recursing parser. It uses the document's parser memory stack
5569
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5570
 *  This parser is also re-enterable, so that post-processing can occur after
5571
 *  such dispatching.
5572
 */
5573
Node* TY_(ParseTitle)( TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode) )
5574
1.64k
{
5575
1.64k
    Node *node;
5576
16.7k
    while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
5577
16.4k
    {
5578
16.4k
        if (node->tag == title->tag && node->type == StartTag
5579
375
            && cfgBool(doc, TidyCoerceEndTags) )
5580
375
        {
5581
375
            TY_(Report)(doc, title, node, COERCE_TO_ENDTAG);
5582
375
            node->type = EndTag;
5583
375
            TY_(UngetToken)( doc );
5584
375
            continue;
5585
375
        }
5586
16.0k
        else if (node->tag == title->tag && node->type == EndTag)
5587
385
        {
5588
385
            TY_(FreeNode)( doc, node);
5589
385
            title->closed = yes;
5590
385
            TrimSpaces(doc, title);
5591
385
            return NULL;
5592
385
        }
5593
5594
15.6k
        if (TY_(nodeIsText)(node))
5595
7.05k
        {
5596
            /* only called for 1st child */
5597
7.05k
            if (title->content == NULL)
5598
540
                TrimInitialSpace(doc, title, node);
5599
5600
7.05k
            if (node->start >= node->end)
5601
83
            {
5602
83
                TY_(FreeNode)( doc, node);
5603
83
                continue;
5604
83
            }
5605
5606
6.97k
            TY_(InsertNodeAtEnd)(title, node);
5607
6.97k
            continue;
5608
7.05k
        }
5609
5610
        /* deal with comments etc. */
5611
8.60k
        if (InsertMisc(title, node))
5612
283
            continue;
5613
5614
        /* discard unknown tags */
5615
8.32k
        if (node->tag == NULL)
5616
7.37k
        {
5617
7.37k
            TY_(Report)(doc, title, node, DISCARDING_UNEXPECTED);
5618
7.37k
            TY_(FreeNode)( doc, node);
5619
7.37k
            continue;
5620
7.37k
        }
5621
5622
        /* pushback unexpected tokens */
5623
942
        TY_(Report)(doc, title, node, MISSING_ENDTAG_BEFORE);
5624
942
        TY_(UngetToken)( doc );
5625
942
        TrimSpaces(doc, title);
5626
942
        return NULL;
5627
8.32k
    }
5628
5629
319
    TY_(Report)(doc, title, node, MISSING_ENDTAG_FOR);
5630
319
    return NULL;
5631
1.64k
}
5632
5633
5634
/** MARK: ParseXMLElement
5635
 *  Parses the given XML element.
5636
 */
5637
static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode)
5638
0
{
5639
0
    Lexer* lexer = doc->lexer;
5640
0
    Node *node;
5641
5642
0
    if ( element == NULL )
5643
0
    {
5644
0
        TidyParserMemory memory = TY_(popMemory)( doc );
5645
0
        element = memory.original_node;
5646
0
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
5647
0
        mode = memory.reentry_mode;
5648
0
        TY_(InsertNodeAtEnd)(element, node); /* The only re-entry action needed. */
5649
0
    }
5650
0
    else
5651
0
    {
5652
        /* if node is pre or has xml:space="preserve" then do so */
5653
0
        if ( TY_(XMLPreserveWhiteSpace)(doc, element) )
5654
0
            mode = Preformatted;
5655
5656
        /* deal with comments etc. */
5657
0
        InsertMisc( &doc->root, element);
5658
        
5659
        /* we shouldn't have plain text at this point. */
5660
0
        if (TY_(nodeIsText)(element))
5661
0
        {
5662
0
            TY_(Report)(doc, &doc->root, element, DISCARDING_UNEXPECTED);
5663
0
            TY_(FreeNode)( doc, element);
5664
0
            return NULL;
5665
0
        }
5666
0
    }
5667
0
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
5668
0
    {
5669
0
        if (node->type == EndTag &&
5670
0
           node->element && element->element &&
5671
0
           TY_(tmbstrcmp)(node->element, element->element) == 0)
5672
0
        {
5673
0
            TY_(FreeNode)( doc, node);
5674
0
            element->closed = yes;
5675
0
            break;
5676
0
        }
5677
5678
        /* discard unexpected end tags */
5679
0
        if (node->type == EndTag)
5680
0
        {
5681
0
            if (element)
5682
0
                TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_IN);
5683
0
            else
5684
0
                TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_ERR);
5685
5686
0
            TY_(FreeNode)( doc, node);
5687
0
            continue;
5688
0
        }
5689
5690
        /* parse content on seeing start tag */
5691
0
        if (node->type == StartTag)
5692
0
        {
5693
0
            TidyParserMemory memory = {0};
5694
0
            memory.identity = ParseXMLElement;
5695
0
            memory.original_node = element;
5696
0
            memory.reentry_node = node;
5697
0
            memory.reentry_mode = mode;
5698
0
            TY_(pushMemory)( doc, memory );
5699
0
            return node;
5700
0
        }
5701
5702
0
        TY_(InsertNodeAtEnd)(element, node);
5703
0
    } /* while */
5704
5705
    /*
5706
     if first child is text then trim initial space and
5707
     delete text node if it is empty.
5708
    */
5709
5710
0
    node = element->content;
5711
5712
0
    if (TY_(nodeIsText)(node) && mode != Preformatted)
5713
0
    {
5714
0
        if ( lexer->lexbuf[node->start] == ' ' )
5715
0
        {
5716
0
            node->start++;
5717
5718
0
            if (node->start >= node->end)
5719
0
                TY_(DiscardElement)( doc, node );
5720
0
        }
5721
0
    }
5722
5723
    /*
5724
     if last child is text then trim final space and
5725
     delete the text node if it is empty
5726
    */
5727
5728
0
    node = element->last;
5729
5730
0
    if (TY_(nodeIsText)(node) && mode != Preformatted)
5731
0
    {
5732
0
        if ( lexer->lexbuf[node->end - 1] == ' ' )
5733
0
        {
5734
0
            node->end--;
5735
5736
0
            if (node->start >= node->end)
5737
0
                TY_(DiscardElement)( doc, node );
5738
0
        }
5739
0
    }
5740
0
    return NULL;
5741
0
}
5742
5743
5744
/***************************************************************************//*
5745
 ** MARK: - Post-Parse Operations
5746
 ***************************************************************************/
5747
5748
5749
/**
5750
 *  Performs checking of all attributes recursively starting at `node`.
5751
 */
5752
static void AttributeChecks(TidyDocImpl* doc, Node* node)
5753
1.74M
{
5754
1.74M
    Node *next;
5755
5756
4.39M
    while (node)
5757
2.64M
    {
5758
2.64M
        next = node->next;
5759
5760
2.64M
        if (TY_(nodeIsElement)(node))
5761
2.07M
        {
5762
2.07M
            if (node->tag && node->tag->chkattrs) /* [i_a]2 fix crash after adding SVG support with alt/unknown tag subtree insertion there */
5763
48.8k
                node->tag->chkattrs(doc, node);
5764
2.02M
            else
5765
2.02M
                TY_(CheckAttributes)(doc, node);
5766
2.07M
        }
5767
5768
2.64M
        if (node->content)
5769
1.73M
            AttributeChecks(doc, node->content);
5770
5771
2.64M
        assert( next != node ); /* http://tidy.sf.net/issue/1603538 */
5772
2.64M
        node = next;
5773
2.64M
    }
5774
1.74M
}
5775
5776
5777
/**
5778
 *  Encloses naked text in certain elements within `p` tags.
5779
 *
5780
 *  <form>, <blockquote>, and <noscript> do not allow #PCDATA in
5781
 *  HTML 4.01 Strict (%block; model instead of %flow;).
5782
 */
5783
static void EncloseBlockText(TidyDocImpl* doc, Node* node)
5784
0
{
5785
0
    Node *next;
5786
0
    Node *block;
5787
5788
0
    while (node)
5789
0
    {
5790
0
        next = node->next;
5791
5792
0
        if (node->content)
5793
0
            EncloseBlockText(doc, node->content);
5794
5795
0
        if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
5796
0
              nodeIsBLOCKQUOTE(node))
5797
0
            || !node->content)
5798
0
        {
5799
0
            node = next;
5800
0
            continue;
5801
0
        }
5802
5803
0
        block = node->content;
5804
5805
0
        if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) ||
5806
0
            (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block)))
5807
0
        {
5808
0
            Node* p = TY_(InferredTag)(doc, TidyTag_P);
5809
0
            TY_(InsertNodeBeforeElement)(block, p);
5810
0
            while (block &&
5811
0
                   (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block)))
5812
0
            {
5813
0
                Node* tempNext = block->next;
5814
0
                TY_(RemoveNode)(block);
5815
0
                TY_(InsertNodeAtEnd)(p, block);
5816
0
                block = tempNext;
5817
0
            }
5818
0
            TrimSpaces(doc, p);
5819
0
            continue;
5820
0
        }
5821
5822
0
        node = next;
5823
0
    }
5824
0
}
5825
5826
5827
/**
5828
 *  Encloses all naked body text within `p` tags.
5829
 */
5830
static void EncloseBodyText(TidyDocImpl* doc)
5831
0
{
5832
0
    Node* node;
5833
0
    Node* body = TY_(FindBody)(doc);
5834
5835
0
    if (!body)
5836
0
        return;
5837
5838
0
    node = body->content;
5839
5840
0
    while (node)
5841
0
    {
5842
0
        if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) ||
5843
0
            (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node)))
5844
0
        {
5845
0
            Node* p = TY_(InferredTag)(doc, TidyTag_P);
5846
0
            TY_(InsertNodeBeforeElement)(node, p);
5847
0
            while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node)))
5848
0
            {
5849
0
                Node* next = node->next;
5850
0
                TY_(RemoveNode)(node);
5851
0
                TY_(InsertNodeAtEnd)(p, node);
5852
0
                node = next;
5853
0
            }
5854
0
            TrimSpaces(doc, p);
5855
0
            continue;
5856
0
        }
5857
0
        node = node->next;
5858
0
    }
5859
0
}
5860
5861
5862
/**
5863
 *  Replaces elements that are obsolete with appropriate substitute tags.
5864
 */
5865
static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
5866
1.74M
{
5867
1.74M
    Node *next;
5868
5869
4.39M
    while (node)
5870
2.64M
    {
5871
2.64M
        next = node->next;
5872
5873
        /* if (nodeIsDIR(node) || nodeIsMENU(node)) */
5874
        /* HTML5 - <menu ... > is no longer obsolete */
5875
2.64M
        if (nodeIsDIR(node))
5876
573
            TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
5877
5878
2.64M
        if (nodeIsXMP(node) || nodeIsLISTING(node) ||
5879
2.63M
            (node->tag && node->tag->id == TidyTag_PLAINTEXT))
5880
13.9k
            TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes);
5881
5882
2.64M
        if (node->content)
5883
1.73M
            ReplaceObsoleteElements(doc, node->content);
5884
5885
2.64M
        node = next;
5886
2.64M
    }
5887
1.74M
}
5888
5889
5890
/***************************************************************************//*
5891
 ** MARK: - Internal API Implementation
5892
 ***************************************************************************/
5893
5894
5895
/** MARK: TY_(CheckNodeIntegrity)
5896
 *  Is used to perform a node integrity check after parsing an HTML or XML
5897
 *  document.
5898
 *  @note Actual performance of this check can be disabled by defining the
5899
 *  macro NO_NODE_INTEGRITY_CHECK.
5900
 */
5901
Bool TY_(CheckNodeIntegrity)(Node *node)
5902
4.39M
{
5903
4.39M
#ifndef NO_NODE_INTEGRITY_CHECK
5904
4.39M
    Node *child;
5905
5906
4.39M
    if (node->prev)
5907
1.36M
    {
5908
1.36M
        if (node->prev->next != node)
5909
0
            return no;
5910
1.36M
    }
5911
5912
4.39M
    if (node->next)
5913
1.36M
    {
5914
1.36M
        if (node->next == node || node->next->prev != node)
5915
0
            return no;
5916
1.36M
    }
5917
5918
4.39M
    if (node->parent)
5919
4.35M
    {
5920
4.35M
        if (node->prev == NULL && node->parent->content != node)
5921
0
            return no;
5922
5923
4.35M
        if (node->next == NULL && node->parent->last != node)
5924
0
            return no;
5925
4.35M
    }
5926
5927
8.75M
    for (child = node->content; child; child = child->next)
5928
4.35M
        if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
5929
0
            return no;
5930
5931
4.39M
#endif
5932
4.39M
    return yes;
5933
4.39M
}
5934
5935
5936
/** MARK: TY_(CoerceNode)
5937
 *  Transforms a given node to another element, for example, from a <p>
5938
 *  to a <br>.
5939
 */
5940
void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
5941
23.1k
{
5942
23.1k
    const Dict* tag = TY_(LookupTagDef)(tid);
5943
23.1k
    Node* tmp = TY_(InferredTag)(doc, tag->id);
5944
5945
23.1k
    if (obsolete)
5946
14.5k
        TY_(Report)(doc, node, tmp, OBSOLETE_ELEMENT);
5947
8.59k
    else if (unexpected)
5948
0
        TY_(Report)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
5949
8.59k
    else
5950
8.59k
        TY_(Report)(doc, node, tmp, REPLACING_ELEMENT);
5951
5952
23.1k
    TidyDocFree(doc, tmp->element);
5953
23.1k
    TidyDocFree(doc, tmp);
5954
5955
23.1k
    node->was = node->tag;
5956
23.1k
    node->tag = tag;
5957
23.1k
    node->type = StartTag;
5958
23.1k
    node->implicit = yes;
5959
23.1k
    TidyDocFree(doc, node->element);
5960
23.1k
    node->element = TY_(tmbstrdup)(doc->allocator, tag->name);
5961
23.1k
}
5962
5963
5964
/** MARK: TY_(DiscardElement)
5965
 *  Remove node from markup tree and discard it.
5966
 */
5967
Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
5968
416k
{
5969
416k
    Node *next = NULL;
5970
5971
416k
    if (element)
5972
416k
    {
5973
416k
        next = element->next;
5974
416k
        TY_(RemoveNode)(element);
5975
416k
        TY_(FreeNode)( doc, element);
5976
416k
    }
5977
5978
416k
    return next;
5979
416k
}
5980
5981
5982
/** MARK: TY_(DropEmptyElements)
5983
 *  Trims a tree of empty elements recursively, returning the next node.
5984
 */
5985
Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
5986
1.74M
{
5987
1.74M
    Node* next;
5988
5989
4.39M
    while (node)
5990
2.64M
    {
5991
2.64M
        next = node->next;
5992
5993
2.64M
        if (node->content)
5994
1.73M
            TY_(DropEmptyElements)(doc, node->content);
5995
5996
2.64M
        if (!TY_(nodeIsElement)(node) &&
5997
569k
            !(TY_(nodeIsText)(node) && !(node->start < node->end)))
5998
557k
        {
5999
557k
            node = next;
6000
557k
            continue;
6001
557k
        }
6002
6003
2.08M
        next = TY_(TrimEmptyElement)(doc, node);
6004
2.08M
        node = next;
6005
2.08M
    }
6006
6007
1.74M
    return node;
6008
1.74M
}
6009
6010
6011
/** MARK: TY_(InsertNodeAtStart)
6012
 *  Insert node into markup tree as the first element of content of element.
6013
 */
6014
void TY_(InsertNodeAtStart)(Node *element, Node *node)
6015
19.8k
{
6016
19.8k
    node->parent = element;
6017
6018
19.8k
    if (element->content == NULL)
6019
1.54k
        element->last = node;
6020
18.3k
    else
6021
18.3k
        element->content->prev = node;
6022
6023
19.8k
    node->next = element->content;
6024
19.8k
    node->prev = NULL;
6025
19.8k
    element->content = node;
6026
19.8k
}
6027
6028
6029
/** MARK: TY_(InsertNodeAtEnd)
6030
 *  Insert node into markup tree as the last element of content of element.
6031
 */
6032
void TY_(InsertNodeAtEnd)(Node *element, Node *node)
6033
2.48M
{
6034
2.48M
    node->parent = element;
6035
2.48M
    node->prev = element ? element->last : NULL;
6036
6037
2.48M
    if (element && element->last != NULL)
6038
730k
        element->last->next = node;
6039
1.75M
    else
6040
1.75M
        if (element)
6041
1.75M
            element->content = node;
6042
6043
2.48M
    if (element)
6044
2.48M
        element->last = node;
6045
2.48M
}
6046
6047
6048
/** MARK: TY_(InsertNodeBeforeElement)
6049
 *  Insert node into markup tree before element.
6050
 */
6051
void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
6052
77.7k
{
6053
77.7k
    Node *parent;
6054
6055
77.7k
    parent = element ? element->parent : NULL;
6056
77.7k
    node->parent = parent;
6057
77.7k
    node->next = element;
6058
77.7k
    node->prev = element ? element->prev : NULL;
6059
77.7k
    if (element)
6060
68.7k
        element->prev = node;
6061
6062
77.7k
    if (node->prev)
6063
42.2k
        node->prev->next = node;
6064
6065
77.7k
    if (parent && parent->content == element)
6066
26.2k
        parent->content = node;
6067
77.7k
}
6068
6069
6070
/** MARK: TY_(InsertNodeAfterElement)
6071
 *  Insert node into markup tree after element.
6072
 */
6073
void TY_(InsertNodeAfterElement)(Node *element, Node *node)
6074
132k
{
6075
132k
    Node *parent;
6076
6077
132k
    parent = element->parent;
6078
132k
    node->parent = parent;
6079
6080
    /* AQ - 13 Jan 2000 fix for parent == NULL */
6081
132k
    if (parent != NULL && parent->last == element)
6082
14.4k
        parent->last = node;
6083
118k
    else
6084
118k
    {
6085
118k
        node->next = element->next;
6086
        /* AQ - 13 Jan 2000 fix for node->next == NULL */
6087
118k
        if (node->next != NULL)
6088
117k
            node->next->prev = node;
6089
118k
    }
6090
6091
132k
    element->next = node;
6092
132k
    node->prev = element;
6093
132k
}
6094
6095
6096
/** MARK: TY_(IsBlank)
6097
 *  Indicates whether or not a text node is blank, meaning that it consists
6098
 *  of nothing, or a single space.
6099
 */
6100
Bool TY_(IsBlank)(Lexer *lexer, Node *node)
6101
414
{
6102
414
    Bool isBlank = TY_(nodeIsText)(node);
6103
414
    if ( isBlank )
6104
0
        isBlank = ( node->end == node->start ||       /* Zero length */
6105
0
                   ( node->end == node->start+1      /* or one blank. */
6106
0
                    && lexer->lexbuf[node->start] == ' ' ) );
6107
    
6108
414
    return isBlank;
6109
414
}
6110
6111
6112
/** MARK: TY_(IsJavaScript)
6113
 *  Indicates whether or not a node is declared as containing javascript
6114
 *  code.
6115
 */
6116
Bool TY_(IsJavaScript)(Node *node)
6117
7.76k
{
6118
7.76k
    Bool result = no;
6119
7.76k
    AttVal *attr;
6120
6121
7.76k
    if (node->attributes == NULL)
6122
5.96k
        return yes;
6123
6124
3.72k
    for (attr = node->attributes; attr; attr = attr->next)
6125
2.38k
    {
6126
2.38k
        if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
6127
1.19k
             && AttrContains(attr, "javascript") )
6128
465
        {
6129
465
            result = yes;
6130
465
            break;
6131
465
        }
6132
2.38k
    }
6133
6134
1.79k
    return result;
6135
7.76k
}
6136
6137
6138
/** MARK: TY_(IsNewNode)
6139
 *  Used to check if a node uses CM_NEW, which determines how attributes
6140
 *  without values should be printed. This was introduced to deal with
6141
 *  user-defined tags e.g. ColdFusion.
6142
 */
6143
Bool TY_(IsNewNode)(Node *node)
6144
0
{
6145
0
    if (node && node->tag)
6146
0
    {
6147
0
        return (node->tag->model & CM_NEW);
6148
0
    }
6149
0
    return yes;
6150
0
}
6151
6152
6153
/** MARK: TY_(RemoveNode)
6154
 *  Extract a node and its children from a markup tree
6155
 */
6156
Node *TY_(RemoveNode)(Node *node)
6157
447k
{
6158
447k
    if (node->prev)
6159
175k
        node->prev->next = node->next;
6160
6161
447k
    if (node->next)
6162
208k
        node->next->prev = node->prev;
6163
6164
447k
    if (node->parent)
6165
440k
    {
6166
440k
        if (node->parent->content == node)
6167
264k
            node->parent->content = node->next;
6168
6169
440k
        if (node->parent->last == node)
6170
232k
            node->parent->last = node->prev;
6171
440k
    }
6172
6173
447k
    node->parent = node->prev = node->next = NULL;
6174
447k
    return node;
6175
447k
}
6176
6177
6178
/** MARK: TY_(TrimEmptyElement)
6179
 *  Trims a single, empty element, returning the next node.
6180
 */
6181
Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
6182
2.08M
{
6183
2.08M
    if ( CanPrune(doc, element) )
6184
413k
    {
6185
413k
        if (element->type != TextNode)
6186
401k
        {
6187
401k
            doc->footnotes |= FN_TRIM_EMPTY_ELEMENT;
6188
401k
            TY_(Report)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
6189
401k
        }
6190
6191
413k
        return TY_(DiscardElement)(doc, element);
6192
413k
    }
6193
1.67M
    return element->next;
6194
2.08M
}
6195
6196
6197
/** MARK: TY_(XMLPreserveWhiteSpace)
6198
 *  Indicates whether or not whitespace is to be preserved in XHTML/XML
6199
 *  documents.
6200
 */
6201
Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element)
6202
0
{
6203
0
    AttVal *attribute;
6204
6205
    /* search attributes for xml:space */
6206
0
    for (attribute = element->attributes; attribute; attribute = attribute->next)
6207
0
    {
6208
0
        if (attrIsXML_SPACE(attribute))
6209
0
        {
6210
0
            if (AttrValueIs(attribute, "preserve"))
6211
0
                return yes;
6212
6213
0
            return no;
6214
0
        }
6215
0
    }
6216
6217
0
    if (element->element == NULL)
6218
0
        return no;
6219
        
6220
    /* kludge for html docs without explicit xml:space attribute */
6221
0
    if (nodeIsPRE(element)    ||
6222
0
        nodeIsSCRIPT(element) ||
6223
0
        nodeIsSTYLE(element)  ||
6224
0
        TY_(FindParser)(doc, element) == TY_(ParsePre))
6225
0
        return yes;
6226
6227
    /* kludge for XSL docs */
6228
0
    if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 )
6229
0
        return yes;
6230
6231
0
    return no;
6232
0
}
6233
6234
6235
/***************************************************************************//*
6236
 ** MARK: - Internal API Implementation - Main Parsers
6237
 ***************************************************************************/
6238
6239
6240
/** MARK: TY_(ParseDocument)
6241
 *  Parses an HTML document after lexing. It begins by properly configuring
6242
 *  the overall HTML structure, and subsequently processes all remaining
6243
 *  nodes.
6244
 */
6245
void TY_(ParseDocument)(TidyDocImpl* doc)
6246
16.9k
{
6247
16.9k
    Node *node, *html, *doctype = NULL;
6248
6249
50.7k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
6250
48.7k
    {
6251
48.7k
        if (node->type == XmlDecl)
6252
717
        {
6253
717
            doc->xmlDetected = yes;
6254
6255
717
            if (TY_(FindXmlDecl)(doc) && doc->root.content)
6256
632
            {
6257
632
                TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6258
632
                TY_(FreeNode)(doc, node);
6259
632
                continue;
6260
632
            }
6261
85
            if (node->line > 1 || node->column != 1)
6262
41
            {
6263
41
                TY_(Report)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
6264
41
            }
6265
85
        }
6266
6267
        /* deal with comments etc. */
6268
48.0k
        if (InsertMisc( &doc->root, node ))
6269
30.9k
            continue;
6270
6271
17.1k
        if (node->type == DocTypeTag)
6272
1.91k
        {
6273
1.91k
            if (doctype == NULL)
6274
1.27k
            {
6275
1.27k
                TY_(InsertNodeAtEnd)( &doc->root, node);
6276
1.27k
                doctype = node;
6277
1.27k
            }
6278
641
            else
6279
641
            {
6280
641
                TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6281
641
                TY_(FreeNode)( doc, node);
6282
641
            }
6283
1.91k
            continue;
6284
1.91k
        }
6285
6286
15.2k
        if (node->type == EndTag)
6287
327
        {
6288
327
            TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6289
327
            TY_(FreeNode)( doc, node);
6290
327
            continue;
6291
327
        }
6292
6293
14.9k
        if (node->type == StartTag && nodeIsHTML(node))
6294
46
        {
6295
46
            AttVal *xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS);
6296
6297
46
            if (AttrValueIs(xmlns, XHTML_NAMESPACE))
6298
0
            {
6299
0
                Bool htmlOut = cfgBool( doc, TidyHtmlOut );
6300
0
                doc->lexer->isvoyager = yes;                  /* Unless plain HTML */
6301
0
                TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
6302
0
                TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut );   /* will be XHTML. */
6303
6304
                /* adjust other config options, just as in config.c */
6305
0
                if ( !htmlOut )
6306
0
                {
6307
0
                    TY_(SetOptionBool)( doc, TidyUpperCaseTags, no );
6308
0
                    TY_(SetOptionInt)( doc, TidyUpperCaseAttrs, no );
6309
0
                }
6310
0
            }
6311
46
        }
6312
6313
14.9k
        if ( node->type != StartTag || !nodeIsHTML(node) )
6314
14.8k
        {
6315
14.8k
            TY_(UngetToken)( doc );
6316
14.8k
            html = TY_(InferredTag)(doc, TidyTag_HTML);
6317
14.8k
        }
6318
46
        else
6319
46
            html = node;
6320
6321
        /*\
6322
         *  #72, avoid MISSING_DOCTYPE if show-body-only.
6323
         *  #191, also if --doctype omit, that is TidyDoctypeOmit
6324
         *  #342, adjust tags to html4-- if not 'auto' or 'html5'
6325
        \*/
6326
14.9k
        if (!TY_(FindDocType)(doc))
6327
14.2k
        {
6328
14.2k
            ulong dtmode = cfg( doc, TidyDoctypeMode );
6329
14.2k
            if ((dtmode != TidyDoctypeOmit) && !showingBodyOnly(doc))
6330
14.2k
                TY_(Report)(doc, NULL, NULL, MISSING_DOCTYPE);
6331
14.2k
            if ((dtmode != TidyDoctypeAuto) && (dtmode != TidyDoctypeHtml5))
6332
0
            {
6333
                /*\
6334
                 *  Issue #342 - if not doctype 'auto', or 'html5'
6335
                 *  then reset mode htm4-- parsing
6336
                \*/
6337
0
                TY_(AdjustTags)(doc); /* Dynamically modify the tags table to html4-- mode */
6338
0
            }
6339
14.2k
        }
6340
14.9k
        TY_(InsertNodeAtEnd)( &doc->root, html);
6341
14.9k
        ParseHTMLWithNode( doc, html );
6342
14.9k
        break;
6343
15.2k
    }
6344
6345
    /* do this before any more document fixes */
6346
16.9k
    if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 )
6347
0
        TY_(AccessibilityChecks)( doc );
6348
6349
16.9k
    if (!TY_(FindHTML)(doc))
6350
2.02k
    {
6351
        /* a later check should complain if <body> is empty */
6352
2.02k
        html = TY_(InferredTag)(doc, TidyTag_HTML);
6353
2.02k
        TY_(InsertNodeAtEnd)( &doc->root, html);
6354
2.02k
        ParseHTMLWithNode( doc, html );
6355
2.02k
    }
6356
6357
16.9k
    node = TY_(FindTITLE)(doc);
6358
16.9k
    if (!node)
6359
16.8k
    {
6360
16.8k
        Node* head = TY_(FindHEAD)(doc);
6361
        /* #72, avoid MISSING_TITLE_ELEMENT if show-body-only (but allow InsertNodeAtEnd to avoid new warning) */
6362
33.7k
        if (!showingBodyOnly(doc))
6363
16.8k
        {
6364
16.8k
            TY_(Report)(doc, head, NULL, MISSING_TITLE_ELEMENT);
6365
16.8k
        }
6366
16.8k
        TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE));
6367
16.8k
    }
6368
102
    else if (!node->content && !showingBodyOnly(doc))
6369
64
    {
6370
        /* Is #839 - warn node is blank in HTML5 */
6371
64
        if (TY_(IsHTML5Mode)(doc))
6372
55
        {
6373
55
            TY_(Report)(doc, node, NULL, BLANK_TITLE_ELEMENT);
6374
55
        }
6375
64
    }
6376
6377
16.9k
    AttributeChecks(doc, &doc->root);
6378
16.9k
    ReplaceObsoleteElements(doc, &doc->root);
6379
16.9k
    TY_(DropEmptyElements)(doc, &doc->root);
6380
16.9k
    CleanSpaces(doc, &doc->root);
6381
6382
16.9k
    if (cfgBool(doc, TidyEncloseBodyText))
6383
0
        EncloseBodyText(doc);
6384
16.9k
    if (cfgBool(doc, TidyEncloseBlockText))
6385
0
        EncloseBlockText(doc, &doc->root);
6386
16.9k
}
6387
6388
6389
/** MARK: TY_(ParseXMLDocument)
6390
 *  Parses the document using Tidy's XML parser.
6391
 */
6392
void TY_(ParseXMLDocument)(TidyDocImpl* doc)
6393
0
{
6394
0
    Node *node, *doctype = NULL;
6395
6396
0
    TY_(SetOptionBool)( doc, TidyXmlTags, yes );
6397
6398
0
    doc->xmlDetected = yes;
6399
6400
0
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
6401
0
    {
6402
        /* discard unexpected end tags */
6403
0
        if (node->type == EndTag)
6404
0
        {
6405
0
            TY_(Report)(doc, NULL, node, UNEXPECTED_ENDTAG);
6406
0
            TY_(FreeNode)( doc, node);
6407
0
            continue;
6408
0
        }
6409
6410
         /* deal with comments etc. */
6411
0
        if (InsertMisc( &doc->root, node))
6412
0
            continue;
6413
6414
0
        if (node->type == DocTypeTag)
6415
0
        {
6416
0
            if (doctype == NULL)
6417
0
            {
6418
0
                TY_(InsertNodeAtEnd)( &doc->root, node);
6419
0
                doctype = node;
6420
0
            }
6421
0
            else
6422
0
            {
6423
0
                TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6424
0
                TY_(FreeNode)( doc, node);
6425
0
            }
6426
0
            continue;
6427
0
        }
6428
6429
0
        if (node->type == StartEndTag)
6430
0
        {
6431
0
            TY_(InsertNodeAtEnd)( &doc->root, node);
6432
0
            continue;
6433
0
        }
6434
6435
       /* if start tag then parse element's content */
6436
0
        if (node->type == StartTag)
6437
0
        {
6438
0
            TY_(InsertNodeAtEnd)( &doc->root, node );
6439
0
            ParseHTMLWithNode( doc, node );
6440
0
            continue;
6441
0
        }
6442
6443
0
        TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6444
0
        TY_(FreeNode)( doc, node);
6445
0
    }
6446
6447
    /* ensure presence of initial <?xml version="1.0"?> */
6448
0
    if ( cfgBool(doc, TidyXmlDecl) )
6449
0
        TY_(FixXmlDecl)( doc );
6450
0
}