Coverage Report

Created: 2026-02-14 06:44

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tidy-html5/src/parser.c
Line
Count
Source
1
/* parser.c -- HTML Parser
2
3
  (c) 1998-2007 (W3C) MIT, ERCIM, Keio University
4
  See tidy.h for the copyright notice.
5
6
*/
7
8
#include "tidy-int.h"
9
#include "lexer.h"
10
#include "parser.h"
11
#include "message.h"
12
#include "clean.h"
13
#include "tags.h"
14
#include "tmbstr.h"
15
#include "sprtf.h"
16
17
18
/****************************************************************************//*
19
 ** MARK: - Configuration Options
20
 ***************************************************************************/
21
22
23
/**
24
 *  Issue #72  - Need to know to avoid error-reporting - no warning only if
25
 *               --show-body-only yes.
26
 *  Issue #132 - Likewise avoid warning if showing body only.
27
 */
28
15.2k
#define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no
29
30
31
/****************************************************************************//*
32
 ** MARK: - Forward Declarations
33
 ***************************************************************************/
34
35
36
static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode);
37
38
39
/****************************************************************************//*
40
 ** MARK: - Node Operations
41
 ***************************************************************************/
42
43
44
/**
45
 *  Generalised search for duplicate elements.
46
 *  Issue #166 - repeated <main> element.
47
 */
48
static Bool findNodeWithId( Node *node, TidyTagId tid )
49
264
{
50
264
    Node *content;
51
616
    while (node)
52
352
    {
53
352
        if (TagIsId(node,tid))
54
0
            return yes;
55
        /*\
56
         *   Issue #459 - Under certain circumstances, with many node this use of
57
         *   'for (content = node->content; content; content = content->content)'
58
         *   would produce a **forever** circle, or at least a very extended loop...
59
         *   It is sufficient to test the content, if it exists,
60
         *   to quickly iterate all nodes. Now all nodes are tested only once.
61
        \*/
62
352
        content = node->content;
63
352
        if (content)
64
176
        {
65
176
            if ( findNodeWithId(content,tid) )
66
0
                return yes;
67
176
        }
68
352
        node = node->next;
69
352
    }
70
264
    return no;
71
264
}
72
73
74
/**
75
 *  Perform a global search for an element.
76
 *  Issue #166 - repeated <main> element
77
 */
78
static Bool findNodeById( TidyDocImpl* doc, TidyTagId tid )
79
88
{
80
88
    Node *node = (doc ? doc->root.content : NULL);
81
88
    return findNodeWithId( node,tid );
82
88
}
83
84
85
/**
86
 *  Inserts node into element at an appropriate location based
87
 *  on the type of node being inserted.
88
 */
89
static Bool InsertMisc(Node *element, Node *node)
90
855k
{
91
855k
    if (node->type == CommentTag ||
92
854k
        node->type == ProcInsTag ||
93
842k
        node->type == CDATATag ||
94
842k
        node->type == SectionTag ||
95
839k
        node->type == AspTag ||
96
839k
        node->type == JsteTag ||
97
838k
        node->type == PhpTag )
98
16.4k
    {
99
16.4k
        TY_(InsertNodeAtEnd)(element, node);
100
16.4k
        return yes;
101
16.4k
    }
102
103
838k
    if ( node->type == XmlDecl )
104
302
    {
105
302
        Node* root = element;
106
1.97k
        while ( root && root->parent )
107
1.67k
            root = root->parent;
108
302
        if ( root && !(root->content && root->content->type == XmlDecl))
109
97
        {
110
97
          TY_(InsertNodeAtStart)( root, node );
111
97
          return yes;
112
97
        }
113
302
    }
114
115
    /* Declared empty tags seem to be slipping through
116
    ** the cracks.  This is an experiment to figure out
117
    ** a decent place to pick them up.
118
    */
119
838k
    if ( node->tag &&
120
756k
         TY_(nodeIsElement)(node) &&
121
708k
         TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN &&
122
0
         (node->tag->versions & VERS_PROPRIETARY) != 0 )
123
0
    {
124
0
        TY_(InsertNodeAtEnd)(element, node);
125
0
        return yes;
126
0
    }
127
128
838k
    return no;
129
838k
}
130
131
132
/**
133
 *  Insert "node" into markup tree in place of "element"
134
 *  which is moved to become the child of the node
135
 */
136
static void InsertNodeAsParent(Node *element, Node *node)
137
329
{
138
329
    node->content = element;
139
329
    node->last = element;
140
329
    node->parent = element->parent;
141
329
    element->parent = node;
142
143
329
    if (node->parent->content == element)
144
155
        node->parent->content = node;
145
146
329
    if (node->parent->last == element)
147
157
        node->parent->last = node;
148
149
329
    node->prev = element->prev;
150
329
    element->prev = NULL;
151
152
329
    if (node->prev)
153
174
        node->prev->next = node;
154
155
329
    node->next = element->next;
156
329
    element->next = NULL;
157
158
329
    if (node->next)
159
172
        node->next->prev = node;
160
329
}
161
162
163
/**
164
 *  Unexpected content in table row is moved to just before the table in
165
 *  in accordance with Netscape and IE. This code assumes that node hasn't
166
 *  been inserted into the row.
167
 */
168
static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row,
169
                            Node *node )
170
15.3k
{
171
15.3k
    Node *table;
172
173
    /* first find the table element */
174
30.6M
    for (table = row->parent; table; table = table->parent)
175
30.6M
    {
176
30.6M
        if ( nodeIsTABLE(table) )
177
9.40k
        {
178
9.40k
            TY_(InsertNodeBeforeElement)( table, node );
179
9.40k
            return;
180
9.40k
        }
181
30.6M
    }
182
    /* No table element */
183
5.90k
    TY_(InsertNodeBeforeElement)( row->parent, node );
184
5.90k
}
185
186
187
/**
188
 *  Moves given node to end of body element.
189
 */
190
static void MoveNodeToBody( TidyDocImpl* doc, Node* node )
191
686
{
192
686
    Node* body = TY_(FindBody)( doc );
193
686
    if ( body )
194
663
    {
195
663
        TY_(RemoveNode)( node );
196
663
        TY_(InsertNodeAtEnd)( body, node );
197
663
    }
198
686
}
199
200
201
/**
202
 *  Move node to the head, where element is used as starting
203
 *  point in hunt for head. Normally called during parsing.
204
 */
205
static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node )
206
1.03k
{
207
1.03k
    Node *head = NULL;
208
209
1.03k
    TY_(RemoveNode)( node );  /* make sure that node is isolated */
210
211
1.03k
    if ( TY_(nodeIsElement)(node) )
212
888
    {
213
888
        TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN );
214
215
888
        head = TY_(FindHEAD)(doc);
216
888
        assert(head != NULL);
217
218
888
        TY_(InsertNodeAtEnd)(head, node);
219
220
888
        if ( node->tag->parser )
221
888
        {
222
            /* Only one of the existing test cases as of 2021-08-14 invoke
223
               MoveToHead, and it doesn't go deeper than one level. The
224
               parser() call is supposed to return a node if additional
225
               parsing is needed. Keep this in mind if we start to get bug
226
               reports.
227
             */
228
888
            Parser* parser = node->tag->parser;
229
888
            parser( doc, node, IgnoreWhitespace );
230
888
        }
231
888
    }
232
142
    else
233
142
    {
234
142
        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
235
142
        TY_(FreeNode)( doc, node );
236
142
    }
237
1.03k
}
238
239
240
/***************************************************************************//*
241
 ** MARK: - Decision Making
242
 ***************************************************************************/
243
244
245
/**
246
 *  Indicates whether or not element can be pruned based on content,
247
 *  user settings, etc.
248
 */
249
static Bool CanPrune( TidyDocImpl* doc, Node *element )
250
438k
{
251
438k
    if ( !cfgBool(doc, TidyDropEmptyElems) )
252
0
        return no;
253
254
438k
    if ( TY_(nodeIsText)(element) )
255
573
        return yes;
256
257
437k
    if ( element->content )
258
305k
        return no;
259
260
132k
    if ( element->tag == NULL )
261
187
        return no;
262
263
131k
    if ( element->tag->model & CM_BLOCK && element->attributes != NULL )
264
4.51k
        return no;
265
266
127k
    if ( nodeIsA(element) && element->attributes != NULL )
267
865
        return no;
268
269
126k
    if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) )
270
0
        return no;
271
272
126k
    if ( element->tag->model & CM_ROW )
273
1.45k
        return no;
274
275
125k
    if ( element->tag->model & CM_EMPTY )
276
7.95k
        return no;
277
278
117k
    if ( nodeIsAPPLET(element) )
279
0
        return no;
280
281
117k
    if ( nodeIsOBJECT(element) )
282
296
        return no;
283
284
116k
    if ( nodeIsSCRIPT(element) && attrGetSRC(element) )
285
0
        return no;
286
287
116k
    if ( nodeIsTITLE(element) )
288
469
        return no;
289
290
    /* #433359 - fix by Randy Waki 12 Mar 01 */
291
116k
    if ( nodeIsIFRAME(element) )
292
9
        return no;
293
294
    /* fix for bug 770297 */
295
116k
    if (nodeIsTEXTAREA(element))
296
1.14k
        return no;
297
298
    /* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */
299
115k
    if (nodeIsCANVAS(element))
300
0
        return no;
301
    
302
115k
    if (nodeIsPROGRESS(element))
303
0
        return no;
304
305
115k
    if ( attrGetID(element) || attrGetNAME(element) )
306
555
        return no;
307
308
    /* fix for bug 695408; a better fix would look for unknown and    */
309
    /* known proprietary attributes that make the element significant */
310
114k
    if (attrGetDATAFLD(element))
311
0
        return no;
312
313
    /* fix for bug 723772, don't trim new-...-tags */
314
114k
    if (element->tag->id == TidyTag_UNKNOWN)
315
0
        return no;
316
317
114k
    if (nodeIsBODY(element))
318
1.74k
        return no;
319
320
112k
    if (nodeIsCOLGROUP(element))
321
1.10k
        return no;
322
323
    /* HTML5 - do NOT drop empty option if it has attributes */
324
111k
    if ( nodeIsOPTION(element) && element->attributes != NULL )
325
1
        return no;
326
327
    /* fix for #103 - don't drop empty dd tags lest document not validate */
328
111k
    if (nodeIsDD(element))
329
734
        return no;
330
331
111k
    return yes;
332
111k
}
333
334
335
/**
336
 *  Indicates whether or not node is a descendant of a tag of the given tid.
337
 */
338
static Bool DescendantOf( Node *element, TidyTagId tid )
339
18.9k
{
340
18.9k
    Node *parent;
341
18.9k
    for ( parent = element->parent;
342
6.46M
         parent != NULL;
343
6.44M
         parent = parent->parent )
344
6.44M
    {
345
6.44M
        if ( TagIsId(parent, tid) )
346
3.75k
            return yes;
347
6.44M
    }
348
15.2k
    return no;
349
18.9k
}
350
351
352
/**
353
 *  Indicates whether or not node is a descendant of a pre tag.
354
 */
355
static Bool IsPreDescendant(Node* node)
356
272k
{
357
272k
    Node *parent = node->parent;
358
359
539M
    while (parent)
360
538M
    {
361
538M
        if (parent->tag && parent->tag->parser == TY_(ParsePre))
362
4.72k
            return yes;
363
364
538M
        parent = parent->parent;
365
538M
    }
366
367
267k
    return no;
368
272k
}
369
370
371
/**
372
 *  Indicates whether or not the only content model for the given node
373
 *  is CM_INLINE.
374
 */
375
static Bool nodeCMIsOnlyInline( Node* node )
376
0
{
377
0
    return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK );
378
0
}
379
380
381
/**
382
 *  Indicates whether or not the content of the given node is acceptable
383
 *  content for pre elements
384
 */
385
static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node )
386
11.0k
{
387
    /* p is coerced to br's, Text OK too */
388
11.0k
    if ( nodeIsP(node) || TY_(nodeIsText)(node) )
389
611
        return yes;
390
391
10.4k
    if ( node->tag == NULL ||
392
10.4k
         nodeIsPARAM(node) ||
393
10.4k
         !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) )
394
9.37k
        return no;
395
396
1.05k
    return yes;
397
10.4k
}
398
399
400
/**
401
 *  Indicates whether or not leading whitespace should be cleaned.
402
 */
403
static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node)
404
30.9k
{
405
30.9k
    if (!TY_(nodeIsText)(node))
406
0
        return no;
407
408
30.9k
    if (node->parent->type == DocTypeTag)
409
0
        return no;
410
411
30.9k
    if (IsPreDescendant(node))
412
1.07k
        return no;
413
414
29.8k
    if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
415
497
        return no;
416
    
417
    /* #523, prevent blank spaces after script if the next item is script.
418
     * This is actually more generalized as, if the preceding element is
419
     * a body level script, then indicate that we want to clean leading
420
     * whitespace.
421
     */
422
29.3k
    if ( node->prev && nodeIsSCRIPT(node->prev) && nodeIsBODY(node->prev->parent) )
423
132
        return yes;
424
425
    /* <p>...<br> <em>...</em>...</p> */
426
29.2k
    if (nodeIsBR(node->prev))
427
10
        return yes;
428
429
    /* <p> ...</p> */
430
29.2k
    if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE))
431
4.39k
        return yes;
432
433
    /* <h4>...</h4> <em>...</em> */
434
24.8k
    if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) &&
435
15.6k
        TY_(nodeIsElement)(node->prev))
436
1.31k
        return yes;
437
438
    /* <p><span> ...</span></p> */
439
23.5k
    if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE))
440
1.25k
        return yes;
441
442
22.3k
    return no;
443
23.5k
}
444
445
446
/**
447
 *  Indicates whether or not trailing whitespace should be cleaned.
448
 */
449
static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node)
450
30.9k
{
451
30.9k
    Node* next;
452
453
30.9k
    if (!TY_(nodeIsText)(node))
454
0
        return no;
455
456
30.9k
    if (node->parent->type == DocTypeTag)
457
0
        return no;
458
459
30.9k
    if (IsPreDescendant(node))
460
1.07k
        return no;
461
462
29.8k
    if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript))
463
497
        return no;
464
465
    /* #523, prevent blank spaces after script if the next item is script.
466
     * This is actually more generalized as, if the next element is
467
     * a body level script, then indicate that we want to clean trailing
468
     * whitespace.
469
     */
470
29.3k
    if ( node->next && nodeIsSCRIPT(node->next) && nodeIsBODY(node->next->parent) )
471
21
        return yes;
472
473
29.3k
    next = node->next;
474
475
    /* <p>... </p> */
476
29.3k
    if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE))
477
3.33k
        return yes;
478
479
    /* <div><small>... </small><h3>...</h3></div> */
480
26.0k
    if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE))
481
2.00k
        return yes;
482
483
24.0k
    if (!next)
484
3.87k
        return no;
485
486
20.1k
    if (nodeIsBR(next))
487
22
        return yes;
488
489
20.1k
    if (TY_(nodeHasCM)(next, CM_INLINE))
490
3.37k
        return no;
491
492
    /* <a href='/'>...</a> <p>...</p> */
493
16.7k
    if (next->type == StartTag)
494
2.20k
        return yes;
495
496
    /* <strong>...</strong> <hr /> */
497
14.5k
    if (next->type == StartEndTag)
498
6
        return yes;
499
500
    /* evil adjacent text nodes, Tidy should not generate these :-( */
501
14.5k
    if (TY_(nodeIsText)(next) && next->start < next->end
502
9.70k
        && TY_(IsWhite)(doc->lexer->lexbuf[next->start]))
503
2.52k
        return yes;
504
505
12.0k
    return no;
506
14.5k
}
507
508
509
/***************************************************************************//*
510
 ** MARK: - Information Accumulation
511
 ***************************************************************************/
512
513
514
/**
515
 *  Errors in positioning of form start or end tags
516
 *  generally require human intervention to fix.
517
 *  Issue #166 - repeated <main> element also uses this flag
518
 *  to indicate duplicates, discarded.
519
 */
520
static void BadForm( TidyDocImpl* doc )
521
445
{
522
445
    doc->badForm |= flg_BadForm;
523
445
}
524
525
526
/***************************************************************************//*
527
 ** MARK: - Fixes and Touchup
528
 ***************************************************************************/
529
530
531
/**
532
 *  Adds style information as a class in the document or a property
533
 *  of the node to prevent indentation of inferred UL tags.
534
 */
535
static void AddClassNoIndent( TidyDocImpl* doc, Node *node )
536
334
{
537
334
    ctmbstr sprop =
538
334
    "padding-left: 2ex; margin-left: 0ex"
539
334
    "; margin-top: 0ex; margin-bottom: 0ex";
540
334
    if ( !cfgBool(doc, TidyDecorateInferredUL) )
541
334
        return;
542
0
    if ( cfgBool(doc, TidyMakeClean) )
543
0
        TY_(AddStyleAsClass)( doc, node, sprop );
544
0
    else
545
0
        TY_(AddStyleProperty)( doc, node, sprop );
546
0
}
547
548
549
/**
550
 *  Cleans whitespace from text nodes, and drops such nodes if emptied
551
 *  completely as a result.
552
 */
553
static void CleanSpaces(TidyDocImpl* doc, Node* node)
554
352
{
555
352
    Stack *stack = TY_(newStack)(doc, 16);
556
352
    Node *next;
557
    
558
373k
    while (node)
559
373k
    {
560
373k
        next = node->next;
561
562
373k
        if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node))
563
7.26k
            while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start]))
564
170
                ++(node->start);
565
566
373k
        if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node))
567
14.0k
            while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1]))
568
3.93k
                --(node->end);
569
570
373k
        if (TY_(nodeIsText)(node) && !(node->start < node->end))
571
1.35k
        {
572
1.35k
            TY_(RemoveNode)(node);
573
1.35k
            TY_(FreeNode)(doc, node);
574
1.35k
            node = next ? next : TY_(pop)(stack);
575
1.35k
            continue;
576
1.35k
        }
577
578
371k
        if (node->content)
579
305k
        {
580
305k
            TY_(push)(stack, next);
581
305k
            node = node->content;
582
305k
            continue;
583
305k
        }
584
585
66.2k
        node = next ? next : TY_(pop)(stack);
586
66.2k
    }
587
352
    TY_(freeStack)(stack);
588
352
}
589
590
591
/**
592
 *  If a table row is empty then insert an empty cell. This practice is
593
 *  consistent with browser behavior and avoids potential problems with
594
 *  row spanning cells.
595
 */
596
static void FixEmptyRow(TidyDocImpl* doc, Node *row)
597
1.52k
{
598
1.52k
    Node *cell;
599
600
1.52k
    if (row->content == NULL)
601
1.27k
    {
602
1.27k
        cell = TY_(InferredTag)(doc, TidyTag_TD);
603
1.27k
        TY_(InsertNodeAtEnd)(row, cell);
604
1.27k
        TY_(Report)(doc, row, cell, MISSING_STARTTAG);
605
1.27k
    }
606
1.52k
}
607
608
609
/**
610
 *  The doctype has been found after other tags,
611
 *  and needs moving to before the html element
612
 */
613
static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype )
614
1.13k
{
615
1.13k
    Node* existing = TY_(FindDocType)( doc );
616
1.13k
    if ( existing )
617
2
    {
618
2
        TY_(Report)(doc, element, doctype, DISCARDING_UNEXPECTED );
619
2
        TY_(FreeNode)( doc, doctype );
620
2
    }
621
1.13k
    else
622
1.13k
    {
623
1.13k
        TY_(Report)(doc, element, doctype, DOCTYPE_AFTER_TAGS );
624
2.26k
        while ( !nodeIsHTML(element) )
625
1.13k
            element = element->parent;
626
1.13k
        TY_(InsertNodeBeforeElement)( element, doctype );
627
1.13k
    }
628
1.13k
}
629
630
631
/**
632
 *  This maps
633
 *     <p>hello<em> world</em>
634
 *  to
635
 *     <p>hello <em>world</em>
636
 *
637
 *  Trims initial space, by moving it before the
638
 *  start tag, or if this element is the first in
639
 *  parent's content, then by discarding the space
640
 */
641
static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text )
642
12.6k
{
643
12.6k
    Lexer* lexer = doc->lexer;
644
12.6k
    Node *prev, *node;
645
646
12.6k
    if ( TY_(nodeIsText)(text) &&
647
12.6k
         lexer->lexbuf[text->start] == ' ' &&
648
1.03k
         text->start < text->end )
649
1.03k
    {
650
1.03k
        if ( (element->tag->model & CM_INLINE) &&
651
670
             !(element->tag->model & CM_FIELD) )
652
310
        {
653
310
            prev = element->prev;
654
655
310
            if (TY_(nodeIsText)(prev))
656
63
            {
657
63
                if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ')
658
41
                    lexer->lexbuf[(prev->end)++] = ' ';
659
660
63
                ++(element->start);
661
63
            }
662
247
            else /* create new node */
663
247
            {
664
247
                node = TY_(NewNode)(lexer->allocator, lexer);
665
247
                node->start = (element->start)++;
666
247
                node->end = element->start;
667
247
                lexer->lexbuf[node->start] = ' ';
668
247
                TY_(InsertNodeBeforeElement)(element ,node);
669
247
                DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n",
670
247
                    (element->element ? element->element : "unknown")));
671
247
            }
672
310
        }
673
674
        /* discard the space in current node */
675
1.03k
        ++(text->start);
676
1.03k
    }
677
12.6k
}
678
679
680
/**
681
 *  This maps
682
 *     <em>hello </em><strong>world</strong>
683
 *  to
684
 *     <em>hello</em> <strong>world</strong>
685
 *
686
 *  If last child of element is a text node
687
 *  then trim trailing white space character
688
 *  moving it to after element's end tag.
689
 */
690
static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last )
691
10.3k
{
692
10.3k
    Lexer* lexer = doc->lexer;
693
10.3k
    byte c;
694
695
10.3k
    if (TY_(nodeIsText)(last))
696
10.3k
    {
697
10.3k
        if (last->end > last->start)
698
10.3k
        {
699
10.3k
            c = (byte) lexer->lexbuf[ last->end - 1 ];
700
701
10.3k
            if ( c == ' ' )
702
459
            {
703
459
                last->end -= 1;
704
459
                if ( (element->tag->model & CM_INLINE) &&
705
445
                     !(element->tag->model & CM_FIELD) )
706
107
                    lexer->insertspace = yes;
707
459
            }
708
10.3k
        }
709
10.3k
    }
710
10.3k
}
711
712
713
/**
714
 *  Move initial and trailing space out.
715
 *  This routine maps:
716
 *     hello<em> world</em>
717
 *  to
718
 *     hello <em>world</em>
719
 *  and
720
 *     <em>hello </em><strong>world</strong>
721
 *  to
722
 *     <em>hello</em> <strong>world</strong>
723
 */
724
static void TrimSpaces( TidyDocImpl* doc, Node *element)
725
210k
{
726
210k
    Node* text = element->content;
727
728
210k
    if (nodeIsPRE(element) || IsPreDescendant(element))
729
3.23k
        return;
730
731
207k
    if (TY_(nodeIsText)(text))
732
12.5k
        TrimInitialSpace(doc, element, text);
733
734
207k
    text = element->last;
735
736
207k
    if (TY_(nodeIsText)(text))
737
10.3k
        TrimTrailingSpace(doc, element, text);
738
207k
}
739
740
741
/***************************************************************************//*
742
 ** MARK: - Parsers Support
743
 ***************************************************************************/
744
745
746
/**
747
 *  Structure used by FindDescendant_cb.
748
 */
749
struct MatchingDescendantData
750
{
751
    Node *found_node;
752
    Bool *passed_marker_node;
753
754
    /* input: */
755
    TidyTagId matching_tagId;
756
    Node *node_to_find;
757
    Node *marker_node;
758
};
759
760
761
/**
762
 *  The main engine for FindMatchingDescendant.
763
 */
764
static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate)
765
5.19k
{
766
5.19k
    struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate;
767
768
5.19k
    if (TagId(node) == cb_data->matching_tagId)
769
630
    {
770
        /* make sure we match up 'unknown' tags exactly! */
771
630
        if (cb_data->matching_tagId != TidyTag_UNKNOWN ||
772
165
            (node->element != NULL &&
773
116
            cb_data->node_to_find != NULL &&
774
116
            cb_data->node_to_find->element != NULL &&
775
116
            0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element)))
776
467
        {
777
467
            cb_data->found_node = node;
778
467
            return ExitTraversal;
779
467
        }
780
630
    }
781
782
4.73k
    if (cb_data->passed_marker_node && node == cb_data->marker_node)
783
0
        *cb_data->passed_marker_node = yes;
784
785
4.73k
    return VisitParent;
786
5.19k
}
787
788
789
/**
790
 *  Search the parent chain (from `parent` upwards up to the root) for a node
791
 *  matching the given 'node'.
792
 *
793
 *  When the search passes beyond the `marker_node` (which is assumed to sit
794
 *  in the parent chain), this will be flagged by setting the boolean
795
 *  referenced by `is_parent_of_marker` to `yes`.
796
 *
797
 *  'is_parent_of_marker' and 'marker_node' are optional parameters and may
798
 *  be NULL.
799
 */
800
static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker )
801
705
{
802
705
    struct MatchingDescendantData cb_data = { 0 };
803
705
    cb_data.matching_tagId = TagId(node);
804
705
    cb_data.node_to_find = node;
805
705
    cb_data.marker_node = marker_node;
806
807
705
    assert(node);
808
809
705
    if (is_parent_of_marker)
810
705
        *is_parent_of_marker = no;
811
812
705
    TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data);
813
705
    return cb_data.found_node;
814
705
}
815
816
817
/**
818
 *   Finds the last list item for the given list, providing it in the
819
 *   in-out parameter. Returns yes or no if the item was the last list
820
 *   item.
821
 */
822
static Bool FindLastLI( Node *list, Node **lastli )
823
64.4k
{
824
64.4k
    Node *node;
825
826
64.4k
    *lastli = NULL;
827
64.5k
    for ( node = list->content; node ; node = node->next )
828
130
        if ( nodeIsLI(node) && node->type == StartTag )
829
2
            *lastli=node;
830
64.4k
    return *lastli ? yes:no;
831
64.4k
}
832
833
834
/***************************************************************************//*
835
 ** MARK: - Parser Stack
836
 ***************************************************************************/
837
838
839
/**
840
 *  Allocates and initializes the parser's stack.
841
 */
842
void TY_(InitParserStack)( TidyDocImpl* doc )
843
402
{
844
402
    enum { default_size = 32 };
845
402
    TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size );
846
847
402
    doc->stack.content = content;
848
402
    doc->stack.size = default_size;
849
402
    doc->stack.top = -1;
850
402
}
851
852
853
/**
854
 *  Frees the parser's stack when done.
855
 */
856
void TY_(FreeParserStack)( TidyDocImpl* doc )
857
402
{
858
402
    TidyFree( doc->allocator, doc->stack.content );
859
860
402
    doc->stack.content = NULL;
861
402
    doc->stack.size = 0;
862
402
    doc->stack.top = -1;
863
402
}
864
865
866
/**
867
 *  Increase the stack size.
868
 */
869
static void growParserStack( TidyDocImpl* doc )
870
584
{
871
584
    TidyParserMemory *content;
872
584
    content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 );
873
874
584
    memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) );
875
584
    TidyFree(doc->allocator, doc->stack.content);
876
877
584
    doc->stack.content = content;
878
584
    doc->stack.size = doc->stack.size * 2;
879
584
}
880
881
882
/**
883
 *  Indicates whether or not the stack is empty.
884
 */
885
Bool TY_(isEmptyParserStack)( TidyDocImpl* doc )
886
1.00M
{
887
1.00M
    return doc->stack.top < 0;
888
1.00M
}
889
890
891
/**
892
 *  Peek at the parser memory.
893
 */
894
TidyParserMemory TY_(peekMemory)( TidyDocImpl* doc )
895
0
{
896
0
    return doc->stack.content[doc->stack.top];
897
0
}
898
899
900
/**
901
 *  Peek at the parser memory "identity" field. This is just a convenience
902
 *  to avoid having to create a new struct instance in the caller.
903
 */
904
Parser* TY_(peekMemoryIdentity)( TidyDocImpl* doc )
905
501k
{
906
501k
    return doc->stack.content[doc->stack.top].identity;
907
501k
}
908
909
910
/**
911
 *  Peek at the parser memory "mode" field. This is just a convenience
912
 *  to avoid having to create a new struct instance in the caller.
913
 */
914
GetTokenMode TY_(peekMemoryMode)( TidyDocImpl* doc )
915
1.10k
{
916
1.10k
    return doc->stack.content[doc->stack.top].mode;
917
1.10k
}
918
919
920
/**
921
 *  Pop out a parser memory.
922
 */
923
TidyParserMemory TY_(popMemory)( TidyDocImpl* doc )
924
501k
{
925
501k
    if ( !TY_(isEmptyParserStack)( doc ) )
926
501k
    {
927
501k
        TidyParserMemory data = doc->stack.content[doc->stack.top];
928
501k
        DEBUG_LOG(SPRTF("\n"
929
501k
                        "<--POP  original: %s @ %p\n"
930
501k
                        "         reentry: %s @ %p\n"
931
501k
                        "     stack depth: %lu @ %p\n"
932
501k
                        "            mode: %u\n"
933
501k
                        "      register 1: %i\n"
934
501k
                        "      register 2: %i\n\n",
935
501k
                        data.original_node ? data.original_node->element : "none", data.original_node,
936
501k
                        data.reentry_node ? data.reentry_node->element : "none", data.reentry_node,
937
501k
                        doc->stack.top, &doc->stack.content[doc->stack.top],
938
501k
                        data.mode,
939
501k
                        data.register_1,
940
501k
                        data.register_2
941
501k
                        ));
942
501k
        doc->stack.top = doc->stack.top - 1;
943
501k
        return data;
944
501k
    }
945
0
    TidyParserMemory blank = { NULL };
946
0
    return blank;
947
501k
}
948
949
950
/**
951
 * Push the parser memory to the stack.
952
 */
953
void TY_(pushMemory)( TidyDocImpl* doc, TidyParserMemory data )
954
549k
{
955
549k
    if ( doc->stack.top == doc->stack.size - 1 )
956
584
        growParserStack( doc );
957
958
549k
    doc->stack.top++;
959
    
960
549k
    doc->stack.content[doc->stack.top] = data;
961
549k
    DEBUG_LOG(SPRTF("\n"
962
549k
                    "-->PUSH original: %s @ %p\n"
963
549k
                    "         reentry: %s @ %p\n"
964
549k
                    "     stack depth: %lu @ %p\n"
965
549k
                    "            mode: %u\n"
966
549k
                    "      register 1: %i\n"
967
549k
                    "      register 2: %i\n\n",
968
549k
                    data.original_node ? data.original_node->element : "none", data.original_node,
969
549k
                    data.reentry_node ? data.reentry_node->element : "none", data.reentry_node,
970
549k
                    doc->stack.top, &doc->stack.content[doc->stack.top],
971
549k
                    data.mode,
972
549k
                    data.register_1,
973
549k
                    data.register_2
974
549k
                    ));
975
549k
}
976
977
978
/***************************************************************************//*
979
 ** MARK: Convenience Logging Macros
980
 ***************************************************************************/
981
982
983
#if defined(ENABLE_DEBUG_LOG)
984
#  define DEBUG_LOG_COUNTERS \
985
     static int depth_parser = 0;\
986
     static int count_parser = 0;\
987
     int old_mode = IgnoreWhitespace;
988
#  define DEBUG_LOG_GET_OLD_MODE old_mode = mode;
989
#  define DEBUG_LOG_REENTER_WITH_NODE(NODE) SPRTF("\n>>>Re-Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser);
990
#  define DEBUG_LOG_ENTER_WITH_NODE(NODE) SPRTF("\n>>>Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser);
991
#  define DEBUG_LOG_CHANGE_MODE SPRTF("+++%s-%u Changing mode to %u (was %u)\n", __FUNCTION__, __LINE__, mode, old_mode);
992
#  define DEBUG_LOG_GOT_TOKEN(NODE) SPRTF("---%s-%u got token '%s' with mode '%u'.\n", __FUNCTION__, __LINE__, NODE ? NODE->element : NULL, mode);
993
#  define DEBUG_LOG_EXIT_WITH_NODE(NODE) SPRTF("<<<Exit %s-%u with a node to parse: '%s', depth: %d\n", __FUNCTION__, __LINE__, NODE->element, depth_parser--);
994
#  define DEBUG_LOG_EXIT SPRTF("<<<Exit %s-%u, depth: %d\n", __FUNCTION__, __LINE__, depth_parser--);
995
#else
996
#  define DEBUG_LOG_COUNTERS
997
#  define DEBUG_LOG_GET_OLD_MODE
998
#  define DEBUG_LOG_REENTER_WITH_NODE(NODE)
999
#  define DEBUG_LOG_ENTER_WITH_NODE(NODE)
1000
#  define DEBUG_LOG_CHANGE_MODE
1001
#  define DEBUG_LOG_GOT_TOKEN(NODE)
1002
#  define DEBUG_LOG_EXIT_WITH_NODE(NODE)
1003
#  define DEBUG_LOG_EXIT
1004
#endif
1005
1006
1007
/***************************************************************************//*
1008
 ** MARK: - Parser Search and Instantiation
1009
 ***************************************************************************/
1010
1011
1012
/**
1013
 *  Retrieves the correct parser for the given node, accounting for various
1014
 *  conditions, and readies the lexer for parsing that node.
1015
 */
1016
static Parser* GetParserForNode( TidyDocImpl* doc, Node *node )
1017
553k
{
1018
553k
    Lexer* lexer = doc->lexer;
1019
1020
553k
    if ( cfgBool( doc, TidyXmlTags ) )
1021
27.4k
        return ParseXMLElement;
1022
    
1023
    /* [i_a]2 prevent crash for active content (php, asp) docs */
1024
526k
    if (!node || node->tag == NULL)
1025
4.45k
        return NULL;
1026
1027
    /*
1028
       Fix by GLP 2000-12-21.  Need to reset insertspace if this is both
1029
       a non-inline and empty tag (base, link, meta, isindex, hr, area).
1030
    */
1031
521k
    if (node->tag->model & CM_EMPTY)
1032
7.44k
    {
1033
7.44k
        lexer->waswhite = no;
1034
7.44k
        if (node->tag->parser == NULL)
1035
0
            return NULL;
1036
7.44k
    }
1037
514k
    else if (!(node->tag->model & CM_INLINE))
1038
208k
        lexer->insertspace = no;
1039
1040
521k
    if (node->tag->parser == NULL)
1041
0
        return NULL;
1042
1043
521k
    if (node->type == StartEndTag)
1044
5.72k
        return NULL;
1045
1046
    /* [i_a]2 added this - not sure why - CHECKME: */
1047
515k
    lexer->parent = node;
1048
1049
515k
    return (node->tag->parser);
1050
521k
}
1051
1052
1053
/**
1054
 *  This parser controller initiates the parsing process with the document's
1055
 *  root starting with the provided node, which should be the HTML node after
1056
 *  the pre-HTML stuff is handled at a higher level.
1057
 *
1058
 *  This controller is responsible for calling each of the individual parsers,
1059
 *  based on the tokens it pulls from the lexer, or the tokens passed back via
1060
 *  the parserMemory stack from each of the parsers. Having a main, central
1061
 *  looping dispatcher in this fashion allows the prevention of recursion.
1062
 */
1063
void ParseHTMLWithNode( TidyDocImpl* doc, Node* node )
1064
413
{
1065
413
    GetTokenMode mode = IgnoreWhitespace;
1066
413
    Parser* parser = GetParserForNode( doc, node );
1067
413
    Bool something_to_do = yes;
1068
1069
    /*
1070
     This main loop is only extinguished when all of the parser tokens are
1071
     consumed. Ideally, EVERY parser will return nodes to this loop for
1072
     dispatch to the appropriate parser, but some of the recursive parsers
1073
     still consume some tokens on their own.
1074
     */
1075
1.05M
    while (something_to_do)
1076
1.05M
    {
1077
1.05M
        node = parser ? parser( doc, node, mode ) : NULL;
1078
        
1079
        /*
1080
         We have a node, so anything deferred was already pushed to the stack
1081
         to be dealt with later.
1082
         */
1083
1.05M
        if ( node )
1084
548k
        {
1085
548k
            parser = GetParserForNode( doc, node );
1086
548k
            continue;
1087
548k
        }
1088
1089
        /*
1090
         We weren't given a node, which means this particular leaf is bottomed
1091
         out. We'll re-enter the parsers using information from the stack.
1092
         */
1093
504k
        if ( !TY_(isEmptyParserStack)(doc))
1094
501k
        {
1095
501k
            parser = TY_(peekMemoryIdentity)(doc);
1096
501k
            if (parser)
1097
499k
            {
1098
499k
                continue;
1099
499k
            }
1100
1.10k
            else
1101
1.10k
            {
1102
                /* No parser means we're only passing back a parsing mode. */
1103
1.10k
                mode = TY_(peekMemoryMode)( doc );
1104
1.10k
                TY_(popMemory)( doc );
1105
1.10k
            }
1106
501k
        }
1107
        
1108
        /*
1109
         At this point, there's nothing being returned from parsers, and
1110
         nothing on the stack, so we can draw a new node from the lexer.
1111
         */
1112
4.63k
        node = TY_(GetToken)( doc, mode );
1113
4.63k
        DEBUG_LOG_GOT_TOKEN(node);
1114
1115
4.63k
        if (node)
1116
4.22k
            parser = GetParserForNode( doc, node );
1117
413
        else
1118
413
            something_to_do = no;
1119
4.63k
    }
1120
413
}
1121
1122
1123
/***************************************************************************//*
1124
 ** MARK: - Parsers
1125
 ***************************************************************************/
1126
1127
1128
/** MARK: TY_(ParseBlock)
1129
 *  `element` is a node created by the lexer upon seeing the start tag, or
1130
 *  by the parser when the start tag is inferred
1131
 *
1132
 *  This is a non-recursing parser. It uses the document's parser memory stack
1133
 *  to send subsequent nodes back to the controller for dispatching to parsers.
1134
 *  This parser is also re-enterable, so that post-processing can occur after
1135
 *  such dispatching.
1136
 */
1137
Node* TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
1138
35.0k
{
1139
35.0k
    Lexer* lexer = doc->lexer;
1140
35.0k
    Node *node = NULL;
1141
35.0k
    Bool checkstack = yes;
1142
35.0k
    uint istackbase = 0;
1143
35.0k
    DEBUG_LOG_COUNTERS;
1144
    
1145
35.0k
    if ( element == NULL )
1146
18.6k
    {
1147
18.6k
        TidyParserMemory memory = TY_(popMemory)( doc );
1148
18.6k
        node = memory.reentry_node; /* Throwaway, because the loop overwrites this immediately. */
1149
18.6k
        DEBUG_LOG_REENTER_WITH_NODE(node);
1150
18.6k
        element = memory.original_node;
1151
18.6k
        DEBUG_LOG_GET_OLD_MODE;
1152
18.6k
        mode = memory.reentry_mode;
1153
18.6k
        DEBUG_LOG_CHANGE_MODE;
1154
18.6k
    }
1155
16.4k
    else
1156
16.4k
    {
1157
16.4k
        DEBUG_LOG_ENTER_WITH_NODE(element);
1158
1159
16.4k
        if ( element->tag->model & CM_EMPTY )
1160
0
        {
1161
0
            DEBUG_LOG_EXIT;
1162
0
            return NULL;
1163
0
        }
1164
1165
16.4k
        if ( nodeIsDIV(element) && nodeIsDL(element->parent) && TY_(IsHTML5Mode)(doc) )
1166
229
        {
1167
229
            DEBUG_LOG_EXIT;
1168
229
            return TY_(ParseDefList)(doc, element, mode); /* @warning: possible recursion! */
1169
229
        }
1170
        
1171
16.1k
        if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) )
1172
548
        {
1173
548
            TY_(Report)(doc, element, NULL, ILLEGAL_NESTING );
1174
548
        }
1175
1176
        /*
1177
         InlineDup() asks the lexer to insert inline emphasis tags
1178
         currently pushed on the istack, but take care to avoid
1179
         propagating inline emphasis inside OBJECT or APPLET.
1180
         For these elements a fresh inline stack context is created
1181
         and disposed of upon reaching the end of the element.
1182
         They thus behave like table cells in this respect.
1183
        */
1184
16.1k
        if (element->tag->model & CM_OBJECT)
1185
2.50k
        {
1186
2.50k
            istackbase = lexer->istackbase;
1187
2.50k
            lexer->istackbase = lexer->istacksize;
1188
2.50k
        }
1189
1190
16.1k
        if (!(element->tag->model & CM_MIXED))
1191
15.0k
        {
1192
15.0k
            TY_(InlineDup)( doc, NULL );
1193
15.0k
        }
1194
1195
        /*\
1196
         *  Issue #212 - If it is likely that it may be necessary
1197
         *  to move a leading space into a text node before this
1198
         *  element, then keep the mode MixedContent to keep any
1199
         *  leading space
1200
        \*/
1201
16.1k
        if ( !(element->tag->model & CM_INLINE) ||
1202
7.73k
              (element->tag->model & CM_FIELD ) )
1203
8.45k
        {
1204
8.45k
            DEBUG_LOG_GET_OLD_MODE;
1205
8.45k
            mode = IgnoreWhitespace;
1206
8.45k
            DEBUG_LOG_CHANGE_MODE;
1207
8.45k
        }
1208
7.73k
        else if (mode == IgnoreWhitespace)
1209
7.73k
        {
1210
            /* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace'
1211
               when such a leading space may need to be inserted before this element to
1212
               preserve the browser view */
1213
7.73k
            DEBUG_LOG_GET_OLD_MODE;
1214
7.73k
            mode = MixedContent;
1215
7.73k
            DEBUG_LOG_CHANGE_MODE;
1216
7.73k
        }
1217
16.1k
    } /* Re-Entering */
1218
    
1219
    /*
1220
     Main Loop
1221
     */
1222
    
1223
49.3k
    while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL)
1224
43.1k
    {
1225
43.1k
        DEBUG_LOG_GOT_TOKEN(node);
1226
        /* end tag for this element */
1227
43.1k
        if (node->type == EndTag && node->tag &&
1228
6.44k
            (node->tag == element->tag || element->was == node->tag))
1229
3.07k
        {
1230
3.07k
            TY_(FreeNode)( doc, node );
1231
1232
3.07k
            if (element->tag->model & CM_OBJECT)
1233
1.38k
            {
1234
                /* pop inline stack */
1235
3.52k
                while (lexer->istacksize > lexer->istackbase)
1236
2.14k
                    TY_(PopInline)( doc, NULL );
1237
1.38k
                lexer->istackbase = istackbase;
1238
1.38k
            }
1239
1240
3.07k
            element->closed = yes;
1241
3.07k
            TrimSpaces( doc, element );
1242
3.07k
            DEBUG_LOG_EXIT;
1243
3.07k
            return NULL;
1244
3.07k
        }
1245
1246
40.0k
        if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) )
1247
639
        {
1248
639
            if ( TY_(nodeIsElement)(node) )
1249
400
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1250
639
            TY_(FreeNode)( doc, node );
1251
639
            continue;
1252
639
        }
1253
1254
1255
39.4k
        if (node->type == EndTag)
1256
3.41k
        {
1257
3.41k
            if (node->tag == NULL)
1258
286
            {
1259
286
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1260
286
                TY_(FreeNode)( doc, node );
1261
286
                continue;
1262
286
            }
1263
3.12k
            else if ( nodeIsBR(node) )
1264
0
            {
1265
0
                node->type = StartTag;
1266
0
            }
1267
3.12k
            else if ( nodeIsP(node) )
1268
192
            {
1269
                /* Cannot have a block inside a paragraph, so no checking
1270
                   for an ancestor is necessary -- but we _can_ have
1271
                   paragraphs inside a block, so change it to an implicit
1272
                   empty paragraph, to be dealt with according to the user's
1273
                   options
1274
                */
1275
192
                node->type = StartEndTag;
1276
192
                node->implicit = yes;
1277
192
            }
1278
2.93k
            else if (DescendantOf( element, node->tag->id ))
1279
544
            {
1280
                /*
1281
                  if this is the end tag for an ancestor element
1282
                  then infer end tag for this element
1283
                */
1284
544
                TY_(UngetToken)( doc );
1285
544
                break;
1286
544
            }
1287
2.39k
            else
1288
2.39k
            {
1289
                /* special case </tr> etc. for stuff moved in front of table */
1290
2.39k
                if ( lexer->exiled
1291
849
                     && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
1292
545
                {
1293
545
                    TY_(UngetToken)( doc );
1294
545
                    TrimSpaces( doc, element );
1295
545
                    DEBUG_LOG_EXIT;
1296
545
                    return NULL;
1297
545
                }
1298
2.39k
            }
1299
3.41k
        }
1300
1301
        /* mixed content model permits text */
1302
38.0k
        if (TY_(nodeIsText)(node))
1303
5.26k
        {
1304
5.26k
            if ( checkstack )
1305
3.11k
            {
1306
3.11k
                checkstack = no;
1307
3.11k
                if (!(element->tag->model & CM_MIXED))
1308
2.74k
                {
1309
2.74k
                    if ( TY_(InlineDup)(doc, node) > 0 )
1310
647
                        continue;
1311
2.74k
                }
1312
3.11k
            }
1313
1314
4.62k
            TY_(InsertNodeAtEnd)(element, node);
1315
4.62k
            DEBUG_LOG_GET_OLD_MODE
1316
4.62k
            mode = MixedContent;
1317
4.62k
            DEBUG_LOG_CHANGE_MODE;
1318
            /*
1319
              HTML4 strict doesn't allow mixed content for
1320
              elements with %block; as their content model
1321
            */
1322
            /*
1323
              But only body, map, blockquote, form and
1324
              noscript have content model %block;
1325
            */
1326
4.62k
            if ( nodeIsBODY(element)       ||
1327
4.62k
                 nodeIsMAP(element)        ||
1328
4.62k
                 nodeIsBLOCKQUOTE(element) ||
1329
4.62k
                 nodeIsFORM(element)       ||
1330
4.34k
                 nodeIsNOSCRIPT(element) )
1331
273
                TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
1332
4.62k
            continue;
1333
5.26k
        }
1334
1335
32.8k
        if ( InsertMisc(element, node) )
1336
610
            continue;
1337
1338
        /* allow PARAM elements? */
1339
32.1k
        if ( nodeIsPARAM(node) )
1340
584
        {
1341
584
            if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) )
1342
584
            {
1343
584
                TY_(InsertNodeAtEnd)(element, node);
1344
584
                continue;
1345
584
            }
1346
1347
            /* otherwise discard it */
1348
0
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1349
0
            TY_(FreeNode)( doc, node );
1350
0
            continue;
1351
584
        }
1352
1353
        /* allow AREA elements? */
1354
31.6k
        if ( nodeIsAREA(node) )
1355
0
        {
1356
0
            if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) )
1357
0
            {
1358
0
                TY_(InsertNodeAtEnd)(element, node);
1359
0
                continue;
1360
0
            }
1361
1362
            /* otherwise discard it */
1363
0
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1364
0
            TY_(FreeNode)( doc, node );
1365
0
            continue;
1366
0
        }
1367
1368
        /* ignore unknown start/end tags */
1369
31.6k
        if ( node->tag == NULL )
1370
3.78k
        {
1371
3.78k
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1372
3.78k
            TY_(FreeNode)( doc, node );
1373
3.78k
            continue;
1374
3.78k
        }
1375
1376
        /*
1377
          Allow CM_INLINE elements here.
1378
1379
          Allow CM_BLOCK elements here unless
1380
          lexer->excludeBlocks is yes.
1381
1382
          LI and DD are special cased.
1383
1384
          Otherwise infer end tag for this element.
1385
        */
1386
1387
27.8k
        if ( !TY_(nodeHasCM)(node, CM_INLINE) )
1388
13.9k
        {
1389
13.9k
            if ( !TY_(nodeIsElement)(node) )
1390
991
            {
1391
991
                if ( nodeIsFORM(node) )
1392
258
                    BadForm( doc );
1393
1394
991
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1395
991
                TY_(FreeNode)( doc, node );
1396
991
                continue;
1397
991
            }
1398
            
1399
            /* #427671 - Fix by Randy Waki - 10 Aug 00 */
1400
            /*
1401
             If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION
1402
             start tag, discard the start tag and let the subsequent content get
1403
             parsed as content of the enclosing LI.  This seems to mimic IE and
1404
             Netscape, and avoids an infinite loop: without this check,
1405
             ParseBlock (which is parsing the LI's content) and ParseList (which
1406
             is parsing the LI's parent's content) repeatedly defer to each
1407
             other to parse the illegal start tag, each time inferring a missing
1408
             </li> or <li> respectively.
1409
1410
             NOTE: This check is a bit fragile.  It specifically checks for the
1411
             four tags that happen to weave their way through the current series
1412
             of tests performed by ParseBlock and ParseList to trigger the
1413
             infinite loop.
1414
            */
1415
12.9k
            if ( nodeIsLI(element) )
1416
1.90k
            {
1417
1.90k
                if ( nodeIsFRAME(node)    ||
1418
1.90k
                     nodeIsFRAMESET(node) ||
1419
1.90k
                     nodeIsOPTGROUP(node) ||
1420
1.45k
                     nodeIsOPTION(node) )
1421
451
                {
1422
451
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1423
451
                    TY_(FreeNode)( doc, node );  /* DSR - 27Apr02 avoid memory leak */
1424
451
                    continue;
1425
451
                }
1426
1.90k
            }
1427
1428
12.5k
            if ( nodeIsTD(element) || nodeIsTH(element) )
1429
806
            {
1430
                /* if parent is a table cell, avoid inferring the end of the cell */
1431
1432
806
                if ( TY_(nodeHasCM)(node, CM_HEAD) )
1433
61
                {
1434
61
                    MoveToHead( doc, element, node );
1435
61
                    continue;
1436
61
                }
1437
1438
745
                if ( TY_(nodeHasCM)(node, CM_LIST) )
1439
13
                {
1440
13
                    TY_(UngetToken)( doc );
1441
13
                    node = TY_(InferredTag)(doc, TidyTag_UL);
1442
13
                    AddClassNoIndent(doc, node);
1443
13
                    lexer->excludeBlocks = yes;
1444
13
                }
1445
732
                else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1446
15
                {
1447
15
                    TY_(UngetToken)( doc );
1448
15
                    node = TY_(InferredTag)(doc, TidyTag_DL);
1449
15
                    lexer->excludeBlocks = yes;
1450
15
                }
1451
1452
                /* infer end of current table cell */
1453
745
                if ( !TY_(nodeHasCM)(node, CM_BLOCK) )
1454
503
                {
1455
503
                    TY_(UngetToken)( doc );
1456
503
                    TrimSpaces( doc, element );
1457
503
                    DEBUG_LOG_EXIT;
1458
503
                    return NULL;
1459
503
                }
1460
745
            }
1461
11.7k
            else if ( TY_(nodeHasCM)(node, CM_BLOCK) )
1462
7.35k
            {
1463
7.35k
                if ( lexer->excludeBlocks )
1464
1.20k
                {
1465
1.20k
                    if ( !TY_(nodeHasCM)(element, CM_OPT) )
1466
624
                        TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
1467
1468
1.20k
                    TY_(UngetToken)( doc );
1469
1470
1.20k
                    if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1471
6
                        lexer->istackbase = istackbase;
1472
1473
1.20k
                    TrimSpaces( doc, element );
1474
1.20k
                    DEBUG_LOG_EXIT;
1475
1.20k
                    return NULL;
1476
1.20k
                }
1477
7.35k
            }
1478
4.36k
            else if ( ! nodeIsTEMPLATE( element ) )/* things like list items */
1479
4.36k
            {
1480
4.36k
                if (node->tag->model & CM_HEAD)
1481
12
                {
1482
12
                    MoveToHead( doc, element, node );
1483
12
                    continue;
1484
12
                }
1485
1486
                /*
1487
                 special case where a form start tag
1488
                 occurs in a tr and is followed by td or th
1489
                */
1490
1491
4.35k
                if ( nodeIsFORM(element) &&
1492
4.35k
                     nodeIsTD(element->parent) &&
1493
0
                     element->parent->implicit )
1494
0
                {
1495
0
                    if ( nodeIsTD(node) )
1496
0
                    {
1497
0
                        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1498
0
                        TY_(FreeNode)( doc, node );
1499
0
                        continue;
1500
0
                    }
1501
1502
0
                    if ( nodeIsTH(node) )
1503
0
                    {
1504
0
                        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1505
0
                        TY_(FreeNode)( doc, node );
1506
0
                        node = element->parent;
1507
0
                        TidyDocFree(doc, node->element);
1508
0
                        node->element = TY_(tmbstrdup)(doc->allocator, "th");
1509
0
                        node->tag = TY_(LookupTagDef)( TidyTag_TH );
1510
0
                        continue;
1511
0
                    }
1512
0
                }
1513
1514
4.35k
                if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit )
1515
2.60k
                    TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
1516
1517
                /* #521, warn on missing optional end-tags if not omitting them. */
1518
4.35k
                if ( cfgBool( doc, TidyOmitOptionalTags ) == no && TY_(nodeHasCM)(element, CM_OPT) )
1519
1.60k
                    TY_(Report)(doc, element, node, MISSING_ENDTAG_OPTIONAL );
1520
1521
1522
4.35k
                TY_(UngetToken)( doc );
1523
1524
4.35k
                if ( TY_(nodeHasCM)(node, CM_LIST) )
1525
290
                {
1526
290
                    if ( element->parent && element->parent->tag &&
1527
289
                         element->parent->tag->parser == TY_(ParseList) )
1528
286
                    {
1529
286
                        TrimSpaces( doc, element );
1530
286
                        DEBUG_LOG_EXIT;
1531
286
                        return NULL;
1532
286
                    }
1533
1534
4
                    node = TY_(InferredTag)(doc, TidyTag_UL);
1535
4
                    AddClassNoIndent(doc, node);
1536
4
                }
1537
4.06k
                else if ( TY_(nodeHasCM)(node, CM_DEFLIST) )
1538
512
                {
1539
512
                    if ( nodeIsDL(element->parent) )
1540
17
                    {
1541
17
                        TrimSpaces( doc, element );
1542
17
                        DEBUG_LOG_EXIT;
1543
17
                        return NULL;
1544
17
                    }
1545
1546
495
                    node = TY_(InferredTag)(doc, TidyTag_DL);
1547
495
                }
1548
3.54k
                else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) )
1549
2.12k
                {
1550
                    /* http://tidy.sf.net/issue/1316307 */
1551
                    /* In exiled mode, return so table processing can
1552
                       continue. */
1553
2.12k
                    if (lexer->exiled)
1554
1.32k
                    {
1555
1.32k
                        DEBUG_LOG_EXIT;
1556
1.32k
                        return NULL;
1557
1.32k
                    }
1558
803
                    node = TY_(InferredTag)(doc, TidyTag_TABLE);
1559
803
                }
1560
1.42k
                else if ( TY_(nodeHasCM)(element, CM_OBJECT) )
1561
65
                {
1562
                    /* pop inline stack */
1563
899
                    while ( lexer->istacksize > lexer->istackbase )
1564
834
                        TY_(PopInline)( doc, NULL );
1565
65
                    lexer->istackbase = istackbase;
1566
65
                    TrimSpaces( doc, element );
1567
65
                    DEBUG_LOG_EXIT;
1568
65
                    return NULL;
1569
1570
65
                }
1571
1.36k
                else
1572
1.36k
                {
1573
1.36k
                    TrimSpaces( doc, element );
1574
1.36k
                    DEBUG_LOG_EXIT;
1575
1.36k
                    return NULL;
1576
1.36k
                }
1577
4.35k
            }
1578
12.5k
        }
1579
1580
        /*\
1581
         *  Issue #307 - an <A> tag to ends any open <A> element
1582
         *  Like #427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00
1583
         *  in ParseInline(), fix copied HERE to ParseBlock()
1584
         *  href: http://www.w3.org/TR/html-markup/a.html
1585
         *  The interactive element a must not appear as a descendant of the a element.
1586
        \*/
1587
21.5k
        if ( nodeIsA(node) && !node->implicit &&
1588
2.94k
             (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
1589
1.27k
        {
1590
1.27k
            if (node->type != EndTag && node->attributes == NULL
1591
912
                && cfgBool(doc, TidyCoerceEndTags) )
1592
912
            {
1593
912
                node->type = EndTag;
1594
912
                TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
1595
912
                TY_(UngetToken)( doc );
1596
912
                continue;
1597
912
            }
1598
1599
361
            if (nodeIsA(element))
1600
46
            {
1601
46
                TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
1602
46
                TY_(UngetToken)( doc );
1603
46
            }
1604
315
            else
1605
315
            {
1606
                /* Issue #597 - if we not 'UngetToken' then it is being discarded.
1607
                   Add message, and 'FreeNode' - thanks @ralfjunker */
1608
315
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
1609
315
                TY_(FreeNode)(doc, node);
1610
315
            }
1611
1612
361
            if (!(mode & Preformatted))
1613
361
                TrimSpaces(doc, element);
1614
1615
361
            DEBUG_LOG_EXIT;
1616
361
            return NULL;
1617
1.27k
        }
1618
1619
        /* parse known element */
1620
20.2k
        if (TY_(nodeIsElement)(node))
1621
19.4k
        {
1622
19.4k
            if (node->tag->model & CM_INLINE)
1623
11.7k
            {
1624
11.7k
                if (checkstack && !node->implicit)
1625
5.97k
                {
1626
5.97k
                    checkstack = no;
1627
1628
5.97k
                    if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */
1629
5.31k
                    {
1630
5.31k
                        if ( TY_(InlineDup)(doc, node) > 0 )
1631
124
                            continue;
1632
5.31k
                    }
1633
5.97k
                }
1634
1635
11.6k
                DEBUG_LOG_GET_OLD_MODE;
1636
11.6k
                mode = MixedContent;
1637
11.6k
                DEBUG_LOG_CHANGE_MODE;
1638
11.6k
            }
1639
7.69k
            else
1640
7.69k
            {
1641
7.69k
                checkstack = yes;
1642
7.69k
                DEBUG_LOG_GET_OLD_MODE;
1643
7.69k
                mode = IgnoreWhitespace;
1644
7.69k
                DEBUG_LOG_CHANGE_MODE;
1645
7.69k
            }
1646
1647
            /* trim white space before <br> */
1648
19.3k
            if ( nodeIsBR(node) )
1649
4
                TrimSpaces( doc, element );
1650
1651
19.3k
            TY_(InsertNodeAtEnd)(element, node);
1652
1653
19.3k
            if (node->implicit)
1654
6.06k
                TY_(Report)(doc, element, node, INSERTING_TAG );
1655
1656
            /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an
1657
               effort has been made above to set a 'MixedContent' mode in some cases?
1658
               WHY IS THE 'mode' VARIABLE NOT USED HERE???? */
1659
1660
19.3k
            {
1661
19.3k
                TidyParserMemory memory = {0};
1662
19.3k
                memory.identity = TY_(ParseBlock);
1663
19.3k
                memory.reentry_node = node;
1664
19.3k
                memory.reentry_mode = mode;
1665
19.3k
                memory.original_node = element;
1666
19.3k
                TY_(pushMemory)(doc, memory);
1667
19.3k
                DEBUG_LOG_EXIT_WITH_NODE(node);
1668
19.3k
            }
1669
19.3k
            return node;
1670
19.4k
        }
1671
1672
        /* discard unexpected tags */
1673
854
        if (node->type == EndTag)
1674
854
            TY_(PopInline)( doc, node );  /* if inline end tag */
1675
1676
854
        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
1677
854
        TY_(FreeNode)( doc, node );
1678
854
        continue;
1679
20.2k
    }
1680
1681
6.77k
    if (!(element->tag->model & CM_OPT))
1682
3.98k
        TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR);
1683
1684
6.77k
    if (element->tag->model & CM_OBJECT)
1685
913
    {
1686
        /* pop inline stack */
1687
1.34k
        while ( lexer->istacksize > lexer->istackbase )
1688
433
            TY_(PopInline)( doc, NULL );
1689
913
        lexer->istackbase = istackbase;
1690
913
    }
1691
1692
6.77k
    TrimSpaces( doc, element );
1693
1694
6.77k
    DEBUG_LOG_EXIT;
1695
6.77k
    return NULL;
1696
34.8k
}
1697
1698
1699
/** MARK: TY_(ParseBody)
1700
 *  Parses the `body` tag.
1701
 *
1702
 *  This is a non-recursing parser. It uses the document's parser memory stack
1703
 *  to send subsequent nodes back to the controller for dispatching to parsers.
1704
 *  This parser is also re-enterable, so that post-processing can occur after
1705
 *  such dispatching.
1706
 */
1707
Node* TY_(ParseBody)( TidyDocImpl* doc, Node *body, GetTokenMode mode )
1708
33.7k
{
1709
33.7k
    Lexer* lexer = doc->lexer;
1710
33.7k
    Node *node = NULL;
1711
33.7k
    Bool checkstack = no;
1712
33.7k
    Bool iswhitenode = no;
1713
33.7k
    DEBUG_LOG_COUNTERS;
1714
1715
33.7k
    mode = IgnoreWhitespace;
1716
33.7k
    checkstack = yes;
1717
1718
    /*
1719
     If we're re-entering, then we need to setup from a previous state,
1720
     instead of starting fresh. We can pull what we need from the document's
1721
     stack.
1722
     */
1723
33.7k
    if ( body == NULL )
1724
12.3k
    {
1725
12.3k
        TidyParserMemory memory = TY_(popMemory)( doc );
1726
12.3k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
1727
12.3k
        DEBUG_LOG_REENTER_WITH_NODE(node);
1728
12.3k
        body = memory.original_node;
1729
12.3k
        checkstack = memory.register_1;
1730
12.3k
        iswhitenode = memory.register_2;
1731
12.3k
        DEBUG_LOG_GET_OLD_MODE;
1732
12.3k
        mode = memory.mode;
1733
12.3k
        DEBUG_LOG_CHANGE_MODE;
1734
12.3k
    }
1735
21.3k
    else
1736
21.3k
    {
1737
21.3k
        DEBUG_LOG_ENTER_WITH_NODE(body);
1738
21.3k
        TY_(BumpObject)( doc, body->parent );
1739
21.3k
    }
1740
    
1741
55.6k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
1742
51.8k
    {
1743
51.8k
        DEBUG_LOG_GOT_TOKEN(node);
1744
        /* find and discard multiple <body> elements */
1745
51.8k
        if (node->tag == body->tag && node->type == StartTag)
1746
1.04k
        {
1747
1.04k
            TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1748
1.04k
            TY_(FreeNode)(doc, node);
1749
1.04k
            continue;
1750
1.04k
        }
1751
1752
        /* #538536 Extra endtags not detected */
1753
50.8k
        if ( nodeIsHTML(node) )
1754
351
        {
1755
351
            if (TY_(nodeIsElement)(node) || lexer->seenEndHtml)
1756
351
                TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1757
0
            else
1758
0
                lexer->seenEndHtml = 1;
1759
1760
351
            TY_(FreeNode)( doc, node);
1761
351
            continue;
1762
351
        }
1763
1764
50.4k
        if ( lexer->seenEndBody &&
1765
2.67k
             ( node->type == StartTag ||
1766
1.28k
               node->type == EndTag   ||
1767
464
               node->type == StartEndTag ) )
1768
2.21k
        {
1769
2.21k
            TY_(Report)(doc, body, node, CONTENT_AFTER_BODY );
1770
2.21k
        }
1771
1772
50.4k
        if ( node->tag == body->tag && node->type == EndTag )
1773
643
        {
1774
643
            body->closed = yes;
1775
643
            TrimSpaces(doc, body);
1776
643
            TY_(FreeNode)( doc, node);
1777
643
            lexer->seenEndBody = 1;
1778
643
            DEBUG_LOG_GET_OLD_MODE;
1779
643
            mode = IgnoreWhitespace;
1780
643
            DEBUG_LOG_CHANGE_MODE;
1781
1782
643
            if ( nodeIsNOFRAMES(body->parent) )
1783
480
                break;
1784
1785
163
            continue;
1786
643
        }
1787
1788
49.8k
        if ( nodeIsNOFRAMES(node) )
1789
1.82k
        {
1790
1.82k
            if (node->type == StartTag)
1791
1.66k
            {
1792
1.66k
                TidyParserMemory memory = {0};
1793
1794
1.66k
                TY_(InsertNodeAtEnd)(body, node);
1795
                
1796
1.66k
                memory.identity = TY_(ParseBody);
1797
1.66k
                memory.original_node = body;
1798
1.66k
                memory.reentry_node = node;
1799
1.66k
                memory.register_1 = checkstack;
1800
1.66k
                memory.register_2 = iswhitenode;
1801
1.66k
                memory.mode = mode;
1802
1.66k
                TY_(pushMemory)( doc, memory );
1803
1.66k
                DEBUG_LOG_EXIT_WITH_NODE(node);
1804
1.66k
                return node;
1805
1.66k
            }
1806
1807
163
            if (node->type == EndTag && nodeIsNOFRAMES(body->parent) )
1808
163
            {
1809
163
                TrimSpaces(doc, body);
1810
163
                TY_(UngetToken)( doc );
1811
163
                break;
1812
163
            }
1813
163
        }
1814
1815
48.0k
        if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node))
1816
2.91k
             && nodeIsNOFRAMES(body->parent) )
1817
1.13k
        {
1818
1.13k
            TrimSpaces(doc, body);
1819
1.13k
            TY_(UngetToken)( doc );
1820
1.13k
            break;
1821
1.13k
        }
1822
1823
46.8k
        iswhitenode = no;
1824
1825
46.8k
        if ( TY_(nodeIsText)(node) &&
1826
7.31k
             node->end <= node->start + 1 &&
1827
1.80k
             lexer->lexbuf[node->start] == ' ' )
1828
731
            iswhitenode = yes;
1829
1830
        /* deal with comments etc. */
1831
46.8k
        if (InsertMisc(body, node))
1832
2.77k
            continue;
1833
1834
        /* mixed content model permits text */
1835
44.1k
        if (TY_(nodeIsText)(node))
1836
7.31k
        {
1837
7.31k
            if (iswhitenode && mode == IgnoreWhitespace)
1838
0
            {
1839
0
                TY_(FreeNode)( doc, node);
1840
0
                continue;
1841
0
            }
1842
1843
            /* HTML 2 and HTML4 strict don't allow text here */
1844
7.31k
            TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20));
1845
1846
7.31k
            if (checkstack)
1847
3.30k
            {
1848
3.30k
                checkstack = no;
1849
1850
3.30k
                if ( TY_(InlineDup)(doc, node) > 0 )
1851
562
                    continue;
1852
3.30k
            }
1853
1854
6.75k
            TY_(InsertNodeAtEnd)(body, node);
1855
6.75k
            DEBUG_LOG_GET_OLD_MODE;
1856
6.75k
            mode = MixedContent;
1857
6.75k
            DEBUG_LOG_CHANGE_MODE;
1858
6.75k
            continue;
1859
7.31k
        }
1860
1861
36.8k
        if (node->type == DocTypeTag)
1862
850
        {
1863
850
            InsertDocType(doc, body, node);
1864
850
            continue;
1865
850
        }
1866
        /* discard unknown  and PARAM tags */
1867
35.9k
        if ( node->tag == NULL || nodeIsPARAM(node) )
1868
7.20k
        {
1869
7.20k
            TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1870
7.20k
            TY_(FreeNode)( doc, node);
1871
7.20k
            continue;
1872
7.20k
        }
1873
1874
        /*
1875
          Netscape allows LI and DD directly in BODY
1876
          We infer UL or DL respectively and use this
1877
          Bool to exclude block-level elements so as
1878
          to match Netscape's observed behaviour.
1879
        */
1880
28.7k
        lexer->excludeBlocks = no;
1881
1882
28.7k
        if ((( nodeIsINPUT(node) ||
1883
28.7k
             (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE))
1884
28.7k
           ) && !TY_(IsHTML5Mode)(doc)) || nodeIsLI(node) )
1885
2.67k
        {
1886
            /* avoid this error message being issued twice */
1887
2.67k
            if (!(node->tag->model & CM_HEAD))
1888
2.40k
                TY_(Report)(doc, body, node, TAG_NOT_ALLOWED_IN);
1889
1890
2.67k
            if (node->tag->model & CM_HTML)
1891
378
            {
1892
                /* copy body attributes if current body was inferred */
1893
378
                if ( nodeIsBODY(node) && body->implicit
1894
14
                     && body->attributes == NULL )
1895
14
                {
1896
14
                    body->attributes = node->attributes;
1897
14
                    node->attributes = NULL;
1898
14
                }
1899
1900
378
                TY_(FreeNode)( doc, node);
1901
378
                continue;
1902
378
            }
1903
1904
2.29k
            if (node->tag->model & CM_HEAD)
1905
268
            {
1906
268
                MoveToHead(doc, body, node);
1907
268
                continue;
1908
268
            }
1909
1910
2.03k
            if (node->tag->model & CM_LIST)
1911
317
            {
1912
317
                TY_(UngetToken)( doc );
1913
317
                node = TY_(InferredTag)(doc, TidyTag_UL);
1914
317
                AddClassNoIndent(doc, node);
1915
317
                lexer->excludeBlocks = yes;
1916
317
            }
1917
1.71k
            else if (node->tag->model & CM_DEFLIST)
1918
52
            {
1919
52
                TY_(UngetToken)( doc );
1920
52
                node = TY_(InferredTag)(doc, TidyTag_DL);
1921
52
                lexer->excludeBlocks = yes;
1922
52
            }
1923
1.66k
            else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW))
1924
321
            {
1925
                /* http://tidy.sf.net/issue/2855621 */
1926
321
                if (node->type != EndTag) {
1927
320
                    TY_(UngetToken)( doc );
1928
320
                    node = TY_(InferredTag)(doc, TidyTag_TABLE);
1929
320
                }
1930
321
                lexer->excludeBlocks = yes;
1931
321
            }
1932
1.34k
            else if ( nodeIsINPUT(node) )
1933
8
            {
1934
8
                TY_(UngetToken)( doc );
1935
8
                node = TY_(InferredTag)(doc, TidyTag_FORM);
1936
8
                lexer->excludeBlocks = yes;
1937
8
            }
1938
1.33k
            else
1939
1.33k
            {
1940
1.33k
                if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) )
1941
1.33k
                {
1942
1.33k
                    TY_(UngetToken)( doc );
1943
1.33k
                    DEBUG_LOG_EXIT;
1944
1.33k
                    return NULL;
1945
1.33k
                }
1946
1947
                /* ignore </td> </th> <option> etc. */
1948
0
                TY_(FreeNode)( doc, node );
1949
0
                continue;
1950
1.33k
            }
1951
2.03k
        }
1952
1953
26.7k
        if (node->type == EndTag)
1954
1.21k
        {
1955
1.21k
            if ( nodeIsBR(node) )
1956
1
            {
1957
1
                node->type = StartTag;
1958
1
            }
1959
1.21k
            else if ( nodeIsP(node) )
1960
8
            {
1961
8
                node->type = StartEndTag;
1962
8
                node->implicit = yes;
1963
8
            }
1964
1.20k
            else if ( TY_(nodeHasCM)(node, CM_INLINE) )
1965
581
                TY_(PopInline)( doc, node );
1966
1.21k
        }
1967
1968
26.7k
        if (TY_(nodeIsElement)(node))
1969
25.5k
        {
1970
25.5k
            if (nodeIsMAIN(node))
1971
88
            {
1972
                /*\ Issue #166 - repeated <main> element
1973
                 *  How to efficiently search for a previous main element?
1974
                \*/
1975
88
                if ( findNodeById(doc, TidyTag_MAIN) )
1976
0
                {
1977
0
                    doc->badForm |= flg_BadMain; /* this is an ERROR in format */
1978
0
                    TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
1979
0
                    TY_(FreeNode)( doc, node);
1980
0
                    continue;
1981
0
                }
1982
88
            }
1983
            /* Issue #20 - merging from Ger Hobbelt fork put back CM_MIXED, which had been
1984
               removed to fix this issue - reverting to fix 880221e
1985
             */
1986
25.5k
            if ( TY_(nodeHasCM)(node, CM_INLINE) )
1987
4.97k
            {
1988
                /* HTML4 strict doesn't allow inline content here */
1989
                /* but HTML2 does allow img elements as children of body */
1990
4.97k
                if ( nodeIsIMG(node) )
1991
479
                    TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT);
1992
4.49k
                else
1993
4.49k
                    TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20));
1994
1995
4.97k
                if (checkstack && !node->implicit)
1996
398
                {
1997
398
                    checkstack = no;
1998
1999
398
                    if ( TY_(InlineDup)(doc, node) > 0 )
2000
358
                        continue;
2001
398
                }
2002
                
2003
4.61k
                DEBUG_LOG_GET_OLD_MODE;
2004
4.61k
                mode = MixedContent;
2005
4.61k
                DEBUG_LOG_CHANGE_MODE;
2006
4.61k
            }
2007
20.5k
            else
2008
20.5k
            {
2009
20.5k
                checkstack = yes;
2010
20.5k
                DEBUG_LOG_GET_OLD_MODE;
2011
20.5k
                mode = IgnoreWhitespace;
2012
20.5k
                DEBUG_LOG_CHANGE_MODE;
2013
20.5k
            }
2014
2015
25.2k
            if (node->implicit)
2016
1.84k
            {
2017
1.84k
                TY_(Report)(doc, body, node, INSERTING_TAG);
2018
1.84k
            }
2019
2020
25.2k
            TY_(InsertNodeAtEnd)(body, node);
2021
            
2022
25.2k
            {
2023
25.2k
                TidyParserMemory memory = {0};
2024
25.2k
                memory.identity = TY_(ParseBody);
2025
25.2k
                memory.original_node = body;
2026
25.2k
                memory.reentry_node = node;
2027
25.2k
                memory.register_1 = checkstack;
2028
25.2k
                memory.register_2 = iswhitenode;
2029
25.2k
                memory.mode = mode;
2030
25.2k
                TY_(pushMemory)( doc, memory );
2031
25.2k
            }
2032
25.2k
            DEBUG_LOG_EXIT_WITH_NODE(node);
2033
25.2k
            return node;
2034
25.5k
        }
2035
2036
        /* discard unexpected tags */
2037
1.20k
        TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED);
2038
1.20k
        TY_(FreeNode)( doc, node);
2039
1.20k
    }
2040
5.51k
    DEBUG_LOG_EXIT;
2041
5.51k
    return NULL;
2042
33.7k
}
2043
2044
2045
/** MARK: TY_(ParseColGroup)
2046
 *  Parses the `colgroup` tag.
2047
 *
2048
 *  This is a non-recursing parser. It uses the document's parser memory stack
2049
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2050
 *  This parser is also re-enterable, so that post-processing can occur after
2051
 *  such dispatching.
2052
 */
2053
Node* TY_(ParseColGroup)( TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode) )
2054
1.57k
{
2055
1.57k
    Node *node, *parent;
2056
1.57k
    DEBUG_LOG_COUNTERS;
2057
2058
    /*
2059
     If we're re-entering, then we need to setup from a previous state,
2060
     instead of starting fresh. We can pull what we need from the document's
2061
     stack.
2062
     */
2063
1.57k
    if ( colgroup == NULL )
2064
247
    {
2065
247
        TidyParserMemory memory = TY_(popMemory)( doc );
2066
247
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2067
247
        DEBUG_LOG_REENTER_WITH_NODE(node);
2068
247
        colgroup = memory.original_node;
2069
247
        DEBUG_LOG_GET_OLD_MODE;
2070
247
        mode = memory.mode;
2071
247
        DEBUG_LOG_CHANGE_MODE;
2072
247
    }
2073
1.32k
    else
2074
1.32k
    {
2075
1.32k
        DEBUG_LOG_ENTER_WITH_NODE(colgroup);
2076
1.32k
        if (colgroup->tag->model & CM_EMPTY)
2077
0
            return NULL;
2078
1.32k
    }
2079
2080
1.80k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2081
1.66k
    {
2082
1.66k
        DEBUG_LOG_GOT_TOKEN(node);
2083
2084
1.66k
        if (node->tag == colgroup->tag && node->type == EndTag)
2085
1
        {
2086
1
            TY_(FreeNode)( doc, node);
2087
1
            colgroup->closed = yes;
2088
1
            return NULL;
2089
1
        }
2090
2091
        /*
2092
          if this is the end tag for an ancestor element
2093
          then infer end tag for this element
2094
        */
2095
1.66k
        if (node->type == EndTag)
2096
433
        {
2097
433
            if ( nodeIsFORM(node) )
2098
30
            {
2099
30
                BadForm( doc );
2100
30
                TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2101
30
                TY_(FreeNode)( doc, node);
2102
30
                continue;
2103
30
            }
2104
2105
403
            for ( parent = colgroup->parent;
2106
1.71k
                  parent != NULL;
2107
1.31k
                  parent = parent->parent )
2108
1.71k
            {
2109
1.71k
                if (node->tag == parent->tag)
2110
403
                {
2111
403
                    TY_(UngetToken)( doc );
2112
403
                    DEBUG_LOG_EXIT;
2113
403
                    return NULL;
2114
403
                }
2115
1.71k
            }
2116
403
        }
2117
2118
1.23k
        if (TY_(nodeIsText)(node))
2119
216
        {
2120
216
            TY_(UngetToken)( doc );
2121
216
            DEBUG_LOG_EXIT;
2122
216
            return NULL;
2123
216
        }
2124
2125
        /* deal with comments etc. */
2126
1.01k
        if (InsertMisc(colgroup, node))
2127
3
            continue;
2128
2129
        /* discard unknown tags */
2130
1.01k
        if (node->tag == NULL)
2131
200
        {
2132
200
            TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2133
200
            TY_(FreeNode)( doc, node);
2134
200
            continue;
2135
200
        }
2136
2137
816
        if ( !nodeIsCOL(node) )
2138
569
        {
2139
569
            TY_(UngetToken)( doc );
2140
569
            DEBUG_LOG_EXIT;
2141
569
            return NULL;
2142
569
        }
2143
2144
247
        if (node->type == EndTag)
2145
0
        {
2146
0
            TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED);
2147
0
            TY_(FreeNode)( doc, node);
2148
0
            continue;
2149
0
        }
2150
2151
        /* node should be <COL> */
2152
247
        TY_(InsertNodeAtEnd)(colgroup, node);
2153
        
2154
247
        {
2155
247
            TidyParserMemory memory = {0};
2156
247
            memory.identity = TY_(ParseColGroup);
2157
247
            memory.original_node = colgroup;
2158
247
            memory.reentry_node = node;
2159
247
            memory.mode = mode;
2160
247
            TY_(pushMemory)( doc, memory );
2161
247
            DEBUG_LOG_EXIT_WITH_NODE(node);
2162
247
        }
2163
247
        DEBUG_LOG_EXIT;
2164
247
        return node;
2165
247
    }
2166
135
    DEBUG_LOG_EXIT;
2167
135
    return NULL;
2168
1.57k
}
2169
2170
2171
/** MARK: TY_(ParseDatalist)
2172
 *  Parses the `datalist` tag.
2173
 *
2174
 *  This is a non-recursing parser. It uses the document's parser memory stack
2175
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2176
 *  This parser is also re-enterable, so that post-processing can occur after
2177
 *  such dispatching.
2178
*/
2179
Node* TY_(ParseDatalist)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) )
2180
401
{
2181
401
    Lexer* lexer = doc->lexer;
2182
401
    Node *node;
2183
401
    DEBUG_LOG_COUNTERS;
2184
2185
401
    if ( field == NULL )
2186
204
    {
2187
204
        TidyParserMemory memory = TY_(popMemory)( doc );
2188
204
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2189
204
        DEBUG_LOG_REENTER_WITH_NODE(node);
2190
204
        field = memory.original_node;
2191
204
        DEBUG_LOG_GET_OLD_MODE;
2192
204
        mode = memory.mode;
2193
204
        DEBUG_LOG_CHANGE_MODE;
2194
204
    }
2195
197
    else
2196
197
    {
2197
197
        DEBUG_LOG_ENTER_WITH_NODE(field);
2198
197
    }
2199
    
2200
401
    lexer->insert = NULL;  /* defer implicit inline start tags */
2201
2202
424
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2203
407
    {
2204
407
        if (node->tag == field->tag && node->type == EndTag)
2205
180
        {
2206
180
            TY_(FreeNode)( doc, node);
2207
180
            field->closed = yes;
2208
180
            TrimSpaces(doc, field);
2209
2210
180
            DEBUG_LOG_EXIT;
2211
180
            return NULL;
2212
180
        }
2213
2214
        /* deal with comments etc. */
2215
227
        if (InsertMisc(field, node))
2216
0
            continue;
2217
2218
227
        if ( node->type == StartTag &&
2219
211
             ( nodeIsOPTION(node)   ||
2220
211
               nodeIsOPTGROUP(node) ||
2221
211
               nodeIsDATALIST(node) ||
2222
211
               nodeIsSCRIPT(node))
2223
227
           )
2224
204
        {
2225
204
            TidyParserMemory memory = {0};
2226
204
            memory.identity = TY_(ParseDatalist);
2227
204
            memory.original_node = field;
2228
204
            memory.reentry_node = node;
2229
204
            memory.reentry_mode = IgnoreWhitespace;
2230
2231
204
            TY_(InsertNodeAtEnd)(field, node);
2232
204
            TY_(pushMemory)(doc, memory);
2233
204
            DEBUG_LOG_EXIT_WITH_NODE(node);
2234
204
            return node;
2235
204
        }
2236
2237
        /* discard unexpected tags */
2238
23
        TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
2239
23
        TY_(FreeNode)( doc, node);
2240
23
    }
2241
2242
17
    TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
2243
2244
17
    DEBUG_LOG_EXIT;
2245
17
    return NULL;
2246
401
}
2247
2248
2249
/** MARK: TY_(ParseDefList)
2250
 *  Parses the `dl` tag.
2251
 *
2252
 *  This is a non-recursing parser. It uses the document's parser memory stack
2253
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2254
 *  This parser is also re-enterable, so that post-processing can occur after
2255
 *  such dispatching.
2256
*/
2257
Node* TY_(ParseDefList)( TidyDocImpl* doc, Node *list, GetTokenMode mode )
2258
18.1k
{
2259
18.1k
    Lexer* lexer = doc->lexer;
2260
18.1k
    Node *node = NULL;
2261
18.1k
    Node *parent = NULL;
2262
18.1k
    DEBUG_LOG_COUNTERS;
2263
2264
18.1k
    enum parserState {
2265
18.1k
        STATE_INITIAL,                /* This is the initial state for every parser. */
2266
18.1k
        STATE_POST_NODEISCENTER,      /* To-do after re-entering after checks. */
2267
18.1k
        STATE_COMPLETE,               /* Done with the switch. */
2268
18.1k
    } state = STATE_INITIAL;
2269
2270
18.1k
    if ( list == NULL )
2271
12.7k
    {
2272
12.7k
        TidyParserMemory memory = TY_(popMemory)( doc );
2273
12.7k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2274
12.7k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2275
12.7k
        list = memory.original_node;
2276
12.7k
        state = memory.reentry_state;
2277
12.7k
        DEBUG_LOG_GET_OLD_MODE;
2278
12.7k
        mode = memory.mode;
2279
12.7k
        DEBUG_LOG_CHANGE_MODE;
2280
12.7k
    }
2281
5.34k
    else
2282
5.34k
    {
2283
5.34k
        DEBUG_LOG_ENTER_WITH_NODE(list);
2284
5.34k
    }
2285
2286
18.1k
    if (list->tag->model & CM_EMPTY)
2287
0
        return NULL;
2288
2289
18.1k
    lexer->insert = NULL;  /* defer implicit inline start tags */
2290
2291
26.7k
    while ( state != STATE_COMPLETE )
2292
24.1k
    {
2293
24.1k
        if ( state == STATE_INITIAL )
2294
20.6k
            node = TY_(GetToken)( doc, IgnoreWhitespace);
2295
        
2296
24.1k
        switch ( state)
2297
24.1k
        {
2298
20.6k
            case STATE_INITIAL:
2299
20.6k
            {
2300
20.6k
                if ( node == NULL)
2301
2.59k
                {
2302
2.59k
                    state = STATE_COMPLETE;
2303
2.59k
                    continue;
2304
2.59k
                }
2305
2306
18.0k
                if (node->tag == list->tag && node->type == EndTag)
2307
0
                {
2308
0
                    TY_(FreeNode)( doc, node);
2309
0
                    list->closed = yes;
2310
0
                    DEBUG_LOG_EXIT;
2311
0
                    return NULL;
2312
0
                }
2313
2314
                /* deal with comments etc. */
2315
18.0k
                if (InsertMisc(list, node))
2316
696
                    continue;
2317
2318
17.3k
                if (TY_(nodeIsText)(node))
2319
3.22k
                {
2320
3.22k
                    TY_(UngetToken)( doc );
2321
3.22k
                    node = TY_(InferredTag)(doc, TidyTag_DT);
2322
3.22k
                    TY_(Report)(doc, list, node, MISSING_STARTTAG);
2323
3.22k
                }
2324
2325
17.3k
                if (node->tag == NULL)
2326
1.68k
                {
2327
1.68k
                    TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2328
1.68k
                    TY_(FreeNode)( doc, node);
2329
1.68k
                    continue;
2330
1.68k
                }
2331
2332
                /*
2333
                  if this is the end tag for an ancestor element
2334
                  then infer end tag for this element
2335
                */
2336
15.6k
                if (node->type == EndTag)
2337
126
                {
2338
126
                    Bool discardIt = no;
2339
126
                    if ( nodeIsFORM(node) )
2340
0
                    {
2341
0
                        BadForm( doc );
2342
0
                        TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2343
0
                        TY_(FreeNode)( doc, node );
2344
0
                        continue;
2345
0
                    }
2346
2347
126
                    for (parent = list->parent;
2348
1.01k
                            parent != NULL; parent = parent->parent)
2349
1.01k
                    {
2350
                       /* Do not match across BODY to avoid infinite loop
2351
                          between ParseBody and this parser,
2352
                          See http://tidy.sf.net/bug/1098012. */
2353
1.01k
                        if (nodeIsBODY(parent))
2354
89
                        {
2355
89
                            discardIt = yes;
2356
89
                            break;
2357
89
                        }
2358
926
                        if (node->tag == parent->tag)
2359
36
                        {
2360
36
                            TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
2361
36
                            TY_(UngetToken)( doc );
2362
2363
36
                            DEBUG_LOG_EXIT;
2364
36
                            return NULL;
2365
36
                        }
2366
926
                    }
2367
90
                    if (discardIt)
2368
89
                    {
2369
89
                        TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2370
89
                        TY_(FreeNode)( doc, node);
2371
89
                        continue;
2372
89
                    }
2373
90
                }
2374
2375
                /* center in a dt or a dl breaks the dl list in two */
2376
15.4k
                if ( nodeIsCENTER(node) )
2377
3.51k
                {
2378
3.51k
                    if (list->content)
2379
3.51k
                        TY_(InsertNodeAfterElement)(list, node);
2380
2
                    else /* trim empty dl list */
2381
2
                    {
2382
2
                        TY_(InsertNodeBeforeElement)(list, node);
2383
2
                    }
2384
2385
                    /* #426885 - fix by Glenn Carroll 19 Apr 00, and
2386
                                 Gary Dechaines 11 Aug 00 */
2387
                    /* ParseTag can destroy node, if it finds that
2388
                     * this <center> is followed immediately by </center>.
2389
                     * It's awkward but necessary to determine if this
2390
                     * has happened.
2391
                     */
2392
3.51k
                    parent = node->parent;
2393
2394
                    /* and parse contents of center */
2395
3.51k
                    lexer->excludeBlocks = no;
2396
2397
3.51k
                    {
2398
3.51k
                        TidyParserMemory memory = {0};
2399
3.51k
                        memory.identity = TY_(ParseDefList);
2400
3.51k
                        memory.original_node = list;
2401
3.51k
                        memory.reentry_node = node;
2402
3.51k
                        memory.reentry_state = STATE_POST_NODEISCENTER;
2403
3.51k
                        TY_(pushMemory)( doc, memory );
2404
3.51k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
2405
3.51k
                        return node;
2406
3.51k
                    }
2407
3.51k
                }
2408
2409
11.9k
                if ( !( nodeIsDT(node) || nodeIsDD(node) || ( nodeIsDIV(node) && TY_(IsHTML5Mode)(doc) ) ) )
2410
6.12k
                {
2411
6.12k
                    TY_(UngetToken)( doc );
2412
2413
6.12k
                    if (!(node->tag->model & (CM_BLOCK | CM_INLINE)))
2414
1.49k
                    {
2415
1.49k
                        TY_(Report)(doc, list, node, TAG_NOT_ALLOWED_IN);
2416
1.49k
                        DEBUG_LOG_EXIT;
2417
1.49k
                        return NULL;
2418
1.49k
                    }
2419
2420
                    /* if DD appeared directly in BODY then exclude blocks */
2421
4.63k
                    if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks)
2422
1.18k
                    {
2423
1.18k
                        DEBUG_LOG_EXIT;
2424
1.18k
                        return NULL;
2425
1.18k
                    }
2426
2427
3.45k
                    node = TY_(InferredTag)(doc, TidyTag_DD);
2428
3.45k
                    TY_(Report)(doc, list, node, MISSING_STARTTAG);
2429
3.45k
                }
2430
2431
9.30k
                if (node->type == EndTag)
2432
0
                {
2433
0
                    TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
2434
0
                    TY_(FreeNode)( doc, node);
2435
0
                    continue;
2436
0
                }
2437
2438
                /* node should be <DT> or <DD> or <DIV>*/
2439
9.30k
                TY_(InsertNodeAtEnd)(list, node);
2440
9.30k
                {
2441
9.30k
                    TidyParserMemory memory = {0};
2442
9.30k
                    memory.identity = TY_(ParseDefList);
2443
9.30k
                    memory.original_node = list;
2444
9.30k
                    memory.reentry_node = node;
2445
9.30k
                    memory.reentry_state = STATE_INITIAL;
2446
9.30k
                    TY_(pushMemory)( doc, memory );
2447
9.30k
                    DEBUG_LOG_EXIT;
2448
9.30k
                    return node;
2449
9.30k
                }
2450
9.30k
            } break;
2451
2452
2453
3.51k
            case STATE_POST_NODEISCENTER:
2454
3.51k
            {
2455
3.51k
                lexer->excludeBlocks = yes;
2456
2457
                /* now create a new dl element,
2458
                 * unless node has been blown away because the
2459
                 * center was empty, as above.
2460
                 */
2461
3.51k
                if (parent && parent->last == node)
2462
0
                {
2463
0
                    list = TY_(InferredTag)(doc, TidyTag_DL);
2464
0
                    TY_(InsertNodeAfterElement)(node, list);
2465
0
                }
2466
3.51k
                state = STATE_INITIAL;
2467
3.51k
                continue;
2468
9.30k
            } break;
2469
2470
2471
0
            default:
2472
0
                break;
2473
24.1k
        } /* switch */
2474
24.1k
    } /* while */
2475
2476
2.59k
    TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR);
2477
2.59k
    DEBUG_LOG_EXIT;
2478
2.59k
    return NULL;
2479
18.1k
}
2480
2481
2482
/** MARK: TY_(ParseEmpty)
2483
 *  Parse empty element nodes.
2484
 *
2485
 *  This is a non-recursing parser. It uses the document's parser memory stack
2486
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2487
 *  This parser is also re-enterable, so that post-processing can occur after
2488
 *  such dispatching.
2489
  */
2490
Node* TY_(ParseEmpty)( TidyDocImpl* doc, Node *element, GetTokenMode mode )
2491
7.46k
{
2492
7.46k
    Lexer* lexer = doc->lexer;
2493
7.46k
    if ( lexer->isvoyager )
2494
1.99k
    {
2495
1.99k
        Node *node = TY_(GetToken)( doc, mode);
2496
1.99k
        if ( node )
2497
1.84k
        {
2498
1.84k
            if ( !(node->type == EndTag && node->tag == element->tag) )
2499
1.66k
            {
2500
                /* TY_(Report)(doc, element, node, ELEMENT_NOT_EMPTY); */
2501
1.66k
                TY_(UngetToken)( doc );
2502
1.66k
            }
2503
186
            else
2504
186
            {
2505
186
                TY_(FreeNode)( doc, node );
2506
186
            }
2507
1.84k
        }
2508
1.99k
    }
2509
7.46k
    return NULL;
2510
7.46k
}
2511
2512
2513
/** MARK: TY_(ParseFrameSet)
2514
 *  Parses the `frameset` tag.
2515
 *
2516
 *  This is a non-recursing parser. It uses the document's parser memory stack
2517
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2518
 *  This parser is also re-enterable, so that post-processing can occur after
2519
 *  such dispatching.
2520
 */
2521
Node* TY_(ParseFrameSet)( TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode) )
2522
23.2k
{
2523
23.2k
    Lexer* lexer = doc->lexer;
2524
23.2k
    Node *node;
2525
23.2k
    DEBUG_LOG_COUNTERS;
2526
2527
    /*
2528
     If we're re-entering, then we need to setup from a previous state,
2529
     instead of starting fresh. We can pull what we need from the document's
2530
     stack.
2531
     */
2532
23.2k
    if ( frameset == NULL )
2533
10.8k
    {
2534
10.8k
        TidyParserMemory memory = TY_(popMemory)( doc );
2535
10.8k
        node = memory.reentry_node; /* Throwaway, because we replace it entering the loop. */
2536
10.8k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2537
10.8k
        frameset = memory.original_node;
2538
10.8k
        DEBUG_LOG_GET_OLD_MODE;
2539
10.8k
        mode = memory.mode;
2540
10.8k
        DEBUG_LOG_CHANGE_MODE;
2541
10.8k
    }
2542
12.4k
    else
2543
12.4k
    {
2544
12.4k
        DEBUG_LOG_ENTER_WITH_NODE(frameset);
2545
12.4k
        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
2546
12.4k
        {
2547
12.4k
            doc->badAccess |= BA_USING_FRAMES;
2548
12.4k
        }
2549
12.4k
    }
2550
2551
38.2k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2552
28.4k
    {
2553
28.4k
        if (node->tag == frameset->tag && node->type == EndTag)
2554
2.51k
        {
2555
2.51k
            TY_(FreeNode)( doc, node);
2556
2.51k
            frameset->closed = yes;
2557
2.51k
            TrimSpaces(doc, frameset);
2558
2.51k
            DEBUG_LOG_EXIT;
2559
2.51k
            return NULL;
2560
2.51k
        }
2561
2562
        /* deal with comments etc. */
2563
25.9k
        if (InsertMisc(frameset, node))
2564
269
            continue;
2565
2566
25.6k
        if (node->tag == NULL)
2567
6.23k
        {
2568
6.23k
            TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED);
2569
6.23k
            TY_(FreeNode)( doc, node);
2570
6.23k
            continue;
2571
6.23k
        }
2572
2573
19.4k
        if (TY_(nodeIsElement)(node))
2574
19.0k
        {
2575
19.0k
            if (node->tag && node->tag->model & CM_HEAD)
2576
665
            {
2577
665
                MoveToHead(doc, frameset, node);
2578
665
                continue;
2579
665
            }
2580
19.0k
        }
2581
2582
18.7k
        if ( nodeIsBODY(node) )
2583
1.42k
        {
2584
1.42k
            TY_(UngetToken)( doc );
2585
1.42k
            node = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
2586
1.42k
            TY_(Report)(doc, frameset, node, INSERTING_TAG);
2587
1.42k
        }
2588
2589
18.7k
        if (node->type == StartTag && (node->tag && node->tag->model & CM_FRAMES))
2590
10.9k
        {
2591
10.9k
            TY_(InsertNodeAtEnd)(frameset, node);
2592
10.9k
            lexer->excludeBlocks = no;
2593
            
2594
            /*
2595
             * We don't really have to do anything when re-entering, except
2596
             * setting up the state when we left. No post-processing means
2597
             * this stays simple.
2598
             */
2599
10.9k
            TidyParserMemory memory = {0};
2600
10.9k
            memory.identity = TY_(ParseFrameSet);
2601
10.9k
            memory.original_node = frameset;
2602
10.9k
            memory.reentry_node = node;
2603
10.9k
            memory.mode = MixedContent;
2604
10.9k
            TY_(pushMemory)( doc, memory );
2605
10.9k
            DEBUG_LOG_EXIT_WITH_NODE(node);
2606
10.9k
            return node;
2607
10.9k
        }
2608
7.84k
        else if (node->type == StartEndTag && (node->tag && node->tag->model & CM_FRAMES))
2609
599
        {
2610
599
            TY_(InsertNodeAtEnd)(frameset, node);
2611
599
            continue;
2612
599
        }
2613
2614
        /* discard unexpected tags */
2615
        /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */
2616
7.25k
        if ( nodeIsA(node) )
2617
390
           doc->badAccess |= BA_INVALID_LINK_NOFRAMES;
2618
2619
7.25k
        TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED);
2620
7.25k
        TY_(FreeNode)( doc, node);
2621
7.25k
    }
2622
2623
9.74k
    TY_(Report)(doc, frameset, node, MISSING_ENDTAG_FOR);
2624
9.74k
    DEBUG_LOG_EXIT;
2625
9.74k
    return NULL;
2626
23.2k
}
2627
2628
2629
/** MARK: TY_(ParseHead)
2630
 *  Parses the `head` tag.
2631
 *
2632
 *  This is a non-recursing parser. It uses the document's parser memory stack
2633
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2634
 *  This parser is also re-enterable, so that post-processing can occur after
2635
 *  such dispatching.
2636
 */
2637
Node* TY_(ParseHead)( TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode) )
2638
17.5k
{
2639
17.5k
    Lexer* lexer = doc->lexer;
2640
17.5k
    Node *node;
2641
17.5k
    int HasTitle = 0;
2642
17.5k
    int HasBase = 0;
2643
17.5k
    DEBUG_LOG_COUNTERS;
2644
2645
17.5k
    if ( head == NULL )
2646
511
    {
2647
511
        TidyParserMemory memory = TY_(popMemory)( doc );
2648
511
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
2649
511
        DEBUG_LOG_REENTER_WITH_NODE(node);
2650
511
        head = memory.original_node;
2651
511
        HasTitle = memory.register_1;
2652
511
        HasBase = memory.register_2;
2653
511
        DEBUG_LOG_GET_OLD_MODE;
2654
511
        mode = memory.mode;
2655
511
        DEBUG_LOG_CHANGE_MODE;
2656
511
    }
2657
17.0k
    else
2658
17.0k
    {
2659
17.0k
        DEBUG_LOG_ENTER_WITH_NODE(head);
2660
17.0k
    }
2661
    
2662
20.2k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
2663
20.1k
    {
2664
20.1k
        if (node->tag == head->tag && node->type == EndTag)
2665
136
        {
2666
136
            TY_(FreeNode)( doc, node);
2667
136
            head->closed = yes;
2668
136
            break;
2669
136
        }
2670
2671
        /* find and discard multiple <head> elements */
2672
        /* find and discard <html> in <head> elements */
2673
20.0k
        if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag)
2674
962
        {
2675
962
            TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2676
962
            TY_(FreeNode)(doc, node);
2677
962
            continue;
2678
962
        }
2679
2680
19.0k
        if (TY_(nodeIsText)(node))
2681
1.06k
        {
2682
            /*\ Issue #132 - avoid warning for missing body tag,
2683
             *  if configured to --omit-otpional-tags yes
2684
             *  Issue #314 - and if --show-body-only
2685
            \*/
2686
1.06k
            if (!cfgBool( doc, TidyOmitOptionalTags ) &&
2687
973
                !showingBodyOnly(doc) )
2688
973
            {
2689
973
                TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN);
2690
973
            }
2691
1.06k
            TY_(UngetToken)( doc );
2692
1.06k
            break;
2693
1.06k
        }
2694
2695
18.0k
        if (node->type == ProcInsTag && node->element &&
2696
0
            TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0)
2697
0
        {
2698
0
            TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN);
2699
0
            TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node);
2700
0
            continue;
2701
0
        }
2702
2703
        /* deal with comments etc. */
2704
18.0k
        if (InsertMisc(head, node))
2705
0
            continue;
2706
2707
18.0k
        if (node->type == DocTypeTag)
2708
283
        {
2709
283
            InsertDocType(doc, head, node);
2710
283
            continue;
2711
283
        }
2712
2713
        /* discard unknown tags */
2714
17.7k
        if (node->tag == NULL)
2715
1.40k
        {
2716
1.40k
            TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2717
1.40k
            TY_(FreeNode)( doc, node);
2718
1.40k
            continue;
2719
1.40k
        }
2720
2721
        /*
2722
         if it doesn't belong in the head then
2723
         treat as implicit end of head and deal
2724
         with as part of the body
2725
        */
2726
16.3k
        if (!(node->tag->model & CM_HEAD))
2727
15.8k
        {
2728
            /* #545067 Implicit closing of head broken - warn only for XHTML input */
2729
15.8k
            if ( lexer->isvoyager )
2730
2
                TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN );
2731
15.8k
            TY_(UngetToken)( doc );
2732
15.8k
            break;
2733
15.8k
        }
2734
2735
516
        if (TY_(nodeIsElement)(node))
2736
511
        {
2737
511
            if ( nodeIsTITLE(node) )
2738
12
            {
2739
12
                ++HasTitle;
2740
2741
12
                if (HasTitle > 1)
2742
11
                    TY_(Report)(doc, head, node,
2743
11
                                     head ?
2744
11
                                     TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
2745
12
            }
2746
499
            else if ( nodeIsBASE(node) )
2747
0
            {
2748
0
                ++HasBase;
2749
2750
0
                if (HasBase > 1)
2751
0
                    TY_(Report)(doc, head, node,
2752
0
                                     head ?
2753
0
                                     TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS);
2754
0
            }
2755
2756
511
            TY_(InsertNodeAtEnd)(head, node);
2757
2758
511
            {
2759
511
                TidyParserMemory memory = {0};
2760
511
                memory.identity = TY_(ParseHead);
2761
511
                memory.original_node = head;
2762
511
                memory.reentry_node = node;
2763
511
                memory.register_1 = HasTitle;
2764
511
                memory.register_2 = HasBase;
2765
511
                TY_(pushMemory)( doc, memory );
2766
511
                DEBUG_LOG_EXIT_WITH_NODE(node);
2767
511
                return node;
2768
511
            }
2769
511
        }
2770
2771
        /* discard unexpected text nodes and end tags */
2772
5
        TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2773
5
        TY_(FreeNode)( doc, node);
2774
5
    }
2775
17.0k
    DEBUG_LOG_EXIT;
2776
17.0k
    return NULL;
2777
17.5k
}
2778
2779
2780
/** MARK: TY_(ParseHTML)
2781
 *  Parses the `html` tag. At this point, other root-level stuff (doctype,
2782
 *  comments) are already set up, and here we handle all of the complexities
2783
 *  of things such as frameset documents, etc.
2784
 *
2785
 *  This is a non-recursing parser. It uses the document's parser memory stack
2786
 *  to send subsequent nodes back to the controller for dispatching to parsers.
2787
 *  This parser is also re-enterable, so that post-processing can occur after
2788
 *  such dispatching.
2789
 */
2790
Node* TY_(ParseHTML)( TidyDocImpl *doc, Node *html, GetTokenMode mode )
2791
39.5k
{
2792
39.5k
    Node *node = NULL;
2793
39.5k
    Node *head = NULL;
2794
39.5k
    Node *frameset = NULL;
2795
39.5k
    Node *noframes = NULL;
2796
39.5k
    DEBUG_LOG_COUNTERS;
2797
2798
39.5k
    enum parserState {
2799
39.5k
        STATE_INITIAL,                /* This is the initial state for every parser. */
2800
39.5k
        STATE_COMPLETE,               /* Complete! */
2801
39.5k
        STATE_PRE_BODY,               /* In this state, we'll consider frames vs. body. */
2802
39.5k
        STATE_PARSE_BODY,             /* In this state, we can parse the body. */
2803
39.5k
        STATE_PARSE_HEAD,             /* In this state, we will setup head for parsing. */
2804
39.5k
        STATE_PARSE_HEAD_REENTER,     /* Resume here after parsing head. */
2805
39.5k
        STATE_PARSE_NOFRAMES,         /* In this state, we can parse noframes content. */
2806
39.5k
        STATE_PARSE_NOFRAMES_REENTER, /* In this state, we can restore more state. */
2807
39.5k
        STATE_PARSE_FRAMESET,         /* In this state, we will parse frameset content. */
2808
39.5k
        STATE_PARSE_FRAMESET_REENTER, /* We need to cleanup some things after parsing frameset. */
2809
39.5k
    } state = STATE_INITIAL;
2810
2811
39.5k
    TY_(SetOptionBool)( doc, TidyXmlTags, no );
2812
2813
39.5k
    if ( html == NULL )
2814
22.6k
    {
2815
22.6k
        TidyParserMemory memory = TY_(popMemory)( doc );
2816
22.6k
        node = memory.reentry_node;
2817
22.6k
        DEBUG_LOG_REENTER_WITH_NODE(node);
2818
22.6k
        html = memory.original_node;
2819
22.6k
        state = memory.reentry_state;
2820
22.6k
        DEBUG_LOG_GET_OLD_MODE;
2821
22.6k
        mode = memory.reentry_mode;
2822
22.6k
        DEBUG_LOG_CHANGE_MODE;
2823
22.6k
    }
2824
16.9k
    else
2825
16.9k
    {
2826
16.9k
        DEBUG_LOG_ENTER_WITH_NODE(html);
2827
16.9k
    }
2828
2829
    /*
2830
     This main loop pulls tokens from the lexer until we're out of tokens,
2831
     or until there's no more work to do.
2832
     */
2833
105k
    while ( state != STATE_COMPLETE )
2834
104k
    {
2835
104k
        if ( state == STATE_INITIAL || state == STATE_PRE_BODY )
2836
43.2k
        {
2837
43.2k
            node = TY_(GetToken)( doc, IgnoreWhitespace );
2838
43.2k
            DEBUG_LOG_GOT_TOKEN(node);
2839
43.2k
        }
2840
2841
104k
        switch ( state )
2842
104k
        {
2843
            /**************************************************************
2844
             This case is all about finding a head tag and dealing with
2845
             cases were we don't, so that we can move on to parsing a head
2846
             tag.
2847
             **************************************************************/
2848
17.0k
            case STATE_INITIAL:
2849
17.0k
            {
2850
                /*
2851
                 The only way we can possibly be here is if the lexer
2852
                 had nothing to give us. Thus we'll create our own
2853
                 head, and set the signal to start parsing it.
2854
                 */
2855
17.0k
                if (node == NULL)
2856
442
                {
2857
442
                    node = TY_(InferredTag)(doc, TidyTag_HEAD);
2858
442
                    state = STATE_PARSE_HEAD;
2859
442
                    continue;
2860
442
                }
2861
2862
                /* We found exactly what we expected: head. */
2863
16.6k
                if ( nodeIsHEAD(node) )
2864
2
                {
2865
2
                    state = STATE_PARSE_HEAD;
2866
2
                    continue;
2867
2
                }
2868
2869
                /* We did not expect to find an html closing tag here! */
2870
16.6k
                if (html && (node->tag == html->tag) && (node->type == EndTag))
2871
0
                {
2872
0
                    TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2873
0
                    TY_(FreeNode)( doc, node);
2874
0
                    continue;
2875
0
                }
2876
2877
                /* Find and discard multiple <html> elements. */
2878
16.6k
                if (html && (node->tag == html->tag) && (node->type == StartTag))
2879
91
                {
2880
91
                    TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2881
91
                    TY_(FreeNode)(doc, node);
2882
91
                    continue;
2883
91
                }
2884
2885
                /* Deal with comments, etc. */
2886
16.5k
                if (InsertMisc(html, node))
2887
84
                    continue;
2888
2889
                /*
2890
                 At this point, we didn't find a head tag, so put the
2891
                 token back and create our own head tag, so we can
2892
                 move on.
2893
                 */
2894
16.4k
                TY_(UngetToken)( doc );
2895
16.4k
                node = TY_(InferredTag)(doc, TidyTag_HEAD);
2896
16.4k
                state = STATE_PARSE_HEAD;
2897
16.4k
                continue;
2898
16.5k
            } break;
2899
2900
2901
            /**************************************************************
2902
             This case determines whether we're dealing with body or
2903
             frameset + noframes, and sets things up accordingly.
2904
             **************************************************************/
2905
26.1k
            case STATE_PRE_BODY:
2906
26.1k
            {
2907
26.1k
                if (node == NULL )
2908
836
                {
2909
836
                    if (frameset == NULL) /* Implied body. */
2910
9
                    {
2911
9
                        node = TY_(InferredTag)(doc, TidyTag_BODY);
2912
9
                        state = STATE_PARSE_BODY;
2913
827
                    } else {
2914
827
                        state = STATE_COMPLETE;
2915
827
                    }
2916
2917
836
                    continue;
2918
836
                }
2919
2920
                /* Robustly handle html tags. */
2921
25.3k
                if (node->tag == html->tag)
2922
22
                {
2923
22
                    if (node->type != StartTag && frameset == NULL)
2924
16
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2925
2926
22
                    TY_(FreeNode)( doc, node);
2927
22
                    continue;
2928
22
                }
2929
2930
                /* Deal with comments, etc. */
2931
25.2k
                if (InsertMisc(html, node))
2932
3.22k
                    continue;
2933
2934
                /* If frameset document, coerce <body> to <noframes> */
2935
22.0k
                if ( nodeIsBODY(node) )
2936
2.04k
                {
2937
2.04k
                    if (node->type != StartTag)
2938
11
                    {
2939
11
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2940
11
                        TY_(FreeNode)( doc, node);
2941
11
                        continue;
2942
11
                    }
2943
2944
2.02k
                    if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
2945
2.02k
                    {
2946
2.02k
                        if (frameset != NULL)
2947
2.01k
                        {
2948
2.01k
                            TY_(UngetToken)( doc );
2949
2950
2.01k
                            if (noframes == NULL)
2951
2.01k
                            {
2952
2.01k
                                noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
2953
2.01k
                                TY_(InsertNodeAtEnd)(frameset, noframes);
2954
2.01k
                                TY_(Report)(doc, html, noframes, INSERTING_TAG);
2955
2.01k
                            }
2956
2
                            else
2957
2
                            {
2958
2
                                if (noframes->type == StartEndTag)
2959
0
                                    noframes->type = StartTag;
2960
2
                            }
2961
2962
2.01k
                            state = STATE_PARSE_NOFRAMES;
2963
2.01k
                            continue;
2964
2.01k
                        }
2965
2.02k
                    }
2966
2967
17
                    TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
2968
17
                    state = STATE_PARSE_BODY;
2969
17
                    continue;
2970
2.02k
                }
2971
2972
                /* Flag an error if we see more than one frameset. */
2973
20.0k
                if ( nodeIsFRAMESET(node) )
2974
3.63k
                {
2975
3.63k
                    if (node->type != StartTag)
2976
24
                    {
2977
24
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2978
24
                        TY_(FreeNode)( doc, node);
2979
24
                        continue;
2980
24
                    }
2981
2982
3.60k
                    if (frameset != NULL)
2983
312
                        TY_(Report)(doc, html, node, DUPLICATE_FRAMESET);
2984
3.29k
                    else
2985
3.29k
                        frameset = node;
2986
2987
3.60k
                    state = STATE_PARSE_FRAMESET;
2988
3.60k
                    continue;
2989
3.63k
                }
2990
2991
                /* If not a frameset document coerce <noframes> to <body>. */
2992
16.3k
                if ( nodeIsNOFRAMES(node) )
2993
707
                {
2994
707
                    if (node->type != StartTag)
2995
0
                    {
2996
0
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
2997
0
                        TY_(FreeNode)( doc, node);
2998
0
                        continue;
2999
0
                    }
3000
3001
707
                    if (frameset == NULL)
3002
13
                    {
3003
13
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
3004
13
                        TY_(FreeNode)( doc, node);
3005
13
                        node = TY_(InferredTag)(doc, TidyTag_BODY);
3006
13
                        state = STATE_PARSE_BODY;
3007
13
                        continue;
3008
13
                    }
3009
3010
694
                    if (noframes == NULL)
3011
688
                    {
3012
688
                        noframes = node;
3013
688
                        TY_(InsertNodeAtEnd)(frameset, noframes);
3014
688
                        state = STATE_PARSE_NOFRAMES;
3015
688
                    }
3016
6
                    else
3017
6
                    {
3018
6
                        TY_(FreeNode)( doc, node);
3019
6
                    }
3020
3021
694
                    continue;
3022
707
                }
3023
3024
                /* Deal with some other element that we're not expecting. */
3025
15.6k
                if (TY_(nodeIsElement)(node))
3026
13.5k
                {
3027
13.5k
                    if (node->tag && node->tag->model & CM_HEAD)
3028
12
                    {
3029
12
                        MoveToHead(doc, html, node);
3030
12
                        continue;
3031
12
                    }
3032
3033
                    /* Discard illegal frame element following a frameset. */
3034
13.5k
                    if ( frameset != NULL && nodeIsFRAME(node) )
3035
239
                    {
3036
239
                        TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED);
3037
239
                        TY_(FreeNode)(doc, node);
3038
239
                        continue;
3039
239
                    }
3040
13.5k
                }
3041
3042
15.4k
                TY_(UngetToken)( doc );
3043
3044
                /* Insert other content into noframes element. */
3045
15.4k
                if (frameset)
3046
1.84k
                {
3047
1.84k
                    if (noframes == NULL)
3048
1.32k
                    {
3049
1.32k
                        noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES);
3050
1.32k
                        TY_(InsertNodeAtEnd)(frameset, noframes);
3051
1.32k
                    }
3052
524
                    else
3053
524
                    {
3054
524
                        TY_(Report)(doc, html, node, NOFRAMES_CONTENT);
3055
524
                        if (noframes->type == StartEndTag)
3056
221
                            noframes->type = StartTag;
3057
524
                    }
3058
3059
1.84k
                    TY_(ConstrainVersion)(doc, VERS_FRAMESET);
3060
1.84k
                    state = STATE_PARSE_NOFRAMES;
3061
1.84k
                    continue;
3062
1.84k
                }
3063
3064
13.5k
                node = TY_(InferredTag)(doc, TidyTag_BODY);
3065
3066
                /* Issue #132 - disable inserting BODY tag warning
3067
                 BUT only if NOT --show-body-only yes */
3068
27.1k
                if (!showingBodyOnly(doc))
3069
13.5k
                    TY_(Report)(doc, html, node, INSERTING_TAG );
3070
3071
13.5k
                TY_(ConstrainVersion)(doc, ~VERS_FRAMESET);
3072
13.5k
                state = STATE_PARSE_BODY;
3073
13.5k
                continue;
3074
15.4k
            } break;
3075
3076
3077
            /**************************************************************
3078
             In this case, we're ready to parse the head, and move on to
3079
             look for the body or body alternative.
3080
             **************************************************************/
3081
16.9k
            case STATE_PARSE_HEAD:
3082
16.9k
            {
3083
16.9k
                TidyParserMemory memory = {0};
3084
16.9k
                memory.identity = TY_(ParseHTML);
3085
16.9k
                memory.mode = mode;
3086
16.9k
                memory.original_node = html;
3087
16.9k
                memory.reentry_node = node;
3088
16.9k
                memory.reentry_mode = mode;
3089
16.9k
                memory.reentry_state = STATE_PARSE_HEAD_REENTER;
3090
16.9k
                TY_(InsertNodeAtEnd)(html, node);
3091
16.9k
                TY_(pushMemory)( doc, memory );
3092
16.9k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3093
16.9k
                return node;
3094
15.4k
            } break;
3095
3096
16.9k
            case STATE_PARSE_HEAD_REENTER:
3097
16.9k
            {
3098
16.9k
                head = node;
3099
16.9k
                state = STATE_PRE_BODY;
3100
16.9k
            } break;
3101
3102
3103
            /**************************************************************
3104
             In this case, we can finally parse a body.
3105
             **************************************************************/
3106
13.6k
            case STATE_PARSE_BODY:
3107
13.6k
            {
3108
13.6k
                TidyParserMemory memory = {0};
3109
13.6k
                memory.identity = NULL; /* we don't need to reenter */
3110
13.6k
                memory.mode = mode;
3111
13.6k
                memory.original_node = html;
3112
13.6k
                memory.reentry_node = NULL;
3113
13.6k
                memory.reentry_mode = mode;
3114
13.6k
                memory.reentry_state = STATE_COMPLETE;
3115
13.6k
                TY_(InsertNodeAtEnd)(html, node);
3116
13.6k
                TY_(pushMemory)( doc, memory );
3117
13.6k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3118
13.6k
                return node;
3119
15.4k
            } break;
3120
3121
3122
            /**************************************************************
3123
             In this case, we will parse noframes. If necessary, the
3124
             node is already inserted in the proper spot.
3125
             **************************************************************/
3126
4.54k
            case STATE_PARSE_NOFRAMES:
3127
4.54k
            {
3128
4.54k
                TidyParserMemory memory = {0};
3129
4.54k
                memory.identity = TY_(ParseHTML);
3130
4.54k
                memory.mode = mode;
3131
4.54k
                memory.original_node = html;
3132
4.54k
                memory.reentry_node = frameset;
3133
4.54k
                memory.reentry_mode = mode;
3134
4.54k
                memory.reentry_state = STATE_PARSE_NOFRAMES_REENTER;
3135
4.54k
                TY_(pushMemory)( doc, memory );
3136
4.54k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3137
4.54k
                return noframes;
3138
15.4k
            } break;
3139
3140
2.22k
            case STATE_PARSE_NOFRAMES_REENTER:
3141
2.22k
            {
3142
2.22k
                frameset = node;
3143
2.22k
                state = STATE_PRE_BODY;
3144
2.22k
            } break;
3145
3146
3147
            /**************************************************************
3148
             In this case, we parse the frameset, and look for noframes
3149
             content to merge later if necessary.
3150
             **************************************************************/
3151
3.60k
            case STATE_PARSE_FRAMESET:
3152
3.60k
            {
3153
3.60k
                TidyParserMemory memory = {0};
3154
3.60k
                memory.identity = TY_(ParseHTML);
3155
3.60k
                memory.mode = mode;
3156
3.60k
                memory.original_node = html;
3157
3.60k
                memory.reentry_node = frameset;
3158
3.60k
                memory.reentry_mode = mode;
3159
3.60k
                memory.reentry_state = STATE_PARSE_FRAMESET_REENTER;
3160
3.60k
                TY_(InsertNodeAtEnd)(html, node);
3161
3.60k
                TY_(pushMemory)( doc, memory );
3162
3.60k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3163
3.60k
                return node;
3164
15.4k
            } break;
3165
3166
3.46k
            case (STATE_PARSE_FRAMESET_REENTER):
3167
3.46k
            {
3168
3.46k
                frameset = node;
3169
                /*
3170
                 See if it includes a noframes element so that
3171
                 we can merge subsequent noframes elements.
3172
                 */
3173
4.18k
                for (node = frameset->content; node; node = node->next)
3174
724
                {
3175
724
                    if ( nodeIsNOFRAMES(node) )
3176
567
                        noframes = node;
3177
724
                }
3178
3.46k
                state = STATE_PRE_BODY;
3179
3.46k
            } break;
3180
3181
3182
            /**************************************************************
3183
             We really shouldn't get here, but if we do, finish nicely.
3184
             **************************************************************/
3185
0
            default:
3186
0
            {
3187
0
                state = STATE_COMPLETE;
3188
0
            }
3189
104k
        } /* switch */
3190
104k
    } /* while */
3191
3192
827
    DEBUG_LOG_EXIT;
3193
827
    return NULL;
3194
39.5k
}
3195
3196
3197
/** MARK: TY_(ParseInline)
3198
 *  Parse inline element nodes.
3199
 *
3200
 *  This is a non-recursing parser. It uses the document's parser memory stack
3201
 *  to send subsequent nodes back to the controller for dispatching to parsers.
3202
 *  This parser is also re-enterable, so that post-processing can occur after
3203
 *  such dispatching.
3204
*/
3205
Node* TY_(ParseInline)( TidyDocImpl *doc, Node *element, GetTokenMode mode )
3206
591k
{
3207
591k
    Lexer* lexer = doc->lexer;
3208
591k
    Node *node = NULL;
3209
591k
    Node *parent = NULL;
3210
591k
    DEBUG_LOG_COUNTERS;
3211
    
3212
591k
    if ( element == NULL )
3213
288k
    {
3214
288k
        TidyParserMemory memory = TY_(popMemory)( doc );
3215
288k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
3216
288k
        DEBUG_LOG_REENTER_WITH_NODE(node);
3217
288k
        element = memory.original_node;
3218
288k
        DEBUG_LOG_GET_OLD_MODE;
3219
288k
        mode = memory.reentry_mode;
3220
288k
        DEBUG_LOG_CHANGE_MODE;
3221
288k
    }
3222
302k
    else
3223
302k
    {
3224
302k
        DEBUG_LOG_ENTER_WITH_NODE(element);
3225
3226
302k
        if (element->tag->model & CM_EMPTY)
3227
0
        {
3228
0
            DEBUG_LOG_EXIT;
3229
0
            return NULL;
3230
0
        }
3231
3232
        /*
3233
         ParseInline is used for some block level elements like H1 to H6
3234
         For such elements we need to insert inline emphasis tags currently
3235
         on the inline stack. For Inline elements, we normally push them
3236
         onto the inline stack provided they aren't implicit or OBJECT/APPLET.
3237
         This test is carried out in PushInline and PopInline, see istack.c
3238
3239
         InlineDup(...) is not called for elements with a CM_MIXED (inline and
3240
         block) content model, e.g. <del> or <ins>, otherwise constructs like
3241
3242
           <p>111<a name='foo'>222<del>333</del>444</a>555</p>
3243
           <p>111<span>222<del>333</del>444</span>555</p>
3244
           <p>111<em>222<del>333</del>444</em>555</p>
3245
3246
         will get corrupted.
3247
        */
3248
302k
        if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) &&
3249
10.0k
            !TY_(nodeHasCM)(element, CM_MIXED))
3250
10.0k
            TY_(InlineDup)(doc, NULL);
3251
292k
        else if (TY_(nodeHasCM)(element, CM_INLINE))
3252
292k
            TY_(PushInline)(doc, element);
3253
3254
302k
        if ( nodeIsNOBR(element) )
3255
0
            doc->badLayout |= USING_NOBR;
3256
302k
        else if ( nodeIsFONT(element) )
3257
275k
            doc->badLayout |= USING_FONT;
3258
3259
        /* Inline elements may or may not be within a preformatted element */
3260
302k
        if (mode != Preformatted)
3261
302k
        {
3262
302k
            DEBUG_LOG_GET_OLD_MODE;
3263
302k
            mode = MixedContent;
3264
302k
            DEBUG_LOG_CHANGE_MODE;
3265
302k
        }
3266
302k
    }
3267
    
3268
630k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
3269
495k
    {
3270
        /* end tag for current element */
3271
495k
        if (node->tag == element->tag && node->type == EndTag)
3272
5.42k
        {
3273
5.42k
            if (element->tag->model & CM_INLINE)
3274
5.29k
                TY_(PopInline)( doc, node );
3275
3276
5.42k
            TY_(FreeNode)( doc, node );
3277
3278
5.42k
            if (!(mode & Preformatted))
3279
5.42k
                TrimSpaces(doc, element);
3280
3281
            /*
3282
             if a font element wraps an anchor and nothing else
3283
             then move the font element inside the anchor since
3284
             otherwise it won't alter the anchor text color
3285
            */
3286
5.42k
            if ( nodeIsFONT(element) &&
3287
1.94k
                 element->content && element->content == element->last )
3288
1.14k
            {
3289
1.14k
                Node *child = element->content;
3290
3291
1.14k
                if ( nodeIsA(child) )
3292
15
                {
3293
15
                    child->parent = element->parent;
3294
15
                    child->next = element->next;
3295
15
                    child->prev = element->prev;
3296
3297
15
                    element->next = NULL;
3298
15
                    element->prev = NULL;
3299
15
                    element->parent = child;
3300
3301
15
                    element->content = child->content;
3302
15
                    element->last = child->last;
3303
15
                    child->content = element;
3304
3305
15
                    TY_(FixNodeLinks)(child);
3306
15
                    TY_(FixNodeLinks)(element);
3307
15
                }
3308
1.14k
            }
3309
3310
5.42k
            element->closed = yes;
3311
5.42k
            TrimSpaces( doc, element );
3312
3313
5.42k
            DEBUG_LOG_EXIT;
3314
5.42k
            return NULL;
3315
5.42k
        }
3316
3317
        /* <u>...<u>  map 2nd <u> to </u> if 1st is explicit */
3318
        /* (see additional conditions below) */
3319
        /* otherwise emphasis nesting is probably unintentional */
3320
        /* big, small, sub, sup have cumulative effect to leave them alone */
3321
490k
        if ( node->type == StartTag
3322
422k
             && node->tag == element->tag
3323
279k
             && TY_(IsPushed)( doc, node )
3324
275k
             && !node->implicit
3325
26.6k
             && !element->implicit
3326
25.4k
             && node->tag && (node->tag->model & CM_INLINE)
3327
490k
             && !nodeIsA(node)
3328
490k
             && !nodeIsFONT(node)
3329
490k
             && !nodeIsBIG(node)
3330
490k
             && !nodeIsSMALL(node)
3331
490k
             && !nodeIsSUB(node)
3332
490k
             && !nodeIsSUP(node)
3333
490k
             && !nodeIsQ(node)
3334
490k
             && !nodeIsSPAN(node)
3335
1.04k
             && cfgBool(doc, TidyCoerceEndTags)
3336
490k
           )
3337
1.04k
        {
3338
            /* proceeds only if "node" does not have any attribute and
3339
               follows a text node not finishing with a space */
3340
1.04k
            if (element->content != NULL && node->attributes == NULL
3341
85
                && TY_(nodeIsText)(element->last)
3342
52
                && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) )
3343
52
            {
3344
52
                TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
3345
52
                node->type = EndTag;
3346
52
                TY_(UngetToken)(doc);
3347
52
                continue;
3348
52
            }
3349
3350
996
            if (node->attributes == NULL || element->attributes == NULL)
3351
842
                TY_(Report)(doc, element, node, NESTED_EMPHASIS);
3352
996
        }
3353
488k
        else if ( TY_(IsPushed)(doc, node) && node->type == StartTag &&
3354
282k
                  nodeIsQ(node) )
3355
1.70k
        {
3356
            /*\
3357
             * Issue #215 - such nested quotes are NOT a problem if HTML5, so
3358
             * only issue this warning if NOT HTML5 mode.
3359
            \*/
3360
1.70k
            if (TY_(HTMLVersion)(doc) != HT50)
3361
1.67k
            {
3362
1.67k
                TY_(Report)(doc, element, node, NESTED_QUOTATION);
3363
1.67k
            }
3364
1.70k
        }
3365
3366
489k
        if ( TY_(nodeIsText)(node) )
3367
16.8k
        {
3368
            /* only called for 1st child */
3369
16.8k
            if ( element->content == NULL && !(mode & Preformatted) )
3370
10.4k
                TrimSpaces( doc, element );
3371
3372
16.8k
            if ( node->start >= node->end )
3373
0
            {
3374
0
                TY_(FreeNode)( doc, node );
3375
0
                continue;
3376
0
            }
3377
3378
16.8k
            TY_(InsertNodeAtEnd)(element, node);
3379
16.8k
            continue;
3380
16.8k
        }
3381
3382
        /* mixed content model so allow text */
3383
473k
        if (InsertMisc(element, node))
3384
3.06k
            continue;
3385
3386
        /* deal with HTML tags */
3387
470k
        if ( nodeIsHTML(node) )
3388
959
        {
3389
959
            if ( TY_(nodeIsElement)(node) )
3390
959
            {
3391
959
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED );
3392
959
                TY_(FreeNode)( doc, node );
3393
959
                continue;
3394
959
            }
3395
3396
            /* otherwise infer end of inline element */
3397
0
            TY_(UngetToken)( doc );
3398
3399
0
            if (!(mode & Preformatted))
3400
0
                TrimSpaces(doc, element);
3401
3402
0
            DEBUG_LOG_EXIT;
3403
0
            return NULL;
3404
959
        }
3405
3406
        /* within <dt> or <pre> map <p> to <br> */
3407
469k
        if ( nodeIsP(node) &&
3408
9.35k
             node->type == StartTag &&
3409
9.21k
             ( (mode & Preformatted) ||
3410
9.21k
               nodeIsDT(element) ||
3411
8.39k
               DescendantOf(element, TidyTag_DT )
3412
9.21k
             )
3413
469k
           )
3414
856
        {
3415
856
            node->tag = TY_(LookupTagDef)( TidyTag_BR );
3416
856
            TidyDocFree(doc, node->element);
3417
856
            node->element = TY_(tmbstrdup)(doc->allocator, "br");
3418
856
            TrimSpaces(doc, element);
3419
856
            TY_(InsertNodeAtEnd)(element, node);
3420
856
            continue;
3421
856
        }
3422
3423
        /* <p> allowed within <address> in HTML 4.01 Transitional */
3424
468k
        if ( nodeIsP(node) &&
3425
8.50k
             node->type == StartTag &&
3426
8.35k
             nodeIsADDRESS(element) )
3427
0
        {
3428
0
            TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT );
3429
0
            TY_(InsertNodeAtEnd)(element, node);
3430
0
            (*node->tag->parser)( doc, node, mode );
3431
0
            continue;
3432
0
        }
3433
3434
        /* ignore unknown and PARAM tags */
3435
468k
        if ( node->tag == NULL || nodeIsPARAM(node) )
3436
11.8k
        {
3437
11.8k
            TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3438
11.8k
            TY_(FreeNode)( doc, node );
3439
11.8k
            continue;
3440
11.8k
        }
3441
3442
456k
        if ( nodeIsBR(node) && node->type == EndTag )
3443
429
            node->type = StartTag;
3444
3445
456k
        if ( node->type == EndTag )
3446
41.5k
        {
3447
           /* coerce </br> to <br> */
3448
41.5k
           if ( nodeIsBR(node) )
3449
0
                node->type = StartTag;
3450
41.5k
           else if ( nodeIsP(node) )
3451
115
           {
3452
               /* coerce unmatched </p> to <br><br> */
3453
115
                if ( !DescendantOf(element, TidyTag_P) )
3454
8
                {
3455
8
                    TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
3456
8
                    TrimSpaces( doc, element );
3457
8
                    TY_(InsertNodeAtEnd)( element, node );
3458
8
                    node = TY_(InferredTag)(doc, TidyTag_BR);
3459
8
                    TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */
3460
8
                    continue;
3461
8
                }
3462
115
           }
3463
41.4k
           else if ( TY_(nodeHasCM)(node, CM_INLINE)
3464
41.4k
                     && !nodeIsA(node)
3465
5.69k
                     && !TY_(nodeHasCM)(node, CM_OBJECT)
3466
1.67k
                     && TY_(nodeHasCM)(element, CM_INLINE) )
3467
1.66k
            {
3468
                /* allow any inline end tag to end current element */
3469
3470
                /* http://tidy.sf.net/issue/1426419 */
3471
                /* but, like the browser, retain an earlier inline element.
3472
                   This is implemented by setting the lexer into a mode
3473
                   where it gets tokens from the inline stack rather than
3474
                   from the input stream. Check if the scenerio fits. */
3475
1.66k
                if ( !nodeIsA(element)
3476
1.22k
                     && (node->tag != element->tag)
3477
1.22k
                     && TY_(IsPushed)( doc, node )
3478
1.00k
                     && TY_(IsPushed)( doc, element ) )
3479
1.00k
                {
3480
                    /* we have something like
3481
                       <b>bold <i>bold and italic</b> italics</i> */
3482
1.00k
                    if ( TY_(SwitchInline)( doc, element, node ) )
3483
760
                    {
3484
760
                        TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG);
3485
760
                        TY_(UngetToken)( doc ); /* put this back */
3486
760
                        TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */
3487
760
                        if (!(mode & Preformatted))
3488
760
                            TrimSpaces( doc, element );
3489
3490
760
                        DEBUG_LOG_EXIT;
3491
760
                        return NULL; /* close <i>, but will re-open it, after </b> */
3492
760
                    }
3493
1.00k
                }
3494
901
                TY_(PopInline)( doc, element );
3495
3496
901
                if ( !nodeIsA(element) )
3497
466
                {
3498
466
                    if ( nodeIsA(node) && node->tag != element->tag )
3499
0
                    {
3500
0
                       TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
3501
0
                       TY_(UngetToken)( doc );
3502
0
                    }
3503
466
                    else
3504
466
                    {
3505
466
                        TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG);
3506
466
                        TY_(FreeNode)( doc, node);
3507
466
                    }
3508
3509
466
                    if (!(mode & Preformatted))
3510
466
                        TrimSpaces(doc, element);
3511
3512
466
                    DEBUG_LOG_EXIT;
3513
466
                    return NULL;
3514
466
                }
3515
3516
                /* if parent is <a> then discard unexpected inline end tag */
3517
435
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3518
435
                TY_(FreeNode)( doc, node);
3519
435
                continue;
3520
901
            }  /* special case </tr> etc. for stuff moved in front of table */
3521
39.8k
            else if ( lexer->exiled
3522
34.4k
                     && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
3523
33.3k
            {
3524
33.3k
                TY_(UngetToken)( doc );
3525
33.3k
                TrimSpaces(doc, element);
3526
3527
33.3k
                DEBUG_LOG_EXIT;
3528
33.3k
                return NULL;
3529
33.3k
            }
3530
41.5k
        }
3531
3532
        /* allow any header tag to end current header */
3533
421k
        if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) )
3534
727
        {
3535
3536
727
            if ( node->tag == element->tag )
3537
427
            {
3538
427
                TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG );
3539
427
                TY_(FreeNode)( doc, node);
3540
427
            }
3541
300
            else
3542
300
            {
3543
300
                TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE );
3544
300
                TY_(UngetToken)( doc );
3545
300
            }
3546
3547
727
            if (!(mode & Preformatted))
3548
727
                TrimSpaces(doc, element);
3549
3550
727
            DEBUG_LOG_EXIT;
3551
727
            return NULL;
3552
727
        }
3553
3554
        /*
3555
           an <A> tag to ends any open <A> element
3556
           but <A href=...> is mapped to </A><A href=...>
3557
        */
3558
        /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
3559
        /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */
3560
420k
        if ( nodeIsA(node) && !node->implicit &&
3561
7.82k
             (nodeIsA(element) || DescendantOf(element, TidyTag_A)) )
3562
5.30k
        {
3563
            /* coerce <a> to </a> unless it has some attributes */
3564
            /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */
3565
            /* other fixes by Dave Raggett */
3566
            /* if (node->attributes == NULL) */
3567
5.30k
            if (node->type != EndTag && node->attributes == NULL
3568
3.16k
                && cfgBool(doc, TidyCoerceEndTags) )
3569
3.16k
            {
3570
3.16k
                node->type = EndTag;
3571
3.16k
                TY_(Report)(doc, element, node, COERCE_TO_ENDTAG);
3572
                /* TY_(PopInline)( doc, node ); */
3573
3.16k
                TY_(UngetToken)( doc );
3574
3.16k
                continue;
3575
3.16k
            }
3576
3577
2.13k
            TY_(UngetToken)( doc );
3578
2.13k
            TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
3579
            /* TY_(PopInline)( doc, element ); */
3580
3581
2.13k
            if (!(mode & Preformatted))
3582
2.13k
                TrimSpaces(doc, element);
3583
3584
2.13k
            DEBUG_LOG_EXIT;
3585
2.13k
            return NULL;
3586
5.30k
        }
3587
3588
415k
        if (element->tag->model & CM_HEADING)
3589
2.63k
        {
3590
2.63k
            if ( nodeIsCENTER(node) || nodeIsDIV(node) )
3591
511
            {
3592
511
                if (!TY_(nodeIsElement)(node))
3593
152
                {
3594
152
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3595
152
                    TY_(FreeNode)( doc, node);
3596
152
                    continue;
3597
152
                }
3598
3599
359
                TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
3600
3601
                /* insert center as parent if heading is empty */
3602
359
                if (element->content == NULL)
3603
329
                {
3604
329
                    InsertNodeAsParent(element, node);
3605
329
                    continue;
3606
329
                }
3607
3608
                /* split heading and make center parent of 2nd part */
3609
30
                TY_(InsertNodeAfterElement)(element, node);
3610
3611
30
                if (!(mode & Preformatted))
3612
30
                    TrimSpaces(doc, element);
3613
3614
30
                element = TY_(CloneNode)( doc, element );
3615
30
                TY_(InsertNodeAtEnd)(node, element);
3616
30
                continue;
3617
359
            }
3618
3619
2.12k
            if ( nodeIsHR(node) )
3620
566
            {
3621
566
                if ( !TY_(nodeIsElement)(node) )
3622
0
                {
3623
0
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3624
0
                    TY_(FreeNode)( doc, node);
3625
0
                    continue;
3626
0
                }
3627
3628
566
                TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
3629
3630
                /* insert hr before heading if heading is empty */
3631
566
                if (element->content == NULL)
3632
256
                {
3633
256
                    TY_(InsertNodeBeforeElement)(element, node);
3634
256
                    continue;
3635
256
                }
3636
3637
                /* split heading and insert hr before 2nd part */
3638
310
                TY_(InsertNodeAfterElement)(element, node);
3639
3640
310
                if (!(mode & Preformatted))
3641
310
                    TrimSpaces(doc, element);
3642
3643
310
                element = TY_(CloneNode)( doc, element );
3644
310
                TY_(InsertNodeAfterElement)(node, element);
3645
310
                continue;
3646
566
            }
3647
2.12k
        }
3648
3649
414k
        if ( nodeIsDT(element) )
3650
6.00k
        {
3651
6.00k
            if ( nodeIsHR(node) )
3652
521
            {
3653
521
                Node *dd;
3654
521
                if ( !TY_(nodeIsElement)(node) )
3655
0
                {
3656
0
                    TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3657
0
                    TY_(FreeNode)( doc, node);
3658
0
                    continue;
3659
0
                }
3660
3661
521
                TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN);
3662
521
                dd = TY_(InferredTag)(doc, TidyTag_DD);
3663
3664
                /* insert hr within dd before dt if dt is empty */
3665
521
                if (element->content == NULL)
3666
0
                {
3667
0
                    TY_(InsertNodeBeforeElement)(element, dd);
3668
0
                    TY_(InsertNodeAtEnd)(dd, node);
3669
0
                    continue;
3670
0
                }
3671
3672
                /* split dt and insert hr within dd before 2nd part */
3673
521
                TY_(InsertNodeAfterElement)(element, dd);
3674
521
                TY_(InsertNodeAtEnd)(dd, node);
3675
3676
521
                if (!(mode & Preformatted))
3677
521
                    TrimSpaces(doc, element);
3678
3679
521
                element = TY_(CloneNode)( doc, element );
3680
521
                TY_(InsertNodeAfterElement)(dd, element);
3681
521
                continue;
3682
521
            }
3683
6.00k
        }
3684
3685
3686
        /*
3687
          if this is the end tag for an ancestor element
3688
          then infer end tag for this element
3689
        */
3690
413k
        if (node->type == EndTag)
3691
5.47k
        {
3692
5.47k
            for (parent = element->parent;
3693
515k
                    parent != NULL; parent = parent->parent)
3694
514k
            {
3695
514k
                if (node->tag == parent->tag)
3696
4.61k
                {
3697
4.61k
                    if (!(element->tag->model & CM_OPT) && !element->implicit)
3698
1.28k
                        TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
3699
3700
4.61k
                    if( TY_(IsPushedLast)( doc, element, node ) )
3701
0
                        TY_(PopInline)( doc, element );
3702
4.61k
                    TY_(UngetToken)( doc );
3703
3704
4.61k
                    if (!(mode & Preformatted))
3705
4.61k
                        TrimSpaces(doc, element);
3706
3707
4.61k
                    DEBUG_LOG_EXIT;
3708
4.61k
                    return NULL;
3709
4.61k
                }
3710
514k
            }
3711
5.47k
        }
3712
3713
        /*\
3714
         *  block level tags end this element
3715
         *  Issue #333 - There seems an exception if the element is a 'span',
3716
         *  and the node just collected is a 'meta'. The 'meta' can not have
3717
         *  CM_INLINE added, nor can the 'span' have CM_MIXED added without
3718
         *  big consequences.
3719
         *  There may be other exceptions to be added...
3720
        \*/
3721
409k
        if (!(node->tag->model & CM_INLINE) &&
3722
120k
            !(element->tag->model & CM_MIXED) &&
3723
120k
            !(nodeIsSPAN(element) && nodeIsMETA(node)) )
3724
120k
        {
3725
120k
            if ( !TY_(nodeIsElement)(node) )
3726
602
            {
3727
602
                TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3728
602
                TY_(FreeNode)( doc, node);
3729
602
                continue;
3730
602
            }
3731
            /* HTML5 */
3732
120k
            if (nodeIsDATALIST(element)) {
3733
0
                TY_(ConstrainVersion)( doc, ~VERS_HTML5 );
3734
0
            } else
3735
120k
            if (!(element->tag->model & CM_OPT))
3736
113k
                TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE);
3737
3738
120k
            if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK))
3739
1
            {
3740
1
                MoveToHead(doc, element, node);
3741
1
                continue;
3742
1
            }
3743
3744
            /*
3745
               prevent anchors from propagating into block tags
3746
               except for headings h1 to h6
3747
            */
3748
120k
            if ( nodeIsA(element) )
3749
1.51k
            {
3750
1.51k
                if (node->tag && !(node->tag->model & CM_HEADING))
3751
979
                    TY_(PopInline)( doc, element );
3752
535
                else if (!(element->content))
3753
463
                {
3754
463
                    TY_(DiscardElement)( doc, element );
3755
463
                    TY_(UngetToken)( doc );
3756
3757
463
                    DEBUG_LOG_EXIT;
3758
463
                    return NULL;
3759
463
                }
3760
1.51k
            }
3761
3762
119k
            TY_(UngetToken)( doc );
3763
3764
119k
            if (!(mode & Preformatted))
3765
119k
                TrimSpaces(doc, element);
3766
3767
119k
            DEBUG_LOG_EXIT;
3768
119k
            return NULL;
3769
120k
        }
3770
3771
        /* parse inline element */
3772
288k
        if (TY_(nodeIsElement)(node))
3773
288k
        {
3774
288k
            if (node->implicit)
3775
256k
                TY_(Report)(doc, element, node, INSERTING_TAG);
3776
3777
            /* trim white space before <br> */
3778
288k
            if ( nodeIsBR(node) )
3779
1.91k
                TrimSpaces(doc, element);
3780
3781
288k
            TY_(InsertNodeAtEnd)(element, node);
3782
            
3783
288k
            {
3784
288k
                TidyParserMemory memory = {0};
3785
288k
                memory.identity = TY_(ParseInline);
3786
288k
                memory.original_node = element;
3787
288k
                memory.reentry_node = node;
3788
288k
                memory.mode = mode;
3789
288k
                memory.reentry_mode = mode;
3790
288k
                TY_(pushMemory)( doc, memory );
3791
288k
                DEBUG_LOG_EXIT_WITH_NODE(node);
3792
288k
                return node;
3793
288k
            }
3794
288k
        }
3795
3796
        /* discard unexpected tags */
3797
249
        TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED);
3798
249
        TY_(FreeNode)( doc, node );
3799
249
        continue;
3800
288k
    }
3801
3802
135k
    if (!(element->tag->model & CM_OPT))
3803
134k
        TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR);
3804
3805
135k
    DEBUG_LOG_EXIT;
3806
135k
    return NULL;
3807
591k
}
3808
3809
3810
/** MARK: TY_(ParseList)
3811
 *  Parses list tags.
3812
 *
3813
 *  This is a non-recursing parser. It uses the document's parser memory stack
3814
 *  to send subsequent nodes back to the controller for dispatching to parsers.
3815
 *  This parser is also re-enterable, so that post-processing can occur after
3816
 *  such dispatching.
3817
*/
3818
Node* TY_(ParseList)( TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode) )
3819
129k
{
3820
129k
    Lexer* lexer = doc->lexer;
3821
129k
    Node *node = NULL;
3822
129k
    Node *parent = NULL;
3823
129k
    Node *lastli = NULL;;
3824
129k
    Bool wasblock = no;
3825
129k
    Bool nodeisOL = nodeIsOL(list);
3826
129k
    DEBUG_LOG_COUNTERS;
3827
3828
129k
    if ( list == NULL )
3829
60.9k
    {
3830
60.9k
        TidyParserMemory memory = TY_(popMemory)( doc );
3831
60.9k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
3832
60.9k
        DEBUG_LOG_REENTER_WITH_NODE(node);
3833
60.9k
        list = memory.original_node;
3834
60.9k
        DEBUG_LOG_GET_OLD_MODE;
3835
60.9k
        mode = memory.mode;
3836
60.9k
        DEBUG_LOG_CHANGE_MODE;
3837
60.9k
    }
3838
68.2k
    else
3839
68.2k
    {
3840
68.2k
        DEBUG_LOG_ENTER_WITH_NODE(list);
3841
3842
68.2k
        if (list->tag->model & CM_EMPTY)
3843
0
        {
3844
0
            DEBUG_LOG_EXIT;
3845
0
            return NULL;
3846
0
        }
3847
68.2k
    }
3848
    
3849
129k
    lexer->insert = NULL;  /* defer implicit inline start tags */
3850
3851
134k
    while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL)
3852
82.2k
    {
3853
82.2k
        Bool foundLI = no;
3854
82.2k
        if (node->tag == list->tag && node->type == EndTag)
3855
0
        {
3856
0
            TY_(FreeNode)( doc, node);
3857
0
            list->closed = yes;
3858
0
            DEBUG_LOG_EXIT;
3859
0
            return NULL;
3860
0
        }
3861
3862
        /* deal with comments etc. */
3863
82.2k
        if (InsertMisc(list, node))
3864
143
            continue;
3865
3866
82.0k
        if (node->type != TextNode && node->tag == NULL)
3867
4.87k
        {
3868
4.87k
            TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3869
4.87k
            TY_(FreeNode)( doc, node);
3870
4.87k
            continue;
3871
4.87k
        }
3872
77.1k
        if (lexer && (node->type == TextNode))
3873
4.15k
        {
3874
4.15k
            uint ch, ix = node->start;
3875
            /* Issue #572 - Skip whitespace. */
3876
4.15k
            while (ix < node->end && (ch = (lexer->lexbuf[ix] & 0xff))
3877
4.03k
                && (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n'))
3878
0
                ++ix;
3879
4.15k
            if (ix >= node->end)
3880
0
            {
3881
                /* Issue #572 - Discard if ALL whitespace. */
3882
0
                TY_(FreeNode)(doc, node);
3883
0
                continue;
3884
0
            }
3885
4.15k
        }
3886
3887
3888
        /*
3889
          if this is the end tag for an ancestor element
3890
          then infer end tag for this element
3891
        */
3892
77.1k
        if (node->type == EndTag)
3893
218
        {
3894
218
            if ( nodeIsFORM(node) )
3895
0
            {
3896
0
                BadForm( doc );
3897
0
                TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3898
0
                TY_(FreeNode)( doc, node );
3899
0
                continue;
3900
0
            }
3901
3902
218
            if (TY_(nodeHasCM)(node,CM_INLINE))
3903
143
            {
3904
143
                TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3905
143
                TY_(PopInline)( doc, node );
3906
143
                TY_(FreeNode)( doc, node);
3907
143
                continue;
3908
143
            }
3909
3910
75
            for ( parent = list->parent;
3911
888
                  parent != NULL; parent = parent->parent )
3912
888
            {
3913
               /* Do not match across BODY to avoid infinite loop
3914
                  between ParseBody and this parser,
3915
                  See http://tidy.sf.net/bug/1053626. */
3916
888
                if (nodeIsBODY(parent))
3917
39
                    break;
3918
849
                if (node->tag == parent->tag)
3919
36
                {
3920
36
                    TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
3921
36
                    TY_(UngetToken)( doc );
3922
36
                    DEBUG_LOG_EXIT;
3923
36
                    return NULL;
3924
36
                }
3925
849
            }
3926
3927
39
            TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED);
3928
39
            TY_(FreeNode)( doc, node);
3929
39
            continue;
3930
75
        }
3931
3932
76.9k
        if ( !nodeIsLI(node) && nodeisOL )
3933
64.2k
        {
3934
            /* Issue #572 - A <ol><li> can have nested <ol> elements */
3935
64.2k
            foundLI = FindLastLI(list, &lastli); /* find last <li> */
3936
64.2k
        }
3937
3938
76.9k
        if ( nodeIsLI(node) || (TY_(IsHTML5Mode)(doc) && !foundLI) )
3939
75.7k
        {
3940
            /* node is <LI> OR
3941
               Issue #396 - A <ul> can have Zero or more <li> elements
3942
             */
3943
75.7k
            TY_(InsertNodeAtEnd)(list,node);
3944
75.7k
        }
3945
1.23k
        else
3946
1.23k
        {
3947
1.23k
            TY_(UngetToken)( doc );
3948
3949
1.23k
            if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks)
3950
310
            {
3951
310
                TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE);
3952
310
                DEBUG_LOG_EXIT;
3953
310
                return NULL;
3954
310
            }
3955
            /* http://tidy.sf.net/issue/1316307 */
3956
            /* In exiled mode, return so table processing can continue. */
3957
921
            else if ( lexer->exiled
3958
684
                      && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW)
3959
684
                          || nodeIsTABLE(node)) )
3960
243
            {
3961
243
                DEBUG_LOG_EXIT;
3962
243
                return NULL;
3963
243
            }
3964
            /* http://tidy.sf.net/issue/836462
3965
               If "list" is an unordered list, insert the next tag within
3966
               the last <li> to preserve the numbering to match the visual
3967
               rendering of most browsers. */
3968
678
            if ( nodeIsOL(list) && FindLastLI(list, &lastli) )
3969
2
            {
3970
                /* Create a node for error reporting */
3971
2
                node = TY_(InferredTag)(doc, TidyTag_LI);
3972
2
                TY_(Report)(doc, list, node, MISSING_STARTTAG );
3973
2
                TY_(FreeNode)( doc, node);
3974
2
                node = lastli;
3975
2
            }
3976
676
            else
3977
676
            {
3978
                /* Add an inferred <li> */
3979
676
                wasblock = TY_(nodeHasCM)(node,CM_BLOCK);
3980
676
                node = TY_(InferredTag)(doc, TidyTag_LI);
3981
                /* Add "display: inline" to avoid a blank line after <li> with
3982
                   Internet Explorer. See http://tidy.sf.net/issue/836462 */
3983
676
                TY_(AddStyleProperty)( doc, node,
3984
676
                                       wasblock
3985
676
                                       ? "list-style: none; display: inline"
3986
676
                                       : "list-style: none"
3987
676
                                       );
3988
676
                TY_(Report)(doc, list, node, MISSING_STARTTAG );
3989
676
                TY_(InsertNodeAtEnd)(list,node);
3990
676
            }
3991
678
        }
3992
3993
76.4k
        {
3994
76.4k
            TidyParserMemory memory = {0};
3995
76.4k
            memory.identity = TY_(ParseList);
3996
76.4k
            memory.original_node = list;
3997
76.4k
            memory.reentry_node = node;
3998
76.4k
            memory.mode = IgnoreWhitespace;
3999
76.4k
            TY_(pushMemory)( doc, memory );
4000
76.4k
            DEBUG_LOG_EXIT_WITH_NODE(node);
4001
76.4k
            return node;
4002
76.9k
        }
4003
76.9k
    }
4004
4005
52.2k
    TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR);
4006
52.2k
    DEBUG_LOG_EXIT;
4007
52.2k
    return NULL;
4008
129k
}
4009
4010
4011
/** MARK: TY_(ParseNamespace)
4012
 *  Act as a generic XML (sub)tree parser: collect each node and add it
4013
 *  to the DOM, without any further validation. It's useful for tags that
4014
 *  have XML-like content, such as `svg` and `math`.
4015
 *
4016
 *  @note Perhaps this is poorly named, as we're not parsing the namespace
4017
 *    of a particular tag, but a tag with XML-like content.
4018
 *
4019
 *  @todo Add schema- or other-hierarchy-definition-based validation
4020
 *    of the subtree here.
4021
 *
4022
 *  This is a non-recursing parser. It uses the document's parser memory stack
4023
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4024
 *  This parser is also re-enterable, so that post-processing can occur after
4025
 *  such dispatching.
4026
*/
4027
Node* TY_(ParseNamespace)( TidyDocImpl* doc, Node *basenode, GetTokenMode mode )
4028
628
{
4029
628
    Lexer* lexer = doc->lexer;
4030
628
    Node *node;
4031
628
    Node *parent = basenode;
4032
628
    uint istackbase;
4033
628
    AttVal* av; /* #130 MathML attr and entity fix! */
4034
4035
    /* a la <table>: defer popping elements off the inline stack */
4036
628
    TY_(DeferDup)( doc );
4037
628
    istackbase = lexer->istackbase;
4038
628
    lexer->istackbase = lexer->istacksize;
4039
4040
628
    mode = OtherNamespace; /* Preformatted; IgnoreWhitespace; */
4041
4042
3.58k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
4043
2.96k
    {
4044
        /*
4045
        fix check to skip action in InsertMisc for regular/empty
4046
        nodes, which we don't want here...
4047
4048
        The way we do it here is by checking and processing everything
4049
        and only what remains goes into InsertMisc()
4050
        */
4051
4052
        /* is this a close tag? And does it match the current parent node? */
4053
2.96k
        if (node->type == EndTag)
4054
705
        {
4055
            /*
4056
            to prevent end tags flowing from one 'alternate namespace' we
4057
            check this in two phases: first we check if the tag is a
4058
            descendant of the current node, and when it is, we check whether
4059
            it is the end tag for a node /within/ or /outside/ the basenode.
4060
            */
4061
705
            Bool outside;
4062
705
            Node *mp = FindMatchingDescendant(parent, node, basenode, &outside);
4063
4064
705
            if (mp != NULL)
4065
467
            {
4066
                /*
4067
                when mp != parent as we might expect,
4068
                infer end tags until we 'hit' the matched
4069
                parent or the basenode
4070
                */
4071
467
                Node *n;
4072
4073
467
                for (n = parent;
4074
1.26k
                     n != NULL && n != basenode->parent && n != mp;
4075
799
                     n = n->parent)
4076
799
                {
4077
                    /* n->implicit = yes; */
4078
799
                    n->closed = yes;
4079
799
                    TY_(Report)(doc, n->parent, n, MISSING_ENDTAG_BEFORE);
4080
799
                }
4081
4082
                /* Issue #369 - Since 'assert' is DEBUG only, and there are
4083
                   simple cases where these can be fired, removing them
4084
                   pending feedback from the original author!
4085
                   assert(outside == no ? n == mp : 1);
4086
                   assert(outside == yes ? n == basenode->parent : 1);
4087
                   =================================================== */
4088
4089
467
                if (outside == no)
4090
467
                {
4091
                    /* EndTag for a node within the basenode subtree. Roll on... */
4092
467
                    if (n)
4093
467
                        n->closed = yes;
4094
467
                    TY_(FreeNode)(doc, node);
4095
4096
467
                    node = n;
4097
467
                    parent = node ? node->parent : NULL;
4098
467
                }
4099
0
                else
4100
0
                {
4101
                    /* EndTag for a node outside the basenode subtree: let the caller handle that. */
4102
0
                    TY_(UngetToken)( doc );
4103
0
                    node = basenode;
4104
0
                    parent = node->parent;
4105
0
                }
4106
4107
                /* when we've arrived at the end-node for the base node, it's quitting time */
4108
467
                if (node == basenode)
4109
11
                {
4110
11
                    lexer->istackbase = istackbase;
4111
11
                    assert(basenode && basenode->closed == yes);
4112
11
                    return NULL;
4113
11
                }
4114
467
            }
4115
238
            else
4116
238
            {
4117
                /* unmatched close tag: report an error and discard */
4118
                /* TY_(Report)(doc, parent, node, NON_MATCHING_ENDTAG); Issue #308 - Seems wrong warning! */
4119
238
                TY_(Report)(doc, parent, node, DISCARDING_UNEXPECTED);
4120
238
                assert(parent);
4121
                /* assert(parent->tag != node->tag); Issue #308 - Seems would always be true! */
4122
238
                TY_(FreeNode)( doc, node); /* Issue #308 - Discard unexpected end tag memory */
4123
238
            }
4124
705
        }
4125
2.26k
        else if (node->type == StartTag)
4126
1.61k
        {
4127
            /* #130 MathML attr and entity fix!
4128
               care if it has attributes, and 'accidently' any of those attributes match known */
4129
1.73k
            for ( av = node->attributes; av; av = av->next )
4130
119
            {
4131
119
                av->dict = 0; /* does something need to be freed? */
4132
119
            }
4133
            /* add another child to the current parent */
4134
1.61k
            TY_(InsertNodeAtEnd)(parent, node);
4135
1.61k
            parent = node;
4136
1.61k
        }
4137
645
        else
4138
645
        {
4139
            /* #130 MathML attr and entity fix!
4140
               care if it has attributes, and 'accidently' any of those attributes match known */
4141
647
            for ( av = node->attributes; av; av = av->next )
4142
2
            {
4143
2
                av->dict = 0; /* does something need to be freed? */
4144
2
            }
4145
645
            TY_(InsertNodeAtEnd)(parent, node);
4146
645
        }
4147
2.96k
    }
4148
4149
617
    TY_(Report)(doc, basenode->parent, basenode, MISSING_ENDTAG_FOR);
4150
617
    return NULL;
4151
628
}
4152
4153
4154
/** MARK: TY_(ParseNoFrames)
4155
 *  Parses the `noframes` tag.
4156
 *
4157
 *  This is a non-recursing parser. It uses the document's parser memory stack
4158
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4159
 *  This parser is also re-enterable, so that post-processing can occur after
4160
 *  such dispatching.
4161
 */
4162
Node* TY_(ParseNoFrames)( TidyDocImpl* doc, Node *noframes, GetTokenMode mode )
4163
14.1k
{
4164
14.1k
    Lexer* lexer = doc->lexer;
4165
14.1k
    Node *node = NULL;
4166
14.1k
    Bool body_seen = no;
4167
14.1k
    DEBUG_LOG_COUNTERS;
4168
4169
14.1k
    enum parserState {
4170
14.1k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4171
14.1k
        STATE_POST_NODEISBODY,        /* To-do after re-entering after checks. */
4172
14.1k
        STATE_COMPLETE,               /* Done with the switch. */
4173
14.1k
    } state = STATE_INITIAL;
4174
4175
    /*
4176
     If we're re-entering, then we need to setup from a previous state,
4177
     instead of starting fresh. We can pull what we need from the document's
4178
     stack.
4179
     */
4180
14.1k
    if ( noframes == NULL )
4181
6.20k
    {
4182
6.20k
        TidyParserMemory memory = TY_(popMemory)( doc );
4183
6.20k
        node = memory.reentry_node; /* Throwaway, because we replace it entering the loop anyway.*/
4184
6.20k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4185
6.20k
        noframes = memory.original_node;
4186
6.20k
        state = memory.reentry_state;
4187
6.20k
        body_seen = memory.register_1;
4188
6.20k
        DEBUG_LOG_GET_OLD_MODE;
4189
6.20k
        mode = memory.mode;
4190
6.20k
        DEBUG_LOG_CHANGE_MODE;
4191
6.20k
    }
4192
7.94k
    else
4193
7.94k
    {
4194
7.94k
        DEBUG_LOG_ENTER_WITH_NODE(noframes);
4195
7.94k
        if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 )
4196
7.94k
        {
4197
7.94k
            doc->badAccess |=  BA_USING_NOFRAMES;
4198
7.94k
        }
4199
7.94k
    }
4200
4201
14.1k
    mode = IgnoreWhitespace;
4202
4203
24.2k
    while ( state != STATE_COMPLETE )
4204
20.6k
    {
4205
20.6k
        if ( state == STATE_INITIAL )
4206
18.7k
        {
4207
18.7k
            node = TY_(GetToken)(doc, mode);
4208
18.7k
            DEBUG_LOG_GOT_TOKEN(node);
4209
18.7k
        }
4210
        
4211
20.6k
        switch ( state )
4212
20.6k
        {
4213
18.7k
            case STATE_INITIAL:
4214
18.7k
            {
4215
18.7k
                if ( node == NULL )
4216
3.52k
                {
4217
3.52k
                    state = STATE_COMPLETE;
4218
3.52k
                    continue;
4219
3.52k
                }
4220
                
4221
15.2k
                if ( node->tag == noframes->tag && node->type == EndTag )
4222
165
                {
4223
165
                    TY_(FreeNode)( doc, node);
4224
165
                    noframes->closed = yes;
4225
165
                    TrimSpaces(doc, noframes);
4226
165
                    DEBUG_LOG_EXIT;
4227
165
                    return NULL;
4228
165
                }
4229
4230
15.0k
                if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) )
4231
1.72k
                {
4232
1.72k
                    TrimSpaces(doc, noframes);
4233
1.72k
                    if (node->type == EndTag)
4234
276
                    {
4235
276
                        TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4236
276
                        TY_(FreeNode)( doc, node);       /* Throw it away */
4237
276
                    }
4238
1.44k
                    else
4239
1.44k
                    {
4240
1.44k
                        TY_(Report)(doc, noframes, node, MISSING_ENDTAG_BEFORE);
4241
1.44k
                        TY_(UngetToken)( doc );
4242
1.44k
                    }
4243
1.72k
                    DEBUG_LOG_EXIT;
4244
1.72k
                    return NULL;
4245
1.72k
                }
4246
4247
13.3k
                if ( nodeIsHTML(node) )
4248
22
                {
4249
22
                    if (TY_(nodeIsElement)(node))
4250
22
                        TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4251
4252
22
                    TY_(FreeNode)( doc, node);
4253
22
                    continue;
4254
22
                }
4255
4256
                /* deal with comments etc. */
4257
13.3k
                if (InsertMisc(noframes, node))
4258
352
                    continue;
4259
4260
12.9k
                if ( nodeIsBODY(node) && node->type == StartTag )
4261
3.88k
                {
4262
3.88k
                    TidyParserMemory memory = {0};
4263
3.88k
                    memory.identity = TY_(ParseNoFrames);
4264
3.88k
                    memory.original_node = noframes;
4265
3.88k
                    memory.reentry_node = node;
4266
3.88k
                    memory.reentry_state = STATE_POST_NODEISBODY;
4267
3.88k
                    memory.register_1 = lexer->seenEndBody;
4268
3.88k
                    memory.mode = IgnoreWhitespace;
4269
4270
3.88k
                    TY_(InsertNodeAtEnd)(noframes, node);
4271
3.88k
                    TY_(pushMemory)( doc, memory );
4272
3.88k
                    DEBUG_LOG_EXIT_WITH_NODE(node);
4273
3.88k
                    return node;
4274
3.88k
                }
4275
4276
                /* implicit body element inferred */
4277
9.07k
                if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag))
4278
4.88k
                {
4279
4.88k
                    Node *body = TY_(FindBody)( doc );
4280
4.88k
                    if ( body || lexer->seenEndBody )
4281
2.80k
                    {
4282
2.80k
                        if ( body == NULL )
4283
30
                        {
4284
30
                            TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4285
30
                            TY_(FreeNode)( doc, node);
4286
30
                            continue;
4287
30
                        }
4288
2.77k
                        if ( TY_(nodeIsText)(node) )
4289
532
                        {
4290
532
                            TY_(UngetToken)( doc );
4291
532
                            node = TY_(InferredTag)(doc, TidyTag_P);
4292
532
                            TY_(Report)(doc, noframes, node, CONTENT_AFTER_BODY );
4293
532
                        }
4294
2.77k
                        TY_(InsertNodeAtEnd)( body, node );
4295
2.77k
                    }
4296
2.07k
                    else
4297
2.07k
                    {
4298
2.07k
                        TY_(UngetToken)( doc );
4299
2.07k
                        node = TY_(InferredTag)(doc, TidyTag_BODY);
4300
2.07k
                        if ( cfgBool(doc, TidyXmlOut) )
4301
1.73k
                            TY_(Report)(doc, noframes, node, INSERTING_TAG);
4302
2.07k
                        TY_(InsertNodeAtEnd)( noframes, node );
4303
2.07k
                    }
4304
4305
4.85k
                    {
4306
4.85k
                        TidyParserMemory memory = {0};
4307
4.85k
                        memory.identity = TY_(ParseNoFrames);
4308
4.85k
                        memory.original_node = noframes;
4309
4.85k
                        memory.reentry_node = node;
4310
4.85k
                        memory.mode = IgnoreWhitespace; /*MixedContent*/
4311
4.85k
                        memory.reentry_state = STATE_INITIAL;
4312
4.85k
                        TY_(pushMemory)( doc, memory );
4313
4.85k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
4314
4.85k
                        return node;
4315
4.88k
                    }
4316
4.88k
                }
4317
4318
                /* discard unexpected end tags */
4319
4.18k
                TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED);
4320
4.18k
                TY_(FreeNode)( doc, node);
4321
4.18k
            } break;
4322
                
4323
                
4324
1.94k
            case STATE_POST_NODEISBODY:
4325
1.94k
            {
4326
                /* fix for bug http://tidy.sf.net/bug/887259 */
4327
1.94k
                if (body_seen && TY_(FindBody)(doc) != node)
4328
686
                {
4329
686
                    TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no);
4330
686
                    MoveNodeToBody(doc, node);
4331
686
                }
4332
1.94k
                state = STATE_INITIAL;
4333
1.94k
                continue;
4334
4335
9.07k
            } break;
4336
                
4337
                
4338
0
            default:
4339
0
                break;
4340
20.6k
        } /* switch */
4341
20.6k
    } /* while */
4342
4343
3.52k
    TY_(Report)(doc, noframes, node, MISSING_ENDTAG_FOR);
4344
3.52k
    DEBUG_LOG_EXIT;
4345
3.52k
    return NULL;
4346
14.1k
}
4347
4348
4349
/** MARK: TY_(ParseOptGroup)
4350
 *  Parses the `optgroup` tag.
4351
 *
4352
 *  This is a non-recursing parser. It uses the document's parser memory stack
4353
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4354
 *  This parser is also re-enterable, so that post-processing can occur after
4355
 *  such dispatching.
4356
 */
4357
Node* TY_(ParseOptGroup)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) )
4358
16.8k
{
4359
16.8k
    Lexer* lexer = doc->lexer;
4360
16.8k
    Node *node;
4361
16.8k
    DEBUG_LOG_COUNTERS;
4362
    
4363
16.8k
    if ( field == NULL )
4364
8.39k
    {
4365
8.39k
        TidyParserMemory memory = TY_(popMemory)( doc );
4366
8.39k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4367
8.39k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4368
8.39k
        field = memory.original_node;
4369
8.39k
        DEBUG_LOG_GET_OLD_MODE;
4370
8.39k
        mode = memory.mode;
4371
8.39k
        DEBUG_LOG_CHANGE_MODE;
4372
8.39k
    }
4373
8.42k
    else
4374
8.42k
    {
4375
8.42k
        DEBUG_LOG_ENTER_WITH_NODE(field);
4376
8.42k
    }
4377
    
4378
16.8k
    lexer->insert = NULL;  /* defer implicit inline start tags */
4379
4380
17.9k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
4381
9.67k
    {
4382
9.67k
        if (node->tag == field->tag && node->type == EndTag)
4383
157
        {
4384
157
            TY_(FreeNode)( doc, node);
4385
157
            field->closed = yes;
4386
157
            TrimSpaces(doc, field);
4387
157
            DEBUG_LOG_EXIT;
4388
157
            return NULL;
4389
157
        }
4390
4391
        /* deal with comments etc. */
4392
9.51k
        if (InsertMisc(field, node))
4393
0
            continue;
4394
4395
9.51k
        if ( node->type == StartTag &&
4396
9.15k
             (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) )
4397
8.39k
        {
4398
8.39k
            TidyParserMemory memory = {0};
4399
4400
8.39k
            if ( nodeIsOPTGROUP(node) )
4401
8.39k
                TY_(Report)(doc, field, node, CANT_BE_NESTED);
4402
4403
8.39k
            TY_(InsertNodeAtEnd)(field, node);
4404
4405
8.39k
            memory.identity = TY_(ParseOptGroup);
4406
8.39k
            memory.original_node = field;
4407
8.39k
            memory.reentry_node = node;
4408
8.39k
            TY_(pushMemory)( doc, memory );
4409
8.39k
            DEBUG_LOG_EXIT_WITH_NODE(node);
4410
8.39k
            return node;
4411
8.39k
        }
4412
4413
        /* discard unexpected tags */
4414
1.12k
        TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED );
4415
1.12k
        TY_(FreeNode)( doc, node);
4416
1.12k
    }
4417
8.26k
    DEBUG_LOG_EXIT;
4418
8.26k
    return NULL;
4419
16.8k
}
4420
4421
4422
/** MARK: TY_(ParsePre)
4423
 *  Parses the `pre` tag.
4424
 *
4425
 *  This is a non-recursing parser. It uses the document's parser memory stack
4426
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4427
 *  This parser is also re-enterable, so that post-processing can occur after
4428
 *  such dispatching.
4429
 */
4430
Node* TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) )
4431
11.2k
{
4432
11.2k
    Node *node = NULL;
4433
11.2k
    DEBUG_LOG_COUNTERS;
4434
4435
11.2k
    enum parserState {
4436
11.2k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4437
11.2k
        STATE_RENTRY_ACTION,          /* To-do after re-entering after checks. */
4438
11.2k
        STATE_COMPLETE,               /* Done with the switch. */
4439
11.2k
    } state = STATE_INITIAL;
4440
4441
4442
11.2k
    if ( pre == NULL )
4443
7.52k
    {
4444
7.52k
        TidyParserMemory memory = TY_(popMemory)( doc );
4445
7.52k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4446
7.52k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4447
7.52k
        pre = memory.original_node;
4448
7.52k
        state = memory.reentry_state;
4449
7.52k
        DEBUG_LOG_GET_OLD_MODE;
4450
7.52k
        mode = memory.mode;
4451
7.52k
        DEBUG_LOG_CHANGE_MODE;
4452
7.52k
    }
4453
3.68k
    else
4454
3.68k
    {
4455
3.68k
        DEBUG_LOG_ENTER_WITH_NODE(pre);
4456
3.68k
        if (pre->tag->model & CM_EMPTY)
4457
0
        {
4458
0
            DEBUG_LOG_EXIT;
4459
0
            return NULL;
4460
0
        }
4461
3.68k
    }
4462
4463
11.2k
    TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */
4464
4465
24.4k
    while ( state != STATE_COMPLETE )
4466
23.5k
    {
4467
23.5k
        if ( state == STATE_INITIAL )
4468
17.0k
            node = TY_(GetToken)(doc, Preformatted);
4469
        
4470
23.5k
        switch ( state )
4471
23.5k
        {
4472
17.0k
            case STATE_INITIAL:
4473
17.0k
            {
4474
17.0k
                if ( node == NULL )
4475
861
                {
4476
861
                    state = STATE_COMPLETE;
4477
861
                    continue;
4478
861
                }
4479
                
4480
16.1k
                if ( node->type == EndTag &&
4481
250
                     (node->tag == pre->tag || DescendantOf(pre, TagId(node))) )
4482
69
                {
4483
69
                    if (nodeIsBODY(node) || nodeIsHTML(node))
4484
3
                    {
4485
3
                        TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4486
3
                        TY_(FreeNode)(doc, node);
4487
3
                        continue;
4488
3
                    }
4489
66
                    if (node->tag == pre->tag)
4490
6
                    {
4491
6
                        TY_(FreeNode)(doc, node);
4492
6
                    }
4493
60
                    else
4494
60
                    {
4495
60
                        TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE );
4496
60
                        TY_(UngetToken)( doc );
4497
60
                    }
4498
66
                    pre->closed = yes;
4499
66
                    TrimSpaces(doc, pre);
4500
66
                    DEBUG_LOG_EXIT;
4501
66
                    return NULL;
4502
69
                }
4503
4504
16.1k
                if (TY_(nodeIsText)(node))
4505
891
                {
4506
891
                    TY_(InsertNodeAtEnd)(pre, node);
4507
891
                    continue;
4508
891
                }
4509
4510
                /* deal with comments etc. */
4511
15.2k
                if (InsertMisc(pre, node))
4512
745
                    continue;
4513
4514
14.4k
                if (node->tag == NULL)
4515
3.43k
                {
4516
3.43k
                    TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4517
3.43k
                    TY_(FreeNode)(doc, node);
4518
3.43k
                    continue;
4519
3.43k
                }
4520
4521
                /* strip unexpected tags */
4522
11.0k
                if ( !PreContent(doc, node) )
4523
9.37k
                {
4524
                    /* fix for http://tidy.sf.net/bug/772205 */
4525
9.37k
                    if (node->type == EndTag)
4526
120
                    {
4527
                        /* http://tidy.sf.net/issue/1590220 */
4528
120
                       if ( doc->lexer->exiled
4529
6
                           && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) )
4530
1
                       {
4531
1
                          TY_(UngetToken)(doc);
4532
1
                          TrimSpaces(doc, pre);
4533
1
                           DEBUG_LOG_EXIT;
4534
1
                          return NULL;
4535
1
                       }
4536
4537
119
                       TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4538
119
                       TY_(FreeNode)(doc, node);
4539
119
                       continue;
4540
120
                    }
4541
                    /* http://tidy.sf.net/issue/1590220 */
4542
9.25k
                    else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW)
4543
6.50k
                             || nodeIsTABLE(node) )
4544
2.74k
                    {
4545
2.74k
                        if (!doc->lexer->exiled)
4546
                            /* No missing close warning if exiled. */
4547
83
                            TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE);
4548
4549
2.74k
                        TY_(UngetToken)(doc);
4550
2.74k
                        DEBUG_LOG_EXIT;
4551
2.74k
                        return NULL;
4552
2.74k
                    }
4553
4554
                    /*
4555
                      This is basically what Tidy 04 August 2000 did and far more accurate
4556
                      with respect to browser behaivour than the code commented out above.
4557
                      Tidy could try to propagate the <pre> into each disallowed child where
4558
                      <pre> is allowed in order to replicate some browsers behaivour, but
4559
                      there are a lot of exceptions, e.g. Internet Explorer does not propagate
4560
                      <pre> into table cells while Mozilla does. Opera 6 never propagates
4561
                      <pre> into blocklevel elements while Opera 7 behaves much like Mozilla.
4562
4563
                      Tidy behaves thus mostly like Opera 6 except for nested <pre> elements
4564
                      which are handled like Mozilla takes them (Opera6 closes all <pre> after
4565
                      the first </pre>).
4566
4567
                      There are similar issues like replacing <p> in <pre> with <br>, for
4568
                      example
4569
4570
                        <pre>...<p>...</pre>                 (Input)
4571
                        <pre>...<br>...</pre>                (Tidy)
4572
                        <pre>...<br>...</pre>                (Opera 7 and Internet Explorer)
4573
                        <pre>...<br><br>...</pre>            (Opera 6 and Mozilla)
4574
4575
                        <pre>...<p>...</p>...</pre>          (Input)
4576
                        <pre>...<br>......</pre>             (Tidy, BUG!)
4577
                        <pre>...<br>...<br>...</pre>         (Internet Explorer)
4578
                        <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6)
4579
                        <pre>...<br>...<br><br>...</pre>     (Opera 7)
4580
4581
                      or something similar, they could also be closing the <pre> and propagate
4582
                      the <pre> into the newly opened <p>.
4583
4584
                      Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are
4585
                      disallowed in <pre>, Tidy neither detects this nor does it perform any
4586
                      cleanup operation. Tidy should at least issue a warning if it encounters
4587
                      such constructs.
4588
4589
                      Todo: discarding </p> is abviously a bug, it should be replaced by <br>.
4590
                    */
4591
6.50k
                    TY_(InsertNodeAfterElement)(pre, node);
4592
6.50k
                    TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE);
4593
                    
4594
6.50k
                    {
4595
6.50k
                        TidyParserMemory memory = {0};
4596
6.50k
                        memory.identity = TY_(ParsePre);
4597
6.50k
                        memory.original_node = pre;
4598
6.50k
                        memory.reentry_node = node;
4599
6.50k
                        memory.reentry_state = STATE_RENTRY_ACTION;
4600
6.50k
                        TY_(pushMemory)( doc, memory );
4601
6.50k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
4602
6.50k
                        return node;
4603
9.37k
                    }
4604
9.37k
                }
4605
4606
1.66k
                if ( nodeIsP(node) )
4607
611
                {
4608
611
                    if (node->type == StartTag)
4609
584
                    {
4610
584
                        TY_(Report)(doc, pre, node, USING_BR_INPLACE_OF);
4611
4612
                        /* trim white space before <p> in <pre>*/
4613
584
                        TrimSpaces(doc, pre);
4614
4615
                        /* coerce both <p> and </p> to <br> */
4616
584
                        TY_(CoerceNode)(doc, node, TidyTag_BR, no, no);
4617
584
                        TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */
4618
584
                        TY_(InsertNodeAtEnd)( pre, node );
4619
584
                    }
4620
27
                    else
4621
27
                    {
4622
27
                        TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4623
27
                        TY_(FreeNode)( doc, node);
4624
27
                    }
4625
611
                    continue;
4626
611
                }
4627
4628
1.05k
                if ( TY_(nodeIsElement)(node) )
4629
1.02k
                {
4630
                    /* trim white space before <br> */
4631
1.02k
                    if ( nodeIsBR(node) )
4632
0
                        TrimSpaces(doc, pre);
4633
4634
1.02k
                    TY_(InsertNodeAtEnd)(pre, node);
4635
                    
4636
1.02k
                    {
4637
1.02k
                        TidyParserMemory memory = {0};
4638
1.02k
                        memory.identity = TY_(ParsePre);
4639
1.02k
                        memory.original_node = pre;
4640
1.02k
                        memory.reentry_node = node;
4641
1.02k
                        memory.reentry_state = STATE_INITIAL;
4642
1.02k
                        TY_(pushMemory)( doc, memory );
4643
1.02k
                        DEBUG_LOG_EXIT_WITH_NODE(node);
4644
1.02k
                        return node;
4645
1.02k
                    }
4646
1.02k
                }
4647
4648
                /* discard unexpected tags */
4649
31
                TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED);
4650
31
                TY_(FreeNode)( doc, node);
4651
31
            } break;
4652
                
4653
6.50k
            case STATE_RENTRY_ACTION:
4654
6.50k
            {
4655
6.50k
                Node* newnode = TY_(InferredTag)(doc, TidyTag_PRE);
4656
6.50k
                TY_(Report)(doc, pre, newnode, INSERTING_TAG);
4657
6.50k
                pre = newnode;
4658
6.50k
                TY_(InsertNodeAfterElement)(node, pre);
4659
6.50k
                state = STATE_INITIAL;
4660
6.50k
                continue;
4661
1.05k
            } break;
4662
            
4663
0
            default:
4664
0
                break;
4665
4666
23.5k
        } /* switch */
4667
23.5k
    } /* while */
4668
4669
861
    TY_(Report)(doc, pre, node, MISSING_ENDTAG_FOR);
4670
861
    DEBUG_LOG_EXIT;
4671
861
    return NULL;
4672
11.2k
}
4673
4674
4675
/** MARK: TY_(ParseRow)
4676
 *  Parses the `row` tag.
4677
 *
4678
 *  This is a non-recursing parser. It uses the document's parser memory stack
4679
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4680
 *  This parser is also re-enterable, so that post-processing can occur after
4681
 *  such dispatching.
4682
 */
4683
Node* TY_(ParseRow)( TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode) )
4684
10.3k
{
4685
10.3k
    Lexer* lexer = doc->lexer;
4686
10.3k
    Node *node = NULL;
4687
10.3k
    Bool exclude_state = no;
4688
10.3k
    DEBUG_LOG_COUNTERS;
4689
4690
10.3k
    enum parserState {
4691
10.3k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4692
10.3k
        STATE_POST_NOT_ENDTAG,        /* To-do after re-entering after !EndTag checks. */
4693
10.3k
        STATE_POST_TD_TH,             /* To-do after re-entering after TD/TH checks. */
4694
10.3k
        STATE_COMPLETE,               /* Done with the switch. */
4695
10.3k
    } state = STATE_INITIAL;
4696
4697
10.3k
    if ( row == NULL )
4698
6.03k
    {
4699
6.03k
        TidyParserMemory memory = TY_(popMemory)( doc );
4700
6.03k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4701
6.03k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4702
6.03k
        row = memory.original_node;
4703
6.03k
        state = memory.reentry_state;
4704
6.03k
        exclude_state = memory.register_1;
4705
6.03k
        DEBUG_LOG_GET_OLD_MODE;
4706
6.03k
        mode = memory.mode;
4707
6.03k
        DEBUG_LOG_CHANGE_MODE;
4708
6.03k
    }
4709
4.34k
    else
4710
4.34k
    {
4711
4.34k
        DEBUG_LOG_ENTER_WITH_NODE(row);
4712
4713
4.34k
        if (row->tag->model & CM_EMPTY)
4714
0
            return NULL;
4715
4.34k
    }
4716
4717
25.0k
    while ( state != STATE_COMPLETE )
4718
22.6k
    {
4719
22.6k
        if ( state == STATE_INITIAL )
4720
16.6k
        {
4721
16.6k
            node = TY_(GetToken)( doc, IgnoreWhitespace );
4722
16.6k
            DEBUG_LOG_GOT_TOKEN(node);
4723
16.6k
        }
4724
    
4725
22.6k
        switch (state)
4726
22.6k
        {
4727
16.6k
            case STATE_INITIAL:
4728
16.6k
            {
4729
16.6k
                if ( node == NULL)
4730
2.35k
                {
4731
2.35k
                    state = STATE_COMPLETE;
4732
2.35k
                    continue;
4733
2.35k
                }
4734
                
4735
14.3k
                if (node->tag == row->tag)
4736
1.52k
                {
4737
1.52k
                    if (node->type == EndTag)
4738
509
                    {
4739
509
                        TY_(FreeNode)( doc, node);
4740
509
                        row->closed = yes;
4741
509
                        FixEmptyRow( doc, row);
4742
509
                        DEBUG_LOG_EXIT;
4743
509
                        return NULL;
4744
509
                    }
4745
4746
                    /* New row start implies end of current row */
4747
1.01k
                    TY_(UngetToken)( doc );
4748
1.01k
                    FixEmptyRow( doc, row);
4749
1.01k
                    DEBUG_LOG_EXIT;
4750
1.01k
                    return NULL;
4751
1.52k
                }
4752
4753
                /*
4754
                  if this is the end tag for an ancestor element
4755
                  then infer end tag for this element
4756
                */
4757
12.7k
                if ( node->type == EndTag )
4758
764
                {
4759
764
                    if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node))
4760
202
                         && DescendantOf(row, TagId(node)) )
4761
202
                    {
4762
202
                        TY_(UngetToken)( doc );
4763
202
                        DEBUG_LOG_EXIT;
4764
202
                        return NULL;
4765
202
                    }
4766
4767
562
                    if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
4768
408
                    {
4769
408
                        if ( nodeIsFORM(node) )
4770
0
                            BadForm( doc );
4771
4772
408
                        TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4773
408
                        TY_(FreeNode)( doc, node);
4774
408
                        continue;
4775
408
                    }
4776
4777
154
                    if ( nodeIsTD(node) || nodeIsTH(node) )
4778
119
                    {
4779
119
                        TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4780
119
                        TY_(FreeNode)( doc, node);
4781
119
                        continue;
4782
119
                    }
4783
154
                }
4784
4785
                /* deal with comments etc. */
4786
12.0k
                if (InsertMisc(row, node))
4787
11
                    continue;
4788
4789
                /* discard unknown tags */
4790
12.0k
                if (node->tag == NULL && node->type != TextNode)
4791
1.57k
                {
4792
1.57k
                    TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4793
1.57k
                    TY_(FreeNode)( doc, node);
4794
1.57k
                    continue;
4795
1.57k
                }
4796
4797
                /* discard unexpected <table> element */
4798
10.4k
                if ( nodeIsTABLE(node) )
4799
41
                {
4800
41
                    TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4801
41
                    TY_(FreeNode)( doc, node);
4802
41
                    continue;
4803
41
                }
4804
4805
                /* THEAD, TFOOT or TBODY */
4806
10.4k
                if ( TY_(nodeHasCM)(node, CM_ROWGRP) )
4807
265
                {
4808
265
                    TY_(UngetToken)( doc );
4809
265
                    DEBUG_LOG_EXIT;
4810
265
                    return NULL;
4811
265
                }
4812
4813
10.1k
                if (node->type == EndTag)
4814
12
                {
4815
12
                    TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED);
4816
12
                    TY_(FreeNode)( doc, node);
4817
12
                    continue;
4818
12
                }
4819
4820
                /*
4821
                  if text or inline or block move before table
4822
                  if head content move to head
4823
                */
4824
4825
10.1k
                if (node->type != EndTag)
4826
10.1k
                {
4827
10.1k
                    if ( nodeIsFORM(node) )
4828
0
                    {
4829
0
                        TY_(UngetToken)( doc );
4830
0
                        node = TY_(InferredTag)(doc, TidyTag_TD);
4831
0
                        TY_(Report)(doc, row, node, MISSING_STARTTAG);
4832
0
                    }
4833
10.1k
                    else if ( TY_(nodeIsText)(node)
4834
8.32k
                              || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) )
4835
6.86k
                    {
4836
6.86k
                        MoveBeforeTable( doc, row, node );
4837
6.86k
                        TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
4838
6.86k
                        lexer->exiled = yes;
4839
6.86k
                        exclude_state = lexer->excludeBlocks;
4840
6.86k
                        lexer->excludeBlocks = no;
4841
4842
6.86k
                        if (node->type != TextNode)
4843
5.03k
                        {
4844
5.03k
                            TidyParserMemory memory = {0};
4845
5.03k
                            memory.identity = TY_(ParseRow);
4846
5.03k
                            memory.original_node = row;
4847
5.03k
                            memory.reentry_node = node;
4848
5.03k
                            memory.reentry_state = STATE_POST_NOT_ENDTAG;
4849
5.03k
                            memory.register_1 = exclude_state;
4850
5.03k
                            TY_(pushMemory)( doc, memory );
4851
5.03k
                            DEBUG_LOG_EXIT_WITH_NODE(node);
4852
5.03k
                            return node;
4853
5.03k
                        }
4854
                        
4855
1.82k
                        lexer->exiled = no;
4856
1.82k
                        lexer->excludeBlocks = exclude_state;
4857
1.82k
                        continue;
4858
6.86k
                    }
4859
3.29k
                    else if (node->tag->model & CM_HEAD)
4860
7
                    {
4861
7
                        TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
4862
7
                        MoveToHead( doc, row, node);
4863
7
                        continue;
4864
7
                    }
4865
10.1k
                }
4866
4867
3.28k
                if ( !(nodeIsTD(node) || nodeIsTH(node)) )
4868
2.29k
                {
4869
2.29k
                    TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN);
4870
2.29k
                    TY_(FreeNode)( doc, node);
4871
2.29k
                    continue;
4872
2.29k
                }
4873
4874
                /* node should be <TD> or <TH> */
4875
993
                TY_(InsertNodeAtEnd)(row, node);
4876
993
                exclude_state = lexer->excludeBlocks;
4877
993
                lexer->excludeBlocks = no;
4878
993
                {
4879
993
                    TidyParserMemory memory = {0};
4880
993
                    memory.identity = TY_(ParseRow);
4881
993
                    memory.original_node = row;
4882
993
                    memory.reentry_node = node;
4883
993
                    memory.reentry_state = STATE_POST_TD_TH;
4884
993
                    memory.register_1 = exclude_state;
4885
993
                    TY_(pushMemory)( doc, memory );
4886
993
                    DEBUG_LOG_EXIT_WITH_NODE(node);
4887
993
                    return node;
4888
3.28k
                }
4889
3.28k
            } break;
4890
                
4891
                
4892
5.03k
            case STATE_POST_NOT_ENDTAG:
4893
5.03k
            {
4894
5.03k
                lexer->exiled = no;
4895
5.03k
                lexer->excludeBlocks = exclude_state; /* capture this in stack. */
4896
5.03k
                state = STATE_INITIAL;
4897
5.03k
                continue;
4898
3.28k
            } break;
4899
                
4900
                
4901
993
            case STATE_POST_TD_TH:
4902
993
            {
4903
993
                lexer->excludeBlocks = exclude_state; /* capture this in stack. */
4904
4905
                /* pop inline stack */
4906
5.75k
                while ( lexer->istacksize > lexer->istackbase )
4907
4.76k
                    TY_(PopInline)( doc, NULL );
4908
                
4909
993
                state = STATE_INITIAL;
4910
993
                continue;
4911
3.28k
            } break;
4912
                
4913
                
4914
0
            default:
4915
0
                break;
4916
                
4917
22.6k
        } /* switch */
4918
22.6k
    } /* while */
4919
2.35k
    DEBUG_LOG_EXIT;
4920
2.35k
    return NULL;
4921
10.3k
}
4922
4923
4924
/** MARK: TY_(ParseRowGroup)
4925
 *  Parses the `rowgroup` tag.
4926
 *
4927
 *  This is a non-recursing parser. It uses the document's parser memory stack
4928
 *  to send subsequent nodes back to the controller for dispatching to parsers.
4929
 *  This parser is also re-enterable, so that post-processing can occur after
4930
 *  such dispatching.
4931
 */
4932
Node* TY_(ParseRowGroup)( TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode) )
4933
19.1k
{
4934
19.1k
    Lexer* lexer = doc->lexer;
4935
19.1k
    Node *node = NULL;
4936
19.1k
    Node *parent = NULL;
4937
19.1k
    DEBUG_LOG_COUNTERS;
4938
4939
19.1k
    enum parserState {
4940
19.1k
        STATE_INITIAL,                /* This is the initial state for every parser. */
4941
19.1k
        STATE_POST_NOT_TEXTNODE,      /* To-do after re-entering after checks. */
4942
19.1k
        STATE_COMPLETE,               /* Done with the switch. */
4943
19.1k
    } state = STATE_INITIAL;
4944
4945
19.1k
    if ( rowgroup == NULL )
4946
8.51k
    {
4947
8.51k
        TidyParserMemory memory = TY_(popMemory)( doc );
4948
8.51k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
4949
8.51k
        DEBUG_LOG_REENTER_WITH_NODE(node);
4950
8.51k
        rowgroup = memory.original_node;
4951
8.51k
        state = memory.reentry_state;
4952
8.51k
        DEBUG_LOG_GET_OLD_MODE;
4953
8.51k
        mode = memory.mode;
4954
8.51k
        DEBUG_LOG_CHANGE_MODE;
4955
8.51k
    }
4956
10.5k
    else
4957
10.5k
    {
4958
10.5k
        DEBUG_LOG_ENTER_WITH_NODE(rowgroup);
4959
10.5k
        if (rowgroup->tag->model & CM_EMPTY)
4960
0
        {
4961
0
            DEBUG_LOG_EXIT;
4962
0
            return NULL;
4963
0
        }
4964
10.5k
    }
4965
4966
35.9k
    while ( state != STATE_COMPLETE )
4967
29.3k
    {
4968
29.3k
        if ( state == STATE_INITIAL )
4969
20.9k
            node = TY_(GetToken)(doc, IgnoreWhitespace);
4970
        
4971
29.3k
        switch (state)
4972
29.3k
        {
4973
20.9k
            case STATE_INITIAL:
4974
20.9k
            {
4975
20.9k
                TidyParserMemory memory = {0};
4976
4977
20.9k
                if (node == NULL)
4978
6.54k
                {
4979
6.54k
                    state = STATE_COMPLETE;
4980
6.54k
                    continue;
4981
6.54k
                }
4982
                
4983
14.3k
                if (node->tag == rowgroup->tag)
4984
3.81k
                {
4985
3.81k
                    if (node->type == EndTag)
4986
0
                    {
4987
0
                        rowgroup->closed = yes;
4988
0
                        TY_(FreeNode)( doc, node);
4989
0
                        DEBUG_LOG_EXIT;
4990
0
                        return NULL;
4991
0
                    }
4992
4993
3.81k
                    TY_(UngetToken)( doc );
4994
3.81k
                    DEBUG_LOG_EXIT;
4995
3.81k
                    return NULL;
4996
3.81k
                }
4997
4998
                /* if </table> infer end tag */
4999
10.5k
                if ( nodeIsTABLE(node) && node->type == EndTag )
5000
230
                {
5001
230
                    TY_(UngetToken)( doc );
5002
230
                    DEBUG_LOG_EXIT;
5003
230
                    return NULL;
5004
230
                }
5005
5006
                /* deal with comments etc. */
5007
10.3k
                if (InsertMisc(rowgroup, node))
5008
123
                    continue;
5009
5010
                /* discard unknown tags */
5011
10.2k
                if (node->tag == NULL && node->type != TextNode)
5012
754
                {
5013
754
                    TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5014
754
                    TY_(FreeNode)( doc, node);
5015
754
                    continue;
5016
754
                }
5017
5018
                /*
5019
                  if TD or TH then infer <TR>
5020
                  if text or inline or block move before table
5021
                  if head content move to head
5022
                */
5023
5024
9.46k
                if (node->type != EndTag)
5025
9.12k
                {
5026
9.12k
                    if ( nodeIsTD(node) || nodeIsTH(node) )
5027
231
                    {
5028
231
                        TY_(UngetToken)( doc );
5029
231
                        node = TY_(InferredTag)(doc, TidyTag_TR);
5030
231
                        TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG);
5031
231
                    }
5032
8.89k
                    else if ( TY_(nodeIsText)(node)
5033
8.28k
                              || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
5034
8.44k
                    {
5035
8.44k
                        MoveBeforeTable( doc, rowgroup, node );
5036
8.44k
                        TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
5037
8.44k
                        lexer->exiled = yes;
5038
5039
8.44k
                        if (node->type != TextNode)
5040
7.83k
                        {
5041
7.83k
                            memory.identity = TY_(ParseRowGroup);
5042
7.83k
                            memory.original_node = rowgroup;
5043
7.83k
                            memory.reentry_node = node;
5044
7.83k
                            memory.reentry_state = STATE_POST_NOT_TEXTNODE;
5045
7.83k
                            TY_(pushMemory)( doc, memory );
5046
7.83k
                            DEBUG_LOG_EXIT_WITH_NODE(node);
5047
7.83k
                            return node;
5048
7.83k
                        }
5049
                        
5050
608
                        state = STATE_POST_NOT_TEXTNODE;
5051
608
                        continue;
5052
8.44k
                    }
5053
450
                    else if (node->tag->model & CM_HEAD)
5054
0
                    {
5055
0
                        TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN);
5056
0
                        MoveToHead(doc, rowgroup, node);
5057
0
                        continue;
5058
0
                    }
5059
9.12k
                }
5060
5061
                /*
5062
                  if this is the end tag for ancestor element
5063
                  then infer end tag for this element
5064
                */
5065
1.02k
                if (node->type == EndTag)
5066
340
                {
5067
340
                    if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
5068
37
                    {
5069
37
                        if ( nodeIsFORM(node) )
5070
4
                            BadForm( doc );
5071
5072
37
                        TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5073
37
                        TY_(FreeNode)( doc, node);
5074
37
                        continue;
5075
37
                    }
5076
5077
303
                    if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) )
5078
303
                    {
5079
303
                        TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5080
303
                        TY_(FreeNode)( doc, node);
5081
303
                        continue;
5082
303
                    }
5083
5084
0
                    for ( parent = rowgroup->parent;
5085
0
                          parent != NULL;
5086
0
                          parent = parent->parent )
5087
0
                    {
5088
0
                        if (node->tag == parent->tag)
5089
0
                        {
5090
0
                            TY_(UngetToken)( doc );
5091
0
                            DEBUG_LOG_EXIT;
5092
0
                            return NULL;
5093
0
                        }
5094
0
                    }
5095
0
                }
5096
5097
                /*
5098
                  if THEAD, TFOOT or TBODY then implied end tag
5099
5100
                */
5101
681
                if (node->tag->model & CM_ROWGRP)
5102
0
                {
5103
0
                    if (node->type != EndTag)
5104
0
                    {
5105
0
                        TY_(UngetToken)( doc );
5106
0
                        DEBUG_LOG_EXIT;
5107
0
                        return NULL;
5108
0
                    }
5109
0
                }
5110
5111
681
                if (node->type == EndTag)
5112
0
                {
5113
0
                    TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED);
5114
0
                    TY_(FreeNode)( doc, node);
5115
0
                    continue;
5116
0
                }
5117
5118
681
                if ( !nodeIsTR(node) )
5119
286
                {
5120
286
                    node = TY_(InferredTag)(doc, TidyTag_TR);
5121
286
                    TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG);
5122
286
                    TY_(UngetToken)( doc );
5123
286
                }
5124
5125
               /* node should be <TR> */
5126
681
                TY_(InsertNodeAtEnd)(rowgroup, node);
5127
681
                memory.identity = TY_(ParseRowGroup);
5128
681
                memory.original_node = rowgroup;
5129
681
                memory.reentry_node = node;
5130
681
                memory.reentry_state = STATE_INITIAL;
5131
681
                TY_(pushMemory)( doc, memory );
5132
681
                DEBUG_LOG_EXIT_WITH_NODE(node);
5133
681
                return node;
5134
681
            } break;
5135
                
5136
                
5137
8.44k
            case STATE_POST_NOT_TEXTNODE:
5138
8.44k
            {
5139
8.44k
                lexer->exiled = no;
5140
8.44k
                state = STATE_INITIAL;
5141
8.44k
                continue;
5142
681
            } break;
5143
5144
                
5145
0
            default:
5146
0
                break;
5147
29.3k
        } /* switch */
5148
29.3k
    } /* while */
5149
6.54k
    DEBUG_LOG_EXIT;
5150
6.54k
    return NULL;
5151
19.1k
}
5152
5153
5154
/** MARK: TY_(ParseScript)
5155
 *  Parses the `script` tag.
5156
 *
5157
 *  @todo This isn't quite right for CDATA content as it recognises tags
5158
 *  within the content and parses them accordingly. This will unfortunately
5159
 *  screw up scripts which include:
5160
 *    < + letter
5161
 *    < + !
5162
 *    < + ?
5163
 *    < + / + letter
5164
 *
5165
 *  This is a non-recursing parser. It uses the document's parser memory stack
5166
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5167
 *  This parser is also re-enterable, so that post-processing can occur after
5168
 *  such dispatching.
5169
 */
5170
Node* TY_(ParseScript)( TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode) )
5171
2.02k
{
5172
2.02k
    Node *node = NULL;
5173
#if defined(ENABLE_DEBUG_LOG)
5174
    static int depth_parser = 0;
5175
    static int count_parser = 0;
5176
#endif
5177
    
5178
2.02k
    DEBUG_LOG_ENTER_WITH_NODE(script);
5179
    
5180
2.02k
    doc->lexer->parent = script;
5181
2.02k
    node = TY_(GetToken)(doc, CdataContent);
5182
2.02k
    doc->lexer->parent = NULL;
5183
5184
2.02k
    if (node)
5185
2.02k
    {
5186
2.02k
        TY_(InsertNodeAtEnd)(script, node);
5187
2.02k
    }
5188
0
    else
5189
0
    {
5190
        /* handle e.g. a document like "<script>" */
5191
0
        TY_(Report)(doc, script, NULL, MISSING_ENDTAG_FOR);
5192
0
        DEBUG_LOG_EXIT;
5193
0
        return NULL;
5194
0
    }
5195
5196
2.02k
    node = TY_(GetToken)(doc, IgnoreWhitespace);
5197
2.02k
    DEBUG_LOG_GOT_TOKEN(node);
5198
5199
2.02k
    if (!(node && node->type == EndTag && node->tag &&
5200
587
        node->tag->id == script->tag->id))
5201
1.69k
    {
5202
1.69k
        TY_(Report)(doc, script, node, MISSING_ENDTAG_FOR);
5203
5204
1.69k
        if (node)
5205
1.01k
            TY_(UngetToken)(doc);
5206
1.69k
    }
5207
321
    else
5208
321
    {
5209
321
        TY_(FreeNode)(doc, node);
5210
321
    }
5211
2.02k
    DEBUG_LOG_EXIT;
5212
2.02k
    return NULL;
5213
2.02k
}
5214
5215
5216
/** MARK: TY_(ParseSelect)
5217
 *  Parses the `select` tag.
5218
 *
5219
 *  This is a non-recursing parser. It uses the document's parser memory stack
5220
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5221
 *  This parser is also re-enterable, so that post-processing can occur after
5222
 *  such dispatching.
5223
 */
5224
Node* TY_(ParseSelect)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) )
5225
275
{
5226
275
    Lexer* lexer = doc->lexer;
5227
275
    Node *node;
5228
275
    DEBUG_LOG_COUNTERS;
5229
5230
275
    if ( field == NULL )
5231
130
    {
5232
130
        TidyParserMemory memory = TY_(popMemory)( doc );
5233
130
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
5234
130
        DEBUG_LOG_REENTER_WITH_NODE(node);
5235
130
        field = memory.original_node;
5236
130
        DEBUG_LOG_GET_OLD_MODE;
5237
130
        mode = memory.mode;
5238
130
        DEBUG_LOG_CHANGE_MODE;
5239
130
    }
5240
145
    else
5241
145
    {
5242
145
        DEBUG_LOG_ENTER_WITH_NODE(field);
5243
145
    }
5244
    
5245
275
    lexer->insert = NULL;  /* defer implicit inline start tags */
5246
5247
794
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
5248
782
    {
5249
782
        if (node->tag == field->tag && node->type == EndTag)
5250
133
        {
5251
133
            TY_(FreeNode)( doc, node);
5252
133
            field->closed = yes;
5253
133
            TrimSpaces(doc, field);
5254
5255
133
            DEBUG_LOG_EXIT;
5256
133
            return NULL;
5257
133
        }
5258
5259
        /* deal with comments etc. */
5260
649
        if (InsertMisc(field, node))
5261
0
            continue;
5262
5263
649
        if ( node->type == StartTag &&
5264
343
             ( nodeIsOPTION(node)   ||
5265
343
               nodeIsOPTGROUP(node) ||
5266
343
               nodeIsDATALIST(node) ||
5267
343
               nodeIsSCRIPT(node))
5268
649
           )
5269
130
        {
5270
130
            TidyParserMemory memory = {0};
5271
130
            memory.identity = TY_(ParseSelect);
5272
130
            memory.original_node = field;
5273
130
            memory.reentry_node = node;
5274
5275
130
            TY_(InsertNodeAtEnd)(field, node);
5276
130
            TY_(pushMemory)( doc, memory );
5277
130
            DEBUG_LOG_EXIT_WITH_NODE(node);
5278
130
            return node;
5279
130
        }
5280
5281
        /* discard unexpected tags */
5282
519
        TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
5283
519
        TY_(FreeNode)( doc, node);
5284
519
    }
5285
5286
12
    TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
5287
5288
12
    DEBUG_LOG_EXIT;
5289
12
    return NULL;
5290
275
}
5291
5292
5293
/** MARK: TY_(ParseTableTag)
5294
 *  Parses the `table` tag.
5295
 *
5296
 *  This is a non-recursing parser. It uses the document's parser memory stack
5297
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5298
 *  This parser is also re-enterable, so that post-processing can occur after
5299
 *  such dispatching.
5300
 */
5301
Node* TY_(ParseTableTag)( TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode) )
5302
15.5k
{
5303
15.5k
    Lexer* lexer = doc->lexer;
5304
15.5k
    Node *node, *parent;
5305
15.5k
    uint istackbase;
5306
15.5k
    DEBUG_LOG_COUNTERS;
5307
5308
15.5k
    if ( table == NULL )
5309
8.75k
    {
5310
8.75k
        TidyParserMemory memory = TY_(popMemory)( doc );
5311
8.75k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
5312
8.75k
        DEBUG_LOG_REENTER_WITH_NODE(node);
5313
8.75k
        table = memory.original_node;
5314
8.75k
        lexer->exiled = memory.register_1;
5315
8.75k
        DEBUG_LOG_GET_OLD_MODE;
5316
8.75k
        mode = memory.mode;
5317
8.75k
        DEBUG_LOG_CHANGE_MODE;
5318
8.75k
    }
5319
6.82k
    else
5320
6.82k
    {
5321
6.82k
        DEBUG_LOG_ENTER_WITH_NODE(table);
5322
6.82k
        TY_(DeferDup)( doc );
5323
6.82k
    }
5324
5325
15.5k
    istackbase = lexer->istackbase;
5326
15.5k
    lexer->istackbase = lexer->istacksize;
5327
5328
26.9k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
5329
21.2k
    {
5330
21.2k
        DEBUG_LOG_GOT_TOKEN(node);
5331
21.2k
        if (node->tag == table->tag )
5332
694
        {
5333
694
            if (node->type == EndTag)
5334
104
            {
5335
104
                TY_(FreeNode)(doc, node);
5336
104
            }
5337
590
            else
5338
590
            {
5339
                /* Issue #498 - If a <table> in a <table>
5340
                 * just close the current table, and issue a
5341
                 * warning. The previous action was to discard
5342
                 * this second <table>
5343
                 */
5344
590
                TY_(UngetToken)(doc);
5345
590
                TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
5346
590
            }
5347
694
            lexer->istackbase = istackbase;
5348
694
            table->closed = yes;
5349
5350
694
            DEBUG_LOG_EXIT;
5351
694
            return NULL;
5352
694
        }
5353
5354
        /* deal with comments etc. */
5355
20.5k
        if (InsertMisc(table, node))
5356
3.37k
            continue;
5357
5358
        /* discard unknown tags */
5359
17.1k
        if (node->tag == NULL && node->type != TextNode)
5360
2.32k
        {
5361
2.32k
            TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5362
2.32k
            TY_(FreeNode)( doc, node);
5363
2.32k
            continue;
5364
2.32k
        }
5365
5366
        /* if TD or TH or text or inline or block then infer <TR> */
5367
5368
14.8k
        if (node->type != EndTag)
5369
14.2k
        {
5370
14.2k
            if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) )
5371
61
            {
5372
61
                TY_(UngetToken)( doc );
5373
61
                node = TY_(InferredTag)(doc, TidyTag_TR);
5374
61
                TY_(Report)(doc, table, node, MISSING_STARTTAG);
5375
61
            }
5376
14.1k
            else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) )
5377
6.03k
            {
5378
6.03k
                TY_(InsertNodeBeforeElement)(table, node);
5379
6.03k
                TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
5380
6.03k
                lexer->exiled = yes;
5381
5382
6.03k
                if (node->type != TextNode)
5383
576
                {
5384
576
                    TidyParserMemory memory = {0};
5385
576
                    memory.identity = TY_(ParseTableTag);
5386
576
                    memory.original_node = table;
5387
576
                    memory.reentry_node = node;
5388
576
                    memory.register_1 = no; /* later, lexer->exiled = no */
5389
576
                    memory.mode = IgnoreWhitespace;
5390
576
                    TY_(pushMemory)( doc, memory );
5391
576
                    DEBUG_LOG_EXIT_WITH_NODE(node);
5392
576
                    return node;
5393
576
                }
5394
5395
5.45k
                lexer->exiled = no;
5396
5.45k
                continue;
5397
6.03k
            }
5398
8.15k
            else if (node->tag->model & CM_HEAD)
5399
4
            {
5400
4
                MoveToHead(doc, table, node);
5401
4
                continue;
5402
4
            }
5403
14.2k
        }
5404
5405
        /*
5406
          if this is the end tag for an ancestor element
5407
          then infer end tag for this element
5408
        */
5409
8.81k
        if (node->type == EndTag)
5410
599
        {
5411
599
            if ( nodeIsFORM(node) )
5412
153
            {
5413
153
                BadForm( doc );
5414
153
                TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5415
153
                TY_(FreeNode)( doc, node);
5416
153
                continue;
5417
153
            }
5418
5419
            /* best to discard unexpected block/inline end tags */
5420
446
            if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) ||
5421
430
                 TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) )
5422
28
            {
5423
28
                TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5424
28
                TY_(FreeNode)( doc, node);
5425
28
                continue;
5426
28
            }
5427
5428
418
            for ( parent = table->parent;
5429
973
                  parent != NULL;
5430
555
                  parent = parent->parent )
5431
694
            {
5432
694
                if (node->tag == parent->tag)
5433
139
                {
5434
139
                    TY_(Report)(doc, table, node, MISSING_ENDTAG_BEFORE );
5435
139
                    TY_(UngetToken)( doc );
5436
139
                    lexer->istackbase = istackbase;
5437
5438
139
                    DEBUG_LOG_EXIT;
5439
139
                    return NULL;
5440
139
                }
5441
694
            }
5442
418
        }
5443
5444
8.49k
        if (!(node->tag->model & CM_TABLE))
5445
310
        {
5446
310
            TY_(UngetToken)( doc );
5447
310
            TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN);
5448
310
            lexer->istackbase = istackbase;
5449
5450
310
            DEBUG_LOG_EXIT;
5451
310
            return NULL;
5452
310
        }
5453
5454
8.18k
        if (TY_(nodeIsElement)(node))
5455
8.18k
        {
5456
8.18k
            TidyParserMemory memory = {0};
5457
8.18k
            TY_(InsertNodeAtEnd)(table, node);
5458
8.18k
            memory.identity = TY_(ParseTableTag);
5459
8.18k
            memory.original_node = table;
5460
8.18k
            memory.reentry_node = node;
5461
8.18k
            memory.register_1 = lexer->exiled;
5462
8.18k
            TY_(pushMemory)( doc, memory );
5463
8.18k
            DEBUG_LOG_EXIT_WITH_NODE(node);
5464
8.18k
            return node;
5465
8.18k
        }
5466
5467
        /* discard unexpected text nodes and end tags */
5468
0
        TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED);
5469
0
        TY_(FreeNode)( doc, node);
5470
0
    }
5471
5472
5.67k
    TY_(Report)(doc, table, node, MISSING_ENDTAG_FOR);
5473
5.67k
    lexer->istackbase = istackbase;
5474
5475
5.67k
    DEBUG_LOG_EXIT;
5476
5.67k
    return NULL;
5477
15.5k
}
5478
5479
5480
/** MARK: TY_(ParseText)
5481
 *  Parses the `option` and `textarea` tags.
5482
 *
5483
 *  This is a non-recursing parser. It uses the document's parser memory stack
5484
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5485
 *  This parser is also re-enterable, so that post-processing can occur after
5486
 *  such dispatching.
5487
 */
5488
Node* TY_(ParseText)( TidyDocImpl* doc, Node *field, GetTokenMode mode )
5489
2.48k
{
5490
2.48k
    Lexer* lexer = doc->lexer;
5491
2.48k
    Node *node;
5492
2.48k
    DEBUG_LOG_COUNTERS;
5493
    
5494
2.48k
    DEBUG_LOG_ENTER_WITH_NODE(field);
5495
5496
2.48k
    lexer->insert = NULL;  /* defer implicit inline start tags */
5497
5498
2.48k
    DEBUG_LOG_GET_OLD_MODE;
5499
2.48k
    if ( nodeIsTEXTAREA(field) )
5500
2.33k
        mode = Preformatted;
5501
151
    else
5502
151
        mode = MixedContent;  /* kludge for font tags */
5503
2.48k
    DEBUG_LOG_CHANGE_MODE;
5504
5505
4.90k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
5506
4.69k
    {
5507
4.69k
        if (node->tag == field->tag && node->type == EndTag)
5508
0
        {
5509
0
            TY_(FreeNode)( doc, node);
5510
0
            field->closed = yes;
5511
0
            TrimSpaces(doc, field);
5512
0
            DEBUG_LOG_EXIT;
5513
0
            return NULL;
5514
0
        }
5515
5516
        /* deal with comments etc. */
5517
4.69k
        if (InsertMisc(field, node))
5518
342
            continue;
5519
5520
4.35k
        if (TY_(nodeIsText)(node))
5521
1.77k
        {
5522
            /* only called for 1st child */
5523
1.77k
            if (field->content == NULL && !(mode & Preformatted))
5524
133
                TrimSpaces(doc, field);
5525
5526
1.77k
            if (node->start >= node->end)
5527
0
            {
5528
0
                TY_(FreeNode)( doc, node);
5529
0
                continue;
5530
0
            }
5531
5532
1.77k
            TY_(InsertNodeAtEnd)(field, node);
5533
1.77k
            continue;
5534
1.77k
        }
5535
5536
        /* for textarea should all cases of < and & be escaped? */
5537
5538
        /* discard inline tags e.g. font */
5539
2.57k
        if (   node->tag
5540
2.26k
            && node->tag->model & CM_INLINE
5541
2.17k
            && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */
5542
304
        {
5543
304
            TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED);
5544
304
            TY_(FreeNode)( doc, node);
5545
304
            continue;
5546
304
        }
5547
5548
        /* terminate element on other tags */
5549
2.27k
        if (!(field->tag->model & CM_OPT))
5550
2.24k
            TY_(Report)(doc, field, node, MISSING_ENDTAG_BEFORE);
5551
5552
2.27k
        TY_(UngetToken)( doc );
5553
2.27k
        TrimSpaces(doc, field);
5554
2.27k
        DEBUG_LOG_EXIT;
5555
2.27k
        return NULL;
5556
2.57k
    }
5557
5558
214
    if (!(field->tag->model & CM_OPT))
5559
88
        TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR);
5560
214
    DEBUG_LOG_EXIT;
5561
214
    return NULL;
5562
2.48k
}
5563
5564
    
5565
/** MARK: TY_(ParseTitle)
5566
 *  Parses the `title` tag.
5567
 *
5568
 *  This is a non-recursing parser. It uses the document's parser memory stack
5569
 *  to send subsequent nodes back to the controller for dispatching to parsers.
5570
 *  This parser is also re-enterable, so that post-processing can occur after
5571
 *  such dispatching.
5572
 */
5573
Node* TY_(ParseTitle)( TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode) )
5574
203
{
5575
203
    Node *node;
5576
779
    while ((node = TY_(GetToken)(doc, MixedContent)) != NULL)
5577
754
    {
5578
754
        if (node->tag == title->tag && node->type == StartTag
5579
101
            && cfgBool(doc, TidyCoerceEndTags) )
5580
101
        {
5581
101
            TY_(Report)(doc, title, node, COERCE_TO_ENDTAG);
5582
101
            node->type = EndTag;
5583
101
            TY_(UngetToken)( doc );
5584
101
            continue;
5585
101
        }
5586
653
        else if (node->tag == title->tag && node->type == EndTag)
5587
156
        {
5588
156
            TY_(FreeNode)( doc, node);
5589
156
            title->closed = yes;
5590
156
            TrimSpaces(doc, title);
5591
156
            return NULL;
5592
156
        }
5593
5594
497
        if (TY_(nodeIsText)(node))
5595
219
        {
5596
            /* only called for 1st child */
5597
219
            if (title->content == NULL)
5598
127
                TrimInitialSpace(doc, title, node);
5599
5600
219
            if (node->start >= node->end)
5601
50
            {
5602
50
                TY_(FreeNode)( doc, node);
5603
50
                continue;
5604
50
            }
5605
5606
169
            TY_(InsertNodeAtEnd)(title, node);
5607
169
            continue;
5608
219
        }
5609
5610
        /* deal with comments etc. */
5611
278
        if (InsertMisc(title, node))
5612
33
            continue;
5613
5614
        /* discard unknown tags */
5615
245
        if (node->tag == NULL)
5616
223
        {
5617
223
            TY_(Report)(doc, title, node, DISCARDING_UNEXPECTED);
5618
223
            TY_(FreeNode)( doc, node);
5619
223
            continue;
5620
223
        }
5621
5622
        /* pushback unexpected tokens */
5623
22
        TY_(Report)(doc, title, node, MISSING_ENDTAG_BEFORE);
5624
22
        TY_(UngetToken)( doc );
5625
22
        TrimSpaces(doc, title);
5626
22
        return NULL;
5627
245
    }
5628
5629
25
    TY_(Report)(doc, title, node, MISSING_ENDTAG_FOR);
5630
25
    return NULL;
5631
203
}
5632
5633
5634
/** MARK: ParseXMLElement
5635
 *  Parses the given XML element.
5636
 */
5637
static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode)
5638
54.4k
{
5639
54.4k
    Lexer* lexer = doc->lexer;
5640
54.4k
    Node *node;
5641
5642
54.4k
    if ( element == NULL )
5643
27.0k
    {
5644
27.0k
        TidyParserMemory memory = TY_(popMemory)( doc );
5645
27.0k
        element = memory.original_node;
5646
27.0k
        node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */
5647
27.0k
        mode = memory.reentry_mode;
5648
27.0k
        TY_(InsertNodeAtEnd)(element, node); /* The only re-entry action needed. */
5649
27.0k
    }
5650
27.4k
    else
5651
27.4k
    {
5652
        /* if node is pre or has xml:space="preserve" then do so */
5653
27.4k
        if ( TY_(XMLPreserveWhiteSpace)(doc, element) )
5654
543
            mode = Preformatted;
5655
5656
        /* deal with comments etc. */
5657
27.4k
        InsertMisc( &doc->root, element);
5658
        
5659
        /* we shouldn't have plain text at this point. */
5660
27.4k
        if (TY_(nodeIsText)(element))
5661
144
        {
5662
144
            TY_(Report)(doc, &doc->root, element, DISCARDING_UNEXPECTED);
5663
144
            TY_(FreeNode)( doc, element);
5664
144
            return NULL;
5665
144
        }
5666
27.4k
    }
5667
66.4k
    while ((node = TY_(GetToken)(doc, mode)) != NULL)
5668
40.1k
    {
5669
40.1k
        if (node->type == EndTag &&
5670
1.73k
           node->element && element->element &&
5671
1.73k
           TY_(tmbstrcmp)(node->element, element->element) == 0)
5672
940
        {
5673
940
            TY_(FreeNode)( doc, node);
5674
940
            element->closed = yes;
5675
940
            break;
5676
940
        }
5677
5678
        /* discard unexpected end tags */
5679
39.1k
        if (node->type == EndTag)
5680
795
        {
5681
795
            if (element)
5682
795
                TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_IN);
5683
0
            else
5684
0
                TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_ERR);
5685
5686
795
            TY_(FreeNode)( doc, node);
5687
795
            continue;
5688
795
        }
5689
5690
        /* parse content on seeing start tag */
5691
38.3k
        if (node->type == StartTag)
5692
27.0k
        {
5693
27.0k
            TidyParserMemory memory = {0};
5694
27.0k
            memory.identity = ParseXMLElement;
5695
27.0k
            memory.original_node = element;
5696
27.0k
            memory.reentry_node = node;
5697
27.0k
            memory.reentry_mode = mode;
5698
27.0k
            TY_(pushMemory)( doc, memory );
5699
27.0k
            return node;
5700
27.0k
        }
5701
5702
11.3k
        TY_(InsertNodeAtEnd)(element, node);
5703
11.3k
    } /* while */
5704
5705
    /*
5706
     if first child is text then trim initial space and
5707
     delete text node if it is empty.
5708
    */
5709
5710
27.2k
    node = element->content;
5711
5712
27.2k
    if (TY_(nodeIsText)(node) && mode != Preformatted)
5713
5.39k
    {
5714
5.39k
        if ( lexer->lexbuf[node->start] == ' ' )
5715
980
        {
5716
980
            node->start++;
5717
5718
980
            if (node->start >= node->end)
5719
972
                TY_(DiscardElement)( doc, node );
5720
980
        }
5721
5.39k
    }
5722
5723
    /*
5724
     if last child is text then trim final space and
5725
     delete the text node if it is empty
5726
    */
5727
5728
27.2k
    node = element->last;
5729
5730
27.2k
    if (TY_(nodeIsText)(node) && mode != Preformatted)
5731
664
    {
5732
664
        if ( lexer->lexbuf[node->end - 1] == ' ' )
5733
287
        {
5734
287
            node->end--;
5735
5736
287
            if (node->start >= node->end)
5737
8
                TY_(DiscardElement)( doc, node );
5738
287
        }
5739
664
    }
5740
27.2k
    return NULL;
5741
54.3k
}
5742
5743
5744
/***************************************************************************//*
5745
 ** MARK: - Post-Parse Operations
5746
 ***************************************************************************/
5747
5748
5749
/**
5750
 *  Performs checking of all attributes recursively starting at `node`.
5751
 */
5752
static void AttributeChecks(TidyDocImpl* doc, Node* node)
5753
379k
{
5754
379k
    Node *next;
5755
5756
864k
    while (node)
5757
484k
    {
5758
484k
        next = node->next;
5759
5760
484k
        if (TY_(nodeIsElement)(node))
5761
437k
        {
5762
437k
            if (node->tag && node->tag->chkattrs) /* [i_a]2 fix crash after adding SVG support with alt/unknown tag subtree insertion there */
5763
19.4k
                node->tag->chkattrs(doc, node);
5764
418k
            else
5765
418k
                TY_(CheckAttributes)(doc, node);
5766
437k
        }
5767
5768
484k
        if (node->content)
5769
378k
            AttributeChecks(doc, node->content);
5770
5771
484k
        assert( next != node ); /* http://tidy.sf.net/issue/1603538 */
5772
484k
        node = next;
5773
484k
    }
5774
379k
}
5775
5776
5777
/**
5778
 *  Encloses naked text in certain elements within `p` tags.
5779
 *
5780
 *  <form>, <blockquote>, and <noscript> do not allow #PCDATA in
5781
 *  HTML 4.01 Strict (%block; model instead of %flow;).
5782
 */
5783
static void EncloseBlockText(TidyDocImpl* doc, Node* node)
5784
0
{
5785
0
    Node *next;
5786
0
    Node *block;
5787
5788
0
    while (node)
5789
0
    {
5790
0
        next = node->next;
5791
5792
0
        if (node->content)
5793
0
            EncloseBlockText(doc, node->content);
5794
5795
0
        if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) ||
5796
0
              nodeIsBLOCKQUOTE(node))
5797
0
            || !node->content)
5798
0
        {
5799
0
            node = next;
5800
0
            continue;
5801
0
        }
5802
5803
0
        block = node->content;
5804
5805
0
        if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) ||
5806
0
            (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block)))
5807
0
        {
5808
0
            Node* p = TY_(InferredTag)(doc, TidyTag_P);
5809
0
            TY_(InsertNodeBeforeElement)(block, p);
5810
0
            while (block &&
5811
0
                   (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block)))
5812
0
            {
5813
0
                Node* tempNext = block->next;
5814
0
                TY_(RemoveNode)(block);
5815
0
                TY_(InsertNodeAtEnd)(p, block);
5816
0
                block = tempNext;
5817
0
            }
5818
0
            TrimSpaces(doc, p);
5819
0
            continue;
5820
0
        }
5821
5822
0
        node = next;
5823
0
    }
5824
0
}
5825
5826
5827
/**
5828
 *  Encloses all naked body text within `p` tags.
5829
 */
5830
static void EncloseBodyText(TidyDocImpl* doc)
5831
0
{
5832
0
    Node* node;
5833
0
    Node* body = TY_(FindBody)(doc);
5834
5835
0
    if (!body)
5836
0
        return;
5837
5838
0
    node = body->content;
5839
5840
0
    while (node)
5841
0
    {
5842
0
        if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) ||
5843
0
            (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node)))
5844
0
        {
5845
0
            Node* p = TY_(InferredTag)(doc, TidyTag_P);
5846
0
            TY_(InsertNodeBeforeElement)(node, p);
5847
0
            while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node)))
5848
0
            {
5849
0
                Node* next = node->next;
5850
0
                TY_(RemoveNode)(node);
5851
0
                TY_(InsertNodeAtEnd)(p, node);
5852
0
                node = next;
5853
0
            }
5854
0
            TrimSpaces(doc, p);
5855
0
            continue;
5856
0
        }
5857
0
        node = node->next;
5858
0
    }
5859
0
}
5860
5861
5862
/**
5863
 *  Replaces elements that are obsolete with appropriate substitute tags.
5864
 */
5865
static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node)
5866
379k
{
5867
379k
    Node *next;
5868
5869
864k
    while (node)
5870
484k
    {
5871
484k
        next = node->next;
5872
5873
        /* if (nodeIsDIR(node) || nodeIsMENU(node)) */
5874
        /* HTML5 - <menu ... > is no longer obsolete */
5875
484k
        if (nodeIsDIR(node))
5876
33
            TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes);
5877
5878
484k
        if (nodeIsXMP(node) || nodeIsLISTING(node) ||
5879
484k
            (node->tag && node->tag->id == TidyTag_PLAINTEXT))
5880
253
            TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes);
5881
5882
484k
        if (node->content)
5883
378k
            ReplaceObsoleteElements(doc, node->content);
5884
5885
484k
        node = next;
5886
484k
    }
5887
379k
}
5888
5889
5890
/***************************************************************************//*
5891
 ** MARK: - Internal API Implementation
5892
 ***************************************************************************/
5893
5894
5895
/** MARK: TY_(CheckNodeIntegrity)
5896
 *  Is used to perform a node integrity check after parsing an HTML or XML
5897
 *  document.
5898
 *  @note Actual performance of this check can be disabled by defining the
5899
 *  macro NO_NODE_INTEGRITY_CHECK.
5900
 */
5901
Bool TY_(CheckNodeIntegrity)(Node *node)
5902
408k
{
5903
408k
#ifndef NO_NODE_INTEGRITY_CHECK
5904
408k
    Node *child;
5905
5906
408k
    if (node->prev)
5907
76.3k
    {
5908
76.3k
        if (node->prev->next != node)
5909
0
            return no;
5910
76.3k
    }
5911
5912
408k
    if (node->next)
5913
76.3k
    {
5914
76.3k
        if (node->next == node || node->next->prev != node)
5915
0
            return no;
5916
76.3k
    }
5917
5918
408k
    if (node->parent)
5919
408k
    {
5920
408k
        if (node->prev == NULL && node->parent->content != node)
5921
0
            return no;
5922
5923
408k
        if (node->next == NULL && node->parent->last != node)
5924
0
            return no;
5925
408k
    }
5926
5927
816k
    for (child = node->content; child; child = child->next)
5928
408k
        if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) )
5929
0
            return no;
5930
5931
408k
#endif
5932
408k
    return yes;
5933
408k
}
5934
5935
5936
/** MARK: TY_(CoerceNode)
5937
 *  Transforms a given node to another element, for example, from a <p>
5938
 *  to a <br>.
5939
 */
5940
void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected)
5941
1.56k
{
5942
1.56k
    const Dict* tag = TY_(LookupTagDef)(tid);
5943
1.56k
    Node* tmp = TY_(InferredTag)(doc, tag->id);
5944
5945
1.56k
    if (obsolete)
5946
286
        TY_(Report)(doc, node, tmp, OBSOLETE_ELEMENT);
5947
1.27k
    else if (unexpected)
5948
0
        TY_(Report)(doc, node, tmp, REPLACING_UNEX_ELEMENT);
5949
1.27k
    else
5950
1.27k
        TY_(Report)(doc, node, tmp, REPLACING_ELEMENT);
5951
5952
1.56k
    TidyDocFree(doc, tmp->element);
5953
1.56k
    TidyDocFree(doc, tmp);
5954
5955
1.56k
    node->was = node->tag;
5956
1.56k
    node->tag = tag;
5957
1.56k
    node->type = StartTag;
5958
1.56k
    node->implicit = yes;
5959
1.56k
    TidyDocFree(doc, node->element);
5960
1.56k
    node->element = TY_(tmbstrdup)(doc->allocator, tag->name);
5961
1.56k
}
5962
5963
5964
/** MARK: TY_(DiscardElement)
5965
 *  Remove node from markup tree and discard it.
5966
 */
5967
Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element )
5968
113k
{
5969
113k
    Node *next = NULL;
5970
5971
113k
    if (element)
5972
113k
    {
5973
113k
        next = element->next;
5974
113k
        TY_(RemoveNode)(element);
5975
113k
        TY_(FreeNode)( doc, element);
5976
113k
    }
5977
5978
113k
    return next;
5979
113k
}
5980
5981
5982
/** MARK: TY_(DropEmptyElements)
5983
 *  Trims a tree of empty elements recursively, returning the next node.
5984
 */
5985
Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node)
5986
379k
{
5987
379k
    Node* next;
5988
5989
864k
    while (node)
5990
484k
    {
5991
484k
        next = node->next;
5992
5993
484k
        if (node->content)
5994
378k
            TY_(DropEmptyElements)(doc, node->content);
5995
5996
484k
        if (!TY_(nodeIsElement)(node) &&
5997
47.4k
            !(TY_(nodeIsText)(node) && !(node->start < node->end)))
5998
46.8k
        {
5999
46.8k
            node = next;
6000
46.8k
            continue;
6001
46.8k
        }
6002
6003
438k
        next = TY_(TrimEmptyElement)(doc, node);
6004
438k
        node = next;
6005
438k
    }
6006
6007
379k
    return node;
6008
379k
}
6009
6010
6011
/** MARK: TY_(InsertNodeAtStart)
6012
 *  Insert node into markup tree as the first element of content of element.
6013
 */
6014
void TY_(InsertNodeAtStart)(Node *element, Node *node)
6015
452
{
6016
452
    node->parent = element;
6017
6018
452
    if (element->content == NULL)
6019
57
        element->last = node;
6020
395
    else
6021
395
        element->content->prev = node;
6022
6023
452
    node->next = element->content;
6024
452
    node->prev = NULL;
6025
452
    element->content = node;
6026
452
}
6027
6028
6029
/** MARK: TY_(InsertNodeAtEnd)
6030
 *  Insert node into markup tree as the last element of content of element.
6031
 */
6032
void TY_(InsertNodeAtEnd)(Node *element, Node *node)
6033
595k
{
6034
595k
    node->parent = element;
6035
595k
    node->prev = element ? element->last : NULL;
6036
6037
595k
    if (element && element->last != NULL)
6038
110k
        element->last->next = node;
6039
484k
    else
6040
484k
        if (element)
6041
484k
            element->content = node;
6042
6043
595k
    if (element)
6044
595k
        element->last = node;
6045
595k
}
6046
6047
6048
/** MARK: TY_(InsertNodeBeforeElement)
6049
 *  Insert node into markup tree before element.
6050
 */
6051
void TY_(InsertNodeBeforeElement)(Node *element, Node *node)
6052
22.9k
{
6053
22.9k
    Node *parent;
6054
6055
22.9k
    parent = element ? element->parent : NULL;
6056
22.9k
    node->parent = parent;
6057
22.9k
    node->next = element;
6058
22.9k
    node->prev = element ? element->prev : NULL;
6059
22.9k
    if (element)
6060
22.7k
        element->prev = node;
6061
6062
22.9k
    if (node->prev)
6063
14.1k
        node->prev->next = node;
6064
6065
22.9k
    if (parent && parent->content == element)
6066
8.38k
        parent->content = node;
6067
22.9k
}
6068
6069
6070
/** MARK: TY_(InsertNodeAfterElement)
6071
 *  Insert node into markup tree after element.
6072
 */
6073
void TY_(InsertNodeAfterElement)(Node *element, Node *node)
6074
18.2k
{
6075
18.2k
    Node *parent;
6076
6077
18.2k
    parent = element->parent;
6078
18.2k
    node->parent = parent;
6079
6080
    /* AQ - 13 Jan 2000 fix for parent == NULL */
6081
18.2k
    if (parent != NULL && parent->last == element)
6082
3.33k
        parent->last = node;
6083
14.8k
    else
6084
14.8k
    {
6085
14.8k
        node->next = element->next;
6086
        /* AQ - 13 Jan 2000 fix for node->next == NULL */
6087
14.8k
        if (node->next != NULL)
6088
12.5k
            node->next->prev = node;
6089
14.8k
    }
6090
6091
18.2k
    element->next = node;
6092
18.2k
    node->prev = element;
6093
18.2k
}
6094
6095
6096
/** MARK: TY_(IsBlank)
6097
 *  Indicates whether or not a text node is blank, meaning that it consists
6098
 *  of nothing, or a single space.
6099
 */
6100
Bool TY_(IsBlank)(Lexer *lexer, Node *node)
6101
175
{
6102
175
    Bool isBlank = TY_(nodeIsText)(node);
6103
175
    if ( isBlank )
6104
0
        isBlank = ( node->end == node->start ||       /* Zero length */
6105
0
                   ( node->end == node->start+1      /* or one blank. */
6106
0
                    && lexer->lexbuf[node->start] == ' ' ) );
6107
    
6108
175
    return isBlank;
6109
175
}
6110
6111
6112
/** MARK: TY_(IsJavaScript)
6113
 *  Indicates whether or not a node is declared as containing javascript
6114
 *  code.
6115
 */
6116
Bool TY_(IsJavaScript)(Node *node)
6117
100
{
6118
100
    Bool result = no;
6119
100
    AttVal *attr;
6120
6121
100
    if (node->attributes == NULL)
6122
65
        return yes;
6123
6124
102
    for (attr = node->attributes; attr; attr = attr->next)
6125
67
    {
6126
67
        if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr))
6127
31
             && AttrContains(attr, "javascript") )
6128
0
        {
6129
0
            result = yes;
6130
0
            break;
6131
0
        }
6132
67
    }
6133
6134
35
    return result;
6135
100
}
6136
6137
6138
/** MARK: TY_(IsNewNode)
6139
 *  Used to check if a node uses CM_NEW, which determines how attributes
6140
 *  without values should be printed. This was introduced to deal with
6141
 *  user-defined tags e.g. ColdFusion.
6142
 */
6143
Bool TY_(IsNewNode)(Node *node)
6144
0
{
6145
0
    if (node && node->tag)
6146
0
    {
6147
0
        return (node->tag->model & CM_NEW);
6148
0
    }
6149
0
    return yes;
6150
0
}
6151
6152
6153
/** MARK: TY_(RemoveNode)
6154
 *  Extract a node and its children from a markup tree
6155
 */
6156
Node *TY_(RemoveNode)(Node *node)
6157
116k
{
6158
116k
    if (node->prev)
6159
20.9k
        node->prev->next = node->next;
6160
6161
116k
    if (node->next)
6162
39.4k
        node->next->prev = node->prev;
6163
6164
116k
    if (node->parent)
6165
115k
    {
6166
115k
        if (node->parent->content == node)
6167
94.5k
            node->parent->content = node->next;
6168
6169
115k
        if (node->parent->last == node)
6170
76.0k
            node->parent->last = node->prev;
6171
115k
    }
6172
6173
116k
    node->parent = node->prev = node->next = NULL;
6174
116k
    return node;
6175
116k
}
6176
6177
6178
/** MARK: TY_(TrimEmptyElement)
6179
 *  Trims a single, empty element, returning the next node.
6180
 */
6181
Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element )
6182
438k
{
6183
438k
    if ( CanPrune(doc, element) )
6184
111k
    {
6185
111k
        if (element->type != TextNode)
6186
111k
        {
6187
111k
            doc->footnotes |= FN_TRIM_EMPTY_ELEMENT;
6188
111k
            TY_(Report)(doc, element, NULL, TRIM_EMPTY_ELEMENT);
6189
111k
        }
6190
6191
111k
        return TY_(DiscardElement)(doc, element);
6192
111k
    }
6193
326k
    return element->next;
6194
438k
}
6195
6196
6197
/** MARK: TY_(XMLPreserveWhiteSpace)
6198
 *  Indicates whether or not whitespace is to be preserved in XHTML/XML
6199
 *  documents.
6200
 */
6201
Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element)
6202
27.4k
{
6203
27.4k
    AttVal *attribute;
6204
6205
    /* search attributes for xml:space */
6206
30.0k
    for (attribute = element->attributes; attribute; attribute = attribute->next)
6207
3.29k
    {
6208
3.29k
        if (attrIsXML_SPACE(attribute))
6209
667
        {
6210
667
            if (AttrValueIs(attribute, "preserve"))
6211
55
                return yes;
6212
6213
612
            return no;
6214
667
        }
6215
3.29k
    }
6216
6217
26.7k
    if (element->element == NULL)
6218
144
        return no;
6219
        
6220
    /* kludge for html docs without explicit xml:space attribute */
6221
26.5k
    if (nodeIsPRE(element)    ||
6222
26.5k
        nodeIsSCRIPT(element) ||
6223
26.5k
        nodeIsSTYLE(element)  ||
6224
26.5k
        TY_(FindParser)(doc, element) == TY_(ParsePre))
6225
488
        return yes;
6226
6227
    /* kludge for XSL docs */
6228
26.1k
    if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 )
6229
0
        return yes;
6230
6231
26.1k
    return no;
6232
26.1k
}
6233
6234
6235
/***************************************************************************//*
6236
 ** MARK: - Internal API Implementation - Main Parsers
6237
 ***************************************************************************/
6238
6239
6240
/** MARK: TY_(ParseDocument)
6241
 *  Parses an HTML document after lexing. It begins by properly configuring
6242
 *  the overall HTML structure, and subsequently processes all remaining
6243
 *  nodes.
6244
 */
6245
void TY_(ParseDocument)(TidyDocImpl* doc)
6246
352
{
6247
352
    Node *node, *html, *doctype = NULL;
6248
6249
1.08k
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
6250
1.08k
    {
6251
1.08k
        if (node->type == XmlDecl)
6252
30
        {
6253
30
            doc->xmlDetected = yes;
6254
6255
30
            if (TY_(FindXmlDecl)(doc) && doc->root.content)
6256
29
            {
6257
29
                TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6258
29
                TY_(FreeNode)(doc, node);
6259
29
                continue;
6260
29
            }
6261
1
            if (node->line > 1 || node->column != 1)
6262
1
            {
6263
1
                TY_(Report)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL);
6264
1
            }
6265
1
        }
6266
6267
        /* deal with comments etc. */
6268
1.05k
        if (InsertMisc( &doc->root, node ))
6269
685
            continue;
6270
6271
370
        if (node->type == DocTypeTag)
6272
11
        {
6273
11
            if (doctype == NULL)
6274
11
            {
6275
11
                TY_(InsertNodeAtEnd)( &doc->root, node);
6276
11
                doctype = node;
6277
11
            }
6278
0
            else
6279
0
            {
6280
0
                TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6281
0
                TY_(FreeNode)( doc, node);
6282
0
            }
6283
11
            continue;
6284
11
        }
6285
6286
359
        if (node->type == EndTag)
6287
9
        {
6288
9
            TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6289
9
            TY_(FreeNode)( doc, node);
6290
9
            continue;
6291
9
        }
6292
6293
350
        if (node->type == StartTag && nodeIsHTML(node))
6294
2
        {
6295
2
            AttVal *xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS);
6296
6297
2
            if (AttrValueIs(xmlns, XHTML_NAMESPACE))
6298
2
            {
6299
2
                Bool htmlOut = cfgBool( doc, TidyHtmlOut );
6300
2
                doc->lexer->isvoyager = yes;                  /* Unless plain HTML */
6301
2
                TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/
6302
2
                TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut );   /* will be XHTML. */
6303
6304
                /* adjust other config options, just as in config.c */
6305
2
                if ( !htmlOut )
6306
0
                {
6307
0
                    TY_(SetOptionBool)( doc, TidyUpperCaseTags, no );
6308
0
                    TY_(SetOptionInt)( doc, TidyUpperCaseAttrs, no );
6309
0
                }
6310
2
            }
6311
2
        }
6312
6313
350
        if ( node->type != StartTag || !nodeIsHTML(node) )
6314
348
        {
6315
348
            TY_(UngetToken)( doc );
6316
348
            html = TY_(InferredTag)(doc, TidyTag_HTML);
6317
348
        }
6318
2
        else
6319
2
            html = node;
6320
6321
        /*\
6322
         *  #72, avoid MISSING_DOCTYPE if show-body-only.
6323
         *  #191, also if --doctype omit, that is TidyDoctypeOmit
6324
         *  #342, adjust tags to html4-- if not 'auto' or 'html5'
6325
        \*/
6326
350
        if (!TY_(FindDocType)(doc))
6327
339
        {
6328
339
            ulong dtmode = cfg( doc, TidyDoctypeMode );
6329
339
            if ((dtmode != TidyDoctypeOmit) && !showingBodyOnly(doc))
6330
339
                TY_(Report)(doc, NULL, NULL, MISSING_DOCTYPE);
6331
339
            if ((dtmode != TidyDoctypeAuto) && (dtmode != TidyDoctypeHtml5))
6332
0
            {
6333
                /*\
6334
                 *  Issue #342 - if not doctype 'auto', or 'html5'
6335
                 *  then reset mode htm4-- parsing
6336
                \*/
6337
0
                TY_(AdjustTags)(doc); /* Dynamically modify the tags table to html4-- mode */
6338
0
            }
6339
339
        }
6340
350
        TY_(InsertNodeAtEnd)( &doc->root, html);
6341
350
        ParseHTMLWithNode( doc, html );
6342
350
        break;
6343
359
    }
6344
6345
    /* do this before any more document fixes */
6346
352
    if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 )
6347
0
        TY_(AccessibilityChecks)( doc );
6348
6349
352
    if (!TY_(FindHTML)(doc))
6350
2
    {
6351
        /* a later check should complain if <body> is empty */
6352
2
        html = TY_(InferredTag)(doc, TidyTag_HTML);
6353
2
        TY_(InsertNodeAtEnd)( &doc->root, html);
6354
2
        ParseHTMLWithNode( doc, html );
6355
2
    }
6356
6357
352
    node = TY_(FindTITLE)(doc);
6358
352
    if (!node)
6359
343
    {
6360
343
        Node* head = TY_(FindHEAD)(doc);
6361
        /* #72, avoid MISSING_TITLE_ELEMENT if show-body-only (but allow InsertNodeAtEnd to avoid new warning) */
6362
686
        if (!showingBodyOnly(doc))
6363
343
        {
6364
343
            TY_(Report)(doc, head, NULL, MISSING_TITLE_ELEMENT);
6365
343
        }
6366
343
        TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE));
6367
343
    }
6368
9
    else if (!node->content && !showingBodyOnly(doc))
6369
6
    {
6370
        /* Is #839 - warn node is blank in HTML5 */
6371
6
        if (TY_(IsHTML5Mode)(doc))
6372
3
        {
6373
3
            TY_(Report)(doc, node, NULL, BLANK_TITLE_ELEMENT);
6374
3
        }
6375
6
    }
6376
6377
352
    AttributeChecks(doc, &doc->root);
6378
352
    ReplaceObsoleteElements(doc, &doc->root);
6379
352
    TY_(DropEmptyElements)(doc, &doc->root);
6380
352
    CleanSpaces(doc, &doc->root);
6381
6382
352
    if (cfgBool(doc, TidyEncloseBodyText))
6383
0
        EncloseBodyText(doc);
6384
352
    if (cfgBool(doc, TidyEncloseBlockText))
6385
0
        EncloseBlockText(doc, &doc->root);
6386
352
}
6387
6388
6389
/** MARK: TY_(ParseXMLDocument)
6390
 *  Parses the document using Tidy's XML parser.
6391
 */
6392
void TY_(ParseXMLDocument)(TidyDocImpl* doc)
6393
50
{
6394
50
    Node *node, *doctype = NULL;
6395
6396
50
    TY_(SetOptionBool)( doc, TidyXmlTags, yes );
6397
6398
50
    doc->xmlDetected = yes;
6399
6400
195
    while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL)
6401
145
    {
6402
        /* discard unexpected end tags */
6403
145
        if (node->type == EndTag)
6404
0
        {
6405
0
            TY_(Report)(doc, NULL, node, UNEXPECTED_ENDTAG);
6406
0
            TY_(FreeNode)( doc, node);
6407
0
            continue;
6408
0
        }
6409
6410
         /* deal with comments etc. */
6411
145
        if (InsertMisc( &doc->root, node))
6412
5
            continue;
6413
6414
140
        if (node->type == DocTypeTag)
6415
0
        {
6416
0
            if (doctype == NULL)
6417
0
            {
6418
0
                TY_(InsertNodeAtEnd)( &doc->root, node);
6419
0
                doctype = node;
6420
0
            }
6421
0
            else
6422
0
            {
6423
0
                TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6424
0
                TY_(FreeNode)( doc, node);
6425
0
            }
6426
0
            continue;
6427
0
        }
6428
6429
140
        if (node->type == StartEndTag)
6430
1
        {
6431
1
            TY_(InsertNodeAtEnd)( &doc->root, node);
6432
1
            continue;
6433
1
        }
6434
6435
       /* if start tag then parse element's content */
6436
139
        if (node->type == StartTag)
6437
61
        {
6438
61
            TY_(InsertNodeAtEnd)( &doc->root, node );
6439
61
            ParseHTMLWithNode( doc, node );
6440
61
            continue;
6441
61
        }
6442
6443
78
        TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED);
6444
78
        TY_(FreeNode)( doc, node);
6445
78
    }
6446
6447
    /* ensure presence of initial <?xml version="1.0"?> */
6448
50
    if ( cfgBool(doc, TidyXmlDecl) )
6449
0
        TY_(FixXmlDecl)( doc );
6450
50
}