Coverage Report

Created: 2026-02-26 06:20

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tidy-html5/src/clean.c
Line
Count
Source
1
/*
2
  clean.c -- clean up misuse of presentation markup
3
4
  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5
  See tidy.h for the copyright notice.
6
7
  Filters from other formats such as Microsoft Word
8
  often make excessive use of presentation markup such
9
  as font tags, B, I, and the align attribute. By applying
10
  a set of production rules, it is straight forward to
11
  transform this to use CSS.
12
13
  Some rules replace some of the children of an element by
14
  style properties on the element, e.g.
15
16
  <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
17
18
  Such rules are applied to the element's content and then
19
  to the element itself until none of the rules more apply.
20
  Having applied all the rules to an element, it will have
21
  a style attribute with one or more properties. 
22
23
  Other rules strip the element they apply to, replacing
24
  it by style properties on the contents, e.g.
25
  
26
  <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
27
      
28
  These rules are applied to an element before processing
29
  its content and replace the current element by the first
30
  element in the exposed content.
31
32
  After applying both sets of rules, you can replace the
33
  style attribute by a class value and style rule in the
34
  document head. To support this, an association of styles
35
  and class names is built.
36
37
  A naive approach is to rely on string matching to test
38
  when two property lists are the same. A better approach
39
  would be to first sort the properties before matching.
40
41
*/
42
43
#include <stdio.h>
44
#include <stdlib.h>
45
#include <string.h>
46
47
#include "tidy-int.h"
48
#include "clean.h"
49
#include "lexer.h"
50
#include "parser.h"
51
#include "attrs.h"
52
#include "message.h"
53
#include "tmbstr.h"
54
#include "utf8.h"
55
56
static Node* CleanNode( TidyDocImpl* doc, Node *node );
57
58
static void RenameElem( TidyDocImpl* doc, Node* node, TidyTagId tid )
59
12.9k
{
60
12.9k
    const Dict* dict = TY_(LookupTagDef)( tid );
61
12.9k
    TidyDocFree( doc, node->element );
62
12.9k
    node->element = TY_(tmbstrdup)( doc->allocator, dict->name );
63
12.9k
    node->tag = dict;
64
12.9k
}
65
66
static void FreeStyleProps(TidyDocImpl* doc, StyleProp *props)
67
1.44k
{
68
1.44k
    StyleProp *next;
69
70
7.19k
    while (props)
71
5.75k
    {
72
5.75k
        next = props->next;
73
5.75k
        TidyDocFree(doc, props->name);
74
5.75k
        TidyDocFree(doc, props->value);
75
5.75k
        TidyDocFree(doc, props);
76
5.75k
        props = next;
77
5.75k
    }
78
1.44k
}
79
80
static StyleProp *InsertProperty( TidyDocImpl* doc, StyleProp* props, ctmbstr name, ctmbstr value )
81
14.8k
{
82
14.8k
    StyleProp *first, *prev, *prop;
83
14.8k
    int cmp;
84
85
14.8k
    prev = NULL;
86
14.8k
    first = props;
87
88
58.1k
    while (props)
89
54.9k
    {
90
54.9k
        cmp = TY_(tmbstrcmp)(props->name, name);
91
92
54.9k
        if (cmp == 0)
93
9.07k
        {
94
            /* this property is already defined, ignore new value */
95
9.07k
            return first;
96
9.07k
        }
97
98
45.9k
        if (cmp > 0)
99
2.54k
        {
100
            /* insert before this */
101
102
2.54k
            prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
103
2.54k
            prop->name = TY_(tmbstrdup)(doc->allocator, name);
104
2.54k
            prop->value = TY_(tmbstrdup)(doc->allocator, value);
105
2.54k
            prop->next = props;
106
107
2.54k
            if (prev)
108
1.23k
                prev->next = prop;
109
1.31k
            else
110
1.31k
                first = prop;
111
112
2.54k
            return first;
113
2.54k
        }
114
115
43.3k
        prev = props;
116
43.3k
        props = props->next;
117
43.3k
    }
118
119
3.20k
    prop = (StyleProp *)TidyDocAlloc(doc, sizeof(StyleProp));
120
3.20k
    prop->name = TY_(tmbstrdup)(doc->allocator, name);
121
3.20k
    prop->value = TY_(tmbstrdup)(doc->allocator, value);
122
3.20k
    prop->next = NULL;
123
124
3.20k
    if (prev)
125
1.76k
        prev->next = prop;
126
1.44k
    else
127
1.44k
        first = prop;
128
129
3.20k
    return first;
130
14.8k
}
131
132
/*
133
 Create sorted linked list of properties from style string
134
 It temporarily places nulls in place of ':' and ';' to
135
 delimit the strings for the property name and value.
136
 Some systems don't allow you to NULL literal strings,
137
 so to avoid this, a copy is made first.
138
*/
139
static StyleProp* CreateProps( TidyDocImpl* doc, StyleProp* prop, ctmbstr style )
140
2.88k
{
141
2.88k
    tmbstr name, value = NULL, name_end, value_end, line;
142
2.88k
    Bool more;
143
144
2.88k
    line = TY_(tmbstrdup)(doc->allocator, style);
145
2.88k
    name = line;
146
147
15.8k
    while (*name)
148
15.2k
    {
149
15.9k
        while (*name == ' ')
150
682
            ++name;
151
152
15.2k
        name_end = name;
153
154
49.5k
        while (*name_end)
155
49.1k
        {
156
49.1k
            if (*name_end == ':')
157
14.8k
            {
158
14.8k
                value = name_end + 1;
159
14.8k
                break;
160
14.8k
            }
161
162
34.2k
            ++name_end;
163
34.2k
        }
164
165
15.2k
        if (*name_end != ':')
166
471
            break;
167
168
17.9k
        while ( value && *value == ' ')
169
3.09k
            ++value;
170
171
14.8k
        value_end = value;
172
14.8k
        more = no;
173
174
22.0k
        while (*value_end)
175
20.2k
        {
176
20.2k
            if (*value_end == ';')
177
13.0k
            {
178
13.0k
                more = yes;
179
13.0k
                break;
180
13.0k
            }
181
182
7.25k
            ++value_end;
183
7.25k
        }
184
185
14.8k
        *name_end = '\0';
186
14.8k
        *value_end = '\0';
187
188
14.8k
        prop = InsertProperty(doc, prop, name, value);
189
14.8k
        *name_end = ':';
190
191
14.8k
        if (more)
192
13.0k
        {
193
13.0k
            *value_end = ';';
194
13.0k
            name = value_end + 1;
195
13.0k
            continue;
196
13.0k
        }
197
198
1.82k
        break;
199
14.8k
    }
200
201
2.88k
    TidyDocFree(doc, line);  /* free temporary copy */
202
2.88k
    return prop;
203
2.88k
}
204
205
static tmbstr CreatePropString(TidyDocImpl* doc, StyleProp *props)
206
1.44k
{
207
1.44k
    tmbstr style, p, s;
208
1.44k
    uint len;
209
1.44k
    StyleProp *prop;
210
211
    /* compute length */
212
213
7.19k
    for (len = 0, prop = props; prop; prop = prop->next)
214
5.75k
    {
215
5.75k
        len += TY_(tmbstrlen)(prop->name) + 2;
216
5.75k
        if (prop->value)
217
5.75k
            len += TY_(tmbstrlen)(prop->value) + 2;
218
5.75k
    }
219
220
1.44k
    style = (tmbstr) TidyDocAlloc(doc, len+1);
221
1.44k
    style[0] = '\0';
222
223
5.75k
    for (p = style, prop = props; prop; prop = prop->next)
224
5.75k
    {
225
5.75k
        s = prop->name;
226
227
34.0k
        while((*p++ = *s++))
228
28.2k
            continue;
229
230
5.75k
        if (prop->value)
231
5.75k
        {
232
5.75k
            *--p = ':';
233
5.75k
            *++p = ' ';
234
5.75k
            ++p;
235
236
5.75k
            s = prop->value;
237
12.9k
            while((*p++ = *s++))
238
7.16k
                continue;
239
5.75k
        }
240
5.75k
        if (prop->next == NULL)
241
1.44k
            break;
242
243
4.31k
        *--p = ';';
244
4.31k
        *++p = ' ';
245
4.31k
        ++p;
246
4.31k
    }
247
248
1.44k
    return style;
249
1.44k
}
250
251
/*
252
  create string with merged properties
253
static tmbstr AddProperty( ctmbstr style, ctmbstr property )
254
{
255
    tmbstr line;
256
    StyleProp *prop;
257
258
    prop = CreateProps(doc, NULL, style);
259
    prop = CreateProps(doc, prop, property);
260
    line = CreatePropString(doc, prop);
261
    FreeStyleProps(doc, prop);
262
    return line;
263
}
264
*/
265
266
void TY_(FreeStyles)( TidyDocImpl* doc )
267
19.1k
{
268
19.1k
    Lexer* lexer = doc->lexer;
269
19.1k
    if ( lexer )
270
19.1k
    {
271
19.1k
        TagStyle *style, *next;
272
19.1k
        for ( style = lexer->styles; style; style = next )
273
0
        {
274
0
            next = style->next;
275
0
            TidyDocFree( doc, style->tag );
276
0
            TidyDocFree( doc, style->tag_class );
277
0
            TidyDocFree( doc, style->properties );
278
0
            TidyDocFree( doc, style );
279
0
        }
280
19.1k
    }
281
19.1k
}
282
283
static tmbstr GensymClass( TidyDocImpl* doc )
284
0
{
285
0
    tmbchar buf[512];  /* CSSPrefix is limited to 256 characters */
286
0
    ctmbstr pfx = cfgStr(doc, TidyCSSPrefix);
287
0
    if ( pfx == NULL || *pfx == 0 )
288
0
      pfx = "c";
289
290
0
    TY_(tmbsnprintf)(buf, sizeof(buf), "%s%u", pfx, ++doc->nClassId );
291
0
    return TY_(tmbstrdup)(doc->allocator, buf);
292
0
}
293
294
static ctmbstr FindStyle( TidyDocImpl* doc, ctmbstr tag, ctmbstr properties )
295
0
{
296
0
    Lexer* lexer = doc->lexer;
297
0
    TagStyle* style;
298
299
0
    for (style = lexer->styles; style; style=style->next)
300
0
    {
301
0
        if (TY_(tmbstrcmp)(style->tag, tag) == 0 &&
302
0
            TY_(tmbstrcmp)(style->properties, properties) == 0)
303
0
            return style->tag_class;
304
0
    }
305
306
0
    style = (TagStyle *)TidyDocAlloc( doc, sizeof(TagStyle) );
307
0
    style->tag = TY_(tmbstrdup)(doc->allocator, tag);
308
0
    style->tag_class = GensymClass( doc );
309
0
    style->properties = TY_(tmbstrdup)( doc->allocator, properties );
310
0
    style->next = lexer->styles;
311
0
    lexer->styles = style;
312
0
    return style->tag_class;
313
0
}
314
315
/*
316
 Add class="foo" to node
317
*/
318
static void AddClass( TidyDocImpl* doc, Node* node, ctmbstr classname )
319
0
{
320
0
    AttVal *classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);;
321
322
    /*
323
     if there already is a class attribute
324
     then append class name after a space.
325
    */
326
0
    if (classattr)
327
0
        TY_(AppendToClassAttr)( doc, classattr, classname );
328
0
    else /* create new class attribute */
329
0
        TY_(AddAttribute)( doc, node, "class", classname );
330
0
}
331
332
void TY_(AddStyleAsClass)( TidyDocImpl* doc, Node *node, ctmbstr stylevalue )
333
0
{
334
0
    ctmbstr classname;
335
336
0
    classname = FindStyle( doc, node->element, stylevalue );
337
0
    AddClass( doc, node, classname);
338
0
}
339
340
/*
341
 Find style attribute in node, and replace it
342
 by corresponding class attribute. Search for
343
 class in style dictionary otherwise gensym
344
 new class and add to dictionary.
345
346
 Assumes that node doesn't have a class attribute
347
*/
348
static void Style2Rule( TidyDocImpl* doc, Node *node)
349
0
{
350
0
    AttVal *styleattr, *classattr;
351
0
    ctmbstr classname;
352
353
0
    styleattr = TY_(AttrGetById)(node, TidyAttr_STYLE);
354
355
0
    if (styleattr)
356
0
    {
357
        /* fix for http://tidy.sf.net/bug/850215 */
358
0
        if (!styleattr->value)
359
0
        {
360
0
            TY_(RemoveAttribute)(doc, node, styleattr);
361
0
            return;
362
0
        }
363
364
0
        classname = FindStyle( doc, node->element, styleattr->value );
365
0
        classattr = TY_(AttrGetById)(node, TidyAttr_CLASS);
366
367
        /*
368
         if there already is a class attribute
369
         then append class name after an underscore
370
        */
371
0
        if (classattr)
372
0
        {
373
0
            TY_(AppendToClassAttr)( doc, classattr, classname );
374
0
            TY_(RemoveAttribute)( doc, node, styleattr );
375
0
        }
376
0
        else /* reuse style attribute for class attribute */
377
0
        {
378
0
            TidyDocFree(doc, styleattr->attribute);
379
0
            TidyDocFree(doc, styleattr->value);
380
0
            styleattr->attribute = TY_(tmbstrdup)(doc->allocator, "class");
381
0
            styleattr->value = TY_(tmbstrdup)(doc->allocator, classname);
382
0
        }
383
0
    }
384
0
}
385
386
static void AddColorRule( Lexer* lexer, ctmbstr selector, ctmbstr color )
387
0
{
388
0
    if ( selector && color )
389
0
    {
390
0
        TY_(AddStringLiteral)(lexer, selector);
391
0
        TY_(AddStringLiteral)(lexer, " { color: ");
392
0
        TY_(AddStringLiteral)(lexer, color);
393
0
        TY_(AddStringLiteral)(lexer, " }\n");
394
0
    }
395
0
}
396
397
/*
398
 move presentation attribs from body to style element
399
400
 background="foo" ->  body { background-image: url(foo) }
401
 bgcolor="foo"    ->  body { background-color: foo }
402
 text="foo"       ->  body { color: foo }
403
 link="foo"       ->  :link { color: foo }
404
 vlink="foo"      ->  :visited { color: foo }
405
 alink="foo"      ->  :active { color: foo }
406
*/
407
static void CleanBodyAttrs( TidyDocImpl* doc, Node* body )
408
0
{
409
0
    Lexer* lexer  = doc->lexer;
410
0
    tmbstr bgurl   = NULL;
411
0
    tmbstr bgcolor = NULL;
412
0
    tmbstr color   = NULL;
413
0
    AttVal* attr;
414
    
415
0
    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BACKGROUND)))
416
0
    {
417
0
        bgurl = attr->value;
418
0
        attr->value = NULL;
419
0
        TY_(RemoveAttribute)( doc, body, attr );
420
0
    }
421
422
0
    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_BGCOLOR)))
423
0
    {
424
0
        bgcolor = attr->value;
425
0
        attr->value = NULL;
426
0
        TY_(RemoveAttribute)( doc, body, attr );
427
0
    }
428
429
0
    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_TEXT)))
430
0
    {
431
0
        color = attr->value;
432
0
        attr->value = NULL;
433
0
        TY_(RemoveAttribute)( doc, body, attr );
434
0
    }
435
436
0
    if ( bgurl || bgcolor || color )
437
0
    {
438
0
        TY_(AddStringLiteral)(lexer, " body {\n");
439
0
        if (bgurl)
440
0
        {
441
0
            TY_(AddStringLiteral)(lexer, "  background-image: url(");
442
0
            TY_(AddStringLiteral)(lexer, bgurl);
443
0
            TY_(AddStringLiteral)(lexer, ");\n");
444
0
            TidyDocFree(doc, bgurl);
445
0
        }
446
0
        if (bgcolor)
447
0
        {
448
0
            TY_(AddStringLiteral)(lexer, "  background-color: ");
449
0
            TY_(AddStringLiteral)(lexer, bgcolor);
450
0
            TY_(AddStringLiteral)(lexer, ";\n");
451
0
            TidyDocFree(doc, bgcolor);
452
0
        }
453
0
        if (color)
454
0
        {
455
0
            TY_(AddStringLiteral)(lexer, "  color: ");
456
0
            TY_(AddStringLiteral)(lexer, color);
457
0
            TY_(AddStringLiteral)(lexer, ";\n");
458
0
            TidyDocFree(doc, color);
459
0
        }
460
461
0
        TY_(AddStringLiteral)(lexer, " }\n");
462
0
    }
463
464
0
    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_LINK)))
465
0
    {
466
0
        AddColorRule(lexer, " :link", attr->value);
467
0
        TY_(RemoveAttribute)( doc, body, attr );
468
0
    }
469
470
0
    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_VLINK)))
471
0
    {
472
0
        AddColorRule(lexer, " :visited", attr->value);
473
0
        TY_(RemoveAttribute)( doc, body, attr );
474
0
    }
475
476
0
    if (NULL != (attr = TY_(AttrGetById)(body, TidyAttr_ALINK)))
477
0
    {
478
0
        AddColorRule(lexer, " :active", attr->value);
479
0
        TY_(RemoveAttribute)( doc, body, attr );
480
0
    }
481
0
}
482
483
static Bool NiceBody( TidyDocImpl* doc )
484
0
{
485
0
    Node* node = TY_(FindBody)(doc);
486
0
    if (node)
487
0
    {
488
0
        if (TY_(AttrGetById)(node, TidyAttr_BACKGROUND) ||
489
0
            TY_(AttrGetById)(node, TidyAttr_BGCOLOR)    ||
490
0
            TY_(AttrGetById)(node, TidyAttr_TEXT)       ||
491
0
            TY_(AttrGetById)(node, TidyAttr_LINK)       ||
492
0
            TY_(AttrGetById)(node, TidyAttr_VLINK)      ||
493
0
            TY_(AttrGetById)(node, TidyAttr_ALINK))
494
0
        {
495
0
            doc->badLayout |= USING_BODY;
496
0
            return no;
497
0
        }
498
0
    }
499
500
0
    return yes;
501
0
}
502
503
/* create style element using rules from dictionary */
504
static void CreateStyleElement( TidyDocImpl* doc )
505
0
{
506
0
    Lexer* lexer = doc->lexer;
507
0
    Node *node, *head, *body;
508
0
    TagStyle *style;
509
0
    AttVal *av;
510
511
0
    if ( lexer->styles == NULL && NiceBody(doc) )
512
0
        return;
513
514
0
    node = TY_(NewNode)( doc->allocator, lexer );
515
0
    node->type = StartTag;
516
0
    node->implicit = yes;
517
0
    node->element = TY_(tmbstrdup)(doc->allocator, "style");
518
0
    TY_(FindTag)( doc, node );
519
520
    /* insert type attribute */
521
0
    av = TY_(NewAttributeEx)( doc, "type", "text/css", '"' );
522
0
    TY_(InsertAttributeAtStart)( node, av );
523
524
0
    body = TY_(FindBody)( doc );
525
0
    lexer->txtstart = lexer->lexsize;
526
0
    if ( body )
527
0
        CleanBodyAttrs( doc, body );
528
529
0
    for (style = lexer->styles; style; style = style->next)
530
0
    {
531
0
        TY_(AddCharToLexer)(lexer, ' ');
532
0
        TY_(AddStringLiteral)(lexer, style->tag);
533
0
        TY_(AddCharToLexer)(lexer, '.');
534
0
        TY_(AddStringLiteral)(lexer, style->tag_class);
535
0
        TY_(AddCharToLexer)(lexer, ' ');
536
0
        TY_(AddCharToLexer)(lexer, '{');
537
0
        TY_(AddStringLiteral)(lexer, style->properties);
538
0
        TY_(AddCharToLexer)(lexer, '}');
539
0
        TY_(AddCharToLexer)(lexer, '\n');
540
0
    }
541
542
0
    lexer->txtend = lexer->lexsize;
543
544
0
    TY_(InsertNodeAtEnd)( node, TY_(TextToken)(lexer) );
545
546
    /*
547
     now insert style element into document head
548
549
     doc is root node. search its children for html node
550
     the head node should be first child of html node
551
    */
552
0
    if ( NULL != (head = TY_(FindHEAD)( doc )) )
553
0
        TY_(InsertNodeAtEnd)( head, node );
554
0
}
555
556
557
/* ensure bidirectional links are consistent */
558
void TY_(FixNodeLinks)(Node *node)
559
1.85k
{
560
1.85k
    Node *child;
561
562
1.85k
    if (node->prev)
563
619
        node->prev->next = node;
564
1.23k
    else
565
1.23k
        node->parent->content = node;
566
567
1.85k
    if (node->next)
568
375
        node->next->prev = node;
569
1.48k
    else
570
1.48k
        node->parent->last = node;
571
572
3.33k
    for (child = node->content; child; child = child->next)
573
1.47k
        child->parent = node;
574
1.85k
}
575
576
/*
577
 used to strip child of node when
578
 the node has one and only one child
579
*/
580
static void StripOnlyChild(TidyDocImpl* doc, Node *node)
581
13.0k
{
582
13.0k
    Node *child;
583
584
13.0k
    child = node->content;
585
13.0k
    node->content = child->content;
586
13.0k
    node->last = child->last;
587
13.0k
    child->content = NULL;
588
13.0k
    TY_(FreeNode)(doc, child);
589
590
30.5k
    for (child = node->content; child; child = child->next)
591
17.4k
        child->parent = node;
592
13.0k
}
593
594
/*
595
  used to strip font start and end tags.
596
  Extricate "element", replace it by its content and delete it.
597
*/
598
static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
599
6.25k
{
600
6.25k
    if (element->content)
601
5.80k
    {
602
5.80k
        Node *node, *parent = element->parent;
603
604
5.80k
        element->last->next = element->next;
605
606
5.80k
        if (element->next)
607
703
        {
608
703
            element->next->prev = element->last;
609
703
        }
610
5.09k
        else
611
5.09k
            parent->last = element->last;
612
613
5.80k
        if (element->prev)
614
3.77k
        {
615
3.77k
            element->content->prev = element->prev;
616
3.77k
            element->prev->next = element->content;
617
3.77k
        }
618
2.02k
        else
619
2.02k
            parent->content = element->content;
620
621
30.1k
        for (node = element->content; node; node = node->next)
622
24.3k
            node->parent = parent;
623
624
5.80k
        *pnode = element->content;
625
626
5.80k
        element->next = element->content = NULL;
627
5.80k
        TY_(FreeNode)(doc, element);
628
5.80k
    }
629
454
    else
630
454
    {
631
454
        *pnode = TY_(DiscardElement)(doc, element);
632
454
    }
633
6.25k
}
634
635
/*
636
  Create new string that consists of the
637
  combined style properties in s1 and s2
638
639
  To merge property lists, we build a linked
640
  list of property/values and insert properties
641
  into the list in order, merging values for
642
  the same property name.
643
*/
644
static tmbstr MergeProperties( TidyDocImpl* doc, ctmbstr s1, ctmbstr s2 )
645
1.44k
{
646
1.44k
    tmbstr s;
647
1.44k
    StyleProp *prop;
648
649
1.44k
    prop = CreateProps(doc, NULL, s1);
650
1.44k
    prop = CreateProps(doc, prop, s2);
651
1.44k
    s = CreatePropString(doc, prop);
652
1.44k
    FreeStyleProps(doc, prop);
653
1.44k
    return s;
654
1.44k
}
655
656
/*
657
 Add style property to element, creating style
658
 attribute as needed and adding ; delimiter
659
*/
660
void TY_(AddStyleProperty)(TidyDocImpl* doc, Node *node, ctmbstr property )
661
12.8k
{
662
12.8k
    AttVal *av = TY_(AttrGetById)(node, TidyAttr_STYLE);
663
664
    /* if style attribute already exists then insert property */
665
666
12.8k
    if ( av )
667
1.71k
    {
668
1.71k
        if (av->value != NULL)
669
1.44k
        {
670
1.44k
            tmbstr s = MergeProperties( doc, av->value, property );
671
1.44k
            TidyDocFree( doc, av->value );
672
1.44k
            av->value = s;
673
1.44k
        }
674
270
        else
675
270
        {
676
270
            av->value = TY_(tmbstrdup)( doc->allocator, property );
677
270
        }
678
1.71k
    }
679
11.1k
    else /* else create new style attribute */
680
11.1k
    {
681
11.1k
        av = TY_(NewAttributeEx)( doc, "style", property, '"' );
682
11.1k
        TY_(InsertAttributeAtStart)( node, av );
683
11.1k
    }
684
12.8k
}
685
686
static void MergeClasses(TidyDocImpl* doc, Node *node, Node *child)
687
0
{
688
0
    AttVal *av;
689
0
    tmbstr s1, s2, names;
690
691
0
    for (s2 = NULL, av = child->attributes; av; av = av->next)
692
0
    {
693
0
        if (attrIsCLASS(av))
694
0
        {
695
0
            s2 = av->value;
696
0
            break;
697
0
        }
698
0
    }
699
700
0
    for (s1 = NULL, av = node->attributes; av; av = av->next)
701
0
    {
702
0
        if (attrIsCLASS(av))
703
0
        {
704
0
            s1 = av->value;
705
0
            break;
706
0
        }
707
0
    }
708
709
0
    if (s1)
710
0
    {
711
0
        if (s2)  /* merge class names from both */
712
0
        {
713
0
            uint l1, l2;
714
0
            l1 = TY_(tmbstrlen)(s1);
715
0
            l2 = TY_(tmbstrlen)(s2);
716
0
            names = (tmbstr) TidyDocAlloc(doc, l1 + l2 + 2);
717
0
            TY_(tmbstrcpy)(names, s1);
718
0
            names[l1] = ' ';
719
0
            TY_(tmbstrcpy)(names+l1+1, s2);
720
0
            TidyDocFree(doc, av->value);
721
0
            av->value = names;
722
0
        }
723
0
    }
724
0
    else if (s2)  /* copy class names from child */
725
0
    {
726
0
        av = TY_(NewAttributeEx)( doc, "class", s2, '"' );
727
0
        TY_(InsertAttributeAtStart)( node, av );
728
0
    }
729
0
}
730
731
static void MergeStyles(TidyDocImpl* doc, Node *node, Node *child)
732
0
{
733
0
    AttVal *av;
734
0
    tmbstr s1, s2, style;
735
736
    /*
737
       the child may have a class attribute used
738
       for attaching styles, if so the class name
739
       needs to be copied to node's class
740
    */
741
0
    MergeClasses(doc, node, child);
742
743
0
    for (s2 = NULL, av = child->attributes; av; av = av->next)
744
0
    {
745
0
        if (attrIsSTYLE(av))
746
0
        {
747
0
            s2 = av->value;
748
0
            break;
749
0
        }
750
0
    }
751
752
0
    for (s1 = NULL, av = node->attributes; av; av = av->next)
753
0
    {
754
0
        if (attrIsSTYLE(av))
755
0
        {
756
0
            s1 = av->value;
757
0
            break;
758
0
        }
759
0
    }
760
761
0
    if (s1)
762
0
    {
763
0
        if (s2)  /* merge styles from both */
764
0
        {
765
0
            style = MergeProperties(doc, s1, s2);
766
0
            TidyDocFree(doc, av->value);
767
0
            av->value = style;
768
0
        }
769
0
    }
770
0
    else if (s2)  /* copy style of child */
771
0
    {
772
0
        av = TY_(NewAttributeEx)( doc, "style", s2, '"' );
773
0
        TY_(InsertAttributeAtStart)( node, av );
774
0
    }
775
0
}
776
777
static ctmbstr FontSize2Name(ctmbstr size)
778
0
{
779
0
    static const ctmbstr sizes[7] =
780
0
    {
781
0
        "60%", "70%", "80%", NULL,
782
0
        "120%", "150%", "200%"
783
0
    };
784
785
    /* increment of 0.8 */
786
0
    static const ctmbstr minussizes[] =
787
0
    {
788
0
        "100%", "80%", "64%", "51%",
789
0
        "40%", "32%", "26%"
790
0
    };
791
792
    /* increment of 1.2 */
793
0
    static const ctmbstr plussizes[] =
794
0
    {
795
0
        "100%", "120%", "144%", "172%",
796
0
        "207%", "248%", "298%"
797
0
    };
798
799
0
    if (size[0] == '\0')
800
0
        return NULL;
801
802
0
    if ('0' <= size[0] && size[0] <= '6')
803
0
    {
804
0
        int n = size[0] - '0';
805
0
        return sizes[n];
806
0
    }
807
808
0
    if (size[0] == '-')
809
0
    {
810
0
        if ('0' <= size[1] && size[1] <= '6')
811
0
        {
812
0
            int n = size[1] - '0';
813
0
            return minussizes[n];
814
0
        }
815
0
        return "smaller"; /*"70%"; */
816
0
    }
817
818
0
    if ('0' <= size[1] && size[1] <= '6')
819
0
    {
820
0
        int n = size[1] - '0';
821
0
        return plussizes[n];
822
0
    }
823
824
0
    return "larger"; /* "140%" */
825
0
}
826
827
static void AddFontFace( TidyDocImpl* doc, Node *node, ctmbstr face )
828
0
{
829
0
    tmbchar buf[256];
830
0
    TY_(tmbsnprintf)(buf, sizeof(buf), "font-family: %s", face );
831
0
    TY_(AddStyleProperty)( doc, node, buf );
832
0
}
833
834
static void AddFontSize( TidyDocImpl* doc, Node* node, ctmbstr size )
835
0
{
836
0
    ctmbstr value = NULL;
837
838
0
    if (nodeIsP(node))
839
0
    {
840
0
        if (TY_(tmbstrcmp)(size, "6") == 0)
841
0
            value = "h1";
842
0
        else if (TY_(tmbstrcmp)(size, "5") == 0)
843
0
            value = "h2";
844
0
        else if (TY_(tmbstrcmp)(size, "4") == 0)
845
0
            value = "h3";
846
847
0
        if (value)
848
0
        {
849
0
            TidyDocFree(doc, node->element);
850
0
            node->element = TY_(tmbstrdup)(doc->allocator, value);
851
0
            TY_(FindTag)(doc, node);
852
0
            return;
853
0
        }
854
0
    }
855
856
0
    value = FontSize2Name(size);
857
858
0
    if (value)
859
0
    {
860
0
        tmbchar buf[64];
861
0
        TY_(tmbsnprintf)(buf, sizeof(buf), "font-size: %s", value);
862
0
        TY_(AddStyleProperty)( doc, node, buf );
863
0
    }
864
0
}
865
866
static void AddFontColor( TidyDocImpl* doc, Node *node, ctmbstr color)
867
0
{
868
0
    tmbchar buf[128];
869
0
    TY_(tmbsnprintf)(buf, sizeof(buf), "color: %s", color);
870
0
    TY_(AddStyleProperty)( doc, node, buf );
871
0
}
872
873
/* force alignment value to lower case */
874
static void AddAlign( TidyDocImpl* doc, Node *node, ctmbstr align )
875
0
{
876
0
    uint i;
877
0
    tmbchar buf[128];
878
879
0
    TY_(tmbstrcpy)( buf, "text-align: " );
880
0
    for ( i = 12; i < sizeof(buf)/sizeof(buf[0])-1; ++i )
881
0
    {
882
0
        if ( (buf[i] = (tmbchar)TY_(ToLower)(*align++)) == '\0' )
883
0
            break;
884
0
    }
885
0
    buf[i] = '\0';
886
0
    TY_(AddStyleProperty)( doc, node, buf );
887
0
}
888
889
/*
890
 add style properties to node corresponding to
891
 the font face, size and color attributes
892
*/
893
static void AddFontStyles( TidyDocImpl* doc, Node *node, AttVal *av)
894
0
{
895
0
    while (av)
896
0
    {
897
0
        if (AttrHasValue(av))
898
0
        {
899
0
            if (attrIsFACE(av))
900
0
                AddFontFace( doc, node, av->value );
901
0
            else if (attrIsSIZE(av))
902
0
                AddFontSize( doc, node, av->value );
903
0
            else if (attrIsCOLOR(av))
904
0
                AddFontColor( doc, node, av->value );
905
0
        }
906
0
        av = av->next;
907
0
    }
908
0
}
909
910
/*
911
    Symptom: <p align=center>
912
    Action: <p style="text-align: center">
913
*/
914
static void TextAlign( TidyDocImpl* doc, Node* node )
915
0
{
916
0
    AttVal *av, *prev;
917
918
0
    prev = NULL;
919
920
0
    for (av = node->attributes; av; av = av->next)
921
0
    {
922
0
        if (attrIsALIGN(av))
923
0
        {
924
0
            if (prev)
925
0
                prev->next = av->next;
926
0
            else
927
0
                node->attributes = av->next;
928
929
0
            if (av->value)
930
0
                AddAlign( doc, node, av->value );
931
932
0
            TY_(FreeAttribute)(doc, av);
933
0
            break;
934
0
        }
935
936
0
        prev = av;
937
0
    }
938
0
}
939
940
/*
941
    Symptom: <table bgcolor="red">
942
    Action: <table style="background-color: red">
943
*/
944
static void TableBgColor( TidyDocImpl* doc, Node* node )
945
0
{
946
0
    AttVal* attr;
947
0
    tmbchar buf[256];
948
949
0
    if (NULL != (attr = TY_(AttrGetById)(node, TidyAttr_BGCOLOR)))
950
0
    {
951
0
        TY_(tmbsnprintf)(buf, sizeof(buf), "background-color: %s", attr->value );
952
0
        TY_(RemoveAttribute)( doc, node, attr );
953
0
        TY_(AddStyleProperty)( doc, node, buf );
954
0
    }
955
0
}
956
957
/*
958
   The clean up rules use the pnode argument to return the
959
   next node when the original node has been deleted
960
*/
961
962
/*
963
    Symptom: <dir> <li> where <li> is only child
964
    Action: coerce <dir> <li> to <div> with indent.
965
*/
966
967
static Bool Dir2Div( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode))
968
0
{
969
0
    Node *child;
970
971
0
    if ( nodeIsDIR(node) || nodeIsUL(node) || nodeIsOL(node) )
972
0
    {
973
0
        child = node->content;
974
975
0
        if (child == NULL)
976
0
            return no;
977
978
        /* check child has no peers */
979
980
0
        if (child->next)
981
0
            return no;
982
983
0
        if ( !nodeIsLI(child) )
984
0
            return no;
985
986
0
        if ( !child->implicit )
987
0
            return no;
988
989
        /* coerce dir to div */
990
0
        node->tag = TY_(LookupTagDef)( TidyTag_DIV );
991
0
        TidyDocFree( doc, node->element );
992
0
        node->element = TY_(tmbstrdup)(doc->allocator, "div");
993
0
        TY_(AddStyleProperty)( doc, node, "margin-left: 2em" );
994
0
        StripOnlyChild( doc, node );
995
0
        return yes;
996
0
    }
997
998
0
    return no;
999
0
}
1000
1001
/*
1002
    Symptom: <center>
1003
    Action: replace <center> by <div style="text-align: center">
1004
*/
1005
1006
static Bool Center2Div( TidyDocImpl* doc, Node *node, Node **pnode)
1007
0
{
1008
0
    if ( nodeIsCENTER(node) )
1009
0
    {
1010
0
        RenameElem( doc, node, TidyTag_DIV );
1011
0
        TY_(AddStyleProperty)( doc, node, "text-align: center" );
1012
0
        return yes;
1013
0
    }
1014
1015
0
    return no;
1016
0
}
1017
1018
/* Copy child attributes to node. Duplicate attributes are overwritten.
1019
   Unique attributes (such as ID) disable the action.
1020
   Attributes style and class are not dealt with. A call to MergeStyles
1021
   will do that.
1022
*/
1023
static Bool CopyAttrs( TidyDocImpl* doc, Node *node, Node *child)
1024
0
{
1025
0
    AttVal *av1, *av2;
1026
0
    TidyAttrId id;
1027
1028
    /* Detect attributes that cannot be merged or overwritten. */
1029
0
    if (TY_(AttrGetById)(child, TidyAttr_ID) != NULL
1030
0
        && TY_(AttrGetById)(node, TidyAttr_ID) != NULL)
1031
0
        return no;
1032
1033
    /* Move child attributes to node. Attributes in node
1034
     can be overwritten or merged. */
1035
0
    for (av2 = child->attributes; av2; )
1036
0
    {
1037
        /* Dealt by MergeStyles. */
1038
0
        if (attrIsSTYLE(av2) || attrIsCLASS(av2))
1039
0
        {
1040
0
            av2 = av2->next;
1041
0
            continue;
1042
0
        }
1043
        /* Avoid duplicates in node */
1044
0
        if ((id=AttrId(av2)) != TidyAttr_UNKNOWN
1045
0
            && (av1=TY_(AttrGetById)(node, id))!= NULL)
1046
0
            TY_(RemoveAttribute)( doc, node, av1 );
1047
1048
        /* Move attribute from child to node */
1049
0
        TY_(DetachAttribute)( child, av2 );
1050
0
        av1 = av2;
1051
0
        av2 = av2->next;
1052
0
        av1->next = NULL;
1053
0
        TY_(InsertAttributeAtEnd)( node, av1 );
1054
0
    }
1055
1056
0
    return yes;
1057
0
}
1058
1059
/*
1060
    Symptom <XX><XX>...</XX></XX>
1061
    Action: merge the two XXs
1062
1063
  For instance, this is useful after nested <dir>s used by Word
1064
  for indenting have been converted to <div>s
1065
1066
  If state is "no", no merging.
1067
  If state is "yes", inner element is discarded. Only Style and Class
1068
  attributes are merged using MergeStyles().
1069
  If state is "auto", atttibutes are merged as described in CopyAttrs().
1070
  Style and Class attributes are merged using MergeStyles().
1071
*/
1072
static Bool MergeNestedElements( TidyDocImpl* doc,
1073
                                 TidyTagId Id, TidyTriState state, Node *node,
1074
                                 Node **ARG_UNUSED(pnode))
1075
0
{
1076
0
    Node *child;
1077
1078
0
    if ( state == TidyNoState
1079
0
         || !TagIsId(node, Id) )
1080
0
        return no;
1081
1082
0
    child = node->content;
1083
1084
0
    if ( child == NULL
1085
0
         || child->next != NULL
1086
0
         || !TagIsId(child, Id) )
1087
0
        return no;
1088
1089
0
    if ( state == TidyAutoState
1090
0
         && CopyAttrs(doc, node, child) == no )
1091
0
        return no;
1092
1093
0
    MergeStyles( doc, node, child );
1094
0
    StripOnlyChild( doc, node );
1095
0
    return yes;
1096
0
}
1097
1098
/*
1099
    Symptom: <ul><li><ul>...</ul></li></ul>
1100
    Action: discard outer list
1101
*/
1102
1103
static Bool NestedList( TidyDocImpl* doc, Node *node, Node **pnode )
1104
0
{
1105
0
    Node *child, *list;
1106
1107
0
    if ( nodeIsUL(node) || nodeIsOL(node) )
1108
0
    {
1109
0
        child = node->content;
1110
1111
0
        if (child == NULL)
1112
0
            return no;
1113
1114
        /* check child has no peers */
1115
1116
0
        if (child->next)
1117
0
            return no;
1118
1119
0
        list = child->content;
1120
1121
0
        if (!list)
1122
0
            return no;
1123
1124
0
        if (list->tag != node->tag)
1125
0
            return no;
1126
1127
        /* check list has no peers */
1128
0
        if (list->next)
1129
0
            return no;
1130
1131
0
        *pnode = list;  /* Set node to resume iteration */
1132
1133
        /* move inner list node into position of outer node */
1134
0
        list->prev = node->prev;
1135
0
        list->next = node->next;
1136
0
        list->parent = node->parent;
1137
0
        TY_(FixNodeLinks)(list);
1138
1139
        /* get rid of outer ul and its li */
1140
0
        child->content = NULL;
1141
0
        TY_(FreeNode)( doc, child ); /* See test #427841. */
1142
0
        child = NULL;
1143
0
        node->content = NULL;
1144
0
        node->next = NULL;
1145
0
        TY_(FreeNode)( doc, node );
1146
0
        node = NULL;
1147
1148
        /*
1149
          If prev node was a list the chances are this node
1150
          should be appended to that list. Word has no way of
1151
          recognizing nested lists and just uses indents
1152
        */
1153
1154
0
        if (list->prev)
1155
0
        {
1156
0
            if ( (nodeIsUL(list->prev) || nodeIsOL(list->prev))
1157
0
                 && list->prev->last )
1158
0
            {
1159
0
                node = list;
1160
0
                list = node->prev;
1161
1162
0
                child = list->last;  /* <li> */
1163
1164
0
                list->next = node->next;
1165
0
                TY_(FixNodeLinks)(list);
1166
1167
0
                node->parent = child;
1168
0
                node->next = NULL;
1169
0
                node->prev = child->last;
1170
0
                TY_(FixNodeLinks)(node);
1171
0
                CleanNode( doc, node );
1172
0
            }
1173
0
        }
1174
1175
0
        return yes;
1176
0
    }
1177
1178
0
    return no;
1179
0
}
1180
1181
/* Find CSS equivalent in a SPAN element */
1182
static
1183
Bool FindCSSSpanEq( Node *node, ctmbstr *s, Bool deprecatedOnly )
1184
0
{
1185
0
    struct
1186
0
    {
1187
0
        TidyTagId id;
1188
0
        ctmbstr CSSeq;
1189
0
        Bool deprecated;
1190
0
    }
1191
0
    const CSS_SpanEq[] =
1192
0
        {
1193
0
            { TidyTag_B, "font-weight: bold", no },
1194
0
            { TidyTag_I, "font-style: italic", no },
1195
0
            { TidyTag_S, "text-decoration: line-through", yes},
1196
0
            { TidyTag_STRIKE, "text-decoration: line-through", yes},
1197
0
            { TidyTag_U, "text-decoration: underline", yes},
1198
0
            { TidyTag_UNKNOWN, NULL, no }
1199
0
        };
1200
0
    uint i;
1201
1202
0
    for (i=0; CSS_SpanEq[i].CSSeq; ++i)
1203
0
        if ( (!deprecatedOnly || CSS_SpanEq[i].deprecated)
1204
0
             && TagIsId(node, CSS_SpanEq[i].id) )
1205
0
        {
1206
0
            *s = CSS_SpanEq[i].CSSeq;
1207
0
            return yes;
1208
0
        }
1209
0
    return no; 
1210
0
}
1211
1212
/* Necessary conditions to apply BlockStyle(). */
1213
static Bool CanApplyBlockStyle( Node *node )
1214
0
{
1215
0
    if (TY_(nodeHasCM)(node,CM_BLOCK | CM_LIST | CM_DEFLIST | CM_TABLE)
1216
0
        && !nodeIsDIV(node) && !nodeIsP(node)
1217
0
        && !nodeIsTABLE(node) && !nodeIsTR(node) && !nodeIsLI(node) )
1218
0
    {
1219
0
        return yes;
1220
0
    }
1221
0
    return no;
1222
0
}
1223
1224
/*
1225
  Symptom: the only child of a block-level element is a
1226
  presentation element such as B, I or FONT
1227
1228
  Action: add style "font-weight: bold" to the block and
1229
  strip the <b> element, leaving its children.
1230
1231
  example:
1232
1233
    <p>
1234
      <b><font face="Arial" size="6">Draft Recommended Practice</font></b>
1235
    </p>
1236
1237
  becomes:
1238
1239
      <p style="font-weight: bold; font-family: Arial; font-size: 6">
1240
        Draft Recommended Practice
1241
      </p>
1242
1243
  This code also replaces the align attribute by a style attribute.
1244
  However, to avoid CSS problems with Navigator 4, this isn't done
1245
  for the elements: caption, tr and table
1246
*/
1247
static Bool BlockStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1248
0
{
1249
0
    Node *child;
1250
0
    ctmbstr CSSeq;
1251
1252
    /* check for bgcolor */
1253
0
    if (   nodeIsTABLE(node)
1254
0
        || nodeIsTD(node) || nodeIsTH(node) || nodeIsTR( node ))
1255
0
        TableBgColor( doc, node );
1256
1257
0
    if (CanApplyBlockStyle(node))
1258
0
    {
1259
        /* check for align attribute */
1260
0
        if ( !nodeIsCAPTION(node) )
1261
0
            TextAlign( doc, node );
1262
1263
0
        child = node->content;
1264
0
        if (child == NULL)
1265
0
            return no;
1266
1267
        /* check child has no peers */
1268
0
        if (child->next)
1269
0
            return no;
1270
1271
0
        if ( FindCSSSpanEq(child, &CSSeq, no) )
1272
0
        {
1273
0
            MergeStyles( doc, node, child );
1274
0
            TY_(AddStyleProperty)( doc, node, CSSeq );
1275
0
            StripOnlyChild( doc, node );
1276
0
            return yes;
1277
0
        }
1278
0
        else if ( nodeIsFONT(child) )
1279
0
        {
1280
0
            MergeStyles( doc, node, child );
1281
0
            AddFontStyles( doc, node, child->attributes );
1282
0
            StripOnlyChild( doc, node );
1283
0
            return yes;
1284
0
        }
1285
0
    }
1286
1287
0
    return no;
1288
0
}
1289
1290
/* Necessary conditions to apply InlineStyle(). */
1291
static Bool CanApplyInlineStyle( Node *node )
1292
0
{
1293
0
    return !nodeIsFONT(node) && TY_(nodeHasCM)(node, CM_INLINE|CM_ROW);
1294
0
}
1295
1296
/* the only child of table cell or an inline element such as em */
1297
static Bool InlineStyle( TidyDocImpl* doc, Node *node, Node **ARG_UNUSED(pnode) )
1298
0
{
1299
0
    Node *child;
1300
0
    ctmbstr CSSeq;
1301
1302
0
    if ( CanApplyInlineStyle(node) )
1303
0
    {
1304
0
        child = node->content;
1305
1306
0
        if (child == NULL)
1307
0
            return no;
1308
1309
        /* check child has no peers */
1310
1311
0
        if (child->next)
1312
0
            return no;
1313
1314
0
        if ( FindCSSSpanEq(child, &CSSeq, no) )
1315
0
        {
1316
0
            MergeStyles( doc, node, child );
1317
0
            TY_(AddStyleProperty)( doc, node, CSSeq );
1318
0
            StripOnlyChild( doc, node );
1319
0
            return yes;
1320
0
        }
1321
0
        else if ( nodeIsFONT(child) )
1322
0
        {
1323
0
            MergeStyles( doc, node, child );
1324
0
            AddFontStyles( doc, node, child->attributes );
1325
0
            StripOnlyChild( doc, node );
1326
0
            return yes;
1327
0
        }
1328
0
    }
1329
1330
0
    return no;
1331
0
}
1332
1333
/*
1334
    Transform element to equivalent CSS
1335
*/
1336
static Bool InlineElementToCSS( TidyDocImpl* doc, Node* node,
1337
                                Node **ARG_UNUSED(pnode)  )
1338
0
{
1339
0
    ctmbstr CSSeq;
1340
1341
    /* if node is the only child of parent element then leave alone
1342
          Do so only if BlockStyle may be successful. */
1343
0
    if ( node->parent->content == node && node->next == NULL &&
1344
0
         (CanApplyBlockStyle(node->parent)
1345
0
          || CanApplyInlineStyle(node->parent)) )
1346
0
        return no;
1347
1348
0
    if ( FindCSSSpanEq(node, &CSSeq, yes) )
1349
0
    {
1350
0
        RenameElem( doc, node, TidyTag_SPAN );
1351
0
        TY_(AddStyleProperty)( doc, node, CSSeq );
1352
0
        return yes;
1353
0
    }
1354
0
    return no;
1355
0
} 
1356
1357
/*
1358
  Replace font elements by span elements, deleting
1359
  the font element's attributes and replacing them
1360
  by a single style attribute.
1361
*/
1362
static Bool Font2Span( TidyDocImpl* doc, Node *node, Node **pnode )
1363
0
{
1364
0
    AttVal *av, *style, *next;
1365
1366
0
    if ( nodeIsFONT(node) )
1367
0
    {
1368
        /* if node is the only child of parent element then leave alone
1369
          Do so only if BlockStyle may be successful. */
1370
0
        if ( node->parent->content == node && node->next == NULL &&
1371
0
             CanApplyBlockStyle(node->parent) )
1372
0
            return no;
1373
1374
0
        AddFontStyles( doc, node, node->attributes );
1375
1376
        /* extract style attribute and free the rest */
1377
0
        av = node->attributes;
1378
0
        style = NULL;
1379
1380
0
        while (av)
1381
0
        {
1382
0
            next = av->next;
1383
1384
0
            if (attrIsSTYLE(av))
1385
0
            {
1386
0
                av->next = NULL;
1387
0
                style = av;
1388
0
            }
1389
0
            else
1390
0
            {
1391
0
                TY_(FreeAttribute)( doc, av );
1392
0
            }
1393
0
            av = next;
1394
0
        }
1395
1396
0
        node->attributes = style;
1397
0
        RenameElem( doc, node, TidyTag_SPAN );
1398
0
        return yes;
1399
0
    }
1400
1401
0
    return no;
1402
0
}
1403
1404
/*
1405
  Applies all matching rules to a node.
1406
*/
1407
Node* CleanNode( TidyDocImpl* doc, Node *node )
1408
0
{
1409
0
    Node *next = NULL;
1410
0
    TidyTriState mergeDivs = cfgAutoBool(doc, TidyMergeDivs);
1411
0
    TidyTriState mergeSpans = cfgAutoBool(doc, TidyMergeSpans);
1412
1413
0
    for (next = node; TY_(nodeIsElement)(node); node = next)
1414
0
    {
1415
0
        if ( Dir2Div(doc, node, &next) )
1416
0
            continue;
1417
1418
        /* Special case: true result means
1419
        ** that arg node and its parent no longer exist.
1420
        ** So we must jump back up the CreateStyleProperties()
1421
        ** call stack until we have a valid node reference.
1422
        */
1423
0
        if ( NestedList(doc, node, &next) )
1424
0
            return next;
1425
1426
0
        if ( Center2Div(doc, node, &next) )
1427
0
            continue;
1428
1429
0
        if ( MergeNestedElements(doc, TidyTag_DIV, mergeDivs, node, &next) )
1430
0
            continue;
1431
1432
0
        if ( MergeNestedElements(doc, TidyTag_SPAN, mergeSpans, node, &next) )
1433
0
            continue;
1434
1435
0
        if ( BlockStyle(doc, node, &next) )
1436
0
            continue;
1437
1438
0
        if ( InlineStyle(doc, node, &next) )
1439
0
            continue;
1440
1441
0
        if ( InlineElementToCSS(doc, node, &next) )
1442
0
            continue;
1443
1444
0
        if ( Font2Span(doc, node, &next) )
1445
0
            continue;
1446
1447
0
        break;
1448
0
    }
1449
1450
0
    return next;
1451
0
}
1452
1453
/* Special case: if the current node is destroyed by
1454
** CleanNode() lower in the tree, this node and its parent
1455
** no longer exist.  So we must jump back up the CleanTree()
1456
** call stack until we have a valid node reference.
1457
*/
1458
1459
static Node* CleanTree( TidyDocImpl* doc, Node *node )
1460
0
{
1461
0
    if (node->content)
1462
0
    {
1463
0
        Node *child;
1464
0
        for (child = node->content; child != NULL; child = child->next)
1465
0
        {
1466
0
            child = CleanTree( doc, child );
1467
0
            if ( !child )
1468
0
                break;
1469
0
        }
1470
0
    }
1471
1472
0
    return CleanNode( doc, node );
1473
0
}
1474
1475
static void DefineStyleRules( TidyDocImpl* doc, Node *node )
1476
0
{
1477
0
    Node *child;
1478
1479
0
    if (node->content)
1480
0
    {
1481
0
        for (child = node->content;
1482
0
                child != NULL; child = child->next)
1483
0
        {
1484
0
            DefineStyleRules( doc, child );
1485
0
        }
1486
0
    }
1487
1488
0
    Style2Rule( doc, node );
1489
0
}
1490
1491
void TY_(CleanDocument)( TidyDocImpl* doc )
1492
0
{
1493
    /* placeholder.  CleanTree()/CleanNode() will not
1494
    ** zap root element 
1495
    */
1496
0
    CleanTree( doc, &doc->root );
1497
1498
0
    if ( cfgBool(doc, TidyMakeClean) )
1499
0
    {
1500
0
        DefineStyleRules( doc, &doc->root );
1501
0
        CreateStyleElement( doc );
1502
0
    }
1503
0
}
1504
1505
/* simplifies <b><b> ... </b> ...</b> etc. */
1506
void TY_(NestedEmphasis)( TidyDocImpl* doc, Node* node )
1507
1.52M
{
1508
1.52M
    Node *next;
1509
1510
4.54M
    while (node)
1511
3.02M
    {
1512
3.02M
        next = node->next;
1513
1514
3.02M
        if ( (nodeIsB(node) || nodeIsI(node))
1515
38.4k
             && node->parent && node->parent->tag == node->tag)
1516
6.25k
        {
1517
            /* strip redundant inner element */
1518
6.25k
            DiscardContainer( doc, node, &next );
1519
6.25k
            node = next;
1520
6.25k
            continue;
1521
6.25k
        }
1522
1523
3.01M
        if ( node->content )
1524
1.50M
            TY_(NestedEmphasis)( doc, node->content );
1525
1526
3.01M
        node = next;
1527
3.01M
    }
1528
1.52M
}
1529
1530
1531
1532
/* replace i by em and b by strong */
1533
void TY_(EmFromI)( TidyDocImpl* doc, Node* node )
1534
0
{
1535
0
    while (node)
1536
0
    {
1537
0
        if ( nodeIsI(node) )
1538
0
            RenameElem( doc, node, TidyTag_EM );
1539
0
        else if ( nodeIsB(node) )
1540
0
            RenameElem( doc, node, TidyTag_STRONG );
1541
1542
0
        if ( node->content )
1543
0
            TY_(EmFromI)( doc, node->content );
1544
1545
0
        node = node->next;
1546
0
    }
1547
0
}
1548
1549
static Bool HasOneChild(Node *node)
1550
71.6k
{
1551
71.6k
    return (node->content && node->content->next == NULL);
1552
71.6k
}
1553
1554
/*
1555
 Some people use dir or ul without an li
1556
 to indent the content. The pattern to
1557
 look for is a list with a single implicit
1558
 li. This is recursively replaced by an
1559
 implicit blockquote.
1560
*/
1561
void TY_(List2BQ)( TidyDocImpl* doc, Node* node )
1562
1.52M
{
1563
4.53M
    while (node)
1564
3.01M
    {
1565
3.01M
        if (node->content)
1566
1.50M
            TY_(List2BQ)( doc, node->content );
1567
1568
3.01M
        if ( node->tag && node->tag->parser == TY_(ParseList) &&
1569
63.4k
             HasOneChild(node) && node->content->implicit )
1570
8.95k
        {
1571
8.95k
            StripOnlyChild( doc, node );
1572
8.95k
            RenameElem( doc, node, TidyTag_BLOCKQUOTE );
1573
8.95k
            node->implicit = yes;
1574
8.95k
        }
1575
1576
3.01M
        node = node->next;
1577
3.01M
    }
1578
1.52M
}
1579
1580
1581
/*
1582
 Replace implicit blockquote by div with an indent
1583
 taking care to reduce nested blockquotes to a single
1584
 div with the indent set to match the nesting depth
1585
*/
1586
void TY_(BQ2Div)( TidyDocImpl* doc, Node *node )
1587
19.1k
{
1588
19.1k
    Stack *stack = TY_(newStack)(doc, 16);
1589
19.1k
    Node *next;
1590
    
1591
19.1k
    tmbchar indent_buf[ 32 ];
1592
19.1k
    uint indent;
1593
1594
3.02M
    while (node)
1595
3.00M
    {
1596
3.00M
        next = node->next;
1597
        
1598
3.00M
        if ( nodeIsBLOCKQUOTE(node) && node->implicit )
1599
4.00k
        {
1600
4.00k
            indent = 1;
1601
1602
8.14k
            while( HasOneChild(node) &&
1603
8.14k
                   nodeIsBLOCKQUOTE(node->content) &&
1604
4.14k
                   node->implicit)
1605
4.14k
            {
1606
4.14k
                ++indent;
1607
4.14k
                StripOnlyChild( doc, node );
1608
4.14k
            }
1609
1610
4.00k
            TY_(tmbsnprintf)(indent_buf, sizeof(indent_buf), "margin-left: %dem",
1611
4.00k
                             2*indent);
1612
1613
4.00k
            RenameElem( doc, node, TidyTag_DIV );
1614
4.00k
            TY_(AddStyleProperty)(doc, node, indent_buf );
1615
1616
4.00k
            if (node->content)
1617
3.83k
            {
1618
3.83k
                TY_(push)(stack, next);
1619
3.83k
                node = node->content;
1620
3.83k
                continue;
1621
3.83k
            }
1622
4.00k
        }
1623
2.99M
        else if (node->content)
1624
1.48M
        {
1625
1.48M
            TY_(push)(stack, next);
1626
1.48M
            node = node->content;
1627
1.48M
            continue;
1628
1.48M
        }
1629
1630
1.51M
        node = next ? next : TY_(pop)(stack);
1631
1.51M
    }
1632
19.1k
    TY_(freeStack)(stack);
1633
19.1k
}
1634
1635
1636
static Node* FindEnclosingCell( TidyDocImpl* ARG_UNUSED(doc), Node *node)
1637
0
{
1638
0
    Node *check;
1639
1640
0
    for ( check=node; check; check = check->parent )
1641
0
    {
1642
0
      if ( nodeIsTD(check) )
1643
0
        return check;
1644
0
    }
1645
0
    return NULL;
1646
0
}
1647
1648
/* node is <![if ...]> prune up to <![endif]> */
1649
static Node* PruneSection( TidyDocImpl* doc, Node *node )
1650
0
{
1651
0
    Lexer* lexer = doc->lexer;
1652
1653
0
    for (;;)
1654
0
    {
1655
0
        if (node == NULL)
1656
0
            return NULL;
1657
        
1658
0
        ctmbstr lexbuf = lexer->lexbuf + node->start;
1659
0
        if ( TY_(tmbstrncmp)(lexbuf, "if !supportEmptyParas", 21) == 0 )
1660
0
        {
1661
0
          Node* cell = FindEnclosingCell( doc, node );
1662
0
          if ( cell )
1663
0
          {
1664
            /* Need to put &nbsp; into cell so it doesn't look weird
1665
            */
1666
0
            Node* nbsp = TY_(NewLiteralTextNode)( lexer, "\240" );
1667
0
            assert( (byte)'\240' == (byte)160 );
1668
0
            TY_(InsertNodeBeforeElement)( node, nbsp );
1669
0
          }
1670
0
        }
1671
1672
        /* discard node and returns next, unless it is a text node */
1673
0
        if ( node->type == TextNode )
1674
0
            node = node->next;
1675
0
        else
1676
0
            node = TY_(DiscardElement)( doc, node );
1677
1678
0
        if (node == NULL)
1679
0
            return NULL;
1680
        
1681
0
        if (node->type == SectionTag)
1682
0
        {
1683
0
            if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0)
1684
0
            {
1685
0
                node = PruneSection( doc, node );
1686
0
                continue;
1687
0
            }
1688
1689
0
            if (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "endif", 5) == 0)
1690
0
            {
1691
0
                node = TY_(DiscardElement)( doc, node );
1692
0
                break;
1693
0
            }
1694
0
        }
1695
0
    }
1696
1697
0
    return node;
1698
0
}
1699
1700
void TY_(DropSections)( TidyDocImpl* doc, Node* node )
1701
0
{
1702
0
    Lexer* lexer = doc->lexer;
1703
0
    while (node)
1704
0
    {
1705
0
        if (node->type == SectionTag)
1706
0
        {
1707
            /* prune up to matching endif */
1708
0
            if ((TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if", 2) == 0) &&
1709
0
                (TY_(tmbstrncmp)(lexer->lexbuf + node->start, "if !vml", 7) != 0)) /* #444394 - fix 13 Sep 01 */
1710
0
            {
1711
0
                node = PruneSection( doc, node );
1712
0
                continue;
1713
0
            }
1714
1715
            /* discard others as well */
1716
0
            node = TY_(DiscardElement)( doc, node );
1717
0
            continue;
1718
0
        }
1719
1720
0
        if (node->content)
1721
0
            TY_(DropSections)( doc, node->content );
1722
1723
0
        node = node->next;
1724
0
    }
1725
0
}
1726
1727
static void PurgeWord2000Attributes( TidyDocImpl* doc, Node* node )
1728
0
{
1729
0
    AttVal *attr, *next, *prev = NULL;
1730
1731
0
    for ( attr = node->attributes; attr; attr = next )
1732
0
    {
1733
0
        next = attr->next;
1734
1735
        /* special check for class="Code" denoting pre text */
1736
        /* Pass thru user defined styles as HTML class names */
1737
0
        if (attrIsCLASS(attr))
1738
0
        {
1739
0
            if (AttrValueIs(attr, "Code") ||
1740
0
                 TY_(tmbstrncmp)(attr->value, "Mso", 3) != 0 )
1741
0
            {
1742
0
                prev = attr;
1743
0
                continue;
1744
0
            }
1745
0
        }
1746
1747
0
        if (attrIsCLASS(attr) ||
1748
0
            attrIsSTYLE(attr) ||
1749
0
            attrIsLANG(attr)  ||
1750
0
             ( (attrIsHEIGHT(attr) || attrIsWIDTH(attr)) &&
1751
0
               (nodeIsTD(node) || nodeIsTR(node) || nodeIsTH(node)) ) ||
1752
0
             (attr->attribute && TY_(tmbstrncmp)(attr->attribute, "x:", 2) == 0) )
1753
0
        {
1754
0
            if (prev)
1755
0
                prev->next = next;
1756
0
            else
1757
0
                node->attributes = next;
1758
1759
0
            TY_(FreeAttribute)( doc, attr );
1760
0
        }
1761
0
        else
1762
0
            prev = attr;
1763
0
    }
1764
0
}
1765
1766
/* Word2000 uses span excessively, so we strip span out */
1767
static Node* StripSpan( TidyDocImpl* doc, Node* span )
1768
0
{
1769
0
    Node *node, *prev = NULL, *content;
1770
1771
    /*
1772
     deal with span elements that have content
1773
     by splicing the content in place of the span
1774
     after having processed it
1775
    */
1776
1777
0
    TY_(CleanWord2000)( doc, span->content );
1778
0
    content = span->content;
1779
1780
0
    if (span->prev)
1781
0
        prev = span->prev;
1782
0
    else if (content)
1783
0
    {
1784
0
        node = content;
1785
0
        content = content->next;
1786
0
        TY_(RemoveNode)(node);
1787
0
        TY_(InsertNodeBeforeElement)(span, node);
1788
0
        prev = node;
1789
0
    }
1790
1791
0
    while (content)
1792
0
    {
1793
0
        node = content;
1794
0
        content = content->next;
1795
0
        TY_(RemoveNode)(node);
1796
0
        TY_(InsertNodeAfterElement)(prev, node);
1797
0
        prev = node;
1798
0
    }
1799
1800
0
    if (span->next == NULL)
1801
0
        span->parent->last = prev;
1802
1803
0
    node = span->next;
1804
0
    span->content = NULL;
1805
0
    TY_(DiscardElement)( doc, span );
1806
0
    return node;
1807
0
}
1808
1809
/* map non-breaking spaces to regular spaces */
1810
void TY_(NormalizeSpaces)(Lexer *lexer, Node *node)
1811
51.5k
{
1812
127k
    while ( node )
1813
76.0k
    {
1814
76.0k
        if ( node->content )
1815
44.7k
            TY_(NormalizeSpaces)( lexer, node->content );
1816
1817
76.0k
        if (TY_(nodeIsText)(node))
1818
13.4k
        {
1819
13.4k
            uint i, c;
1820
13.4k
            tmbstr p = lexer->lexbuf + node->start;
1821
1822
8.65M
            for (i = node->start; i < node->end; ++i)
1823
8.64M
            {
1824
8.64M
                c = (byte) lexer->lexbuf[i];
1825
1826
                /* look for UTF-8 multibyte character */
1827
8.64M
                if ( c > 0x7F )
1828
186k
                    i += TY_(GetUTF8)( lexer->lexbuf + i, &c );
1829
1830
8.64M
                if ( c == 160 )
1831
229
                    c = ' ';
1832
1833
8.64M
                p = TY_(PutUTF8)(p, c);
1834
8.64M
            }
1835
13.4k
            node->end = p - lexer->lexbuf;
1836
13.4k
        }
1837
1838
76.0k
        node = node->next;
1839
76.0k
    }
1840
51.5k
}
1841
1842
/* used to hunt for hidden preformatted sections */
1843
static Bool NoMargins(Node *node)
1844
0
{
1845
0
    AttVal *attval = TY_(AttrGetById)(node, TidyAttr_STYLE);
1846
1847
0
    if ( !AttrHasValue(attval) )
1848
0
        return no;
1849
1850
    /* search for substring "margin-top: 0" */
1851
0
    if (!TY_(tmbsubstr)(attval->value, "margin-top: 0"))
1852
0
        return no;
1853
1854
    /* search for substring "margin-bottom: 0" */
1855
0
    if (!TY_(tmbsubstr)(attval->value, "margin-bottom: 0"))
1856
0
        return no;
1857
1858
0
    return yes;
1859
0
}
1860
1861
/* does element have a single space as its content? */
1862
static Bool SingleSpace( Lexer* lexer, Node* node )
1863
0
{
1864
0
    if ( node->content )
1865
0
    {
1866
0
        node = node->content;
1867
1868
0
        if ( node->next != NULL )
1869
0
            return no;
1870
1871
0
        if ( node->type != TextNode )
1872
0
            return no;
1873
1874
0
        if ( (node->end - node->start) == 1 &&
1875
0
             lexer->lexbuf[node->start] == ' ' )
1876
0
            return yes;
1877
1878
0
        if ( (node->end - node->start) == 2 )
1879
0
        {
1880
0
            uint c = 0;
1881
0
            TY_(GetUTF8)( lexer->lexbuf + node->start, &c );
1882
0
            if ( c == 160 )
1883
0
                return yes;
1884
0
        }
1885
0
    }
1886
1887
0
    return no;
1888
0
}
1889
1890
/*
1891
 This is a major clean up to strip out all the extra stuff you get
1892
 when you save as web page from Word 2000. It doesn't yet know what
1893
 to do with VML tags, but these will appear as errors unless you
1894
 declare them as new tags, such as o:p which needs to be declared
1895
 as inline.
1896
*/
1897
void TY_(CleanWord2000)( TidyDocImpl* doc, Node *node)
1898
0
{
1899
    /* used to a list from a sequence of bulleted p's */
1900
0
    Lexer* lexer = doc->lexer;
1901
0
    Node* list = NULL;
1902
0
    AttVal *next_attr, *attval;
1903
1904
0
    while ( node )
1905
0
    {
1906
        /* get rid of Word's xmlns attributes */
1907
0
        if ( nodeIsHTML(node) )
1908
0
        {
1909
            /* check that it's a Word 2000 document */
1910
0
            if ( !TY_(IsWord2000) (doc) ) /* Is. #896 */
1911
0
                return;
1912
1913
            /* Output proprietary attributes to maintain errout compatibility
1914
             * with traditional Tidy. This is a result of moving all of the
1915
             * proprietary checks to near the end of the cleanup process,
1916
             * meaning this result would not ordinarily be displayed. 
1917
             */
1918
0
            attval = node->attributes;
1919
0
            while ( attval ) {
1920
0
                next_attr = attval->next;
1921
1922
                /* Issue #591 - take care of a NULL attribute, too. */
1923
0
                if ( !attval->attribute || ( strcmp(attval->attribute, "xmlns") != 0 ))
1924
0
                    TY_(ReportAttrError)(doc, node, attval, PROPRIETARY_ATTRIBUTE);
1925
0
                attval = next_attr;
1926
0
            }
1927
1928
0
            TY_(FreeAttrs)( doc, node );
1929
0
        }
1930
1931
        /* fix up preformatted sections by looking for a
1932
        ** sequence of paragraphs with zero top/bottom margin
1933
        */
1934
0
        if ( nodeIsP(node) )
1935
0
        {
1936
0
            if (NoMargins(node))
1937
0
            {
1938
0
                Node *pre, *next;
1939
0
                TY_(CoerceNode)(doc, node, TidyTag_PRE, no, yes);
1940
1941
0
                PurgeWord2000Attributes( doc, node );
1942
1943
0
                if (node->content)
1944
0
                    TY_(CleanWord2000)( doc, node->content );
1945
1946
0
                pre = node;
1947
0
                node = node->next;
1948
1949
                /* continue to strip p's */
1950
1951
0
                while ( nodeIsP(node) && NoMargins(node) )
1952
0
                {
1953
0
                    next = node->next;
1954
0
                    TY_(RemoveNode)(node);
1955
0
                    TY_(InsertNodeAtEnd)(pre, TY_(NewLineNode)(lexer));
1956
0
                    TY_(InsertNodeAtEnd)(pre, node);
1957
0
                    StripSpan( doc, node );
1958
0
                    node = next;
1959
0
                }
1960
1961
0
                if (node == NULL)
1962
0
                    break;
1963
0
            }
1964
0
        }
1965
1966
0
        if (node->tag && (node->tag->model & CM_BLOCK)
1967
0
            && SingleSpace(lexer, node))
1968
0
        {
1969
0
            node = StripSpan( doc, node );
1970
0
            continue;
1971
0
        }
1972
        /* discard Word's style verbiage */
1973
0
        if ( nodeIsSTYLE(node) || nodeIsMETA(node) ||
1974
0
             node->type == CommentTag )
1975
0
        {
1976
0
            node = TY_(DiscardElement)( doc, node );
1977
0
            continue;
1978
0
        }
1979
1980
        /* strip out all span and font tags Word scatters so liberally! */
1981
0
        if ( nodeIsSPAN(node) || nodeIsFONT(node) )
1982
0
        {
1983
0
            node = StripSpan( doc, node );
1984
0
            continue;
1985
0
        }
1986
1987
0
        if ( nodeIsLINK(node) )
1988
0
        {
1989
0
            AttVal *attr = TY_(AttrGetById)(node, TidyAttr_REL);
1990
1991
0
            if (AttrValueIs(attr, "File-List"))
1992
0
            {
1993
0
                node = TY_(DiscardElement)( doc, node );
1994
0
                continue;
1995
0
            }
1996
0
        }
1997
1998
        /* discards <o:p> which encodes the paragraph mark */
1999
0
        if ( node->tag && TY_(tmbstrcmp)(node->tag->name,"o:p")==0)
2000
0
        {
2001
            /* Output proprietary elements to maintain errout compatibility
2002
             * with traditional Tidy. This is a result of moving all of the
2003
             * proprietary checks to near the end of the cleanup process,
2004
             * meaning this result would not ordinarily be displayed.
2005
             */
2006
0
            Node* next;
2007
0
            TY_(Report)(doc, NULL, node, PROPRIETARY_ELEMENT);
2008
0
            DiscardContainer( doc, node, &next );
2009
0
            node = next;
2010
0
            continue;
2011
0
        }
2012
2013
        /* discard empty paragraphs */
2014
2015
0
        if ( node->content == NULL && nodeIsP(node) )
2016
0
        {
2017
            /*  Use the existing function to ensure consistency */
2018
0
            Node *next = TY_(TrimEmptyElement)( doc, node );
2019
0
            node = next;
2020
0
            continue;
2021
0
        }
2022
2023
0
        if ( nodeIsP(node) )
2024
0
        {
2025
0
            AttVal *attr, *atrStyle;
2026
            
2027
0
            attr = TY_(AttrGetById)(node, TidyAttr_CLASS);
2028
0
            atrStyle = TY_(AttrGetById)(node, TidyAttr_STYLE);
2029
            /*
2030
               (JES) Sometimes Word marks a list item with the following hokie syntax
2031
               <p class="MsoNormal" style="...;mso-list:l1 level1 lfo1;
2032
                translate these into <li>
2033
            */
2034
            /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
2035
            /* map <p class="MsoListNumber"> to <ol>...</ol> */
2036
0
            if ( AttrValueIs(attr, "MsoListBullet") ||
2037
0
                 AttrValueIs(attr, "MsoListNumber") ||
2038
0
                 AttrContains(atrStyle, "mso-list:") )
2039
0
            {
2040
0
                TidyTagId listType = TidyTag_UL;
2041
0
                if (AttrValueIs(attr, "MsoListNumber"))
2042
0
                    listType = TidyTag_OL;
2043
2044
0
                TY_(CoerceNode)(doc, node, TidyTag_LI, no, yes);
2045
2046
0
                if ( !list || TagId(list) != listType )
2047
0
                {
2048
0
                    const Dict* tag = TY_(LookupTagDef)( listType );
2049
0
                    list = TY_(InferredTag)(doc, tag->id);
2050
0
                    TY_(InsertNodeBeforeElement)(node, list);
2051
0
                }
2052
2053
0
                PurgeWord2000Attributes( doc, node );
2054
2055
0
                if ( node->content )
2056
0
                    TY_(CleanWord2000)( doc, node->content );
2057
2058
                /* remove node and append to contents of list */
2059
0
                TY_(RemoveNode)(node);
2060
0
                TY_(InsertNodeAtEnd)(list, node);
2061
0
                node = list;
2062
0
            }
2063
            /* map sequence of <p class="Code"> to <pre>...</pre> */
2064
0
            else if (AttrValueIs(attr, "Code"))
2065
0
            {
2066
0
                Node *br = TY_(NewLineNode)(lexer);
2067
0
                TY_(NormalizeSpaces)(lexer, node->content);
2068
2069
0
                if ( !list || TagId(list) != TidyTag_PRE )
2070
0
                {
2071
0
                    list = TY_(InferredTag)(doc, TidyTag_PRE);
2072
0
                    TY_(InsertNodeBeforeElement)(node, list);
2073
0
                }
2074
2075
                /* remove node and append to contents of list */
2076
0
                TY_(RemoveNode)(node);
2077
0
                TY_(InsertNodeAtEnd)(list, node);
2078
0
                StripSpan( doc, node );
2079
0
                TY_(InsertNodeAtEnd)(list, br);
2080
0
                node = list->next;
2081
0
            }
2082
0
            else
2083
0
                list = NULL;
2084
0
        }
2085
0
        else
2086
0
            list = NULL;
2087
2088
0
        if (!node)
2089
0
            return;
2090
2091
        /* strip out style and class attributes */
2092
0
        if (TY_(nodeIsElement)(node))
2093
0
            PurgeWord2000Attributes( doc, node );
2094
2095
0
        if (node->content)
2096
0
            TY_(CleanWord2000)( doc, node->content );
2097
2098
0
        node = node->next;
2099
0
    }
2100
0
}
2101
2102
Bool TY_(IsWord2000)( TidyDocImpl* doc )
2103
0
{
2104
0
    AttVal *attval;
2105
0
    Node *node, *head;
2106
0
    Node *html = TY_(FindHTML)( doc );
2107
2108
0
    if (html && TY_(GetAttrByName)(html, "xmlns:o"))
2109
0
        return yes;
2110
    
2111
    /* search for <meta name="GENERATOR" content="Microsoft ..."> */
2112
0
    head = TY_(FindHEAD)( doc );
2113
2114
0
    if (head)
2115
0
    {
2116
0
        for (node = head->content; node; node = node->next)
2117
0
        {
2118
0
            if ( !nodeIsMETA(node) )
2119
0
                continue;
2120
2121
0
            attval = TY_(AttrGetById)( node, TidyAttr_NAME );
2122
2123
0
            if ( !AttrValueIs(attval, "generator") )
2124
0
                continue;
2125
2126
0
            attval =  TY_(AttrGetById)( node, TidyAttr_CONTENT );
2127
2128
0
            if ( AttrContains(attval, "Microsoft") )
2129
0
                return yes;
2130
0
        }
2131
0
    }
2132
2133
0
    return no;
2134
0
}
2135
2136
/* where appropriate move object elements from head to body */
2137
void TY_(BumpObject)( TidyDocImpl* doc, Node *html )
2138
81.9k
{
2139
81.9k
    Node *node, *next, *head = NULL, *body = NULL;
2140
2141
81.9k
    if (!html)
2142
406
        return;
2143
2144
266k
    for ( node = html->content; node != NULL; node = node->next )
2145
185k
    {
2146
185k
        if ( nodeIsHEAD(node) )
2147
72.2k
            head = node;
2148
2149
185k
        if ( nodeIsBODY(node) )
2150
107k
            body = node;
2151
185k
    }
2152
2153
81.5k
    if ( head != NULL && body != NULL )
2154
72.2k
    {
2155
89.3k
        for (node = head->content; node != NULL; node = next)
2156
17.1k
        {
2157
17.1k
            next = node->next;
2158
2159
17.1k
            if ( nodeIsOBJECT(node) )
2160
5.95k
            {
2161
5.95k
                Node *child;
2162
5.95k
                Bool bump = no;
2163
2164
10.1k
                for (child = node->content; child != NULL; child = child->next)
2165
5.99k
                {
2166
                    /* bump to body unless content is param */
2167
5.99k
                    if ( (TY_(nodeIsText)(child) && !TY_(IsBlank)(doc->lexer, node))
2168
5.07k
                         || !nodeIsPARAM(child) )
2169
1.85k
                    {
2170
1.85k
                            bump = yes;
2171
1.85k
                            break;
2172
1.85k
                    }
2173
5.99k
                }
2174
2175
5.95k
                if ( bump )
2176
1.85k
                {
2177
1.85k
                    TY_(RemoveNode)( node );
2178
1.85k
                    TY_(InsertNodeAtStart)( body, node );
2179
1.85k
                }
2180
5.95k
            }
2181
17.1k
        }
2182
72.2k
    }
2183
81.5k
}
2184
2185
2186
/*\
2187
*  Issue #456 - Check meta charset
2188
*  1. if there is no meta charset, it adds one, according to doctype, no warning.
2189
*  2. if there is a meta charset, it moves it to the top if HEAD. Not sure this required?
2190
*  3. if it doesn't match the output encoding, and fix. Naybe no warning?
2191
*  4. if there are duplicates, discard them, with warning.
2192
\*/
2193
Bool TY_(TidyMetaCharset)(TidyDocImpl* doc)
2194
19.1k
{
2195
19.1k
    AttVal *charsetAttr;
2196
19.1k
    AttVal *contentAttr;
2197
19.1k
    AttVal *httpEquivAttr;
2198
19.1k
    Bool charsetFound = no;
2199
19.1k
    uint outenc = cfg(doc, TidyOutCharEncoding);
2200
19.1k
    ctmbstr enc = TY_(GetEncodingNameFromTidyId)(outenc);
2201
19.1k
    Node *currentNode;
2202
19.1k
    Node *head = TY_(FindHEAD)(doc);
2203
19.1k
    Node *metaTag;
2204
19.1k
    Node *prevNode;
2205
19.1k
    TidyBuffer buf;
2206
19.1k
    TidyBuffer charsetString;
2207
    /* tmbstr httpEquivAttrValue; */
2208
    /* tmbstr lcontent; */
2209
19.1k
    tmbstr newValue;
2210
19.1k
    Bool add_meta = cfgBool(doc, TidyMetaCharset);
2211
2212
    /* We can't do anything we don't have a head or encoding is NULL */
2213
19.1k
    if (!head || !enc || !TY_(tmbstrlen)(enc))
2214
0
        return no;
2215
19.1k
    if (outenc == RAW)
2216
0
        return no;
2217
19.1k
#ifndef NO_NATIVE_ISO2022_SUPPORT
2218
19.1k
    if (outenc == ISO2022)
2219
0
        return no;
2220
19.1k
#endif
2221
19.1k
    if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
2222
0
        return no; /* nothing to do here if showing body only */
2223
2224
19.1k
    tidyBufInit(&charsetString);
2225
    /* Set up the content test 'charset=value' */
2226
19.1k
    tidyBufClear(&charsetString);
2227
19.1k
    tidyBufAppend(&charsetString, "charset=", 8);
2228
19.1k
    tidyBufAppend(&charsetString, (char*)enc, TY_(tmbstrlen)(enc));
2229
19.1k
    tidyBufAppend(&charsetString, "\0", 1); /* zero terminate the buffer */
2230
    /* process the children of the head */
2231
    /* Issue #656 - guard against 'currentNode' being set NULL in loop */
2232
59.5k
    for (currentNode = head->content; currentNode; 
2233
40.4k
        currentNode = (currentNode ? currentNode->next : NULL))
2234
40.4k
    {
2235
40.4k
        if (!nodeIsMETA(currentNode))
2236
36.8k
            continue;   /* not a meta node */
2237
3.55k
        charsetAttr = attrGetCHARSET(currentNode);
2238
3.55k
        httpEquivAttr = attrGetHTTP_EQUIV(currentNode);
2239
3.55k
        if (!charsetAttr && !httpEquivAttr)
2240
1.55k
            continue;   /* has no charset attribute */
2241
                        /*
2242
                        Meta charset comes in quite a few flavors:
2243
                        1. <meta charset="value"> - expected for (X)HTML5.
2244
                        */
2245
2.00k
        if (charsetAttr && !httpEquivAttr)
2246
354
        {
2247
            /* we already found one, so remove the rest. */
2248
354
            if (charsetFound || !charsetAttr->value)
2249
341
            {
2250
341
                prevNode = currentNode->prev;
2251
341
                TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2252
341
                TY_(DiscardElement)(doc, currentNode);
2253
341
                currentNode = prevNode;
2254
341
                continue;
2255
341
            }
2256
13
            charsetFound = yes;
2257
            /* Fix mismatched attribute value */
2258
13
            if (TY_(tmbstrcasecmp)(charsetAttr->value, enc) != 0)
2259
13
            {
2260
13
                newValue = (tmbstr)TidyDocAlloc(doc, TY_(tmbstrlen)(enc) + 1);   /* allocate + 1 for 0 */
2261
13
                TY_(tmbstrcpy)(newValue, enc);
2262
                /* Note: previously http-equiv had been modified, without warning
2263
                in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2264
                */
2265
13
                TY_(ReportAttrError)(doc, currentNode, charsetAttr, ATTRIBUTE_VALUE_REPLACED);
2266
13
                TidyDocFree(doc, charsetAttr->value);   /* free current value */
2267
13
                charsetAttr->value = newValue;
2268
13
            }
2269
            /* Make sure it's the first element. */
2270
13
            if (currentNode != head->content->next) {
2271
12
                TY_(RemoveNode)(currentNode);
2272
12
                TY_(InsertNodeAtStart)(head, currentNode);
2273
12
            }
2274
13
            continue;
2275
354
        }
2276
        /*
2277
        2. <meta http-equiv="content-type" content="text/html; charset=UTF-8">
2278
        expected for HTML4. This is normally ok - but can clash.
2279
        */
2280
1.64k
        if (httpEquivAttr && !charsetAttr)
2281
1.44k
        {
2282
1.44k
            contentAttr = TY_(AttrGetById)(currentNode, TidyAttr_CONTENT);
2283
1.44k
            if (!contentAttr)
2284
396
                continue;   /* has no 'content' attribute */
2285
1.04k
            if (!httpEquivAttr->value)
2286
338
            {
2287
338
                prevNode = currentNode->prev;
2288
338
                TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2289
338
                TY_(DiscardElement)(doc, currentNode);
2290
338
                currentNode = prevNode;
2291
338
                continue;
2292
338
            }
2293
            /* httpEquivAttrValue = TY_(tmbstrtolower)(httpEquivAttr->value); */
2294
711
            if (TY_(tmbstrcasecmp)(httpEquivAttr->value, (tmbstr) "content-type") != 0)
2295
332
                continue;   /* is not 'content-type' */
2296
379
            if (!contentAttr->value)
2297
18
            {
2298
18
                continue; /* has no 'content' attribute has NO VALUE! */
2299
18
            }
2300
            /* check encoding matches
2301
            If a miss-match found here, fix it. previous silently done
2302
            in void TY_(VerifyHTTPEquiv)(TidyDocImpl* doc, Node *head)
2303
            lcontent = TY_(tmbstrtolower)(contentAttr->value);
2304
            */
2305
361
            if (TY_(tmbstrcasecmp)(contentAttr->value, (ctmbstr)charsetString.bp) == 0)
2306
82
            {
2307
                /* we already found one, so remove the rest. */
2308
82
                if (charsetFound)
2309
75
                {
2310
75
                    prevNode = currentNode->prev;
2311
75
                    TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2312
75
                    TY_(DiscardElement)(doc, currentNode);
2313
75
                    currentNode = prevNode;
2314
75
                    continue;
2315
75
                }
2316
7
                charsetFound = yes;
2317
7
            }
2318
279
            else
2319
279
            {
2320
                /* fix a mismatch */
2321
279
                if (charsetFound)
2322
268
                {
2323
268
                    prevNode = currentNode->prev;
2324
268
                    TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2325
268
                    TY_(DiscardElement)(doc, currentNode);
2326
268
                    currentNode = prevNode;
2327
268
                }
2328
11
                else
2329
11
                {
2330
                    /* correct the content */
2331
11
                    newValue = (tmbstr)TidyDocAlloc(doc, 19 + TY_(tmbstrlen)(enc) + 1);
2332
11
                    TY_(tmbstrcpy)(newValue, "text/html; charset=");
2333
11
                    TY_(tmbstrcpy)(newValue + 19, enc);
2334
11
                    if (cfgBool(doc, TidyShowMetaChange))   /* Issue #456 - backward compatibility only */
2335
0
                        TY_(ReportAttrError)(doc, currentNode, contentAttr, ATTRIBUTE_VALUE_REPLACED);
2336
11
                    TidyDocFree(doc, contentAttr->value);
2337
11
                    contentAttr->value = newValue;
2338
11
                    charsetFound = yes;
2339
11
                }
2340
279
            }
2341
286
            continue;
2342
361
        }
2343
        /*
2344
        3. <meta charset="utf-8" http-equiv="Content-Type" content="...">
2345
        This is generally bad. Discard and warn.
2346
        */
2347
204
        if (httpEquivAttr && charsetAttr)
2348
204
        {
2349
            /* printf("WARN ABOUT HTTP EQUIV AND CHARSET ATTR! \n"); */
2350
204
            prevNode = currentNode->prev;
2351
204
            TY_(Report)(doc, head, currentNode, DISCARDING_UNEXPECTED);
2352
204
            TY_(DiscardElement)(doc, currentNode);
2353
204
            currentNode = prevNode;
2354
204
        }
2355
204
    }
2356
2357
    /* completed head scan - add appropriate meta - if 'yes' and none exists */
2358
19.1k
    if (add_meta && !charsetFound)
2359
0
    {
2360
        /* add appropriate meta charset tag - no warning */
2361
0
        metaTag = TY_(InferredTag)(doc, TidyTag_META);
2362
0
        switch (TY_(HTMLVersion)(doc))
2363
0
        {
2364
0
        case HT50:
2365
0
        case XH50:
2366
0
            TY_(AddAttribute)(doc, metaTag, "charset", enc);
2367
0
            break;
2368
0
        default:
2369
0
            tidyBufInit(&buf);
2370
0
            tidyBufAppend(&buf, "text/html; ", 11);
2371
0
            tidyBufAppend(&buf, charsetString.bp, TY_(tmbstrlen)((ctmbstr)charsetString.bp));
2372
0
            tidyBufAppend(&buf, "\0", 1);   /* zero terminate the buffer */
2373
0
            TY_(AddAttribute)(doc, metaTag, "http-equiv", "Content-Type"); /* add 'http-equiv' const. */
2374
0
            TY_(AddAttribute)(doc, metaTag, "content", (char*)buf.bp);  /* add 'content="<enc>"' */
2375
0
            tidyBufFree(&buf);
2376
0
        }
2377
0
        TY_(InsertNodeAtStart)(head, metaTag);
2378
0
        TY_(Report)(doc, metaTag, head, ADDED_MISSING_CHARSET); /* actually just 'Info:' */
2379
0
    }
2380
19.1k
    tidyBufFree(&charsetString);
2381
19.1k
    return yes;
2382
19.1k
}
2383
2384
2385
void TY_(DropComments)(TidyDocImpl* doc, Node* node)
2386
0
{
2387
0
    Node* next;
2388
2389
0
    while (node)
2390
0
    {
2391
0
        next = node->next;
2392
2393
0
        if (node->type == CommentTag)
2394
0
        {
2395
0
            TY_(RemoveNode)(node);
2396
0
            TY_(FreeNode)(doc, node);
2397
0
            node = next;
2398
0
            continue;
2399
0
        }
2400
2401
0
        if (node->content)
2402
0
            TY_(DropComments)(doc, node->content);
2403
2404
0
        node = next;
2405
0
    }
2406
0
}
2407
2408
void TY_(DropFontElements)(TidyDocImpl* doc, Node* node, Node **ARG_UNUSED(pnode))
2409
0
{
2410
0
    Node* next;
2411
2412
0
    while (node)
2413
0
    {
2414
0
        next = node->next;
2415
2416
0
        if (nodeIsFONT(node))
2417
0
        {
2418
0
            DiscardContainer(doc, node, &next);
2419
0
            node = next;
2420
0
            continue;
2421
0
        }
2422
2423
0
        if (node->content)
2424
0
            TY_(DropFontElements)(doc, node->content, &next);
2425
2426
0
        node = next;
2427
0
    }
2428
0
}
2429
2430
void TY_(WbrToSpace)(TidyDocImpl* doc, Node* node)
2431
0
{
2432
0
    Node* next;
2433
2434
0
    while (node)
2435
0
    {
2436
0
        next = node->next;
2437
2438
0
        if (nodeIsWBR(node))
2439
0
        {
2440
0
            Node* text;
2441
0
            text = TY_(NewLiteralTextNode)(doc->lexer, " ");
2442
0
            TY_(InsertNodeAfterElement)(node, text);
2443
0
            TY_(RemoveNode)(node);
2444
0
            TY_(FreeNode)(doc, node);
2445
0
            node = next;
2446
0
            continue;
2447
0
        }
2448
2449
0
        if (node->content)
2450
0
            TY_(WbrToSpace)(doc, node->content);
2451
2452
0
        node = next;
2453
0
   }
2454
0
}
2455
2456
/*
2457
  Filters from Word and PowerPoint often use smart
2458
  quotes resulting in character codes between 128
2459
  and 159. Unfortunately, the corresponding HTML 4.0
2460
  entities for these are not widely supported. The
2461
  following converts dashes and quotation marks to
2462
  the nearest ASCII equivalent. My thanks to
2463
  Andrzej Novosiolov for his help with this code.
2464
2465
  Note: The old code in the pretty printer applied
2466
  this to all node types and attribute values while
2467
  this routine applies it only to text nodes. First,
2468
  Microsoft Office products rarely put the relevant
2469
  characters into these tokens, second support for
2470
  them is much better now and last but not least, it
2471
  can be harmful to replace these characters since
2472
  US-ASCII quote marks are often used as syntax
2473
  characters, a simple
2474
2475
    <a onmouseover="alert('&#x2018;')">...</a>
2476
2477
  would be broken if the U+2018 is replaced by "'".
2478
  The old code would neither take care whether the
2479
  quote mark is already used as delimiter,
2480
2481
    <p title='&#x2018;'>...</p>
2482
2483
  got
2484
  
2485
    <p title='''>...</p>
2486
2487
  Since browser support is much better nowadays and
2488
  high-quality typography is better than ASCII it'd
2489
  be probably a good idea to drop the feature...
2490
*/
2491
void TY_(DowngradeTypography)(TidyDocImpl* doc, Node* node)
2492
0
{
2493
0
    Node* next;
2494
0
    Lexer* lexer = doc->lexer;
2495
2496
0
    while (node)
2497
0
    {
2498
0
        next = node->next;
2499
2500
0
        if (TY_(nodeIsText)(node))
2501
0
        {
2502
0
            uint i, c;
2503
0
            tmbstr p = lexer->lexbuf + node->start;
2504
2505
0
            for (i = node->start; i < node->end; ++i)
2506
0
            {
2507
0
                c = (unsigned char) lexer->lexbuf[i];
2508
2509
0
                if (c > 0x7F)
2510
0
                    i += TY_(GetUTF8)(lexer->lexbuf + i, &c);
2511
2512
0
                if (c >= 0x2013 && c <= 0x201E)
2513
0
                {
2514
0
                    switch (c)
2515
0
                    {
2516
0
                    case 0x2013: /* en dash */
2517
0
                    case 0x2014: /* em dash */
2518
0
                        c = '-';
2519
0
                        break;
2520
0
                    case 0x2018: /* left single  quotation mark */
2521
0
                    case 0x2019: /* right single quotation mark */
2522
0
                    case 0x201A: /* single low-9 quotation mark */
2523
0
                        c = '\'';
2524
0
                        break;
2525
0
                    case 0x201C: /* left double  quotation mark */
2526
0
                    case 0x201D: /* right double quotation mark */
2527
0
                    case 0x201E: /* double low-9 quotation mark */
2528
0
                        c = '"';
2529
0
                        break;
2530
0
                    }
2531
0
                }
2532
2533
0
                p = TY_(PutUTF8)(p, c);
2534
0
            }
2535
2536
0
            node->end = p - lexer->lexbuf;
2537
0
        }
2538
2539
0
        if (node->content)
2540
0
            TY_(DowngradeTypography)(doc, node->content);
2541
2542
0
        node = next;
2543
0
    }
2544
0
}
2545
2546
void TY_(ReplacePreformattedSpaces)(TidyDocImpl* doc, Node* node)
2547
1.45M
{
2548
1.45M
    Node* next;
2549
2550
4.40M
    while (node)
2551
2.95M
    {
2552
2.95M
        next = node->next;
2553
2554
2.95M
        if (node->tag && node->tag->parser == TY_(ParsePre))
2555
6.83k
        {
2556
6.83k
            TY_(NormalizeSpaces)(doc->lexer, node->content);
2557
6.83k
            node = next;
2558
6.83k
            continue;
2559
6.83k
        }
2560
2561
2.94M
        if (node->content)
2562
1.43M
            TY_(ReplacePreformattedSpaces)(doc, node->content);
2563
2564
2.94M
        node = next;
2565
2.94M
    }
2566
1.45M
}
2567
2568
void TY_(ConvertCDATANodes)(TidyDocImpl* doc, Node* node)
2569
0
{
2570
0
    Node* next;
2571
2572
0
    while (node)
2573
0
    {
2574
0
        next = node->next;
2575
2576
0
        if (node->type == CDATATag)
2577
0
            node->type = TextNode;
2578
2579
0
        if (node->content)
2580
0
            TY_(ConvertCDATANodes)(doc, node->content);
2581
2582
0
        node = next;
2583
0
    }
2584
0
}
2585
2586
/*
2587
  FixLanguageInformation ensures that the document contains (only)
2588
  the attributes for language information desired by the output
2589
  document type. For example, for XHTML 1.0 documents both
2590
  'xml:lang' and 'lang' are desired, for XHTML 1.1 only 'xml:lang'
2591
  is desired and for HTML 4.01 only 'lang' is desired.
2592
*/
2593
void TY_(FixLanguageInformation)(TidyDocImpl* doc, Node* node, Bool wantXmlLang, Bool wantLang)
2594
19.1k
{
2595
19.1k
    Stack *stack = TY_(newStack)(doc, 16);
2596
19.1k
    Node* next;
2597
2598
3.03M
    while (node)
2599
3.01M
    {
2600
3.01M
        next = node->next;
2601
2602
        /* todo: report modifications made here to the report system */
2603
2604
3.01M
        if (TY_(nodeIsElement)(node))
2605
1.63M
        {
2606
1.63M
            AttVal* lang = TY_(AttrGetById)(node, TidyAttr_LANG);
2607
1.63M
            AttVal* xmlLang = TY_(AttrGetById)(node, TidyAttr_XML_LANG);
2608
2609
1.63M
            if (lang && xmlLang)
2610
441
            {
2611
                /*
2612
                  todo: check whether both attributes are in sync,
2613
                  here or elsewhere, where elsewhere is probably
2614
                  preferable.
2615
                  AD - March 2005: not mandatory according the standards.
2616
                */
2617
441
            }
2618
1.63M
            else if (lang && wantXmlLang)
2619
3.12k
            {
2620
3.12k
                if (TY_(NodeAttributeVersions)( node, TidyAttr_XML_LANG )
2621
3.12k
                    & doc->lexer->versionEmitted)
2622
2.86k
                    TY_(RepairAttrValue)(doc, node, "xml:lang", lang->value);
2623
3.12k
            }
2624
1.63M
            else if (xmlLang && wantLang)
2625
8.21k
            {
2626
8.21k
                if (TY_(NodeAttributeVersions)( node, TidyAttr_LANG )
2627
8.21k
                    & doc->lexer->versionEmitted)
2628
7.93k
                    TY_(RepairAttrValue)(doc, node, "lang", xmlLang->value);
2629
8.21k
            }
2630
2631
1.63M
            if (lang && !wantLang)
2632
0
                TY_(RemoveAttribute)(doc, node, lang);
2633
            
2634
1.63M
            if (xmlLang && !wantXmlLang)
2635
0
                TY_(RemoveAttribute)(doc, node, xmlLang);
2636
1.63M
        }
2637
2638
3.01M
        if (node->content)
2639
1.48M
        {
2640
1.48M
            TY_(push)(stack, next);
2641
1.48M
            node = node->content;
2642
1.48M
            continue;
2643
1.48M
        }
2644
2645
1.52M
        node = next ? next : TY_(pop)(stack);
2646
1.52M
    }
2647
19.1k
    TY_(freeStack)(stack);
2648
19.1k
}
2649
2650
/*
2651
  Set/fix/remove <html xmlns='...'>
2652
*/
2653
void TY_(FixXhtmlNamespace)(TidyDocImpl* doc, Bool wantXmlns)
2654
19.1k
{
2655
19.1k
    Node* html = TY_(FindHTML)(doc);
2656
19.1k
    AttVal* xmlns;
2657
2658
19.1k
    if (!html)
2659
0
        return;
2660
2661
19.1k
    xmlns = TY_(AttrGetById)(html, TidyAttr_XMLNS);
2662
2663
19.1k
    if (wantXmlns)
2664
19.1k
    {
2665
19.1k
        if (!AttrValueIs(xmlns, XHTML_NAMESPACE))
2666
19.1k
            TY_(RepairAttrValue)(doc, html, "xmlns", XHTML_NAMESPACE);
2667
19.1k
    }
2668
0
    else if (xmlns)
2669
0
    {
2670
0
        TY_(RemoveAttribute)(doc, html, xmlns);
2671
0
    }
2672
19.1k
}
2673
2674
/*
2675
  ...
2676
*/
2677
void TY_(FixAnchors)(TidyDocImpl* doc, Node *node, Bool wantName, Bool wantId)
2678
19.1k
{
2679
19.1k
    Stack *stack = TY_(newStack)(doc, 16);
2680
19.1k
    Node* next;
2681
2682
3.03M
    while (node)
2683
3.01M
    {
2684
3.01M
        next = node->next;
2685
2686
3.01M
        if (TY_(IsAnchorElement)(doc, node))
2687
163k
        {
2688
163k
            AttVal *name = TY_(AttrGetById)(node, TidyAttr_NAME);
2689
163k
            AttVal *id = TY_(AttrGetById)(node, TidyAttr_ID);
2690
163k
            Bool hadName = name!=NULL;
2691
163k
            Bool hadId = id!=NULL;
2692
163k
            Bool IdEmitted = no;
2693
163k
            Bool NameEmitted = no;
2694
2695
            /* todo: how are empty name/id attributes handled? */
2696
2697
163k
            if (name && id)
2698
1.63k
            {
2699
1.63k
                Bool NameHasValue = AttrHasValue(name);
2700
1.63k
                Bool IdHasValue = AttrHasValue(id);
2701
1.63k
                if ( (NameHasValue != IdHasValue) ||
2702
1.41k
                     (NameHasValue && IdHasValue &&
2703
1.14k
                     TY_(tmbstrcmp)(name->value, id->value) != 0 ) )
2704
514
                    TY_(ReportAttrError)( doc, node, name, ID_NAME_MISMATCH);
2705
1.63k
            }
2706
162k
            else if (name && wantId)
2707
34.7k
            {
2708
34.7k
                if (TY_(NodeAttributeVersions)( node, TidyAttr_ID )
2709
34.7k
                    & doc->lexer->versionEmitted)
2710
34.2k
                {
2711
34.2k
                    if (TY_(IsValidHTMLID)(name->value))
2712
31.0k
                    {
2713
31.0k
                        TY_(RepairAttrValue)(doc, node, "id", name->value);
2714
31.0k
                        IdEmitted = yes;
2715
31.0k
                    }
2716
3.20k
                    else
2717
3.20k
                        TY_(ReportAttrError)(doc, node, name, INVALID_XML_ID);
2718
34.2k
                 }
2719
34.7k
            }
2720
127k
            else if (id && wantName)
2721
2.25k
            {
2722
2.25k
                if (TY_(NodeAttributeVersions)( node, TidyAttr_NAME )
2723
2.25k
                    & doc->lexer->versionEmitted)
2724
1.09k
                {
2725
                    /* todo: do not assume id is valid */
2726
1.09k
                    TY_(RepairAttrValue)(doc, node, "name", id->value);
2727
1.09k
                    NameEmitted = yes;
2728
1.09k
                }
2729
2.25k
            }
2730
2731
163k
            if (id && !wantId
2732
                /* make sure that Name has been emitted if requested */
2733
0
                && (hadName || !wantName || NameEmitted) ) {
2734
0
                if (!wantId && !wantName)
2735
0
                    TY_(RemoveAnchorByNode)(doc, id->value, node);
2736
0
                TY_(RemoveAttribute)(doc, node, id);
2737
0
            }
2738
2739
163k
            if (name && !wantName
2740
                /* make sure that Id has been emitted if requested */
2741
0
                && (hadId || !wantId || IdEmitted) ) {
2742
0
                if (!wantId && !wantName)
2743
0
                    TY_(RemoveAnchorByNode)(doc, name->value, node);
2744
0
                TY_(RemoveAttribute)(doc, node, name);
2745
0
            }
2746
163k
        }
2747
2748
3.01M
        if (node->content)
2749
1.48M
        {
2750
1.48M
            TY_(push)(stack, next);
2751
1.48M
            node = node->content;
2752
1.48M
            continue;
2753
1.48M
        }
2754
2755
1.52M
        node = next ? next : TY_(pop)(stack);
2756
1.52M
    }
2757
19.1k
    TY_(freeStack)(stack);
2758
19.1k
}
2759
2760
/* Issue #567 - move style elements from body to head 
2761
 * ==================================================
2762
 */
2763
static void StyleToHead(TidyDocImpl* doc, Node *head, Node *node, Bool fix, int indent)
2764
18.9k
{
2765
18.9k
    Stack *stack = TY_(newStack)(doc, 16);
2766
18.9k
    Node *next;
2767
    
2768
2.67M
    while (node)
2769
2.65M
    {
2770
2.65M
        next = node->next;
2771
        
2772
2.65M
        if (nodeIsSTYLE(node))
2773
1.74k
        {
2774
1.74k
            if (fix)
2775
1.74k
            {
2776
1.74k
                TY_(RemoveNode)(node); /* unhook style node from body */
2777
1.74k
                TY_(InsertNodeAtEnd)(head, node);   /* add to end of head */
2778
1.74k
                TY_(Report)(doc, node, head, MOVED_STYLE_TO_HEAD); /* report move */
2779
1.74k
            }
2780
0
            else
2781
0
            {
2782
0
                TY_(Report)(doc, node, head, FOUND_STYLE_IN_BODY);
2783
0
            }
2784
1.74k
        }
2785
2.65M
        else if (node->content)
2786
1.40M
        {
2787
1.40M
            TY_(push)(stack, next);
2788
1.40M
            node = node->content;
2789
1.40M
            indent++;
2790
1.40M
            continue;
2791
1.40M
        }
2792
        
2793
1.25M
        if (next)
2794
1.16M
            node = next;
2795
95.2k
        else
2796
95.2k
        {
2797
95.2k
            node = TY_(pop)(stack);
2798
95.2k
            indent--;
2799
95.2k
        }
2800
1.25M
    }
2801
18.9k
    TY_(freeStack)(stack);
2802
18.9k
}
2803
2804
2805
void TY_(CleanStyle)(TidyDocImpl* doc, Node *html)
2806
19.1k
{
2807
19.1k
    Node *head = NULL, *body = NULL;
2808
19.1k
    Bool fix = cfgBool(doc, TidyStyleTags);
2809
2810
19.1k
    if (!html)
2811
0
        return; /* oops, not given a start node */
2812
2813
19.1k
    head = TY_(FindHEAD)( doc );
2814
19.1k
    body = TY_(FindBody)( doc );
2815
2816
19.1k
    if ((head != NULL) && (body != NULL))
2817
18.9k
    {
2818
18.9k
    StyleToHead(doc, head, body, fix, 0); /* found head and body */
2819
18.9k
    }
2820
19.1k
}
2821
/* ==================================================
2822
 */
2823
2824
/*
2825
 * CleanHead - clean the head node, if it exists, and we
2826
 * are going to show it in the output.
2827
 * Issue #692 - Remove multiple title elements
2828
 */
2829
void TY_(CleanHead)(TidyDocImpl* doc)
2830
19.1k
{
2831
19.1k
    Node *head, *node, *next;
2832
19.1k
    uint titles = 0;
2833
19.1k
    if (cfgAutoBool(doc, TidyBodyOnly) == TidyYesState)
2834
0
        return; /* not going to show head, so forget it */
2835
19.1k
    head = TY_(FindHEAD)(doc);
2836
19.1k
    if (!head)
2837
0
        return;
2838
19.1k
    node = head->content;
2839
77.4k
    while (node)
2840
58.3k
    {
2841
58.3k
        next = node->next;  /* get any 'next' */
2842
58.3k
        if (nodeIsTITLE(node))
2843
25.6k
        {
2844
25.6k
            titles++;
2845
25.6k
            if (titles > 1)
2846
6.52k
            {
2847
6.52k
                TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED);
2848
6.52k
                TY_(DiscardElement)(doc, node); /* delete this node */
2849
6.52k
            }
2850
25.6k
        }
2851
58.3k
        node = next;
2852
58.3k
    }
2853
19.1k
}
2854
2855
/*
2856
 * local variables:
2857
 * mode: c
2858
 * indent-tabs-mode: nil
2859
 * c-basic-offset: 4
2860
 * eval: (c-set-offset 'substatement-open 0)
2861
 * end:
2862
 */