Coverage Report

Created: 2026-05-30 06:47

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/tidy-html5/src/gdoc.c
Line
Count
Source
1
/*
2
  clean.c -- clean up misuse of presentation markup
3
4
  (c) 1998-2008 (W3C) MIT, ERCIM, Keio University
5
  See tidy.h for the copyright notice.
6
7
  Filters from other formats such as Microsoft Word
8
  often make excessive use of presentation markup such
9
  as font tags, B, I, and the align attribute. By applying
10
  a set of production rules, it is straight forward to
11
  transform this to use CSS.
12
13
  Some rules replace some of the children of an element by
14
  style properties on the element, e.g.
15
16
  <p><b>...</b></p> -> <p style="font-weight: bold">...</p>
17
18
  Such rules are applied to the element's content and then
19
  to the element itself until none of the rules more apply.
20
  Having applied all the rules to an element, it will have
21
  a style attribute with one or more properties. 
22
23
  Other rules strip the element they apply to, replacing
24
  it by style properties on the contents, e.g.
25
  
26
  <dir><li><p>...</li></dir> -> <p style="margin-left 1em">...
27
      
28
  These rules are applied to an element before processing
29
  its content and replace the current element by the first
30
  element in the exposed content.
31
32
  After applying both sets of rules, you can replace the
33
  style attribute by a class value and style rule in the
34
  document head. To support this, an association of styles
35
  and class names is built.
36
37
  A naive approach is to rely on string matching to test
38
  when two property lists are the same. A better approach
39
  would be to first sort the properties before matching.
40
41
*/
42
43
#include <stdio.h>
44
#include <stdlib.h>
45
#include <string.h>
46
47
#include "tidy-int.h"
48
#include "gdoc.h"
49
#include "lexer.h"
50
#include "parser.h"
51
#include "tags.h"
52
#include "attrs.h"
53
#include "message.h"
54
#include "tmbstr.h"
55
#include "utf8.h"
56
57
/*
58
  Extricate "element", replace it by its content and delete it.
59
*/
60
static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode)
61
441
{
62
441
    if (element->content)
63
441
    {
64
441
        Node *node, *parent = element->parent;
65
66
441
        element->last->next = element->next;
67
68
441
        if (element->next)
69
207
        {
70
207
            element->next->prev = element->last;
71
207
        }
72
234
        else
73
234
            parent->last = element->last;
74
75
441
        if (element->prev)
76
186
        {
77
186
            element->content->prev = element->prev;
78
186
            element->prev->next = element->content;
79
186
        }
80
255
        else
81
255
            parent->content = element->content;
82
83
19.3k
        for (node = element->content; node; node = node->next)
84
18.8k
            node->parent = parent;
85
86
441
        *pnode = element->content;
87
88
441
        element->next = element->content = NULL;
89
441
        TY_(FreeNode)(doc, element);
90
441
    }
91
0
    else
92
0
    {
93
0
        *pnode = TY_(DiscardElement)(doc, element);
94
0
    }
95
441
}
96
97
static void CleanNode( TidyDocImpl* doc, Node *node )
98
848
{
99
848
    Stack *stack = TY_(newStack)(doc, 16);
100
848
    Node *child, *next;
101
102
848
    if ( (child = node->content) )
103
848
    {
104
47.0k
        while (child)
105
46.2k
        {
106
46.2k
            next = child->next;
107
            
108
46.2k
            if (TY_(nodeIsElement)(child))
109
44.0k
            {
110
44.0k
                if (nodeIsSTYLE(child))
111
0
                    TY_(DiscardElement)(doc, child);
112
44.0k
                if (nodeIsP(child) && !child->content)
113
129
                    TY_(DiscardElement)(doc, child);
114
43.8k
                else if (nodeIsSPAN(child))
115
441
                    DiscardContainer( doc, child, &next);
116
43.4k
                else if (nodeIsA(child) && !child->content)
117
132
                 {
118
132
                    AttVal *id = TY_(GetAttrByName)( child, "name" );
119
                    /* Recent Google Docs is using "id" instead of "name" in
120
                    ** the exported html.
121
                    */
122
132
                    if (!id)
123
79
                        id = TY_(GetAttrByName)( child, "id" );
124
125
132
                    if (id)
126
96
                        TY_(RepairAttrValue)( doc, child->parent, "id", id->value );
127
128
132
                    TY_(DiscardElement)(doc, child);
129
132
                }
130
43.3k
                else
131
43.3k
                {
132
43.3k
                    if (child->attributes)
133
818
                        TY_(DropAttrByName)( doc, child, "class" );
134
135
43.3k
                    TY_(push)(stack,next);
136
43.3k
                    child = child->content;
137
43.3k
                    continue;
138
43.3k
                }
139
44.0k
            }
140
2.93k
            child = next ? next : TY_(pop)(stack);
141
2.93k
        }
142
848
        TY_(freeStack)(stack);
143
848
    }
144
848
}
145
146
/* insert meta element to force browser to recognize doc as UTF8 */
147
static void SetUTF8( TidyDocImpl* doc )
148
848
{
149
848
    Node *head = TY_(FindHEAD)( doc );
150
151
848
    if (head)
152
848
    {
153
848
        Node *node = TY_(InferredTag)(doc, TidyTag_META);
154
848
        TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" );
155
848
        TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" );
156
848
        TY_(InsertNodeAtStart)( head, node );
157
848
    }
158
848
}
159
160
/* clean html exported by Google Docs
161
162
    - strip the script element, as the style sheet is a mess
163
    - strip class attributes
164
    - strip span elements, leaving their content in place
165
    - replace <a name=...></a> by id on parent element
166
    - strip empty <p> elements
167
*/
168
void TY_(CleanGoogleDocument)( TidyDocImpl* doc )
169
848
{
170
    /* placeholder.  CleanTree()/CleanNode() will not
171
    ** zap root element 
172
    */
173
848
    CleanNode( doc, &doc->root );
174
848
    SetUTF8( doc );
175
848
}
176
177
/*
178
 * local variables:
179
 * mode: c
180
 * indent-tabs-mode: nil
181
 * c-basic-offset: 4
182
 * eval: (c-set-offset 'substatement-open 0)
183
 * end:
184
 */