/src/tidy-html5/src/gdoc.c
Line | Count | Source |
1 | | /* |
2 | | clean.c -- clean up misuse of presentation markup |
3 | | |
4 | | (c) 1998-2008 (W3C) MIT, ERCIM, Keio University |
5 | | See tidy.h for the copyright notice. |
6 | | |
7 | | Filters from other formats such as Microsoft Word |
8 | | often make excessive use of presentation markup such |
9 | | as font tags, B, I, and the align attribute. By applying |
10 | | a set of production rules, it is straight forward to |
11 | | transform this to use CSS. |
12 | | |
13 | | Some rules replace some of the children of an element by |
14 | | style properties on the element, e.g. |
15 | | |
16 | | <p><b>...</b></p> -> <p style="font-weight: bold">...</p> |
17 | | |
18 | | Such rules are applied to the element's content and then |
19 | | to the element itself until none of the rules more apply. |
20 | | Having applied all the rules to an element, it will have |
21 | | a style attribute with one or more properties. |
22 | | |
23 | | Other rules strip the element they apply to, replacing |
24 | | it by style properties on the contents, e.g. |
25 | | |
26 | | <dir><li><p>...</li></dir> -> <p style="margin-left 1em">... |
27 | | |
28 | | These rules are applied to an element before processing |
29 | | its content and replace the current element by the first |
30 | | element in the exposed content. |
31 | | |
32 | | After applying both sets of rules, you can replace the |
33 | | style attribute by a class value and style rule in the |
34 | | document head. To support this, an association of styles |
35 | | and class names is built. |
36 | | |
37 | | A naive approach is to rely on string matching to test |
38 | | when two property lists are the same. A better approach |
39 | | would be to first sort the properties before matching. |
40 | | |
41 | | */ |
42 | | |
43 | | #include <stdio.h> |
44 | | #include <stdlib.h> |
45 | | #include <string.h> |
46 | | |
47 | | #include "tidy-int.h" |
48 | | #include "gdoc.h" |
49 | | #include "lexer.h" |
50 | | #include "parser.h" |
51 | | #include "tags.h" |
52 | | #include "attrs.h" |
53 | | #include "message.h" |
54 | | #include "tmbstr.h" |
55 | | #include "utf8.h" |
56 | | |
57 | | /* |
58 | | Extricate "element", replace it by its content and delete it. |
59 | | */ |
60 | | static void DiscardContainer( TidyDocImpl* doc, Node *element, Node **pnode) |
61 | 441 | { |
62 | 441 | if (element->content) |
63 | 441 | { |
64 | 441 | Node *node, *parent = element->parent; |
65 | | |
66 | 441 | element->last->next = element->next; |
67 | | |
68 | 441 | if (element->next) |
69 | 207 | { |
70 | 207 | element->next->prev = element->last; |
71 | 207 | } |
72 | 234 | else |
73 | 234 | parent->last = element->last; |
74 | | |
75 | 441 | if (element->prev) |
76 | 186 | { |
77 | 186 | element->content->prev = element->prev; |
78 | 186 | element->prev->next = element->content; |
79 | 186 | } |
80 | 255 | else |
81 | 255 | parent->content = element->content; |
82 | | |
83 | 19.3k | for (node = element->content; node; node = node->next) |
84 | 18.8k | node->parent = parent; |
85 | | |
86 | 441 | *pnode = element->content; |
87 | | |
88 | 441 | element->next = element->content = NULL; |
89 | 441 | TY_(FreeNode)(doc, element); |
90 | 441 | } |
91 | 0 | else |
92 | 0 | { |
93 | 0 | *pnode = TY_(DiscardElement)(doc, element); |
94 | 0 | } |
95 | 441 | } |
96 | | |
97 | | static void CleanNode( TidyDocImpl* doc, Node *node ) |
98 | 848 | { |
99 | 848 | Stack *stack = TY_(newStack)(doc, 16); |
100 | 848 | Node *child, *next; |
101 | | |
102 | 848 | if ( (child = node->content) ) |
103 | 848 | { |
104 | 47.0k | while (child) |
105 | 46.2k | { |
106 | 46.2k | next = child->next; |
107 | | |
108 | 46.2k | if (TY_(nodeIsElement)(child)) |
109 | 44.0k | { |
110 | 44.0k | if (nodeIsSTYLE(child)) |
111 | 0 | TY_(DiscardElement)(doc, child); |
112 | 44.0k | if (nodeIsP(child) && !child->content) |
113 | 129 | TY_(DiscardElement)(doc, child); |
114 | 43.8k | else if (nodeIsSPAN(child)) |
115 | 441 | DiscardContainer( doc, child, &next); |
116 | 43.4k | else if (nodeIsA(child) && !child->content) |
117 | 132 | { |
118 | 132 | AttVal *id = TY_(GetAttrByName)( child, "name" ); |
119 | | /* Recent Google Docs is using "id" instead of "name" in |
120 | | ** the exported html. |
121 | | */ |
122 | 132 | if (!id) |
123 | 79 | id = TY_(GetAttrByName)( child, "id" ); |
124 | | |
125 | 132 | if (id) |
126 | 96 | TY_(RepairAttrValue)( doc, child->parent, "id", id->value ); |
127 | | |
128 | 132 | TY_(DiscardElement)(doc, child); |
129 | 132 | } |
130 | 43.3k | else |
131 | 43.3k | { |
132 | 43.3k | if (child->attributes) |
133 | 818 | TY_(DropAttrByName)( doc, child, "class" ); |
134 | | |
135 | 43.3k | TY_(push)(stack,next); |
136 | 43.3k | child = child->content; |
137 | 43.3k | continue; |
138 | 43.3k | } |
139 | 44.0k | } |
140 | 2.93k | child = next ? next : TY_(pop)(stack); |
141 | 2.93k | } |
142 | 848 | TY_(freeStack)(stack); |
143 | 848 | } |
144 | 848 | } |
145 | | |
146 | | /* insert meta element to force browser to recognize doc as UTF8 */ |
147 | | static void SetUTF8( TidyDocImpl* doc ) |
148 | 848 | { |
149 | 848 | Node *head = TY_(FindHEAD)( doc ); |
150 | | |
151 | 848 | if (head) |
152 | 848 | { |
153 | 848 | Node *node = TY_(InferredTag)(doc, TidyTag_META); |
154 | 848 | TY_(AddAttribute)( doc, node, "http-equiv", "Content-Type" ); |
155 | 848 | TY_(AddAttribute)( doc, node, "content", "text/html; charset=UTF-8" ); |
156 | 848 | TY_(InsertNodeAtStart)( head, node ); |
157 | 848 | } |
158 | 848 | } |
159 | | |
160 | | /* clean html exported by Google Docs |
161 | | |
162 | | - strip the script element, as the style sheet is a mess |
163 | | - strip class attributes |
164 | | - strip span elements, leaving their content in place |
165 | | - replace <a name=...></a> by id on parent element |
166 | | - strip empty <p> elements |
167 | | */ |
168 | | void TY_(CleanGoogleDocument)( TidyDocImpl* doc ) |
169 | 848 | { |
170 | | /* placeholder. CleanTree()/CleanNode() will not |
171 | | ** zap root element |
172 | | */ |
173 | 848 | CleanNode( doc, &doc->root ); |
174 | 848 | SetUTF8( doc ); |
175 | 848 | } |
176 | | |
177 | | /* |
178 | | * local variables: |
179 | | * mode: c |
180 | | * indent-tabs-mode: nil |
181 | | * c-basic-offset: 4 |
182 | | * eval: (c-set-offset 'substatement-open 0) |
183 | | * end: |
184 | | */ |