/src/tidy-html5/src/parser.c
Line | Count | Source |
1 | | /* parser.c -- HTML Parser |
2 | | |
3 | | (c) 1998-2007 (W3C) MIT, ERCIM, Keio University |
4 | | See tidy.h for the copyright notice. |
5 | | |
6 | | */ |
7 | | |
8 | | #include "tidy-int.h" |
9 | | #include "lexer.h" |
10 | | #include "parser.h" |
11 | | #include "message.h" |
12 | | #include "clean.h" |
13 | | #include "tags.h" |
14 | | #include "tmbstr.h" |
15 | | #include "sprtf.h" |
16 | | |
17 | | |
18 | | /****************************************************************************//* |
19 | | ** MARK: - Configuration Options |
20 | | ***************************************************************************/ |
21 | | |
22 | | |
23 | | /** |
24 | | * Issue #72 - Need to know to avoid error-reporting - no warning only if |
25 | | * --show-body-only yes. |
26 | | * Issue #132 - Likewise avoid warning if showing body only. |
27 | | */ |
28 | 57.4k | #define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no |
29 | | |
30 | | |
31 | | /****************************************************************************//* |
32 | | ** MARK: - Forward Declarations |
33 | | ***************************************************************************/ |
34 | | |
35 | | |
36 | | static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode); |
37 | | |
38 | | |
39 | | /****************************************************************************//* |
40 | | ** MARK: - Node Operations |
41 | | ***************************************************************************/ |
42 | | |
43 | | |
44 | | /** |
45 | | * Generalised search for duplicate elements. |
46 | | * Issue #166 - repeated <main> element. |
47 | | */ |
48 | | static Bool findNodeWithId( Node *node, TidyTagId tid ) |
49 | 2.06k | { |
50 | 2.06k | Node *content; |
51 | 5.22k | while (node) |
52 | 4.01k | { |
53 | 4.01k | if (TagIsId(node,tid)) |
54 | 284 | return yes; |
55 | | /*\ |
56 | | * Issue #459 - Under certain circumstances, with many node this use of |
57 | | * 'for (content = node->content; content; content = content->content)' |
58 | | * would produce a **forever** circle, or at least a very extended loop... |
59 | | * It is sufficient to test the content, if it exists, |
60 | | * to quickly iterate all nodes. Now all nodes are tested only once. |
61 | | \*/ |
62 | 3.73k | content = node->content; |
63 | 3.73k | if (content) |
64 | 1.64k | { |
65 | 1.64k | if ( findNodeWithId(content,tid) ) |
66 | 569 | return yes; |
67 | 1.64k | } |
68 | 3.16k | node = node->next; |
69 | 3.16k | } |
70 | 1.20k | return no; |
71 | 2.06k | } |
72 | | |
73 | | |
74 | | /** |
75 | | * Perform a global search for an element. |
76 | | * Issue #166 - repeated <main> element |
77 | | */ |
78 | | static Bool findNodeById( TidyDocImpl* doc, TidyTagId tid ) |
79 | 417 | { |
80 | 417 | Node *node = (doc ? doc->root.content : NULL); |
81 | 417 | return findNodeWithId( node,tid ); |
82 | 417 | } |
83 | | |
84 | | |
85 | | /** |
86 | | * Inserts node into element at an appropriate location based |
87 | | * on the type of node being inserted. |
88 | | */ |
89 | | static Bool InsertMisc(Node *element, Node *node) |
90 | 3.27M | { |
91 | 3.27M | if (node->type == CommentTag || |
92 | 3.06M | node->type == ProcInsTag || |
93 | 3.00M | node->type == CDATATag || |
94 | 3.00M | node->type == SectionTag || |
95 | 2.98M | node->type == AspTag || |
96 | 2.97M | node->type == JsteTag || |
97 | 2.97M | node->type == PhpTag ) |
98 | 301k | { |
99 | 301k | TY_(InsertNodeAtEnd)(element, node); |
100 | 301k | return yes; |
101 | 301k | } |
102 | | |
103 | 2.97M | if ( node->type == XmlDecl ) |
104 | 60.8k | { |
105 | 60.8k | Node* root = element; |
106 | 184k | while ( root && root->parent ) |
107 | 123k | root = root->parent; |
108 | 60.8k | if ( root && !(root->content && root->content->type == XmlDecl)) |
109 | 2.26k | { |
110 | 2.26k | TY_(InsertNodeAtStart)( root, node ); |
111 | 2.26k | return yes; |
112 | 2.26k | } |
113 | 60.8k | } |
114 | | |
115 | | /* Declared empty tags seem to be slipping through |
116 | | ** the cracks. This is an experiment to figure out |
117 | | ** a decent place to pick them up. |
118 | | */ |
119 | 2.97M | if ( node->tag && |
120 | 2.50M | TY_(nodeIsElement)(node) && |
121 | 2.40M | TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && |
122 | 0 | (node->tag->versions & VERS_PROPRIETARY) != 0 ) |
123 | 0 | { |
124 | 0 | TY_(InsertNodeAtEnd)(element, node); |
125 | 0 | return yes; |
126 | 0 | } |
127 | | |
128 | 2.97M | return no; |
129 | 2.97M | } |
130 | | |
131 | | |
132 | | /** |
133 | | * Insert "node" into markup tree in place of "element" |
134 | | * which is moved to become the child of the node |
135 | | */ |
136 | | static void InsertNodeAsParent(Node *element, Node *node) |
137 | 780 | { |
138 | 780 | node->content = element; |
139 | 780 | node->last = element; |
140 | 780 | node->parent = element->parent; |
141 | 780 | element->parent = node; |
142 | | |
143 | 780 | if (node->parent->content == element) |
144 | 409 | node->parent->content = node; |
145 | | |
146 | 780 | if (node->parent->last == element) |
147 | 510 | node->parent->last = node; |
148 | | |
149 | 780 | node->prev = element->prev; |
150 | 780 | element->prev = NULL; |
151 | | |
152 | 780 | if (node->prev) |
153 | 371 | node->prev->next = node; |
154 | | |
155 | 780 | node->next = element->next; |
156 | 780 | element->next = NULL; |
157 | | |
158 | 780 | if (node->next) |
159 | 270 | node->next->prev = node; |
160 | 780 | } |
161 | | |
162 | | |
163 | | /** |
164 | | * Unexpected content in table row is moved to just before the table in |
165 | | * in accordance with Netscape and IE. This code assumes that node hasn't |
166 | | * been inserted into the row. |
167 | | */ |
168 | | static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row, |
169 | | Node *node ) |
170 | 42.9k | { |
171 | 42.9k | Node *table; |
172 | | |
173 | | /* first find the table element */ |
174 | 1.92M | for (table = row->parent; table; table = table->parent) |
175 | 1.89M | { |
176 | 1.89M | if ( nodeIsTABLE(table) ) |
177 | 10.5k | { |
178 | 10.5k | TY_(InsertNodeBeforeElement)( table, node ); |
179 | 10.5k | return; |
180 | 10.5k | } |
181 | 1.89M | } |
182 | | /* No table element */ |
183 | 32.4k | TY_(InsertNodeBeforeElement)( row->parent, node ); |
184 | 32.4k | } |
185 | | |
186 | | |
187 | | /** |
188 | | * Moves given node to end of body element. |
189 | | */ |
190 | | static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) |
191 | 1.09k | { |
192 | 1.09k | Node* body = TY_(FindBody)( doc ); |
193 | 1.09k | if ( body ) |
194 | 662 | { |
195 | 662 | TY_(RemoveNode)( node ); |
196 | 662 | TY_(InsertNodeAtEnd)( body, node ); |
197 | 662 | } |
198 | 1.09k | } |
199 | | |
200 | | |
201 | | /** |
202 | | * Move node to the head, where element is used as starting |
203 | | * point in hunt for head. Normally called during parsing. |
204 | | */ |
205 | | static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) |
206 | 6.65k | { |
207 | 6.65k | Node *head = NULL; |
208 | | |
209 | 6.65k | TY_(RemoveNode)( node ); /* make sure that node is isolated */ |
210 | | |
211 | 6.65k | if ( TY_(nodeIsElement)(node) ) |
212 | 6.57k | { |
213 | 6.57k | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN ); |
214 | | |
215 | 6.57k | head = TY_(FindHEAD)(doc); |
216 | 6.57k | assert(head != NULL); |
217 | | |
218 | 6.57k | TY_(InsertNodeAtEnd)(head, node); |
219 | | |
220 | 6.57k | if ( node->tag->parser ) |
221 | 6.57k | { |
222 | | /* Only one of the existing test cases as of 2021-08-14 invoke |
223 | | MoveToHead, and it doesn't go deeper than one level. The |
224 | | parser() call is supposed to return a node if additional |
225 | | parsing is needed. Keep this in mind if we start to get bug |
226 | | reports. |
227 | | */ |
228 | 6.57k | Parser* parser = node->tag->parser; |
229 | 6.57k | parser( doc, node, IgnoreWhitespace ); |
230 | 6.57k | } |
231 | 6.57k | } |
232 | 75 | else |
233 | 75 | { |
234 | 75 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
235 | 75 | TY_(FreeNode)( doc, node ); |
236 | 75 | } |
237 | 6.65k | } |
238 | | |
239 | | |
240 | | /***************************************************************************//* |
241 | | ** MARK: - Decision Making |
242 | | ***************************************************************************/ |
243 | | |
244 | | |
245 | | /** |
246 | | * Indicates whether or not element can be pruned based on content, |
247 | | * user settings, etc. |
248 | | */ |
249 | | static Bool CanPrune( TidyDocImpl* doc, Node *element ) |
250 | 2.08M | { |
251 | 2.08M | if ( !cfgBool(doc, TidyDropEmptyElems) ) |
252 | 0 | return no; |
253 | | |
254 | 2.08M | if ( TY_(nodeIsText)(element) ) |
255 | 11.3k | return yes; |
256 | | |
257 | 2.07M | if ( element->content ) |
258 | 1.49M | return no; |
259 | | |
260 | 581k | if ( element->tag == NULL ) |
261 | 337 | return no; |
262 | | |
263 | 581k | if ( element->tag->model & CM_BLOCK && element->attributes != NULL ) |
264 | 61.9k | return no; |
265 | | |
266 | 519k | if ( nodeIsA(element) && element->attributes != NULL ) |
267 | 3.94k | return no; |
268 | | |
269 | 515k | if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) ) |
270 | 0 | return no; |
271 | | |
272 | 515k | if ( element->tag->model & CM_ROW ) |
273 | 6.44k | return no; |
274 | | |
275 | 509k | if ( element->tag->model & CM_EMPTY ) |
276 | 55.0k | return no; |
277 | | |
278 | 454k | if ( nodeIsAPPLET(element) ) |
279 | 737 | return no; |
280 | | |
281 | 453k | if ( nodeIsOBJECT(element) ) |
282 | 586 | return no; |
283 | | |
284 | 452k | if ( nodeIsSCRIPT(element) && attrGetSRC(element) ) |
285 | 0 | return no; |
286 | | |
287 | 452k | if ( nodeIsTITLE(element) ) |
288 | 18.9k | return no; |
289 | | |
290 | | /* #433359 - fix by Randy Waki 12 Mar 01 */ |
291 | 434k | if ( nodeIsIFRAME(element) ) |
292 | 184 | return no; |
293 | | |
294 | | /* fix for bug 770297 */ |
295 | 433k | if (nodeIsTEXTAREA(element)) |
296 | 473 | return no; |
297 | | |
298 | | /* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */ |
299 | 433k | if (nodeIsCANVAS(element)) |
300 | 207 | return no; |
301 | | |
302 | 433k | if (nodeIsPROGRESS(element)) |
303 | 337 | return no; |
304 | | |
305 | 432k | if ( attrGetID(element) || attrGetNAME(element) ) |
306 | 19.8k | return no; |
307 | | |
308 | | /* fix for bug 695408; a better fix would look for unknown and */ |
309 | | /* known proprietary attributes that make the element significant */ |
310 | 412k | if (attrGetDATAFLD(element)) |
311 | 0 | return no; |
312 | | |
313 | | /* fix for bug 723772, don't trim new-...-tags */ |
314 | 412k | if (element->tag->id == TidyTag_UNKNOWN) |
315 | 0 | return no; |
316 | | |
317 | 412k | if (nodeIsBODY(element)) |
318 | 7.23k | return no; |
319 | | |
320 | 405k | if (nodeIsCOLGROUP(element)) |
321 | 1.90k | return no; |
322 | | |
323 | | /* HTML5 - do NOT drop empty option if it has attributes */ |
324 | 403k | if ( nodeIsOPTION(element) && element->attributes != NULL ) |
325 | 123 | return no; |
326 | | |
327 | | /* fix for #103 - don't drop empty dd tags lest document not validate */ |
328 | 403k | if (nodeIsDD(element)) |
329 | 2.02k | return no; |
330 | | |
331 | 401k | return yes; |
332 | 403k | } |
333 | | |
334 | | |
335 | | /** |
336 | | * Indicates whether or not node is a descendant of a tag of the given tid. |
337 | | */ |
338 | | static Bool DescendantOf( Node *element, TidyTagId tid ) |
339 | 275k | { |
340 | 275k | Node *parent; |
341 | 275k | for ( parent = element->parent; |
342 | 149M | parent != NULL; |
343 | 149M | parent = parent->parent ) |
344 | 149M | { |
345 | 149M | if ( TagIsId(parent, tid) ) |
346 | 51.9k | return yes; |
347 | 149M | } |
348 | 223k | return no; |
349 | 275k | } |
350 | | |
351 | | |
352 | | /** |
353 | | * Indicates whether or not node is a descendant of a pre tag. |
354 | | */ |
355 | | static Bool IsPreDescendant(Node* node) |
356 | 1.12M | { |
357 | 1.12M | Node *parent = node->parent; |
358 | | |
359 | 1.42G | while (parent) |
360 | 1.42G | { |
361 | 1.42G | if (parent->tag && parent->tag->parser == TY_(ParsePre)) |
362 | 52.4k | return yes; |
363 | | |
364 | 1.42G | parent = parent->parent; |
365 | 1.42G | } |
366 | | |
367 | 1.07M | return no; |
368 | 1.12M | } |
369 | | |
370 | | |
371 | | /** |
372 | | * Indicates whether or not the only content model for the given node |
373 | | * is CM_INLINE. |
374 | | */ |
375 | | static Bool nodeCMIsOnlyInline( Node* node ) |
376 | 0 | { |
377 | 0 | return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK ); |
378 | 0 | } |
379 | | |
380 | | |
381 | | /** |
382 | | * Indicates whether or not the content of the given node is acceptable |
383 | | * content for pre elements |
384 | | */ |
385 | | static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node ) |
386 | 134k | { |
387 | | /* p is coerced to br's, Text OK too */ |
388 | 134k | if ( nodeIsP(node) || TY_(nodeIsText)(node) ) |
389 | 6.17k | return yes; |
390 | | |
391 | 128k | if ( node->tag == NULL || |
392 | 128k | nodeIsPARAM(node) || |
393 | 128k | !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) ) |
394 | 123k | return no; |
395 | | |
396 | 5.36k | return yes; |
397 | 128k | } |
398 | | |
399 | | |
400 | | /** |
401 | | * Indicates whether or not leading whitespace should be cleaned. |
402 | | */ |
403 | | static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) |
404 | 222k | { |
405 | 222k | if (!TY_(nodeIsText)(node)) |
406 | 0 | return no; |
407 | | |
408 | 222k | if (node->parent->type == DocTypeTag) |
409 | 564 | return no; |
410 | | |
411 | 221k | if (IsPreDescendant(node)) |
412 | 10.8k | return no; |
413 | | |
414 | 210k | if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) |
415 | 4.39k | return no; |
416 | | |
417 | | /* #523, prevent blank spaces after script if the next item is script. |
418 | | * This is actually more generalized as, if the preceding element is |
419 | | * a body level script, then indicate that we want to clean leading |
420 | | * whitespace. |
421 | | */ |
422 | 206k | if ( node->prev && nodeIsSCRIPT(node->prev) && nodeIsBODY(node->prev->parent) ) |
423 | 251 | return yes; |
424 | | |
425 | | /* <p>...<br> <em>...</em>...</p> */ |
426 | 206k | if (nodeIsBR(node->prev)) |
427 | 2.32k | return yes; |
428 | | |
429 | | /* <p> ...</p> */ |
430 | 203k | if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE)) |
431 | 61.9k | return yes; |
432 | | |
433 | | /* <h4>...</h4> <em>...</em> */ |
434 | 141k | if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) && |
435 | 88.3k | TY_(nodeIsElement)(node->prev)) |
436 | 7.04k | return yes; |
437 | | |
438 | | /* <p><span> ...</span></p> */ |
439 | 134k | if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE)) |
440 | 6.80k | return yes; |
441 | | |
442 | 127k | return no; |
443 | 134k | } |
444 | | |
445 | | |
446 | | /** |
447 | | * Indicates whether or not trailing whitespace should be cleaned. |
448 | | */ |
449 | | static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) |
450 | 222k | { |
451 | 222k | Node* next; |
452 | | |
453 | 222k | if (!TY_(nodeIsText)(node)) |
454 | 0 | return no; |
455 | | |
456 | 222k | if (node->parent->type == DocTypeTag) |
457 | 564 | return no; |
458 | | |
459 | 221k | if (IsPreDescendant(node)) |
460 | 10.8k | return no; |
461 | | |
462 | 210k | if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) |
463 | 4.39k | return no; |
464 | | |
465 | | /* #523, prevent blank spaces after script if the next item is script. |
466 | | * This is actually more generalized as, if the next element is |
467 | | * a body level script, then indicate that we want to clean trailing |
468 | | * whitespace. |
469 | | */ |
470 | 206k | if ( node->next && nodeIsSCRIPT(node->next) && nodeIsBODY(node->next->parent) ) |
471 | 295 | return yes; |
472 | | |
473 | 206k | next = node->next; |
474 | | |
475 | | /* <p>... </p> */ |
476 | 206k | if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE)) |
477 | 43.2k | return yes; |
478 | | |
479 | | /* <div><small>... </small><h3>...</h3></div> */ |
480 | 162k | if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE)) |
481 | 6.10k | return yes; |
482 | | |
483 | 156k | if (!next) |
484 | 21.0k | return no; |
485 | | |
486 | 135k | if (nodeIsBR(next)) |
487 | 1.91k | return yes; |
488 | | |
489 | 133k | if (TY_(nodeHasCM)(next, CM_INLINE)) |
490 | 33.7k | return no; |
491 | | |
492 | | /* <a href='/'>...</a> <p>...</p> */ |
493 | 99.9k | if (next->type == StartTag) |
494 | 16.7k | return yes; |
495 | | |
496 | | /* <strong>...</strong> <hr /> */ |
497 | 83.1k | if (next->type == StartEndTag) |
498 | 1.32k | return yes; |
499 | | |
500 | | /* evil adjacent text nodes, Tidy should not generate these :-( */ |
501 | 81.8k | if (TY_(nodeIsText)(next) && next->start < next->end |
502 | 51.6k | && TY_(IsWhite)(doc->lexer->lexbuf[next->start])) |
503 | 20.4k | return yes; |
504 | | |
505 | 61.4k | return no; |
506 | 81.8k | } |
507 | | |
508 | | |
509 | | /***************************************************************************//* |
510 | | ** MARK: - Information Accumulation |
511 | | ***************************************************************************/ |
512 | | |
513 | | |
514 | | /** |
515 | | * Errors in positioning of form start or end tags |
516 | | * generally require human intervention to fix. |
517 | | * Issue #166 - repeated <main> element also uses this flag |
518 | | * to indicate duplicates, discarded. |
519 | | */ |
520 | | static void BadForm( TidyDocImpl* doc ) |
521 | 1.66k | { |
522 | 1.66k | doc->badForm |= flg_BadForm; |
523 | 1.66k | } |
524 | | |
525 | | |
526 | | /***************************************************************************//* |
527 | | ** MARK: - Fixes and Touchup |
528 | | ***************************************************************************/ |
529 | | |
530 | | |
531 | | /** |
532 | | * Adds style information as a class in the document or a property |
533 | | * of the node to prevent indentation of inferred UL tags. |
534 | | */ |
535 | | static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) |
536 | 752 | { |
537 | 752 | ctmbstr sprop = |
538 | 752 | "padding-left: 2ex; margin-left: 0ex" |
539 | 752 | "; margin-top: 0ex; margin-bottom: 0ex"; |
540 | 752 | if ( !cfgBool(doc, TidyDecorateInferredUL) ) |
541 | 752 | return; |
542 | 0 | if ( cfgBool(doc, TidyMakeClean) ) |
543 | 0 | TY_(AddStyleAsClass)( doc, node, sprop ); |
544 | 0 | else |
545 | 0 | TY_(AddStyleProperty)( doc, node, sprop ); |
546 | 0 | } |
547 | | |
548 | | |
549 | | /** |
550 | | * Cleans whitespace from text nodes, and drops such nodes if emptied |
551 | | * completely as a result. |
552 | | */ |
553 | | static void CleanSpaces(TidyDocImpl* doc, Node* node) |
554 | 16.9k | { |
555 | 16.9k | Stack *stack = TY_(newStack)(doc, 16); |
556 | 16.9k | Node *next; |
557 | | |
558 | 2.24M | while (node) |
559 | 2.23M | { |
560 | 2.23M | next = node->next; |
561 | | |
562 | 2.23M | if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node)) |
563 | 81.5k | while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start])) |
564 | 3.17k | ++(node->start); |
565 | | |
566 | 2.23M | if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node)) |
567 | 124k | while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1])) |
568 | 34.6k | --(node->end); |
569 | | |
570 | 2.23M | if (TY_(nodeIsText)(node) && !(node->start < node->end)) |
571 | 21.5k | { |
572 | 21.5k | TY_(RemoveNode)(node); |
573 | 21.5k | TY_(FreeNode)(doc, node); |
574 | 21.5k | node = next ? next : TY_(pop)(stack); |
575 | 21.5k | continue; |
576 | 21.5k | } |
577 | | |
578 | 2.21M | if (node->content) |
579 | 1.51M | { |
580 | 1.51M | TY_(push)(stack, next); |
581 | 1.51M | node = node->content; |
582 | 1.51M | continue; |
583 | 1.51M | } |
584 | | |
585 | 699k | node = next ? next : TY_(pop)(stack); |
586 | 699k | } |
587 | 16.9k | TY_(freeStack)(stack); |
588 | 16.9k | } |
589 | | |
590 | | |
591 | | /** |
592 | | * If a table row is empty then insert an empty cell. This practice is |
593 | | * consistent with browser behavior and avoids potential problems with |
594 | | * row spanning cells. |
595 | | */ |
596 | | static void FixEmptyRow(TidyDocImpl* doc, Node *row) |
597 | 9.16k | { |
598 | 9.16k | Node *cell; |
599 | | |
600 | 9.16k | if (row->content == NULL) |
601 | 4.34k | { |
602 | 4.34k | cell = TY_(InferredTag)(doc, TidyTag_TD); |
603 | 4.34k | TY_(InsertNodeAtEnd)(row, cell); |
604 | 4.34k | TY_(Report)(doc, row, cell, MISSING_STARTTAG); |
605 | 4.34k | } |
606 | 9.16k | } |
607 | | |
608 | | |
609 | | /** |
610 | | * The doctype has been found after other tags, |
611 | | * and needs moving to before the html element |
612 | | */ |
613 | | static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) |
614 | 8.87k | { |
615 | 8.87k | Node* existing = TY_(FindDocType)( doc ); |
616 | 8.87k | if ( existing ) |
617 | 4.31k | { |
618 | 4.31k | TY_(Report)(doc, element, doctype, DISCARDING_UNEXPECTED ); |
619 | 4.31k | TY_(FreeNode)( doc, doctype ); |
620 | 4.31k | } |
621 | 4.55k | else |
622 | 4.55k | { |
623 | 4.55k | TY_(Report)(doc, element, doctype, DOCTYPE_AFTER_TAGS ); |
624 | 9.15k | while ( !nodeIsHTML(element) ) |
625 | 4.60k | element = element->parent; |
626 | 4.55k | TY_(InsertNodeBeforeElement)( element, doctype ); |
627 | 4.55k | } |
628 | 8.87k | } |
629 | | |
630 | | |
631 | | /** |
632 | | * This maps |
633 | | * <p>hello<em> world</em> |
634 | | * to |
635 | | * <p>hello <em>world</em> |
636 | | * |
637 | | * Trims initial space, by moving it before the |
638 | | * start tag, or if this element is the first in |
639 | | * parent's content, then by discarding the space |
640 | | */ |
641 | | static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) |
642 | 65.3k | { |
643 | 65.3k | Lexer* lexer = doc->lexer; |
644 | 65.3k | Node *prev, *node; |
645 | | |
646 | 65.3k | if ( TY_(nodeIsText)(text) && |
647 | 65.3k | lexer->lexbuf[text->start] == ' ' && |
648 | 10.5k | text->start < text->end ) |
649 | 9.25k | { |
650 | 9.25k | if ( (element->tag->model & CM_INLINE) && |
651 | 8.55k | !(element->tag->model & CM_FIELD) ) |
652 | 8.44k | { |
653 | 8.44k | prev = element->prev; |
654 | | |
655 | 8.44k | if (TY_(nodeIsText)(prev)) |
656 | 917 | { |
657 | 917 | if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') |
658 | 592 | lexer->lexbuf[(prev->end)++] = ' '; |
659 | | |
660 | 917 | ++(element->start); |
661 | 917 | } |
662 | 7.53k | else /* create new node */ |
663 | 7.53k | { |
664 | 7.53k | node = TY_(NewNode)(lexer->allocator, lexer); |
665 | 7.53k | node->start = (element->start)++; |
666 | 7.53k | node->end = element->start; |
667 | 7.53k | lexer->lexbuf[node->start] = ' '; |
668 | 7.53k | TY_(InsertNodeBeforeElement)(element ,node); |
669 | 7.53k | DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n", |
670 | 7.53k | (element->element ? element->element : "unknown"))); |
671 | 7.53k | } |
672 | 8.44k | } |
673 | | |
674 | | /* discard the space in current node */ |
675 | 9.25k | ++(text->start); |
676 | 9.25k | } |
677 | 65.3k | } |
678 | | |
679 | | |
680 | | /** |
681 | | * This maps |
682 | | * <em>hello </em><strong>world</strong> |
683 | | * to |
684 | | * <em>hello</em> <strong>world</strong> |
685 | | * |
686 | | * If last child of element is a text node |
687 | | * then trim trailing white space character |
688 | | * moving it to after element's end tag. |
689 | | */ |
690 | | static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) |
691 | 42.5k | { |
692 | 42.5k | Lexer* lexer = doc->lexer; |
693 | 42.5k | byte c; |
694 | | |
695 | 42.5k | if (TY_(nodeIsText)(last)) |
696 | 42.5k | { |
697 | 42.5k | if (last->end > last->start) |
698 | 39.9k | { |
699 | 39.9k | c = (byte) lexer->lexbuf[ last->end - 1 ]; |
700 | | |
701 | 39.9k | if ( c == ' ' ) |
702 | 5.05k | { |
703 | 5.05k | last->end -= 1; |
704 | 5.05k | if ( (element->tag->model & CM_INLINE) && |
705 | 4.50k | !(element->tag->model & CM_FIELD) ) |
706 | 4.43k | lexer->insertspace = yes; |
707 | 5.05k | } |
708 | 39.9k | } |
709 | 42.5k | } |
710 | 42.5k | } |
711 | | |
712 | | |
713 | | /** |
714 | | * Move initial and trailing space out. |
715 | | * This routine maps: |
716 | | * hello<em> world</em> |
717 | | * to |
718 | | * hello <em>world</em> |
719 | | * and |
720 | | * <em>hello </em><strong>world</strong> |
721 | | * to |
722 | | * <em>hello</em> <strong>world</strong> |
723 | | */ |
724 | | static void TrimSpaces( TidyDocImpl* doc, Node *element) |
725 | 700k | { |
726 | 700k | Node* text = element->content; |
727 | | |
728 | 700k | if (nodeIsPRE(element) || IsPreDescendant(element)) |
729 | 46.1k | return; |
730 | | |
731 | 654k | if (TY_(nodeIsText)(text)) |
732 | 64.8k | TrimInitialSpace(doc, element, text); |
733 | | |
734 | 654k | text = element->last; |
735 | | |
736 | 654k | if (TY_(nodeIsText)(text)) |
737 | 42.5k | TrimTrailingSpace(doc, element, text); |
738 | 654k | } |
739 | | |
740 | | |
741 | | /***************************************************************************//* |
742 | | ** MARK: - Parsers Support |
743 | | ***************************************************************************/ |
744 | | |
745 | | |
746 | | /** |
747 | | * Structure used by FindDescendant_cb. |
748 | | */ |
749 | | struct MatchingDescendantData |
750 | | { |
751 | | Node *found_node; |
752 | | Bool *passed_marker_node; |
753 | | |
754 | | /* input: */ |
755 | | TidyTagId matching_tagId; |
756 | | Node *node_to_find; |
757 | | Node *marker_node; |
758 | | }; |
759 | | |
760 | | |
761 | | /** |
762 | | * The main engine for FindMatchingDescendant. |
763 | | */ |
764 | | static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate) |
765 | 52.5k | { |
766 | 52.5k | struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate; |
767 | | |
768 | 52.5k | if (TagId(node) == cb_data->matching_tagId) |
769 | 9.66k | { |
770 | | /* make sure we match up 'unknown' tags exactly! */ |
771 | 9.66k | if (cb_data->matching_tagId != TidyTag_UNKNOWN || |
772 | 5.88k | (node->element != NULL && |
773 | 5.32k | cb_data->node_to_find != NULL && |
774 | 5.32k | cb_data->node_to_find->element != NULL && |
775 | 5.32k | 0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element))) |
776 | 4.33k | { |
777 | 4.33k | cb_data->found_node = node; |
778 | 4.33k | return ExitTraversal; |
779 | 4.33k | } |
780 | 9.66k | } |
781 | | |
782 | 48.2k | if (cb_data->passed_marker_node && node == cb_data->marker_node) |
783 | 0 | *cb_data->passed_marker_node = yes; |
784 | | |
785 | 48.2k | return VisitParent; |
786 | 52.5k | } |
787 | | |
788 | | |
789 | | /** |
790 | | * Search the parent chain (from `parent` upwards up to the root) for a node |
791 | | * matching the given 'node'. |
792 | | * |
793 | | * When the search passes beyond the `marker_node` (which is assumed to sit |
794 | | * in the parent chain), this will be flagged by setting the boolean |
795 | | * referenced by `is_parent_of_marker` to `yes`. |
796 | | * |
797 | | * 'is_parent_of_marker' and 'marker_node' are optional parameters and may |
798 | | * be NULL. |
799 | | */ |
800 | | static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker ) |
801 | 5.72k | { |
802 | 5.72k | struct MatchingDescendantData cb_data = { 0 }; |
803 | 5.72k | cb_data.matching_tagId = TagId(node); |
804 | 5.72k | cb_data.node_to_find = node; |
805 | 5.72k | cb_data.marker_node = marker_node; |
806 | | |
807 | 5.72k | assert(node); |
808 | | |
809 | 5.72k | if (is_parent_of_marker) |
810 | 5.72k | *is_parent_of_marker = no; |
811 | | |
812 | 5.72k | TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data); |
813 | 5.72k | return cb_data.found_node; |
814 | 5.72k | } |
815 | | |
816 | | |
817 | | /** |
818 | | * Finds the last list item for the given list, providing it in the |
819 | | * in-out parameter. Returns yes or no if the item was the last list |
820 | | * item. |
821 | | */ |
822 | | static Bool FindLastLI( Node *list, Node **lastli ) |
823 | 18.7k | { |
824 | 18.7k | Node *node; |
825 | | |
826 | 18.7k | *lastli = NULL; |
827 | 23.5k | for ( node = list->content; node ; node = node->next ) |
828 | 4.79k | if ( nodeIsLI(node) && node->type == StartTag ) |
829 | 671 | *lastli=node; |
830 | 18.7k | return *lastli ? yes:no; |
831 | 18.7k | } |
832 | | |
833 | | |
834 | | /***************************************************************************//* |
835 | | ** MARK: - Parser Stack |
836 | | ***************************************************************************/ |
837 | | |
838 | | |
839 | | /** |
840 | | * Allocates and initializes the parser's stack. |
841 | | */ |
842 | | void TY_(InitParserStack)( TidyDocImpl* doc ) |
843 | 16.9k | { |
844 | 16.9k | enum { default_size = 32 }; |
845 | 16.9k | TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size ); |
846 | | |
847 | 16.9k | doc->stack.content = content; |
848 | 16.9k | doc->stack.size = default_size; |
849 | 16.9k | doc->stack.top = -1; |
850 | 16.9k | } |
851 | | |
852 | | |
853 | | /** |
854 | | * Frees the parser's stack when done. |
855 | | */ |
856 | | void TY_(FreeParserStack)( TidyDocImpl* doc ) |
857 | 16.9k | { |
858 | 16.9k | TidyFree( doc->allocator, doc->stack.content ); |
859 | | |
860 | 16.9k | doc->stack.content = NULL; |
861 | 16.9k | doc->stack.size = 0; |
862 | 16.9k | doc->stack.top = -1; |
863 | 16.9k | } |
864 | | |
865 | | |
866 | | /** |
867 | | * Increase the stack size. |
868 | | */ |
869 | | static void growParserStack( TidyDocImpl* doc ) |
870 | 4.15k | { |
871 | 4.15k | TidyParserMemory *content; |
872 | 4.15k | content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 ); |
873 | | |
874 | 4.15k | memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) ); |
875 | 4.15k | TidyFree(doc->allocator, doc->stack.content); |
876 | | |
877 | 4.15k | doc->stack.content = content; |
878 | 4.15k | doc->stack.size = doc->stack.size * 2; |
879 | 4.15k | } |
880 | | |
881 | | |
882 | | /** |
883 | | * Indicates whether or not the stack is empty. |
884 | | */ |
885 | | Bool TY_(isEmptyParserStack)( TidyDocImpl* doc ) |
886 | 3.63M | { |
887 | 3.63M | return doc->stack.top < 0; |
888 | 3.63M | } |
889 | | |
890 | | |
891 | | /** |
892 | | * Peek at the parser memory. |
893 | | */ |
894 | | TidyParserMemory TY_(peekMemory)( TidyDocImpl* doc ) |
895 | 0 | { |
896 | 0 | return doc->stack.content[doc->stack.top]; |
897 | 0 | } |
898 | | |
899 | | |
900 | | /** |
901 | | * Peek at the parser memory "identity" field. This is just a convenience |
902 | | * to avoid having to create a new struct instance in the caller. |
903 | | */ |
904 | | Parser* TY_(peekMemoryIdentity)( TidyDocImpl* doc ) |
905 | 1.81M | { |
906 | 1.81M | return doc->stack.content[doc->stack.top].identity; |
907 | 1.81M | } |
908 | | |
909 | | |
910 | | /** |
911 | | * Peek at the parser memory "mode" field. This is just a convenience |
912 | | * to avoid having to create a new struct instance in the caller. |
913 | | */ |
914 | | GetTokenMode TY_(peekMemoryMode)( TidyDocImpl* doc ) |
915 | 17.5k | { |
916 | 17.5k | return doc->stack.content[doc->stack.top].mode; |
917 | 17.5k | } |
918 | | |
919 | | |
920 | | /** |
921 | | * Pop out a parser memory. |
922 | | */ |
923 | | TidyParserMemory TY_(popMemory)( TidyDocImpl* doc ) |
924 | 1.81M | { |
925 | 1.81M | if ( !TY_(isEmptyParserStack)( doc ) ) |
926 | 1.81M | { |
927 | 1.81M | TidyParserMemory data = doc->stack.content[doc->stack.top]; |
928 | 1.81M | DEBUG_LOG(SPRTF("\n" |
929 | 1.81M | "<--POP original: %s @ %p\n" |
930 | 1.81M | " reentry: %s @ %p\n" |
931 | 1.81M | " stack depth: %lu @ %p\n" |
932 | 1.81M | " mode: %u\n" |
933 | 1.81M | " register 1: %i\n" |
934 | 1.81M | " register 2: %i\n\n", |
935 | 1.81M | data.original_node ? data.original_node->element : "none", data.original_node, |
936 | 1.81M | data.reentry_node ? data.reentry_node->element : "none", data.reentry_node, |
937 | 1.81M | doc->stack.top, &doc->stack.content[doc->stack.top], |
938 | 1.81M | data.mode, |
939 | 1.81M | data.register_1, |
940 | 1.81M | data.register_2 |
941 | 1.81M | )); |
942 | 1.81M | doc->stack.top = doc->stack.top - 1; |
943 | 1.81M | return data; |
944 | 1.81M | } |
945 | 0 | TidyParserMemory blank = { NULL }; |
946 | 0 | return blank; |
947 | 1.81M | } |
948 | | |
949 | | |
950 | | /** |
951 | | * Push the parser memory to the stack. |
952 | | */ |
953 | | void TY_(pushMemory)( TidyDocImpl* doc, TidyParserMemory data ) |
954 | 1.94M | { |
955 | 1.94M | if ( doc->stack.top == doc->stack.size - 1 ) |
956 | 4.15k | growParserStack( doc ); |
957 | | |
958 | 1.94M | doc->stack.top++; |
959 | | |
960 | 1.94M | doc->stack.content[doc->stack.top] = data; |
961 | 1.94M | DEBUG_LOG(SPRTF("\n" |
962 | 1.94M | "-->PUSH original: %s @ %p\n" |
963 | 1.94M | " reentry: %s @ %p\n" |
964 | 1.94M | " stack depth: %lu @ %p\n" |
965 | 1.94M | " mode: %u\n" |
966 | 1.94M | " register 1: %i\n" |
967 | 1.94M | " register 2: %i\n\n", |
968 | 1.94M | data.original_node ? data.original_node->element : "none", data.original_node, |
969 | 1.94M | data.reentry_node ? data.reentry_node->element : "none", data.reentry_node, |
970 | 1.94M | doc->stack.top, &doc->stack.content[doc->stack.top], |
971 | 1.94M | data.mode, |
972 | 1.94M | data.register_1, |
973 | 1.94M | data.register_2 |
974 | 1.94M | )); |
975 | 1.94M | } |
976 | | |
977 | | |
978 | | /***************************************************************************//* |
979 | | ** MARK: Convenience Logging Macros |
980 | | ***************************************************************************/ |
981 | | |
982 | | |
983 | | #if defined(ENABLE_DEBUG_LOG) |
984 | | # define DEBUG_LOG_COUNTERS \ |
985 | | static int depth_parser = 0;\ |
986 | | static int count_parser = 0;\ |
987 | | int old_mode = IgnoreWhitespace; |
988 | | # define DEBUG_LOG_GET_OLD_MODE old_mode = mode; |
989 | | # define DEBUG_LOG_REENTER_WITH_NODE(NODE) SPRTF("\n>>>Re-Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser); |
990 | | # define DEBUG_LOG_ENTER_WITH_NODE(NODE) SPRTF("\n>>>Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser); |
991 | | # define DEBUG_LOG_CHANGE_MODE SPRTF("+++%s-%u Changing mode to %u (was %u)\n", __FUNCTION__, __LINE__, mode, old_mode); |
992 | | # define DEBUG_LOG_GOT_TOKEN(NODE) SPRTF("---%s-%u got token '%s' with mode '%u'.\n", __FUNCTION__, __LINE__, NODE ? NODE->element : NULL, mode); |
993 | | # define DEBUG_LOG_EXIT_WITH_NODE(NODE) SPRTF("<<<Exit %s-%u with a node to parse: '%s', depth: %d\n", __FUNCTION__, __LINE__, NODE->element, depth_parser--); |
994 | | # define DEBUG_LOG_EXIT SPRTF("<<<Exit %s-%u, depth: %d\n", __FUNCTION__, __LINE__, depth_parser--); |
995 | | #else |
996 | | # define DEBUG_LOG_COUNTERS |
997 | | # define DEBUG_LOG_GET_OLD_MODE |
998 | | # define DEBUG_LOG_REENTER_WITH_NODE(NODE) |
999 | | # define DEBUG_LOG_ENTER_WITH_NODE(NODE) |
1000 | | # define DEBUG_LOG_CHANGE_MODE |
1001 | | # define DEBUG_LOG_GOT_TOKEN(NODE) |
1002 | | # define DEBUG_LOG_EXIT_WITH_NODE(NODE) |
1003 | | # define DEBUG_LOG_EXIT |
1004 | | #endif |
1005 | | |
1006 | | |
1007 | | /***************************************************************************//* |
1008 | | ** MARK: - Parser Search and Instantiation |
1009 | | ***************************************************************************/ |
1010 | | |
1011 | | |
1012 | | /** |
1013 | | * Retrieves the correct parser for the given node, accounting for various |
1014 | | * conditions, and readies the lexer for parsing that node. |
1015 | | */ |
1016 | | static Parser* GetParserForNode( TidyDocImpl* doc, Node *node ) |
1017 | 1.97M | { |
1018 | 1.97M | Lexer* lexer = doc->lexer; |
1019 | | |
1020 | 1.97M | if ( cfgBool( doc, TidyXmlTags ) ) |
1021 | 0 | return ParseXMLElement; |
1022 | | |
1023 | | /* [i_a]2 prevent crash for active content (php, asp) docs */ |
1024 | 1.97M | if (!node || node->tag == NULL) |
1025 | 5.96k | return NULL; |
1026 | | |
1027 | | /* |
1028 | | Fix by GLP 2000-12-21. Need to reset insertspace if this is both |
1029 | | a non-inline and empty tag (base, link, meta, isindex, hr, area). |
1030 | | */ |
1031 | 1.97M | if (node->tag->model & CM_EMPTY) |
1032 | 34.7k | { |
1033 | 34.7k | lexer->waswhite = no; |
1034 | 34.7k | if (node->tag->parser == NULL) |
1035 | 0 | return NULL; |
1036 | 34.7k | } |
1037 | 1.93M | else if (!(node->tag->model & CM_INLINE)) |
1038 | 471k | lexer->insertspace = no; |
1039 | | |
1040 | 1.97M | if (node->tag->parser == NULL) |
1041 | 0 | return NULL; |
1042 | | |
1043 | 1.97M | if (node->type == StartEndTag) |
1044 | 10.9k | return NULL; |
1045 | | |
1046 | | /* [i_a]2 added this - not sure why - CHECKME: */ |
1047 | 1.95M | lexer->parent = node; |
1048 | | |
1049 | 1.95M | return (node->tag->parser); |
1050 | 1.97M | } |
1051 | | |
1052 | | |
1053 | | /** |
1054 | | * This parser controller initiates the parsing process with the document's |
1055 | | * root starting with the provided node, which should be the HTML node after |
1056 | | * the pre-HTML stuff is handled at a higher level. |
1057 | | * |
1058 | | * This controller is responsible for calling each of the individual parsers, |
1059 | | * based on the tokens it pulls from the lexer, or the tokens passed back via |
1060 | | * the parserMemory stack from each of the parsers. Having a main, central |
1061 | | * looping dispatcher in this fashion allows the prevention of recursion. |
1062 | | */ |
1063 | | void ParseHTMLWithNode( TidyDocImpl* doc, Node* node ) |
1064 | 16.9k | { |
1065 | 16.9k | GetTokenMode mode = IgnoreWhitespace; |
1066 | 16.9k | Parser* parser = GetParserForNode( doc, node ); |
1067 | 16.9k | Bool something_to_do = yes; |
1068 | | |
1069 | | /* |
1070 | | This main loop is only extinguished when all of the parser tokens are |
1071 | | consumed. Ideally, EVERY parser will return nodes to this loop for |
1072 | | dispatch to the appropriate parser, but some of the recursive parsers |
1073 | | still consume some tokens on their own. |
1074 | | */ |
1075 | 3.78M | while (something_to_do) |
1076 | 3.77M | { |
1077 | 3.77M | node = parser ? parser( doc, node, mode ) : NULL; |
1078 | | |
1079 | | /* |
1080 | | We have a node, so anything deferred was already pushed to the stack |
1081 | | to be dealt with later. |
1082 | | */ |
1083 | 3.77M | if ( node ) |
1084 | 1.94M | { |
1085 | 1.94M | parser = GetParserForNode( doc, node ); |
1086 | 1.94M | continue; |
1087 | 1.94M | } |
1088 | | |
1089 | | /* |
1090 | | We weren't given a node, which means this particular leaf is bottomed |
1091 | | out. We'll re-enter the parsers using information from the stack. |
1092 | | */ |
1093 | 1.82M | if ( !TY_(isEmptyParserStack)(doc)) |
1094 | 1.81M | { |
1095 | 1.81M | parser = TY_(peekMemoryIdentity)(doc); |
1096 | 1.81M | if (parser) |
1097 | 1.79M | { |
1098 | 1.79M | continue; |
1099 | 1.79M | } |
1100 | 17.5k | else |
1101 | 17.5k | { |
1102 | | /* No parser means we're only passing back a parsing mode. */ |
1103 | 17.5k | mode = TY_(peekMemoryMode)( doc ); |
1104 | 17.5k | TY_(popMemory)( doc ); |
1105 | 17.5k | } |
1106 | 1.81M | } |
1107 | | |
1108 | | /* |
1109 | | At this point, there's nothing being returned from parsers, and |
1110 | | nothing on the stack, so we can draw a new node from the lexer. |
1111 | | */ |
1112 | 30.0k | node = TY_(GetToken)( doc, mode ); |
1113 | 30.0k | DEBUG_LOG_GOT_TOKEN(node); |
1114 | | |
1115 | 30.0k | if (node) |
1116 | 13.0k | parser = GetParserForNode( doc, node ); |
1117 | 16.9k | else |
1118 | 16.9k | something_to_do = no; |
1119 | 30.0k | } |
1120 | 16.9k | } |
1121 | | |
1122 | | |
1123 | | /***************************************************************************//* |
1124 | | ** MARK: - Parsers |
1125 | | ***************************************************************************/ |
1126 | | |
1127 | | |
1128 | | /** MARK: TY_(ParseBlock) |
1129 | | * `element` is a node created by the lexer upon seeing the start tag, or |
1130 | | * by the parser when the start tag is inferred |
1131 | | * |
1132 | | * This is a non-recursing parser. It uses the document's parser memory stack |
1133 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
1134 | | * This parser is also re-enterable, so that post-processing can occur after |
1135 | | * such dispatching. |
1136 | | */ |
1137 | | Node* TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) |
1138 | 363k | { |
1139 | 363k | Lexer* lexer = doc->lexer; |
1140 | 363k | Node *node = NULL; |
1141 | 363k | Bool checkstack = yes; |
1142 | 363k | uint istackbase = 0; |
1143 | 363k | DEBUG_LOG_COUNTERS; |
1144 | | |
1145 | 363k | if ( element == NULL ) |
1146 | 179k | { |
1147 | 179k | TidyParserMemory memory = TY_(popMemory)( doc ); |
1148 | 179k | node = memory.reentry_node; /* Throwaway, because the loop overwrites this immediately. */ |
1149 | 179k | DEBUG_LOG_REENTER_WITH_NODE(node); |
1150 | 179k | element = memory.original_node; |
1151 | 179k | DEBUG_LOG_GET_OLD_MODE; |
1152 | 179k | mode = memory.reentry_mode; |
1153 | 179k | DEBUG_LOG_CHANGE_MODE; |
1154 | 179k | } |
1155 | 183k | else |
1156 | 183k | { |
1157 | 183k | DEBUG_LOG_ENTER_WITH_NODE(element); |
1158 | | |
1159 | 183k | if ( element->tag->model & CM_EMPTY ) |
1160 | 231 | { |
1161 | 231 | DEBUG_LOG_EXIT; |
1162 | 231 | return NULL; |
1163 | 231 | } |
1164 | | |
1165 | 183k | if ( nodeIsDIV(element) && nodeIsDL(element->parent) && TY_(IsHTML5Mode)(doc) ) |
1166 | 43 | { |
1167 | 43 | DEBUG_LOG_EXIT; |
1168 | 43 | return TY_(ParseDefList)(doc, element, mode); /* @warning: possible recursion! */ |
1169 | 43 | } |
1170 | | |
1171 | 183k | if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) ) |
1172 | 4.31k | { |
1173 | 4.31k | TY_(Report)(doc, element, NULL, ILLEGAL_NESTING ); |
1174 | 4.31k | } |
1175 | | |
1176 | | /* |
1177 | | InlineDup() asks the lexer to insert inline emphasis tags |
1178 | | currently pushed on the istack, but take care to avoid |
1179 | | propagating inline emphasis inside OBJECT or APPLET. |
1180 | | For these elements a fresh inline stack context is created |
1181 | | and disposed of upon reaching the end of the element. |
1182 | | They thus behave like table cells in this respect. |
1183 | | */ |
1184 | 183k | if (element->tag->model & CM_OBJECT) |
1185 | 13.4k | { |
1186 | 13.4k | istackbase = lexer->istackbase; |
1187 | 13.4k | lexer->istackbase = lexer->istacksize; |
1188 | 13.4k | } |
1189 | | |
1190 | 183k | if (!(element->tag->model & CM_MIXED)) |
1191 | 173k | { |
1192 | 173k | TY_(InlineDup)( doc, NULL ); |
1193 | 173k | } |
1194 | | |
1195 | | /*\ |
1196 | | * Issue #212 - If it is likely that it may be necessary |
1197 | | * to move a leading space into a text node before this |
1198 | | * element, then keep the mode MixedContent to keep any |
1199 | | * leading space |
1200 | | \*/ |
1201 | 183k | if ( !(element->tag->model & CM_INLINE) || |
1202 | 127k | (element->tag->model & CM_FIELD ) ) |
1203 | 56.4k | { |
1204 | 56.4k | DEBUG_LOG_GET_OLD_MODE; |
1205 | 56.4k | mode = IgnoreWhitespace; |
1206 | 56.4k | DEBUG_LOG_CHANGE_MODE; |
1207 | 56.4k | } |
1208 | 127k | else if (mode == IgnoreWhitespace) |
1209 | 127k | { |
1210 | | /* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace' |
1211 | | when such a leading space may need to be inserted before this element to |
1212 | | preserve the browser view */ |
1213 | 127k | DEBUG_LOG_GET_OLD_MODE; |
1214 | 127k | mode = MixedContent; |
1215 | 127k | DEBUG_LOG_CHANGE_MODE; |
1216 | 127k | } |
1217 | 183k | } /* Re-Entering */ |
1218 | | |
1219 | | /* |
1220 | | Main Loop |
1221 | | */ |
1222 | | |
1223 | 436k | while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL) |
1224 | 299k | { |
1225 | 299k | DEBUG_LOG_GOT_TOKEN(node); |
1226 | | /* end tag for this element */ |
1227 | 299k | if (node->type == EndTag && node->tag && |
1228 | 18.0k | (node->tag == element->tag || element->was == node->tag)) |
1229 | 6.00k | { |
1230 | 6.00k | TY_(FreeNode)( doc, node ); |
1231 | | |
1232 | 6.00k | if (element->tag->model & CM_OBJECT) |
1233 | 630 | { |
1234 | | /* pop inline stack */ |
1235 | 3.69k | while (lexer->istacksize > lexer->istackbase) |
1236 | 3.06k | TY_(PopInline)( doc, NULL ); |
1237 | 630 | lexer->istackbase = istackbase; |
1238 | 630 | } |
1239 | | |
1240 | 6.00k | element->closed = yes; |
1241 | 6.00k | TrimSpaces( doc, element ); |
1242 | 6.00k | DEBUG_LOG_EXIT; |
1243 | 6.00k | return NULL; |
1244 | 6.00k | } |
1245 | | |
1246 | 292k | if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) |
1247 | 2.33k | { |
1248 | 2.33k | if ( TY_(nodeIsElement)(node) ) |
1249 | 1.34k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1250 | 2.33k | TY_(FreeNode)( doc, node ); |
1251 | 2.33k | continue; |
1252 | 2.33k | } |
1253 | | |
1254 | | |
1255 | 290k | if (node->type == EndTag) |
1256 | 11.8k | { |
1257 | 11.8k | if (node->tag == NULL) |
1258 | 765 | { |
1259 | 765 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1260 | 765 | TY_(FreeNode)( doc, node ); |
1261 | 765 | continue; |
1262 | 765 | } |
1263 | 11.0k | else if ( nodeIsBR(node) ) |
1264 | 658 | { |
1265 | 658 | node->type = StartTag; |
1266 | 658 | } |
1267 | 10.3k | else if ( nodeIsP(node) ) |
1268 | 1.05k | { |
1269 | | /* Cannot have a block inside a paragraph, so no checking |
1270 | | for an ancestor is necessary -- but we _can_ have |
1271 | | paragraphs inside a block, so change it to an implicit |
1272 | | empty paragraph, to be dealt with according to the user's |
1273 | | options |
1274 | | */ |
1275 | 1.05k | node->type = StartEndTag; |
1276 | 1.05k | node->implicit = yes; |
1277 | 1.05k | } |
1278 | 9.32k | else if (DescendantOf( element, node->tag->id )) |
1279 | 4.47k | { |
1280 | | /* |
1281 | | if this is the end tag for an ancestor element |
1282 | | then infer end tag for this element |
1283 | | */ |
1284 | 4.47k | TY_(UngetToken)( doc ); |
1285 | 4.47k | break; |
1286 | 4.47k | } |
1287 | 4.85k | else |
1288 | 4.85k | { |
1289 | | /* special case </tr> etc. for stuff moved in front of table */ |
1290 | 4.85k | if ( lexer->exiled |
1291 | 1.40k | && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) |
1292 | 802 | { |
1293 | 802 | TY_(UngetToken)( doc ); |
1294 | 802 | TrimSpaces( doc, element ); |
1295 | 802 | DEBUG_LOG_EXIT; |
1296 | 802 | return NULL; |
1297 | 802 | } |
1298 | 4.85k | } |
1299 | 11.8k | } |
1300 | | |
1301 | | /* mixed content model permits text */ |
1302 | 284k | if (TY_(nodeIsText)(node)) |
1303 | 28.9k | { |
1304 | 28.9k | if ( checkstack ) |
1305 | 23.2k | { |
1306 | 23.2k | checkstack = no; |
1307 | 23.2k | if (!(element->tag->model & CM_MIXED)) |
1308 | 19.4k | { |
1309 | 19.4k | if ( TY_(InlineDup)(doc, node) > 0 ) |
1310 | 2.86k | continue; |
1311 | 19.4k | } |
1312 | 23.2k | } |
1313 | | |
1314 | 26.0k | TY_(InsertNodeAtEnd)(element, node); |
1315 | 26.0k | DEBUG_LOG_GET_OLD_MODE |
1316 | 26.0k | mode = MixedContent; |
1317 | 26.0k | DEBUG_LOG_CHANGE_MODE; |
1318 | | /* |
1319 | | HTML4 strict doesn't allow mixed content for |
1320 | | elements with %block; as their content model |
1321 | | */ |
1322 | | /* |
1323 | | But only body, map, blockquote, form and |
1324 | | noscript have content model %block; |
1325 | | */ |
1326 | 26.0k | if ( nodeIsBODY(element) || |
1327 | 26.0k | nodeIsMAP(element) || |
1328 | 26.0k | nodeIsBLOCKQUOTE(element) || |
1329 | 26.0k | nodeIsFORM(element) || |
1330 | 18.5k | nodeIsNOSCRIPT(element) ) |
1331 | 7.80k | TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); |
1332 | 26.0k | continue; |
1333 | 28.9k | } |
1334 | | |
1335 | 255k | if ( InsertMisc(element, node) ) |
1336 | 4.45k | continue; |
1337 | | |
1338 | | /* allow PARAM elements? */ |
1339 | 251k | if ( nodeIsPARAM(node) ) |
1340 | 7.20k | { |
1341 | 7.20k | if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) ) |
1342 | 6.93k | { |
1343 | 6.93k | TY_(InsertNodeAtEnd)(element, node); |
1344 | 6.93k | continue; |
1345 | 6.93k | } |
1346 | | |
1347 | | /* otherwise discard it */ |
1348 | 261 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1349 | 261 | TY_(FreeNode)( doc, node ); |
1350 | 261 | continue; |
1351 | 7.20k | } |
1352 | | |
1353 | | /* allow AREA elements? */ |
1354 | 244k | if ( nodeIsAREA(node) ) |
1355 | 3.96k | { |
1356 | 3.96k | if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) ) |
1357 | 1.78k | { |
1358 | 1.78k | TY_(InsertNodeAtEnd)(element, node); |
1359 | 1.78k | continue; |
1360 | 1.78k | } |
1361 | | |
1362 | | /* otherwise discard it */ |
1363 | 2.17k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1364 | 2.17k | TY_(FreeNode)( doc, node ); |
1365 | 2.17k | continue; |
1366 | 3.96k | } |
1367 | | |
1368 | | /* ignore unknown start/end tags */ |
1369 | 240k | if ( node->tag == NULL ) |
1370 | 14.9k | { |
1371 | 14.9k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1372 | 14.9k | TY_(FreeNode)( doc, node ); |
1373 | 14.9k | continue; |
1374 | 14.9k | } |
1375 | | |
1376 | | /* |
1377 | | Allow CM_INLINE elements here. |
1378 | | |
1379 | | Allow CM_BLOCK elements here unless |
1380 | | lexer->excludeBlocks is yes. |
1381 | | |
1382 | | LI and DD are special cased. |
1383 | | |
1384 | | Otherwise infer end tag for this element. |
1385 | | */ |
1386 | | |
1387 | 225k | if ( !TY_(nodeHasCM)(node, CM_INLINE) ) |
1388 | 89.5k | { |
1389 | 89.5k | if ( !TY_(nodeIsElement)(node) ) |
1390 | 895 | { |
1391 | 895 | if ( nodeIsFORM(node) ) |
1392 | 208 | BadForm( doc ); |
1393 | | |
1394 | 895 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1395 | 895 | TY_(FreeNode)( doc, node ); |
1396 | 895 | continue; |
1397 | 895 | } |
1398 | | |
1399 | | /* #427671 - Fix by Randy Waki - 10 Aug 00 */ |
1400 | | /* |
1401 | | If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION |
1402 | | start tag, discard the start tag and let the subsequent content get |
1403 | | parsed as content of the enclosing LI. This seems to mimic IE and |
1404 | | Netscape, and avoids an infinite loop: without this check, |
1405 | | ParseBlock (which is parsing the LI's content) and ParseList (which |
1406 | | is parsing the LI's parent's content) repeatedly defer to each |
1407 | | other to parse the illegal start tag, each time inferring a missing |
1408 | | </li> or <li> respectively. |
1409 | | |
1410 | | NOTE: This check is a bit fragile. It specifically checks for the |
1411 | | four tags that happen to weave their way through the current series |
1412 | | of tests performed by ParseBlock and ParseList to trigger the |
1413 | | infinite loop. |
1414 | | */ |
1415 | 88.6k | if ( nodeIsLI(element) ) |
1416 | 23.3k | { |
1417 | 23.3k | if ( nodeIsFRAME(node) || |
1418 | 23.3k | nodeIsFRAMESET(node) || |
1419 | 23.3k | nodeIsOPTGROUP(node) || |
1420 | 22.5k | nodeIsOPTION(node) ) |
1421 | 937 | { |
1422 | 937 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1423 | 937 | TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */ |
1424 | 937 | continue; |
1425 | 937 | } |
1426 | 23.3k | } |
1427 | | |
1428 | 87.6k | if ( nodeIsTD(element) || nodeIsTH(element) ) |
1429 | 14.3k | { |
1430 | | /* if parent is a table cell, avoid inferring the end of the cell */ |
1431 | | |
1432 | 14.3k | if ( TY_(nodeHasCM)(node, CM_HEAD) ) |
1433 | 603 | { |
1434 | 603 | MoveToHead( doc, element, node ); |
1435 | 603 | continue; |
1436 | 603 | } |
1437 | | |
1438 | 13.7k | if ( TY_(nodeHasCM)(node, CM_LIST) ) |
1439 | 279 | { |
1440 | 279 | TY_(UngetToken)( doc ); |
1441 | 279 | node = TY_(InferredTag)(doc, TidyTag_UL); |
1442 | 279 | AddClassNoIndent(doc, node); |
1443 | 279 | lexer->excludeBlocks = yes; |
1444 | 279 | } |
1445 | 13.4k | else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) |
1446 | 5.09k | { |
1447 | 5.09k | TY_(UngetToken)( doc ); |
1448 | 5.09k | node = TY_(InferredTag)(doc, TidyTag_DL); |
1449 | 5.09k | lexer->excludeBlocks = yes; |
1450 | 5.09k | } |
1451 | | |
1452 | | /* infer end of current table cell */ |
1453 | 13.7k | if ( !TY_(nodeHasCM)(node, CM_BLOCK) ) |
1454 | 2.92k | { |
1455 | 2.92k | TY_(UngetToken)( doc ); |
1456 | 2.92k | TrimSpaces( doc, element ); |
1457 | 2.92k | DEBUG_LOG_EXIT; |
1458 | 2.92k | return NULL; |
1459 | 2.92k | } |
1460 | 13.7k | } |
1461 | 73.3k | else if ( TY_(nodeHasCM)(node, CM_BLOCK) ) |
1462 | 42.2k | { |
1463 | 42.2k | if ( lexer->excludeBlocks ) |
1464 | 3.24k | { |
1465 | 3.24k | if ( !TY_(nodeHasCM)(element, CM_OPT) ) |
1466 | 2.67k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
1467 | | |
1468 | 3.24k | TY_(UngetToken)( doc ); |
1469 | | |
1470 | 3.24k | if ( TY_(nodeHasCM)(element, CM_OBJECT) ) |
1471 | 50 | lexer->istackbase = istackbase; |
1472 | | |
1473 | 3.24k | TrimSpaces( doc, element ); |
1474 | 3.24k | DEBUG_LOG_EXIT; |
1475 | 3.24k | return NULL; |
1476 | 3.24k | } |
1477 | 42.2k | } |
1478 | 31.0k | else if ( ! nodeIsTEMPLATE( element ) )/* things like list items */ |
1479 | 31.0k | { |
1480 | 31.0k | if (node->tag->model & CM_HEAD) |
1481 | 1.64k | { |
1482 | 1.64k | MoveToHead( doc, element, node ); |
1483 | 1.64k | continue; |
1484 | 1.64k | } |
1485 | | |
1486 | | /* |
1487 | | special case where a form start tag |
1488 | | occurs in a tr and is followed by td or th |
1489 | | */ |
1490 | | |
1491 | 29.3k | if ( nodeIsFORM(element) && |
1492 | 29.3k | nodeIsTD(element->parent) && |
1493 | 2.69k | element->parent->implicit ) |
1494 | 771 | { |
1495 | 771 | if ( nodeIsTD(node) ) |
1496 | 88 | { |
1497 | 88 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1498 | 88 | TY_(FreeNode)( doc, node ); |
1499 | 88 | continue; |
1500 | 88 | } |
1501 | | |
1502 | 683 | if ( nodeIsTH(node) ) |
1503 | 308 | { |
1504 | 308 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1505 | 308 | TY_(FreeNode)( doc, node ); |
1506 | 308 | node = element->parent; |
1507 | 308 | TidyDocFree(doc, node->element); |
1508 | 308 | node->element = TY_(tmbstrdup)(doc->allocator, "th"); |
1509 | 308 | node->tag = TY_(LookupTagDef)( TidyTag_TH ); |
1510 | 308 | continue; |
1511 | 308 | } |
1512 | 683 | } |
1513 | | |
1514 | 29.0k | if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit ) |
1515 | 14.6k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
1516 | | |
1517 | | /* #521, warn on missing optional end-tags if not omitting them. */ |
1518 | 29.0k | if ( cfgBool( doc, TidyOmitOptionalTags ) == no && TY_(nodeHasCM)(element, CM_OPT) ) |
1519 | 13.9k | TY_(Report)(doc, element, node, MISSING_ENDTAG_OPTIONAL ); |
1520 | | |
1521 | | |
1522 | 29.0k | TY_(UngetToken)( doc ); |
1523 | | |
1524 | 29.0k | if ( TY_(nodeHasCM)(node, CM_LIST) ) |
1525 | 2.16k | { |
1526 | 2.16k | if ( element->parent && element->parent->tag && |
1527 | 2.08k | element->parent->tag->parser == TY_(ParseList) ) |
1528 | 1.86k | { |
1529 | 1.86k | TrimSpaces( doc, element ); |
1530 | 1.86k | DEBUG_LOG_EXIT; |
1531 | 1.86k | return NULL; |
1532 | 1.86k | } |
1533 | | |
1534 | 307 | node = TY_(InferredTag)(doc, TidyTag_UL); |
1535 | 307 | AddClassNoIndent(doc, node); |
1536 | 307 | } |
1537 | 26.8k | else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) |
1538 | 3.35k | { |
1539 | 3.35k | if ( nodeIsDL(element->parent) ) |
1540 | 2.87k | { |
1541 | 2.87k | TrimSpaces( doc, element ); |
1542 | 2.87k | DEBUG_LOG_EXIT; |
1543 | 2.87k | return NULL; |
1544 | 2.87k | } |
1545 | | |
1546 | 479 | node = TY_(InferredTag)(doc, TidyTag_DL); |
1547 | 479 | } |
1548 | 23.4k | else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) ) |
1549 | 15.8k | { |
1550 | | /* http://tidy.sf.net/issue/1316307 */ |
1551 | | /* In exiled mode, return so table processing can |
1552 | | continue. */ |
1553 | 15.8k | if (lexer->exiled) |
1554 | 5.08k | { |
1555 | 5.08k | DEBUG_LOG_EXIT; |
1556 | 5.08k | return NULL; |
1557 | 5.08k | } |
1558 | 10.7k | node = TY_(InferredTag)(doc, TidyTag_TABLE); |
1559 | 10.7k | } |
1560 | 7.64k | else if ( TY_(nodeHasCM)(element, CM_OBJECT) ) |
1561 | 597 | { |
1562 | | /* pop inline stack */ |
1563 | 6.64k | while ( lexer->istacksize > lexer->istackbase ) |
1564 | 6.04k | TY_(PopInline)( doc, NULL ); |
1565 | 597 | lexer->istackbase = istackbase; |
1566 | 597 | TrimSpaces( doc, element ); |
1567 | 597 | DEBUG_LOG_EXIT; |
1568 | 597 | return NULL; |
1569 | | |
1570 | 597 | } |
1571 | 7.05k | else |
1572 | 7.05k | { |
1573 | 7.05k | TrimSpaces( doc, element ); |
1574 | 7.05k | DEBUG_LOG_EXIT; |
1575 | 7.05k | return NULL; |
1576 | 7.05k | } |
1577 | 29.0k | } |
1578 | 87.6k | } |
1579 | | |
1580 | | /*\ |
1581 | | * Issue #307 - an <A> tag to ends any open <A> element |
1582 | | * Like #427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00 |
1583 | | * in ParseInline(), fix copied HERE to ParseBlock() |
1584 | | * href: http://www.w3.org/TR/html-markup/a.html |
1585 | | * The interactive element a must not appear as a descendant of the a element. |
1586 | | \*/ |
1587 | 196k | if ( nodeIsA(node) && !node->implicit && |
1588 | 14.4k | (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) |
1589 | 10.3k | { |
1590 | 10.3k | if (node->type != EndTag && node->attributes == NULL |
1591 | 3.86k | && cfgBool(doc, TidyCoerceEndTags) ) |
1592 | 3.86k | { |
1593 | 3.86k | node->type = EndTag; |
1594 | 3.86k | TY_(Report)(doc, element, node, COERCE_TO_ENDTAG); |
1595 | 3.86k | TY_(UngetToken)( doc ); |
1596 | 3.86k | continue; |
1597 | 3.86k | } |
1598 | | |
1599 | 6.45k | if (nodeIsA(element)) |
1600 | 3.49k | { |
1601 | 3.49k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
1602 | 3.49k | TY_(UngetToken)( doc ); |
1603 | 3.49k | } |
1604 | 2.96k | else |
1605 | 2.96k | { |
1606 | | /* Issue #597 - if we not 'UngetToken' then it is being discarded. |
1607 | | Add message, and 'FreeNode' - thanks @ralfjunker */ |
1608 | 2.96k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
1609 | 2.96k | TY_(FreeNode)(doc, node); |
1610 | 2.96k | } |
1611 | | |
1612 | 6.45k | if (!(mode & Preformatted)) |
1613 | 6.45k | TrimSpaces(doc, element); |
1614 | | |
1615 | 6.45k | DEBUG_LOG_EXIT; |
1616 | 6.45k | return NULL; |
1617 | 10.3k | } |
1618 | | |
1619 | | /* parse known element */ |
1620 | 186k | if (TY_(nodeIsElement)(node)) |
1621 | 185k | { |
1622 | 185k | if (node->tag->model & CM_INLINE) |
1623 | 123k | { |
1624 | 123k | if (checkstack && !node->implicit) |
1625 | 43.4k | { |
1626 | 43.4k | checkstack = no; |
1627 | | |
1628 | 43.4k | if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */ |
1629 | 41.9k | { |
1630 | 41.9k | if ( TY_(InlineDup)(doc, node) > 0 ) |
1631 | 947 | continue; |
1632 | 41.9k | } |
1633 | 43.4k | } |
1634 | | |
1635 | 123k | DEBUG_LOG_GET_OLD_MODE; |
1636 | 123k | mode = MixedContent; |
1637 | 123k | DEBUG_LOG_CHANGE_MODE; |
1638 | 123k | } |
1639 | 61.4k | else |
1640 | 61.4k | { |
1641 | 61.4k | checkstack = yes; |
1642 | 61.4k | DEBUG_LOG_GET_OLD_MODE; |
1643 | 61.4k | mode = IgnoreWhitespace; |
1644 | 61.4k | DEBUG_LOG_CHANGE_MODE; |
1645 | 61.4k | } |
1646 | | |
1647 | | /* trim white space before <br> */ |
1648 | 184k | if ( nodeIsBR(node) ) |
1649 | 2.25k | TrimSpaces( doc, element ); |
1650 | | |
1651 | 184k | TY_(InsertNodeAtEnd)(element, node); |
1652 | | |
1653 | 184k | if (node->implicit) |
1654 | 93.7k | TY_(Report)(doc, element, node, INSERTING_TAG ); |
1655 | | |
1656 | | /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an |
1657 | | effort has been made above to set a 'MixedContent' mode in some cases? |
1658 | | WHY IS THE 'mode' VARIABLE NOT USED HERE???? */ |
1659 | | |
1660 | 184k | { |
1661 | 184k | TidyParserMemory memory = {0}; |
1662 | 184k | memory.identity = TY_(ParseBlock); |
1663 | 184k | memory.reentry_node = node; |
1664 | 184k | memory.reentry_mode = mode; |
1665 | 184k | memory.original_node = element; |
1666 | 184k | TY_(pushMemory)(doc, memory); |
1667 | 184k | DEBUG_LOG_EXIT_WITH_NODE(node); |
1668 | 184k | } |
1669 | 184k | return node; |
1670 | 185k | } |
1671 | | |
1672 | | /* discard unexpected tags */ |
1673 | 1.26k | if (node->type == EndTag) |
1674 | 1.26k | TY_(PopInline)( doc, node ); /* if inline end tag */ |
1675 | | |
1676 | 1.26k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1677 | 1.26k | TY_(FreeNode)( doc, node ); |
1678 | 1.26k | continue; |
1679 | 186k | } |
1680 | | |
1681 | 141k | if (!(element->tag->model & CM_OPT)) |
1682 | 110k | TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR); |
1683 | | |
1684 | 141k | if (element->tag->model & CM_OBJECT) |
1685 | 10.9k | { |
1686 | | /* pop inline stack */ |
1687 | 11.4k | while ( lexer->istacksize > lexer->istackbase ) |
1688 | 499 | TY_(PopInline)( doc, NULL ); |
1689 | 10.9k | lexer->istackbase = istackbase; |
1690 | 10.9k | } |
1691 | | |
1692 | 141k | TrimSpaces( doc, element ); |
1693 | | |
1694 | 141k | DEBUG_LOG_EXIT; |
1695 | 141k | return NULL; |
1696 | 363k | } |
1697 | | |
1698 | | |
1699 | | /** MARK: TY_(ParseBody) |
1700 | | * Parses the `body` tag. |
1701 | | * |
1702 | | * This is a non-recursing parser. It uses the document's parser memory stack |
1703 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
1704 | | * This parser is also re-enterable, so that post-processing can occur after |
1705 | | * such dispatching. |
1706 | | */ |
1707 | | Node* TY_(ParseBody)( TidyDocImpl* doc, Node *body, GetTokenMode mode ) |
1708 | 169k | { |
1709 | 169k | Lexer* lexer = doc->lexer; |
1710 | 169k | Node *node = NULL; |
1711 | 169k | Bool checkstack = no; |
1712 | 169k | Bool iswhitenode = no; |
1713 | 169k | DEBUG_LOG_COUNTERS; |
1714 | | |
1715 | 169k | mode = IgnoreWhitespace; |
1716 | 169k | checkstack = yes; |
1717 | | |
1718 | | /* |
1719 | | If we're re-entering, then we need to setup from a previous state, |
1720 | | instead of starting fresh. We can pull what we need from the document's |
1721 | | stack. |
1722 | | */ |
1723 | 169k | if ( body == NULL ) |
1724 | 108k | { |
1725 | 108k | TidyParserMemory memory = TY_(popMemory)( doc ); |
1726 | 108k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
1727 | 108k | DEBUG_LOG_REENTER_WITH_NODE(node); |
1728 | 108k | body = memory.original_node; |
1729 | 108k | checkstack = memory.register_1; |
1730 | 108k | iswhitenode = memory.register_2; |
1731 | 108k | DEBUG_LOG_GET_OLD_MODE; |
1732 | 108k | mode = memory.mode; |
1733 | 108k | DEBUG_LOG_CHANGE_MODE; |
1734 | 108k | } |
1735 | 60.5k | else |
1736 | 60.5k | { |
1737 | 60.5k | DEBUG_LOG_ENTER_WITH_NODE(body); |
1738 | 60.5k | TY_(BumpObject)( doc, body->parent ); |
1739 | 60.5k | } |
1740 | | |
1741 | 643k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
1742 | 591k | { |
1743 | 591k | DEBUG_LOG_GOT_TOKEN(node); |
1744 | | /* find and discard multiple <body> elements */ |
1745 | 591k | if (node->tag == body->tag && node->type == StartTag) |
1746 | 2.76k | { |
1747 | 2.76k | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1748 | 2.76k | TY_(FreeNode)(doc, node); |
1749 | 2.76k | continue; |
1750 | 2.76k | } |
1751 | | |
1752 | | /* #538536 Extra endtags not detected */ |
1753 | 588k | if ( nodeIsHTML(node) ) |
1754 | 2.43k | { |
1755 | 2.43k | if (TY_(nodeIsElement)(node) || lexer->seenEndHtml) |
1756 | 2.40k | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1757 | 36 | else |
1758 | 36 | lexer->seenEndHtml = 1; |
1759 | | |
1760 | 2.43k | TY_(FreeNode)( doc, node); |
1761 | 2.43k | continue; |
1762 | 2.43k | } |
1763 | | |
1764 | 586k | if ( lexer->seenEndBody && |
1765 | 12.2k | ( node->type == StartTag || |
1766 | 9.16k | node->type == EndTag || |
1767 | 6.83k | node->type == StartEndTag ) ) |
1768 | 10.0k | { |
1769 | 10.0k | TY_(Report)(doc, body, node, CONTENT_AFTER_BODY ); |
1770 | 10.0k | } |
1771 | | |
1772 | 586k | if ( node->tag == body->tag && node->type == EndTag ) |
1773 | 2.21k | { |
1774 | 2.21k | body->closed = yes; |
1775 | 2.21k | TrimSpaces(doc, body); |
1776 | 2.21k | TY_(FreeNode)( doc, node); |
1777 | 2.21k | lexer->seenEndBody = 1; |
1778 | 2.21k | DEBUG_LOG_GET_OLD_MODE; |
1779 | 2.21k | mode = IgnoreWhitespace; |
1780 | 2.21k | DEBUG_LOG_CHANGE_MODE; |
1781 | | |
1782 | 2.21k | if ( nodeIsNOFRAMES(body->parent) ) |
1783 | 1.06k | break; |
1784 | | |
1785 | 1.14k | continue; |
1786 | 2.21k | } |
1787 | | |
1788 | 583k | if ( nodeIsNOFRAMES(node) ) |
1789 | 10.6k | { |
1790 | 10.6k | if (node->type == StartTag) |
1791 | 7.41k | { |
1792 | 7.41k | TidyParserMemory memory = {0}; |
1793 | | |
1794 | 7.41k | TY_(InsertNodeAtEnd)(body, node); |
1795 | | |
1796 | 7.41k | memory.identity = TY_(ParseBody); |
1797 | 7.41k | memory.original_node = body; |
1798 | 7.41k | memory.reentry_node = node; |
1799 | 7.41k | memory.register_1 = checkstack; |
1800 | 7.41k | memory.register_2 = iswhitenode; |
1801 | 7.41k | memory.mode = mode; |
1802 | 7.41k | TY_(pushMemory)( doc, memory ); |
1803 | 7.41k | DEBUG_LOG_EXIT_WITH_NODE(node); |
1804 | 7.41k | return node; |
1805 | 7.41k | } |
1806 | | |
1807 | 3.27k | if (node->type == EndTag && nodeIsNOFRAMES(body->parent) ) |
1808 | 606 | { |
1809 | 606 | TrimSpaces(doc, body); |
1810 | 606 | TY_(UngetToken)( doc ); |
1811 | 606 | break; |
1812 | 606 | } |
1813 | 3.27k | } |
1814 | | |
1815 | 575k | if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node)) |
1816 | 3.42k | && nodeIsNOFRAMES(body->parent) ) |
1817 | 1.17k | { |
1818 | 1.17k | TrimSpaces(doc, body); |
1819 | 1.17k | TY_(UngetToken)( doc ); |
1820 | 1.17k | break; |
1821 | 1.17k | } |
1822 | | |
1823 | 574k | iswhitenode = no; |
1824 | | |
1825 | 574k | if ( TY_(nodeIsText)(node) && |
1826 | 79.2k | node->end <= node->start + 1 && |
1827 | 55.0k | lexer->lexbuf[node->start] == ' ' ) |
1828 | 14.1k | iswhitenode = yes; |
1829 | | |
1830 | | /* deal with comments etc. */ |
1831 | 574k | if (InsertMisc(body, node)) |
1832 | 222k | continue; |
1833 | | |
1834 | | /* mixed content model permits text */ |
1835 | 352k | if (TY_(nodeIsText)(node)) |
1836 | 79.2k | { |
1837 | 79.2k | if (iswhitenode && mode == IgnoreWhitespace) |
1838 | 159 | { |
1839 | 159 | TY_(FreeNode)( doc, node); |
1840 | 159 | continue; |
1841 | 159 | } |
1842 | | |
1843 | | /* HTML 2 and HTML4 strict don't allow text here */ |
1844 | 79.0k | TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20)); |
1845 | | |
1846 | 79.0k | if (checkstack) |
1847 | 43.7k | { |
1848 | 43.7k | checkstack = no; |
1849 | | |
1850 | 43.7k | if ( TY_(InlineDup)(doc, node) > 0 ) |
1851 | 1.33k | continue; |
1852 | 43.7k | } |
1853 | | |
1854 | 77.7k | TY_(InsertNodeAtEnd)(body, node); |
1855 | 77.7k | DEBUG_LOG_GET_OLD_MODE; |
1856 | 77.7k | mode = MixedContent; |
1857 | 77.7k | DEBUG_LOG_CHANGE_MODE; |
1858 | 77.7k | continue; |
1859 | 79.0k | } |
1860 | | |
1861 | 273k | if (node->type == DocTypeTag) |
1862 | 6.35k | { |
1863 | 6.35k | InsertDocType(doc, body, node); |
1864 | 6.35k | continue; |
1865 | 6.35k | } |
1866 | | /* discard unknown and PARAM tags */ |
1867 | 267k | if ( node->tag == NULL || nodeIsPARAM(node) ) |
1868 | 150k | { |
1869 | 150k | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1870 | 150k | TY_(FreeNode)( doc, node); |
1871 | 150k | continue; |
1872 | 150k | } |
1873 | | |
1874 | | /* |
1875 | | Netscape allows LI and DD directly in BODY |
1876 | | We infer UL or DL respectively and use this |
1877 | | Bool to exclude block-level elements so as |
1878 | | to match Netscape's observed behaviour. |
1879 | | */ |
1880 | 116k | lexer->excludeBlocks = no; |
1881 | | |
1882 | 116k | if ((( nodeIsINPUT(node) || |
1883 | 115k | (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE)) |
1884 | 116k | ) && !TY_(IsHTML5Mode)(doc)) || nodeIsLI(node) ) |
1885 | 5.52k | { |
1886 | | /* avoid this error message being issued twice */ |
1887 | 5.52k | if (!(node->tag->model & CM_HEAD)) |
1888 | 4.74k | TY_(Report)(doc, body, node, TAG_NOT_ALLOWED_IN); |
1889 | | |
1890 | 5.52k | if (node->tag->model & CM_HTML) |
1891 | 1.68k | { |
1892 | | /* copy body attributes if current body was inferred */ |
1893 | 1.68k | if ( nodeIsBODY(node) && body->implicit |
1894 | 530 | && body->attributes == NULL ) |
1895 | 327 | { |
1896 | 327 | body->attributes = node->attributes; |
1897 | 327 | node->attributes = NULL; |
1898 | 327 | } |
1899 | | |
1900 | 1.68k | TY_(FreeNode)( doc, node); |
1901 | 1.68k | continue; |
1902 | 1.68k | } |
1903 | | |
1904 | 3.83k | if (node->tag->model & CM_HEAD) |
1905 | 775 | { |
1906 | 775 | MoveToHead(doc, body, node); |
1907 | 775 | continue; |
1908 | 775 | } |
1909 | | |
1910 | 3.05k | if (node->tag->model & CM_LIST) |
1911 | 166 | { |
1912 | 166 | TY_(UngetToken)( doc ); |
1913 | 166 | node = TY_(InferredTag)(doc, TidyTag_UL); |
1914 | 166 | AddClassNoIndent(doc, node); |
1915 | 166 | lexer->excludeBlocks = yes; |
1916 | 166 | } |
1917 | 2.89k | else if (node->tag->model & CM_DEFLIST) |
1918 | 373 | { |
1919 | 373 | TY_(UngetToken)( doc ); |
1920 | 373 | node = TY_(InferredTag)(doc, TidyTag_DL); |
1921 | 373 | lexer->excludeBlocks = yes; |
1922 | 373 | } |
1923 | 2.52k | else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW)) |
1924 | 567 | { |
1925 | | /* http://tidy.sf.net/issue/2855621 */ |
1926 | 567 | if (node->type != EndTag) { |
1927 | 328 | TY_(UngetToken)( doc ); |
1928 | 328 | node = TY_(InferredTag)(doc, TidyTag_TABLE); |
1929 | 328 | } |
1930 | 567 | lexer->excludeBlocks = yes; |
1931 | 567 | } |
1932 | 1.95k | else if ( nodeIsINPUT(node) ) |
1933 | 400 | { |
1934 | 400 | TY_(UngetToken)( doc ); |
1935 | 400 | node = TY_(InferredTag)(doc, TidyTag_FORM); |
1936 | 400 | lexer->excludeBlocks = yes; |
1937 | 400 | } |
1938 | 1.55k | else |
1939 | 1.55k | { |
1940 | 1.55k | if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) ) |
1941 | 362 | { |
1942 | 362 | TY_(UngetToken)( doc ); |
1943 | 362 | DEBUG_LOG_EXIT; |
1944 | 362 | return NULL; |
1945 | 362 | } |
1946 | | |
1947 | | /* ignore </td> </th> <option> etc. */ |
1948 | 1.19k | TY_(FreeNode)( doc, node ); |
1949 | 1.19k | continue; |
1950 | 1.55k | } |
1951 | 3.05k | } |
1952 | | |
1953 | 112k | if (node->type == EndTag) |
1954 | 5.70k | { |
1955 | 5.70k | if ( nodeIsBR(node) ) |
1956 | 292 | { |
1957 | 292 | node->type = StartTag; |
1958 | 292 | } |
1959 | 5.40k | else if ( nodeIsP(node) ) |
1960 | 426 | { |
1961 | 426 | node->type = StartEndTag; |
1962 | 426 | node->implicit = yes; |
1963 | 426 | } |
1964 | 4.98k | else if ( TY_(nodeHasCM)(node, CM_INLINE) ) |
1965 | 1.31k | TY_(PopInline)( doc, node ); |
1966 | 5.70k | } |
1967 | | |
1968 | 112k | if (TY_(nodeIsElement)(node)) |
1969 | 107k | { |
1970 | 107k | if (nodeIsMAIN(node)) |
1971 | 417 | { |
1972 | | /*\ Issue #166 - repeated <main> element |
1973 | | * How to efficiently search for a previous main element? |
1974 | | \*/ |
1975 | 417 | if ( findNodeById(doc, TidyTag_MAIN) ) |
1976 | 284 | { |
1977 | 284 | doc->badForm |= flg_BadMain; /* this is an ERROR in format */ |
1978 | 284 | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1979 | 284 | TY_(FreeNode)( doc, node); |
1980 | 284 | continue; |
1981 | 284 | } |
1982 | 417 | } |
1983 | | /* Issue #20 - merging from Ger Hobbelt fork put back CM_MIXED, which had been |
1984 | | removed to fix this issue - reverting to fix 880221e |
1985 | | */ |
1986 | 107k | if ( TY_(nodeHasCM)(node, CM_INLINE) ) |
1987 | 36.8k | { |
1988 | | /* HTML4 strict doesn't allow inline content here */ |
1989 | | /* but HTML2 does allow img elements as children of body */ |
1990 | 36.8k | if ( nodeIsIMG(node) ) |
1991 | 1.40k | TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT); |
1992 | 35.4k | else |
1993 | 35.4k | TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20)); |
1994 | | |
1995 | 36.8k | if (checkstack && !node->implicit) |
1996 | 6.76k | { |
1997 | 6.76k | checkstack = no; |
1998 | | |
1999 | 6.76k | if ( TY_(InlineDup)(doc, node) > 0 ) |
2000 | 478 | continue; |
2001 | 6.76k | } |
2002 | | |
2003 | 36.4k | DEBUG_LOG_GET_OLD_MODE; |
2004 | 36.4k | mode = MixedContent; |
2005 | 36.4k | DEBUG_LOG_CHANGE_MODE; |
2006 | 36.4k | } |
2007 | 70.2k | else |
2008 | 70.2k | { |
2009 | 70.2k | checkstack = yes; |
2010 | 70.2k | DEBUG_LOG_GET_OLD_MODE; |
2011 | 70.2k | mode = IgnoreWhitespace; |
2012 | 70.2k | DEBUG_LOG_CHANGE_MODE; |
2013 | 70.2k | } |
2014 | | |
2015 | 106k | if (node->implicit) |
2016 | 3.92k | { |
2017 | 3.92k | TY_(Report)(doc, body, node, INSERTING_TAG); |
2018 | 3.92k | } |
2019 | | |
2020 | 106k | TY_(InsertNodeAtEnd)(body, node); |
2021 | | |
2022 | 106k | { |
2023 | 106k | TidyParserMemory memory = {0}; |
2024 | 106k | memory.identity = TY_(ParseBody); |
2025 | 106k | memory.original_node = body; |
2026 | 106k | memory.reentry_node = node; |
2027 | 106k | memory.register_1 = checkstack; |
2028 | 106k | memory.register_2 = iswhitenode; |
2029 | 106k | memory.mode = mode; |
2030 | 106k | TY_(pushMemory)( doc, memory ); |
2031 | 106k | } |
2032 | 106k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2033 | 106k | return node; |
2034 | 107k | } |
2035 | | |
2036 | | /* discard unexpected tags */ |
2037 | 4.98k | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
2038 | 4.98k | TY_(FreeNode)( doc, node); |
2039 | 4.98k | } |
2040 | 54.9k | DEBUG_LOG_EXIT; |
2041 | 54.9k | return NULL; |
2042 | 169k | } |
2043 | | |
2044 | | |
2045 | | /** MARK: TY_(ParseColGroup) |
2046 | | * Parses the `colgroup` tag. |
2047 | | * |
2048 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2049 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2050 | | * This parser is also re-enterable, so that post-processing can occur after |
2051 | | * such dispatching. |
2052 | | */ |
2053 | | Node* TY_(ParseColGroup)( TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode) ) |
2054 | 2.97k | { |
2055 | 2.97k | Node *node, *parent; |
2056 | 2.97k | DEBUG_LOG_COUNTERS; |
2057 | | |
2058 | | /* |
2059 | | If we're re-entering, then we need to setup from a previous state, |
2060 | | instead of starting fresh. We can pull what we need from the document's |
2061 | | stack. |
2062 | | */ |
2063 | 2.97k | if ( colgroup == NULL ) |
2064 | 738 | { |
2065 | 738 | TidyParserMemory memory = TY_(popMemory)( doc ); |
2066 | 738 | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2067 | 738 | DEBUG_LOG_REENTER_WITH_NODE(node); |
2068 | 738 | colgroup = memory.original_node; |
2069 | 738 | DEBUG_LOG_GET_OLD_MODE; |
2070 | 738 | mode = memory.mode; |
2071 | 738 | DEBUG_LOG_CHANGE_MODE; |
2072 | 738 | } |
2073 | 2.24k | else |
2074 | 2.24k | { |
2075 | 2.24k | DEBUG_LOG_ENTER_WITH_NODE(colgroup); |
2076 | 2.24k | if (colgroup->tag->model & CM_EMPTY) |
2077 | 0 | return NULL; |
2078 | 2.24k | } |
2079 | | |
2080 | 5.00k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2081 | 4.42k | { |
2082 | 4.42k | DEBUG_LOG_GOT_TOKEN(node); |
2083 | | |
2084 | 4.42k | if (node->tag == colgroup->tag && node->type == EndTag) |
2085 | 196 | { |
2086 | 196 | TY_(FreeNode)( doc, node); |
2087 | 196 | colgroup->closed = yes; |
2088 | 196 | return NULL; |
2089 | 196 | } |
2090 | | |
2091 | | /* |
2092 | | if this is the end tag for an ancestor element |
2093 | | then infer end tag for this element |
2094 | | */ |
2095 | 4.22k | if (node->type == EndTag) |
2096 | 1.33k | { |
2097 | 1.33k | if ( nodeIsFORM(node) ) |
2098 | 243 | { |
2099 | 243 | BadForm( doc ); |
2100 | 243 | TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED); |
2101 | 243 | TY_(FreeNode)( doc, node); |
2102 | 243 | continue; |
2103 | 243 | } |
2104 | | |
2105 | 1.09k | for ( parent = colgroup->parent; |
2106 | 111k | parent != NULL; |
2107 | 110k | parent = parent->parent ) |
2108 | 110k | { |
2109 | 110k | if (node->tag == parent->tag) |
2110 | 223 | { |
2111 | 223 | TY_(UngetToken)( doc ); |
2112 | 223 | DEBUG_LOG_EXIT; |
2113 | 223 | return NULL; |
2114 | 223 | } |
2115 | 110k | } |
2116 | 1.09k | } |
2117 | | |
2118 | 3.76k | if (TY_(nodeIsText)(node)) |
2119 | 425 | { |
2120 | 425 | TY_(UngetToken)( doc ); |
2121 | 425 | DEBUG_LOG_EXIT; |
2122 | 425 | return NULL; |
2123 | 425 | } |
2124 | | |
2125 | | /* deal with comments etc. */ |
2126 | 3.33k | if (InsertMisc(colgroup, node)) |
2127 | 73 | continue; |
2128 | | |
2129 | | /* discard unknown tags */ |
2130 | 3.26k | if (node->tag == NULL) |
2131 | 837 | { |
2132 | 837 | TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED); |
2133 | 837 | TY_(FreeNode)( doc, node); |
2134 | 837 | continue; |
2135 | 837 | } |
2136 | | |
2137 | 2.42k | if ( !nodeIsCOL(node) ) |
2138 | 817 | { |
2139 | 817 | TY_(UngetToken)( doc ); |
2140 | 817 | DEBUG_LOG_EXIT; |
2141 | 817 | return NULL; |
2142 | 817 | } |
2143 | | |
2144 | 1.60k | if (node->type == EndTag) |
2145 | 871 | { |
2146 | 871 | TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED); |
2147 | 871 | TY_(FreeNode)( doc, node); |
2148 | 871 | continue; |
2149 | 871 | } |
2150 | | |
2151 | | /* node should be <COL> */ |
2152 | 738 | TY_(InsertNodeAtEnd)(colgroup, node); |
2153 | | |
2154 | 738 | { |
2155 | 738 | TidyParserMemory memory = {0}; |
2156 | 738 | memory.identity = TY_(ParseColGroup); |
2157 | 738 | memory.original_node = colgroup; |
2158 | 738 | memory.reentry_node = node; |
2159 | 738 | memory.mode = mode; |
2160 | 738 | TY_(pushMemory)( doc, memory ); |
2161 | 738 | DEBUG_LOG_EXIT_WITH_NODE(node); |
2162 | 738 | } |
2163 | 738 | DEBUG_LOG_EXIT; |
2164 | 738 | return node; |
2165 | 1.60k | } |
2166 | 580 | DEBUG_LOG_EXIT; |
2167 | 580 | return NULL; |
2168 | 2.97k | } |
2169 | | |
2170 | | |
2171 | | /** MARK: TY_(ParseDatalist) |
2172 | | * Parses the `datalist` tag. |
2173 | | * |
2174 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2175 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2176 | | * This parser is also re-enterable, so that post-processing can occur after |
2177 | | * such dispatching. |
2178 | | */ |
2179 | | Node* TY_(ParseDatalist)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) ) |
2180 | 3.58k | { |
2181 | 3.58k | Lexer* lexer = doc->lexer; |
2182 | 3.58k | Node *node; |
2183 | 3.58k | DEBUG_LOG_COUNTERS; |
2184 | | |
2185 | 3.58k | if ( field == NULL ) |
2186 | 1.97k | { |
2187 | 1.97k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2188 | 1.97k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2189 | 1.97k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2190 | 1.97k | field = memory.original_node; |
2191 | 1.97k | DEBUG_LOG_GET_OLD_MODE; |
2192 | 1.97k | mode = memory.mode; |
2193 | 1.97k | DEBUG_LOG_CHANGE_MODE; |
2194 | 1.97k | } |
2195 | 1.60k | else |
2196 | 1.60k | { |
2197 | 1.60k | DEBUG_LOG_ENTER_WITH_NODE(field); |
2198 | 1.60k | } |
2199 | | |
2200 | 3.58k | lexer->insert = NULL; /* defer implicit inline start tags */ |
2201 | | |
2202 | 6.23k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2203 | 4.91k | { |
2204 | 4.91k | if (node->tag == field->tag && node->type == EndTag) |
2205 | 284 | { |
2206 | 284 | TY_(FreeNode)( doc, node); |
2207 | 284 | field->closed = yes; |
2208 | 284 | TrimSpaces(doc, field); |
2209 | | |
2210 | 284 | DEBUG_LOG_EXIT; |
2211 | 284 | return NULL; |
2212 | 284 | } |
2213 | | |
2214 | | /* deal with comments etc. */ |
2215 | 4.62k | if (InsertMisc(field, node)) |
2216 | 317 | continue; |
2217 | | |
2218 | 4.30k | if ( node->type == StartTag && |
2219 | 3.28k | ( nodeIsOPTION(node) || |
2220 | 3.28k | nodeIsOPTGROUP(node) || |
2221 | 3.28k | nodeIsDATALIST(node) || |
2222 | 3.28k | nodeIsSCRIPT(node)) |
2223 | 4.30k | ) |
2224 | 1.97k | { |
2225 | 1.97k | TidyParserMemory memory = {0}; |
2226 | 1.97k | memory.identity = TY_(ParseDatalist); |
2227 | 1.97k | memory.original_node = field; |
2228 | 1.97k | memory.reentry_node = node; |
2229 | 1.97k | memory.reentry_mode = IgnoreWhitespace; |
2230 | | |
2231 | 1.97k | TY_(InsertNodeAtEnd)(field, node); |
2232 | 1.97k | TY_(pushMemory)(doc, memory); |
2233 | 1.97k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2234 | 1.97k | return node; |
2235 | 1.97k | } |
2236 | | |
2237 | | /* discard unexpected tags */ |
2238 | 2.33k | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED); |
2239 | 2.33k | TY_(FreeNode)( doc, node); |
2240 | 2.33k | } |
2241 | | |
2242 | 1.32k | TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR); |
2243 | | |
2244 | 1.32k | DEBUG_LOG_EXIT; |
2245 | 1.32k | return NULL; |
2246 | 3.58k | } |
2247 | | |
2248 | | |
2249 | | /** MARK: TY_(ParseDefList) |
2250 | | * Parses the `dl` tag. |
2251 | | * |
2252 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2253 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2254 | | * This parser is also re-enterable, so that post-processing can occur after |
2255 | | * such dispatching. |
2256 | | */ |
2257 | | Node* TY_(ParseDefList)( TidyDocImpl* doc, Node *list, GetTokenMode mode ) |
2258 | 27.8k | { |
2259 | 27.8k | Lexer* lexer = doc->lexer; |
2260 | 27.8k | Node *node = NULL; |
2261 | 27.8k | Node *parent = NULL; |
2262 | 27.8k | DEBUG_LOG_COUNTERS; |
2263 | | |
2264 | 27.8k | enum parserState { |
2265 | 27.8k | STATE_INITIAL, /* This is the initial state for every parser. */ |
2266 | 27.8k | STATE_POST_NODEISCENTER, /* To-do after re-entering after checks. */ |
2267 | 27.8k | STATE_COMPLETE, /* Done with the switch. */ |
2268 | 27.8k | } state = STATE_INITIAL; |
2269 | | |
2270 | 27.8k | if ( list == NULL ) |
2271 | 17.6k | { |
2272 | 17.6k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2273 | 17.6k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2274 | 17.6k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2275 | 17.6k | list = memory.original_node; |
2276 | 17.6k | state = memory.reentry_state; |
2277 | 17.6k | DEBUG_LOG_GET_OLD_MODE; |
2278 | 17.6k | mode = memory.mode; |
2279 | 17.6k | DEBUG_LOG_CHANGE_MODE; |
2280 | 17.6k | } |
2281 | 10.1k | else |
2282 | 10.1k | { |
2283 | 10.1k | DEBUG_LOG_ENTER_WITH_NODE(list); |
2284 | 10.1k | } |
2285 | | |
2286 | 27.8k | if (list->tag->model & CM_EMPTY) |
2287 | 0 | return NULL; |
2288 | | |
2289 | 27.8k | lexer->insert = NULL; /* defer implicit inline start tags */ |
2290 | | |
2291 | 39.6k | while ( state != STATE_COMPLETE ) |
2292 | 32.2k | { |
2293 | 32.2k | if ( state == STATE_INITIAL ) |
2294 | 31.1k | node = TY_(GetToken)( doc, IgnoreWhitespace); |
2295 | | |
2296 | 32.2k | switch ( state) |
2297 | 32.2k | { |
2298 | 31.1k | case STATE_INITIAL: |
2299 | 31.1k | { |
2300 | 31.1k | if ( node == NULL) |
2301 | 7.35k | { |
2302 | 7.35k | state = STATE_COMPLETE; |
2303 | 7.35k | continue; |
2304 | 7.35k | } |
2305 | | |
2306 | 23.7k | if (node->tag == list->tag && node->type == EndTag) |
2307 | 114 | { |
2308 | 114 | TY_(FreeNode)( doc, node); |
2309 | 114 | list->closed = yes; |
2310 | 114 | DEBUG_LOG_EXIT; |
2311 | 114 | return NULL; |
2312 | 114 | } |
2313 | | |
2314 | | /* deal with comments etc. */ |
2315 | 23.6k | if (InsertMisc(list, node)) |
2316 | 287 | continue; |
2317 | | |
2318 | 23.3k | if (TY_(nodeIsText)(node)) |
2319 | 2.90k | { |
2320 | 2.90k | TY_(UngetToken)( doc ); |
2321 | 2.90k | node = TY_(InferredTag)(doc, TidyTag_DT); |
2322 | 2.90k | TY_(Report)(doc, list, node, MISSING_STARTTAG); |
2323 | 2.90k | } |
2324 | | |
2325 | 23.3k | if (node->tag == NULL) |
2326 | 545 | { |
2327 | 545 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2328 | 545 | TY_(FreeNode)( doc, node); |
2329 | 545 | continue; |
2330 | 545 | } |
2331 | | |
2332 | | /* |
2333 | | if this is the end tag for an ancestor element |
2334 | | then infer end tag for this element |
2335 | | */ |
2336 | 22.8k | if (node->type == EndTag) |
2337 | 3.44k | { |
2338 | 3.44k | Bool discardIt = no; |
2339 | 3.44k | if ( nodeIsFORM(node) ) |
2340 | 200 | { |
2341 | 200 | BadForm( doc ); |
2342 | 200 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2343 | 200 | TY_(FreeNode)( doc, node ); |
2344 | 200 | continue; |
2345 | 200 | } |
2346 | | |
2347 | 3.24k | for (parent = list->parent; |
2348 | 195k | parent != NULL; parent = parent->parent) |
2349 | 194k | { |
2350 | | /* Do not match across BODY to avoid infinite loop |
2351 | | between ParseBody and this parser, |
2352 | | See http://tidy.sf.net/bug/1098012. */ |
2353 | 194k | if (nodeIsBODY(parent)) |
2354 | 869 | { |
2355 | 869 | discardIt = yes; |
2356 | 869 | break; |
2357 | 869 | } |
2358 | 193k | if (node->tag == parent->tag) |
2359 | 799 | { |
2360 | 799 | TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE); |
2361 | 799 | TY_(UngetToken)( doc ); |
2362 | | |
2363 | 799 | DEBUG_LOG_EXIT; |
2364 | 799 | return NULL; |
2365 | 799 | } |
2366 | 193k | } |
2367 | 2.44k | if (discardIt) |
2368 | 869 | { |
2369 | 869 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2370 | 869 | TY_(FreeNode)( doc, node); |
2371 | 869 | continue; |
2372 | 869 | } |
2373 | 2.44k | } |
2374 | | |
2375 | | /* center in a dt or a dl breaks the dl list in two */ |
2376 | 20.9k | if ( nodeIsCENTER(node) ) |
2377 | 1.13k | { |
2378 | 1.13k | if (list->content) |
2379 | 900 | TY_(InsertNodeAfterElement)(list, node); |
2380 | 238 | else /* trim empty dl list */ |
2381 | 238 | { |
2382 | 238 | TY_(InsertNodeBeforeElement)(list, node); |
2383 | 238 | } |
2384 | | |
2385 | | /* #426885 - fix by Glenn Carroll 19 Apr 00, and |
2386 | | Gary Dechaines 11 Aug 00 */ |
2387 | | /* ParseTag can destroy node, if it finds that |
2388 | | * this <center> is followed immediately by </center>. |
2389 | | * It's awkward but necessary to determine if this |
2390 | | * has happened. |
2391 | | */ |
2392 | 1.13k | parent = node->parent; |
2393 | | |
2394 | | /* and parse contents of center */ |
2395 | 1.13k | lexer->excludeBlocks = no; |
2396 | | |
2397 | 1.13k | { |
2398 | 1.13k | TidyParserMemory memory = {0}; |
2399 | 1.13k | memory.identity = TY_(ParseDefList); |
2400 | 1.13k | memory.original_node = list; |
2401 | 1.13k | memory.reentry_node = node; |
2402 | 1.13k | memory.reentry_state = STATE_POST_NODEISCENTER; |
2403 | 1.13k | TY_(pushMemory)( doc, memory ); |
2404 | 1.13k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2405 | 1.13k | return node; |
2406 | 1.13k | } |
2407 | 1.13k | } |
2408 | | |
2409 | 19.8k | if ( !( nodeIsDT(node) || nodeIsDD(node) || ( nodeIsDIV(node) && TY_(IsHTML5Mode)(doc) ) ) ) |
2410 | 5.75k | { |
2411 | 5.75k | TY_(UngetToken)( doc ); |
2412 | | |
2413 | 5.75k | if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) |
2414 | 1.08k | { |
2415 | 1.08k | TY_(Report)(doc, list, node, TAG_NOT_ALLOWED_IN); |
2416 | 1.08k | DEBUG_LOG_EXIT; |
2417 | 1.08k | return NULL; |
2418 | 1.08k | } |
2419 | | |
2420 | | /* if DD appeared directly in BODY then exclude blocks */ |
2421 | 4.67k | if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) |
2422 | 838 | { |
2423 | 838 | DEBUG_LOG_EXIT; |
2424 | 838 | return NULL; |
2425 | 838 | } |
2426 | | |
2427 | 3.83k | node = TY_(InferredTag)(doc, TidyTag_DD); |
2428 | 3.83k | TY_(Report)(doc, list, node, MISSING_STARTTAG); |
2429 | 3.83k | } |
2430 | | |
2431 | 17.9k | if (node->type == EndTag) |
2432 | 1.39k | { |
2433 | 1.39k | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2434 | 1.39k | TY_(FreeNode)( doc, node); |
2435 | 1.39k | continue; |
2436 | 1.39k | } |
2437 | | |
2438 | | /* node should be <DT> or <DD> or <DIV>*/ |
2439 | 16.5k | TY_(InsertNodeAtEnd)(list, node); |
2440 | 16.5k | { |
2441 | 16.5k | TidyParserMemory memory = {0}; |
2442 | 16.5k | memory.identity = TY_(ParseDefList); |
2443 | 16.5k | memory.original_node = list; |
2444 | 16.5k | memory.reentry_node = node; |
2445 | 16.5k | memory.reentry_state = STATE_INITIAL; |
2446 | 16.5k | TY_(pushMemory)( doc, memory ); |
2447 | 16.5k | DEBUG_LOG_EXIT; |
2448 | 16.5k | return node; |
2449 | 17.9k | } |
2450 | 17.9k | } break; |
2451 | | |
2452 | | |
2453 | 1.13k | case STATE_POST_NODEISCENTER: |
2454 | 1.13k | { |
2455 | 1.13k | lexer->excludeBlocks = yes; |
2456 | | |
2457 | | /* now create a new dl element, |
2458 | | * unless node has been blown away because the |
2459 | | * center was empty, as above. |
2460 | | */ |
2461 | 1.13k | if (parent && parent->last == node) |
2462 | 0 | { |
2463 | 0 | list = TY_(InferredTag)(doc, TidyTag_DL); |
2464 | 0 | TY_(InsertNodeAfterElement)(node, list); |
2465 | 0 | } |
2466 | 1.13k | state = STATE_INITIAL; |
2467 | 1.13k | continue; |
2468 | 17.9k | } break; |
2469 | | |
2470 | | |
2471 | 0 | default: |
2472 | 0 | break; |
2473 | 32.2k | } /* switch */ |
2474 | 32.2k | } /* while */ |
2475 | | |
2476 | 7.35k | TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR); |
2477 | 7.35k | DEBUG_LOG_EXIT; |
2478 | 7.35k | return NULL; |
2479 | 27.8k | } |
2480 | | |
2481 | | |
2482 | | /** MARK: TY_(ParseEmpty) |
2483 | | * Parse empty element nodes. |
2484 | | * |
2485 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2486 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2487 | | * This parser is also re-enterable, so that post-processing can occur after |
2488 | | * such dispatching. |
2489 | | */ |
2490 | | Node* TY_(ParseEmpty)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) |
2491 | 39.5k | { |
2492 | 39.5k | Lexer* lexer = doc->lexer; |
2493 | 39.5k | if ( lexer->isvoyager ) |
2494 | 695 | { |
2495 | 695 | Node *node = TY_(GetToken)( doc, mode); |
2496 | 695 | if ( node ) |
2497 | 642 | { |
2498 | 642 | if ( !(node->type == EndTag && node->tag == element->tag) ) |
2499 | 554 | { |
2500 | | /* TY_(Report)(doc, element, node, ELEMENT_NOT_EMPTY); */ |
2501 | 554 | TY_(UngetToken)( doc ); |
2502 | 554 | } |
2503 | 88 | else |
2504 | 88 | { |
2505 | 88 | TY_(FreeNode)( doc, node ); |
2506 | 88 | } |
2507 | 642 | } |
2508 | 695 | } |
2509 | 39.5k | return NULL; |
2510 | 39.5k | } |
2511 | | |
2512 | | |
2513 | | /** MARK: TY_(ParseFrameSet) |
2514 | | * Parses the `frameset` tag. |
2515 | | * |
2516 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2517 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2518 | | * This parser is also re-enterable, so that post-processing can occur after |
2519 | | * such dispatching. |
2520 | | */ |
2521 | | Node* TY_(ParseFrameSet)( TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode) ) |
2522 | 40.2k | { |
2523 | 40.2k | Lexer* lexer = doc->lexer; |
2524 | 40.2k | Node *node; |
2525 | 40.2k | DEBUG_LOG_COUNTERS; |
2526 | | |
2527 | | /* |
2528 | | If we're re-entering, then we need to setup from a previous state, |
2529 | | instead of starting fresh. We can pull what we need from the document's |
2530 | | stack. |
2531 | | */ |
2532 | 40.2k | if ( frameset == NULL ) |
2533 | 18.6k | { |
2534 | 18.6k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2535 | 18.6k | node = memory.reentry_node; /* Throwaway, because we replace it entering the loop. */ |
2536 | 18.6k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2537 | 18.6k | frameset = memory.original_node; |
2538 | 18.6k | DEBUG_LOG_GET_OLD_MODE; |
2539 | 18.6k | mode = memory.mode; |
2540 | 18.6k | DEBUG_LOG_CHANGE_MODE; |
2541 | 18.6k | } |
2542 | 21.6k | else |
2543 | 21.6k | { |
2544 | 21.6k | DEBUG_LOG_ENTER_WITH_NODE(frameset); |
2545 | 21.6k | if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) |
2546 | 21.6k | { |
2547 | 21.6k | doc->badAccess |= BA_USING_FRAMES; |
2548 | 21.6k | } |
2549 | 21.6k | } |
2550 | | |
2551 | 53.4k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2552 | 32.2k | { |
2553 | 32.2k | if (node->tag == frameset->tag && node->type == EndTag) |
2554 | 452 | { |
2555 | 452 | TY_(FreeNode)( doc, node); |
2556 | 452 | frameset->closed = yes; |
2557 | 452 | TrimSpaces(doc, frameset); |
2558 | 452 | DEBUG_LOG_EXIT; |
2559 | 452 | return NULL; |
2560 | 452 | } |
2561 | | |
2562 | | /* deal with comments etc. */ |
2563 | 31.7k | if (InsertMisc(frameset, node)) |
2564 | 2.86k | continue; |
2565 | | |
2566 | 28.9k | if (node->tag == NULL) |
2567 | 5.52k | { |
2568 | 5.52k | TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED); |
2569 | 5.52k | TY_(FreeNode)( doc, node); |
2570 | 5.52k | continue; |
2571 | 5.52k | } |
2572 | | |
2573 | 23.3k | if (TY_(nodeIsElement)(node)) |
2574 | 22.4k | { |
2575 | 22.4k | if (node->tag && node->tag->model & CM_HEAD) |
2576 | 465 | { |
2577 | 465 | MoveToHead(doc, frameset, node); |
2578 | 465 | continue; |
2579 | 465 | } |
2580 | 22.4k | } |
2581 | | |
2582 | 22.9k | if ( nodeIsBODY(node) ) |
2583 | 984 | { |
2584 | 984 | TY_(UngetToken)( doc ); |
2585 | 984 | node = TY_(InferredTag)(doc, TidyTag_NOFRAMES); |
2586 | 984 | TY_(Report)(doc, frameset, node, INSERTING_TAG); |
2587 | 984 | } |
2588 | | |
2589 | 22.9k | if (node->type == StartTag && (node->tag && node->tag->model & CM_FRAMES)) |
2590 | 18.6k | { |
2591 | 18.6k | TY_(InsertNodeAtEnd)(frameset, node); |
2592 | 18.6k | lexer->excludeBlocks = no; |
2593 | | |
2594 | | /* |
2595 | | * We don't really have to do anything when re-entering, except |
2596 | | * setting up the state when we left. No post-processing means |
2597 | | * this stays simple. |
2598 | | */ |
2599 | 18.6k | TidyParserMemory memory = {0}; |
2600 | 18.6k | memory.identity = TY_(ParseFrameSet); |
2601 | 18.6k | memory.original_node = frameset; |
2602 | 18.6k | memory.reentry_node = node; |
2603 | 18.6k | memory.mode = MixedContent; |
2604 | 18.6k | TY_(pushMemory)( doc, memory ); |
2605 | 18.6k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2606 | 18.6k | return node; |
2607 | 18.6k | } |
2608 | 4.27k | else if (node->type == StartEndTag && (node->tag && node->tag->model & CM_FRAMES)) |
2609 | 695 | { |
2610 | 695 | TY_(InsertNodeAtEnd)(frameset, node); |
2611 | 695 | continue; |
2612 | 695 | } |
2613 | | |
2614 | | /* discard unexpected tags */ |
2615 | | /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */ |
2616 | 3.58k | if ( nodeIsA(node) ) |
2617 | 214 | doc->badAccess |= BA_INVALID_LINK_NOFRAMES; |
2618 | | |
2619 | 3.58k | TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED); |
2620 | 3.58k | TY_(FreeNode)( doc, node); |
2621 | 3.58k | } |
2622 | | |
2623 | 21.2k | TY_(Report)(doc, frameset, node, MISSING_ENDTAG_FOR); |
2624 | 21.2k | DEBUG_LOG_EXIT; |
2625 | 21.2k | return NULL; |
2626 | 40.2k | } |
2627 | | |
2628 | | |
2629 | | /** MARK: TY_(ParseHead) |
2630 | | * Parses the `head` tag. |
2631 | | * |
2632 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2633 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2634 | | * This parser is also re-enterable, so that post-processing can occur after |
2635 | | * such dispatching. |
2636 | | */ |
2637 | | Node* TY_(ParseHead)( TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode) ) |
2638 | 39.8k | { |
2639 | 39.8k | Lexer* lexer = doc->lexer; |
2640 | 39.8k | Node *node; |
2641 | 39.8k | int HasTitle = 0; |
2642 | 39.8k | int HasBase = 0; |
2643 | 39.8k | DEBUG_LOG_COUNTERS; |
2644 | | |
2645 | 39.8k | if ( head == NULL ) |
2646 | 14.5k | { |
2647 | 14.5k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2648 | 14.5k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2649 | 14.5k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2650 | 14.5k | head = memory.original_node; |
2651 | 14.5k | HasTitle = memory.register_1; |
2652 | 14.5k | HasBase = memory.register_2; |
2653 | 14.5k | DEBUG_LOG_GET_OLD_MODE; |
2654 | 14.5k | mode = memory.mode; |
2655 | 14.5k | DEBUG_LOG_CHANGE_MODE; |
2656 | 14.5k | } |
2657 | 25.3k | else |
2658 | 25.3k | { |
2659 | 25.3k | DEBUG_LOG_ENTER_WITH_NODE(head); |
2660 | 25.3k | } |
2661 | | |
2662 | 53.1k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2663 | 49.6k | { |
2664 | 49.6k | if (node->tag == head->tag && node->type == EndTag) |
2665 | 339 | { |
2666 | 339 | TY_(FreeNode)( doc, node); |
2667 | 339 | head->closed = yes; |
2668 | 339 | break; |
2669 | 339 | } |
2670 | | |
2671 | | /* find and discard multiple <head> elements */ |
2672 | | /* find and discard <html> in <head> elements */ |
2673 | 49.2k | if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag) |
2674 | 1.13k | { |
2675 | 1.13k | TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED); |
2676 | 1.13k | TY_(FreeNode)(doc, node); |
2677 | 1.13k | continue; |
2678 | 1.13k | } |
2679 | | |
2680 | 48.1k | if (TY_(nodeIsText)(node)) |
2681 | 7.53k | { |
2682 | | /*\ Issue #132 - avoid warning for missing body tag, |
2683 | | * if configured to --omit-otpional-tags yes |
2684 | | * Issue #314 - and if --show-body-only |
2685 | | \*/ |
2686 | 7.53k | if (!cfgBool( doc, TidyOmitOptionalTags ) && |
2687 | 7.53k | !showingBodyOnly(doc) ) |
2688 | 7.53k | { |
2689 | 7.53k | TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN); |
2690 | 7.53k | } |
2691 | 7.53k | TY_(UngetToken)( doc ); |
2692 | 7.53k | break; |
2693 | 7.53k | } |
2694 | | |
2695 | 40.6k | if (node->type == ProcInsTag && node->element && |
2696 | 1.06k | TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0) |
2697 | 707 | { |
2698 | 707 | TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN); |
2699 | 707 | TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node); |
2700 | 707 | continue; |
2701 | 707 | } |
2702 | | |
2703 | | /* deal with comments etc. */ |
2704 | 39.9k | if (InsertMisc(head, node)) |
2705 | 710 | continue; |
2706 | | |
2707 | 39.1k | if (node->type == DocTypeTag) |
2708 | 2.51k | { |
2709 | 2.51k | InsertDocType(doc, head, node); |
2710 | 2.51k | continue; |
2711 | 2.51k | } |
2712 | | |
2713 | | /* discard unknown tags */ |
2714 | 36.6k | if (node->tag == NULL) |
2715 | 7.86k | { |
2716 | 7.86k | TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED); |
2717 | 7.86k | TY_(FreeNode)( doc, node); |
2718 | 7.86k | continue; |
2719 | 7.86k | } |
2720 | | |
2721 | | /* |
2722 | | if it doesn't belong in the head then |
2723 | | treat as implicit end of head and deal |
2724 | | with as part of the body |
2725 | | */ |
2726 | 28.8k | if (!(node->tag->model & CM_HEAD)) |
2727 | 13.5k | { |
2728 | | /* #545067 Implicit closing of head broken - warn only for XHTML input */ |
2729 | 13.5k | if ( lexer->isvoyager ) |
2730 | 151 | TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN ); |
2731 | 13.5k | TY_(UngetToken)( doc ); |
2732 | 13.5k | break; |
2733 | 13.5k | } |
2734 | | |
2735 | 15.2k | if (TY_(nodeIsElement)(node)) |
2736 | 14.9k | { |
2737 | 14.9k | if ( nodeIsTITLE(node) ) |
2738 | 1.11k | { |
2739 | 1.11k | ++HasTitle; |
2740 | | |
2741 | 1.11k | if (HasTitle > 1) |
2742 | 790 | TY_(Report)(doc, head, node, |
2743 | 790 | head ? |
2744 | 790 | TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS); |
2745 | 1.11k | } |
2746 | 13.8k | else if ( nodeIsBASE(node) ) |
2747 | 3.00k | { |
2748 | 3.00k | ++HasBase; |
2749 | | |
2750 | 3.00k | if (HasBase > 1) |
2751 | 1.93k | TY_(Report)(doc, head, node, |
2752 | 1.93k | head ? |
2753 | 1.93k | TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS); |
2754 | 3.00k | } |
2755 | | |
2756 | 14.9k | TY_(InsertNodeAtEnd)(head, node); |
2757 | | |
2758 | 14.9k | { |
2759 | 14.9k | TidyParserMemory memory = {0}; |
2760 | 14.9k | memory.identity = TY_(ParseHead); |
2761 | 14.9k | memory.original_node = head; |
2762 | 14.9k | memory.reentry_node = node; |
2763 | 14.9k | memory.register_1 = HasTitle; |
2764 | 14.9k | memory.register_2 = HasBase; |
2765 | 14.9k | TY_(pushMemory)( doc, memory ); |
2766 | 14.9k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2767 | 14.9k | return node; |
2768 | 14.9k | } |
2769 | 14.9k | } |
2770 | | |
2771 | | /* discard unexpected text nodes and end tags */ |
2772 | 303 | TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED); |
2773 | 303 | TY_(FreeNode)( doc, node); |
2774 | 303 | } |
2775 | 24.9k | DEBUG_LOG_EXIT; |
2776 | 24.9k | return NULL; |
2777 | 39.8k | } |
2778 | | |
2779 | | |
2780 | | /** MARK: TY_(ParseHTML) |
2781 | | * Parses the `html` tag. At this point, other root-level stuff (doctype, |
2782 | | * comments) are already set up, and here we handle all of the complexities |
2783 | | * of things such as frameset documents, etc. |
2784 | | * |
2785 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2786 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2787 | | * This parser is also re-enterable, so that post-processing can occur after |
2788 | | * such dispatching. |
2789 | | */ |
2790 | | Node* TY_(ParseHTML)( TidyDocImpl *doc, Node *html, GetTokenMode mode ) |
2791 | 59.3k | { |
2792 | 59.3k | Node *node = NULL; |
2793 | 59.3k | Node *head = NULL; |
2794 | 59.3k | Node *frameset = NULL; |
2795 | 59.3k | Node *noframes = NULL; |
2796 | 59.3k | DEBUG_LOG_COUNTERS; |
2797 | | |
2798 | 59.3k | enum parserState { |
2799 | 59.3k | STATE_INITIAL, /* This is the initial state for every parser. */ |
2800 | 59.3k | STATE_COMPLETE, /* Complete! */ |
2801 | 59.3k | STATE_PRE_BODY, /* In this state, we'll consider frames vs. body. */ |
2802 | 59.3k | STATE_PARSE_BODY, /* In this state, we can parse the body. */ |
2803 | 59.3k | STATE_PARSE_HEAD, /* In this state, we will setup head for parsing. */ |
2804 | 59.3k | STATE_PARSE_HEAD_REENTER, /* Resume here after parsing head. */ |
2805 | 59.3k | STATE_PARSE_NOFRAMES, /* In this state, we can parse noframes content. */ |
2806 | 59.3k | STATE_PARSE_NOFRAMES_REENTER, /* In this state, we can restore more state. */ |
2807 | 59.3k | STATE_PARSE_FRAMESET, /* In this state, we will parse frameset content. */ |
2808 | 59.3k | STATE_PARSE_FRAMESET_REENTER, /* We need to cleanup some things after parsing frameset. */ |
2809 | 59.3k | } state = STATE_INITIAL; |
2810 | | |
2811 | 59.3k | TY_(SetOptionBool)( doc, TidyXmlTags, no ); |
2812 | | |
2813 | 59.3k | if ( html == NULL ) |
2814 | 34.7k | { |
2815 | 34.7k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2816 | 34.7k | node = memory.reentry_node; |
2817 | 34.7k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2818 | 34.7k | html = memory.original_node; |
2819 | 34.7k | state = memory.reentry_state; |
2820 | 34.7k | DEBUG_LOG_GET_OLD_MODE; |
2821 | 34.7k | mode = memory.reentry_mode; |
2822 | 34.7k | DEBUG_LOG_CHANGE_MODE; |
2823 | 34.7k | } |
2824 | 24.5k | else |
2825 | 24.5k | { |
2826 | 24.5k | DEBUG_LOG_ENTER_WITH_NODE(html); |
2827 | 24.5k | } |
2828 | | |
2829 | | /* |
2830 | | This main loop pulls tokens from the lexer until we're out of tokens, |
2831 | | or until there's no more work to do. |
2832 | | */ |
2833 | 163k | while ( state != STATE_COMPLETE ) |
2834 | 162k | { |
2835 | 162k | if ( state == STATE_INITIAL || state == STATE_PRE_BODY ) |
2836 | 69.0k | { |
2837 | 69.0k | node = TY_(GetToken)( doc, IgnoreWhitespace ); |
2838 | 69.0k | DEBUG_LOG_GOT_TOKEN(node); |
2839 | 69.0k | } |
2840 | | |
2841 | 162k | switch ( state ) |
2842 | 162k | { |
2843 | | /************************************************************** |
2844 | | This case is all about finding a head tag and dealing with |
2845 | | cases were we don't, so that we can move on to parsing a head |
2846 | | tag. |
2847 | | **************************************************************/ |
2848 | 28.1k | case STATE_INITIAL: |
2849 | 28.1k | { |
2850 | | /* |
2851 | | The only way we can possibly be here is if the lexer |
2852 | | had nothing to give us. Thus we'll create our own |
2853 | | head, and set the signal to start parsing it. |
2854 | | */ |
2855 | 28.1k | if (node == NULL) |
2856 | 2.72k | { |
2857 | 2.72k | node = TY_(InferredTag)(doc, TidyTag_HEAD); |
2858 | 2.72k | state = STATE_PARSE_HEAD; |
2859 | 2.72k | continue; |
2860 | 2.72k | } |
2861 | | |
2862 | | /* We found exactly what we expected: head. */ |
2863 | 25.4k | if ( nodeIsHEAD(node) ) |
2864 | 248 | { |
2865 | 248 | state = STATE_PARSE_HEAD; |
2866 | 248 | continue; |
2867 | 248 | } |
2868 | | |
2869 | | /* We did not expect to find an html closing tag here! */ |
2870 | 25.1k | if (html && (node->tag == html->tag) && (node->type == EndTag)) |
2871 | 1.11k | { |
2872 | 1.11k | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2873 | 1.11k | TY_(FreeNode)( doc, node); |
2874 | 1.11k | continue; |
2875 | 1.11k | } |
2876 | | |
2877 | | /* Find and discard multiple <html> elements. */ |
2878 | 24.0k | if (html && (node->tag == html->tag) && (node->type == StartTag)) |
2879 | 1.93k | { |
2880 | 1.93k | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2881 | 1.93k | TY_(FreeNode)(doc, node); |
2882 | 1.93k | continue; |
2883 | 1.93k | } |
2884 | | |
2885 | | /* Deal with comments, etc. */ |
2886 | 22.1k | if (InsertMisc(html, node)) |
2887 | 528 | continue; |
2888 | | |
2889 | | /* |
2890 | | At this point, we didn't find a head tag, so put the |
2891 | | token back and create our own head tag, so we can |
2892 | | move on. |
2893 | | */ |
2894 | 21.5k | TY_(UngetToken)( doc ); |
2895 | 21.5k | node = TY_(InferredTag)(doc, TidyTag_HEAD); |
2896 | 21.5k | state = STATE_PARSE_HEAD; |
2897 | 21.5k | continue; |
2898 | 22.1k | } break; |
2899 | | |
2900 | | |
2901 | | /************************************************************** |
2902 | | This case determines whether we're dealing with body or |
2903 | | frameset + noframes, and sets things up accordingly. |
2904 | | **************************************************************/ |
2905 | 40.9k | case STATE_PRE_BODY: |
2906 | 40.9k | { |
2907 | 40.9k | if (node == NULL ) |
2908 | 4.31k | { |
2909 | 4.31k | if (frameset == NULL) /* Implied body. */ |
2910 | 3.37k | { |
2911 | 3.37k | node = TY_(InferredTag)(doc, TidyTag_BODY); |
2912 | 3.37k | state = STATE_PARSE_BODY; |
2913 | 3.37k | } else { |
2914 | 944 | state = STATE_COMPLETE; |
2915 | 944 | } |
2916 | | |
2917 | 4.31k | continue; |
2918 | 4.31k | } |
2919 | | |
2920 | | /* Robustly handle html tags. */ |
2921 | 36.5k | if (node->tag == html->tag) |
2922 | 1.86k | { |
2923 | 1.86k | if (node->type != StartTag && frameset == NULL) |
2924 | 1.06k | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2925 | | |
2926 | 1.86k | TY_(FreeNode)( doc, node); |
2927 | 1.86k | continue; |
2928 | 1.86k | } |
2929 | | |
2930 | | /* Deal with comments, etc. */ |
2931 | 34.7k | if (InsertMisc(html, node)) |
2932 | 245 | continue; |
2933 | | |
2934 | | /* If frameset document, coerce <body> to <noframes> */ |
2935 | 34.4k | if ( nodeIsBODY(node) ) |
2936 | 805 | { |
2937 | 805 | if (node->type != StartTag) |
2938 | 94 | { |
2939 | 94 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2940 | 94 | TY_(FreeNode)( doc, node); |
2941 | 94 | continue; |
2942 | 94 | } |
2943 | | |
2944 | 711 | if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) |
2945 | 711 | { |
2946 | 711 | if (frameset != NULL) |
2947 | 530 | { |
2948 | 530 | TY_(UngetToken)( doc ); |
2949 | | |
2950 | 530 | if (noframes == NULL) |
2951 | 356 | { |
2952 | 356 | noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES); |
2953 | 356 | TY_(InsertNodeAtEnd)(frameset, noframes); |
2954 | 356 | TY_(Report)(doc, html, noframes, INSERTING_TAG); |
2955 | 356 | } |
2956 | 174 | else |
2957 | 174 | { |
2958 | 174 | if (noframes->type == StartEndTag) |
2959 | 79 | noframes->type = StartTag; |
2960 | 174 | } |
2961 | | |
2962 | 530 | state = STATE_PARSE_NOFRAMES; |
2963 | 530 | continue; |
2964 | 530 | } |
2965 | 711 | } |
2966 | | |
2967 | 181 | TY_(ConstrainVersion)(doc, ~VERS_FRAMESET); |
2968 | 181 | state = STATE_PARSE_BODY; |
2969 | 181 | continue; |
2970 | 711 | } |
2971 | | |
2972 | | /* Flag an error if we see more than one frameset. */ |
2973 | 33.6k | if ( nodeIsFRAMESET(node) ) |
2974 | 5.21k | { |
2975 | 5.21k | if (node->type != StartTag) |
2976 | 247 | { |
2977 | 247 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2978 | 247 | TY_(FreeNode)( doc, node); |
2979 | 247 | continue; |
2980 | 247 | } |
2981 | | |
2982 | 4.96k | if (frameset != NULL) |
2983 | 3.41k | TY_(Report)(doc, html, node, DUPLICATE_FRAMESET); |
2984 | 1.54k | else |
2985 | 1.54k | frameset = node; |
2986 | | |
2987 | 4.96k | state = STATE_PARSE_FRAMESET; |
2988 | 4.96k | continue; |
2989 | 5.21k | } |
2990 | | |
2991 | | /* If not a frameset document coerce <noframes> to <body>. */ |
2992 | 28.4k | if ( nodeIsNOFRAMES(node) ) |
2993 | 4.57k | { |
2994 | 4.57k | if (node->type != StartTag) |
2995 | 252 | { |
2996 | 252 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2997 | 252 | TY_(FreeNode)( doc, node); |
2998 | 252 | continue; |
2999 | 252 | } |
3000 | | |
3001 | 4.32k | if (frameset == NULL) |
3002 | 243 | { |
3003 | 243 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
3004 | 243 | TY_(FreeNode)( doc, node); |
3005 | 243 | node = TY_(InferredTag)(doc, TidyTag_BODY); |
3006 | 243 | state = STATE_PARSE_BODY; |
3007 | 243 | continue; |
3008 | 243 | } |
3009 | | |
3010 | 4.07k | if (noframes == NULL) |
3011 | 1.79k | { |
3012 | 1.79k | noframes = node; |
3013 | 1.79k | TY_(InsertNodeAtEnd)(frameset, noframes); |
3014 | 1.79k | state = STATE_PARSE_NOFRAMES; |
3015 | 1.79k | } |
3016 | 2.28k | else |
3017 | 2.28k | { |
3018 | 2.28k | TY_(FreeNode)( doc, node); |
3019 | 2.28k | } |
3020 | | |
3021 | 4.07k | continue; |
3022 | 4.32k | } |
3023 | | |
3024 | | /* Deal with some other element that we're not expecting. */ |
3025 | 23.8k | if (TY_(nodeIsElement)(node)) |
3026 | 15.1k | { |
3027 | 15.1k | if (node->tag && node->tag->model & CM_HEAD) |
3028 | 374 | { |
3029 | 374 | MoveToHead(doc, html, node); |
3030 | 374 | continue; |
3031 | 374 | } |
3032 | | |
3033 | | /* Discard illegal frame element following a frameset. */ |
3034 | 14.7k | if ( frameset != NULL && nodeIsFRAME(node) ) |
3035 | 753 | { |
3036 | 753 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
3037 | 753 | TY_(FreeNode)(doc, node); |
3038 | 753 | continue; |
3039 | 753 | } |
3040 | 14.7k | } |
3041 | | |
3042 | 22.7k | TY_(UngetToken)( doc ); |
3043 | | |
3044 | | /* Insert other content into noframes element. */ |
3045 | 22.7k | if (frameset) |
3046 | 3.93k | { |
3047 | 3.93k | if (noframes == NULL) |
3048 | 2.18k | { |
3049 | 2.18k | noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES); |
3050 | 2.18k | TY_(InsertNodeAtEnd)(frameset, noframes); |
3051 | 2.18k | } |
3052 | 1.75k | else |
3053 | 1.75k | { |
3054 | 1.75k | TY_(Report)(doc, html, node, NOFRAMES_CONTENT); |
3055 | 1.75k | if (noframes->type == StartEndTag) |
3056 | 39 | noframes->type = StartTag; |
3057 | 1.75k | } |
3058 | | |
3059 | 3.93k | TY_(ConstrainVersion)(doc, VERS_FRAMESET); |
3060 | 3.93k | state = STATE_PARSE_NOFRAMES; |
3061 | 3.93k | continue; |
3062 | 3.93k | } |
3063 | | |
3064 | 18.8k | node = TY_(InferredTag)(doc, TidyTag_BODY); |
3065 | | |
3066 | | /* Issue #132 - disable inserting BODY tag warning |
3067 | | BUT only if NOT --show-body-only yes */ |
3068 | 37.6k | if (!showingBodyOnly(doc)) |
3069 | 18.8k | TY_(Report)(doc, html, node, INSERTING_TAG ); |
3070 | | |
3071 | 18.8k | TY_(ConstrainVersion)(doc, ~VERS_FRAMESET); |
3072 | 18.8k | state = STATE_PARSE_BODY; |
3073 | 18.8k | continue; |
3074 | 22.7k | } break; |
3075 | | |
3076 | | |
3077 | | /************************************************************** |
3078 | | In this case, we're ready to parse the head, and move on to |
3079 | | look for the body or body alternative. |
3080 | | **************************************************************/ |
3081 | 24.5k | case STATE_PARSE_HEAD: |
3082 | 24.5k | { |
3083 | 24.5k | TidyParserMemory memory = {0}; |
3084 | 24.5k | memory.identity = TY_(ParseHTML); |
3085 | 24.5k | memory.mode = mode; |
3086 | 24.5k | memory.original_node = html; |
3087 | 24.5k | memory.reentry_node = node; |
3088 | 24.5k | memory.reentry_mode = mode; |
3089 | 24.5k | memory.reentry_state = STATE_PARSE_HEAD_REENTER; |
3090 | 24.5k | TY_(InsertNodeAtEnd)(html, node); |
3091 | 24.5k | TY_(pushMemory)( doc, memory ); |
3092 | 24.5k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3093 | 24.5k | return node; |
3094 | 22.7k | } break; |
3095 | | |
3096 | 24.1k | case STATE_PARSE_HEAD_REENTER: |
3097 | 24.1k | { |
3098 | 24.1k | head = node; |
3099 | 24.1k | state = STATE_PRE_BODY; |
3100 | 24.1k | } break; |
3101 | | |
3102 | | |
3103 | | /************************************************************** |
3104 | | In this case, we can finally parse a body. |
3105 | | **************************************************************/ |
3106 | 22.6k | case STATE_PARSE_BODY: |
3107 | 22.6k | { |
3108 | 22.6k | TidyParserMemory memory = {0}; |
3109 | 22.6k | memory.identity = NULL; /* we don't need to reenter */ |
3110 | 22.6k | memory.mode = mode; |
3111 | 22.6k | memory.original_node = html; |
3112 | 22.6k | memory.reentry_node = NULL; |
3113 | 22.6k | memory.reentry_mode = mode; |
3114 | 22.6k | memory.reentry_state = STATE_COMPLETE; |
3115 | 22.6k | TY_(InsertNodeAtEnd)(html, node); |
3116 | 22.6k | TY_(pushMemory)( doc, memory ); |
3117 | 22.6k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3118 | 22.6k | return node; |
3119 | 22.7k | } break; |
3120 | | |
3121 | | |
3122 | | /************************************************************** |
3123 | | In this case, we will parse noframes. If necessary, the |
3124 | | node is already inserted in the proper spot. |
3125 | | **************************************************************/ |
3126 | 6.26k | case STATE_PARSE_NOFRAMES: |
3127 | 6.26k | { |
3128 | 6.26k | TidyParserMemory memory = {0}; |
3129 | 6.26k | memory.identity = TY_(ParseHTML); |
3130 | 6.26k | memory.mode = mode; |
3131 | 6.26k | memory.original_node = html; |
3132 | 6.26k | memory.reentry_node = frameset; |
3133 | 6.26k | memory.reentry_mode = mode; |
3134 | 6.26k | memory.reentry_state = STATE_PARSE_NOFRAMES_REENTER; |
3135 | 6.26k | TY_(pushMemory)( doc, memory ); |
3136 | 6.26k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3137 | 6.26k | return noframes; |
3138 | 22.7k | } break; |
3139 | | |
3140 | 5.66k | case STATE_PARSE_NOFRAMES_REENTER: |
3141 | 5.66k | { |
3142 | 5.66k | frameset = node; |
3143 | 5.66k | state = STATE_PRE_BODY; |
3144 | 5.66k | } break; |
3145 | | |
3146 | | |
3147 | | /************************************************************** |
3148 | | In this case, we parse the frameset, and look for noframes |
3149 | | content to merge later if necessary. |
3150 | | **************************************************************/ |
3151 | 4.96k | case STATE_PARSE_FRAMESET: |
3152 | 4.96k | { |
3153 | 4.96k | TidyParserMemory memory = {0}; |
3154 | 4.96k | memory.identity = TY_(ParseHTML); |
3155 | 4.96k | memory.mode = mode; |
3156 | 4.96k | memory.original_node = html; |
3157 | 4.96k | memory.reentry_node = frameset; |
3158 | 4.96k | memory.reentry_mode = mode; |
3159 | 4.96k | memory.reentry_state = STATE_PARSE_FRAMESET_REENTER; |
3160 | 4.96k | TY_(InsertNodeAtEnd)(html, node); |
3161 | 4.96k | TY_(pushMemory)( doc, memory ); |
3162 | 4.96k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3163 | 4.96k | return node; |
3164 | 22.7k | } break; |
3165 | | |
3166 | 4.95k | case (STATE_PARSE_FRAMESET_REENTER): |
3167 | 4.95k | { |
3168 | 4.95k | frameset = node; |
3169 | | /* |
3170 | | See if it includes a noframes element so that |
3171 | | we can merge subsequent noframes elements. |
3172 | | */ |
3173 | 1.41M | for (node = frameset->content; node; node = node->next) |
3174 | 1.41M | { |
3175 | 1.41M | if ( nodeIsNOFRAMES(node) ) |
3176 | 1.40M | noframes = node; |
3177 | 1.41M | } |
3178 | 4.95k | state = STATE_PRE_BODY; |
3179 | 4.95k | } break; |
3180 | | |
3181 | | |
3182 | | /************************************************************** |
3183 | | We really shouldn't get here, but if we do, finish nicely. |
3184 | | **************************************************************/ |
3185 | 0 | default: |
3186 | 0 | { |
3187 | 0 | state = STATE_COMPLETE; |
3188 | 0 | } |
3189 | 162k | } /* switch */ |
3190 | 162k | } /* while */ |
3191 | | |
3192 | 944 | DEBUG_LOG_EXIT; |
3193 | 944 | return NULL; |
3194 | 59.3k | } |
3195 | | |
3196 | | |
3197 | | /** MARK: TY_(ParseInline) |
3198 | | * Parse inline element nodes. |
3199 | | * |
3200 | | * This is a non-recursing parser. It uses the document's parser memory stack |
3201 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
3202 | | * This parser is also re-enterable, so that post-processing can occur after |
3203 | | * such dispatching. |
3204 | | */ |
3205 | | Node* TY_(ParseInline)( TidyDocImpl *doc, Node *element, GetTokenMode mode ) |
3206 | 2.56M | { |
3207 | 2.56M | Lexer* lexer = doc->lexer; |
3208 | 2.56M | Node *node = NULL; |
3209 | 2.56M | Node *parent = NULL; |
3210 | 2.56M | DEBUG_LOG_COUNTERS; |
3211 | | |
3212 | 2.56M | if ( element == NULL ) |
3213 | 1.19M | { |
3214 | 1.19M | TidyParserMemory memory = TY_(popMemory)( doc ); |
3215 | 1.19M | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
3216 | 1.19M | DEBUG_LOG_REENTER_WITH_NODE(node); |
3217 | 1.19M | element = memory.original_node; |
3218 | 1.19M | DEBUG_LOG_GET_OLD_MODE; |
3219 | 1.19M | mode = memory.reentry_mode; |
3220 | 1.19M | DEBUG_LOG_CHANGE_MODE; |
3221 | 1.19M | } |
3222 | 1.37M | else |
3223 | 1.37M | { |
3224 | 1.37M | DEBUG_LOG_ENTER_WITH_NODE(element); |
3225 | | |
3226 | 1.37M | if (element->tag->model & CM_EMPTY) |
3227 | 0 | { |
3228 | 0 | DEBUG_LOG_EXIT; |
3229 | 0 | return NULL; |
3230 | 0 | } |
3231 | | |
3232 | | /* |
3233 | | ParseInline is used for some block level elements like H1 to H6 |
3234 | | For such elements we need to insert inline emphasis tags currently |
3235 | | on the inline stack. For Inline elements, we normally push them |
3236 | | onto the inline stack provided they aren't implicit or OBJECT/APPLET. |
3237 | | This test is carried out in PushInline and PopInline, see istack.c |
3238 | | |
3239 | | InlineDup(...) is not called for elements with a CM_MIXED (inline and |
3240 | | block) content model, e.g. <del> or <ins>, otherwise constructs like |
3241 | | |
3242 | | <p>111<a name='foo'>222<del>333</del>444</a>555</p> |
3243 | | <p>111<span>222<del>333</del>444</span>555</p> |
3244 | | <p>111<em>222<del>333</del>444</em>555</p> |
3245 | | |
3246 | | will get corrupted. |
3247 | | */ |
3248 | 1.37M | if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) && |
3249 | 69.6k | !TY_(nodeHasCM)(element, CM_MIXED)) |
3250 | 63.4k | TY_(InlineDup)(doc, NULL); |
3251 | 1.31M | else if (TY_(nodeHasCM)(element, CM_INLINE)) |
3252 | 1.30M | TY_(PushInline)(doc, element); |
3253 | | |
3254 | 1.37M | if ( nodeIsNOBR(element) ) |
3255 | 1.18k | doc->badLayout |= USING_NOBR; |
3256 | 1.37M | else if ( nodeIsFONT(element) ) |
3257 | 1.05M | doc->badLayout |= USING_FONT; |
3258 | | |
3259 | | /* Inline elements may or may not be within a preformatted element */ |
3260 | 1.37M | if (mode != Preformatted) |
3261 | 1.37M | { |
3262 | 1.37M | DEBUG_LOG_GET_OLD_MODE; |
3263 | 1.37M | mode = MixedContent; |
3264 | 1.37M | DEBUG_LOG_CHANGE_MODE; |
3265 | 1.37M | } |
3266 | 1.37M | } |
3267 | | |
3268 | 2.71M | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
3269 | 1.86M | { |
3270 | | /* end tag for current element */ |
3271 | 1.86M | if (node->tag == element->tag && node->type == EndTag) |
3272 | 11.2k | { |
3273 | 11.2k | if (element->tag->model & CM_INLINE) |
3274 | 8.02k | TY_(PopInline)( doc, node ); |
3275 | | |
3276 | 11.2k | TY_(FreeNode)( doc, node ); |
3277 | | |
3278 | 11.2k | if (!(mode & Preformatted)) |
3279 | 11.2k | TrimSpaces(doc, element); |
3280 | | |
3281 | | /* |
3282 | | if a font element wraps an anchor and nothing else |
3283 | | then move the font element inside the anchor since |
3284 | | otherwise it won't alter the anchor text color |
3285 | | */ |
3286 | 11.2k | if ( nodeIsFONT(element) && |
3287 | 2.16k | element->content && element->content == element->last ) |
3288 | 1.69k | { |
3289 | 1.69k | Node *child = element->content; |
3290 | | |
3291 | 1.69k | if ( nodeIsA(child) ) |
3292 | 989 | { |
3293 | 989 | child->parent = element->parent; |
3294 | 989 | child->next = element->next; |
3295 | 989 | child->prev = element->prev; |
3296 | | |
3297 | 989 | element->next = NULL; |
3298 | 989 | element->prev = NULL; |
3299 | 989 | element->parent = child; |
3300 | | |
3301 | 989 | element->content = child->content; |
3302 | 989 | element->last = child->last; |
3303 | 989 | child->content = element; |
3304 | | |
3305 | 989 | TY_(FixNodeLinks)(child); |
3306 | 989 | TY_(FixNodeLinks)(element); |
3307 | 989 | } |
3308 | 1.69k | } |
3309 | | |
3310 | 11.2k | element->closed = yes; |
3311 | 11.2k | TrimSpaces( doc, element ); |
3312 | | |
3313 | 11.2k | DEBUG_LOG_EXIT; |
3314 | 11.2k | return NULL; |
3315 | 11.2k | } |
3316 | | |
3317 | | /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */ |
3318 | | /* (see additional conditions below) */ |
3319 | | /* otherwise emphasis nesting is probably unintentional */ |
3320 | | /* big, small, sub, sup have cumulative effect to leave them alone */ |
3321 | 1.85M | if ( node->type == StartTag |
3322 | 1.70M | && node->tag == element->tag |
3323 | 1.10M | && TY_(IsPushed)( doc, node ) |
3324 | 1.05M | && !node->implicit |
3325 | 135k | && !element->implicit |
3326 | 132k | && node->tag && (node->tag->model & CM_INLINE) |
3327 | 1.85M | && !nodeIsA(node) |
3328 | 1.85M | && !nodeIsFONT(node) |
3329 | 1.85M | && !nodeIsBIG(node) |
3330 | 1.85M | && !nodeIsSMALL(node) |
3331 | 1.85M | && !nodeIsSUB(node) |
3332 | 1.85M | && !nodeIsSUP(node) |
3333 | 1.85M | && !nodeIsQ(node) |
3334 | 1.85M | && !nodeIsSPAN(node) |
3335 | 11.5k | && cfgBool(doc, TidyCoerceEndTags) |
3336 | 1.85M | ) |
3337 | 11.5k | { |
3338 | | /* proceeds only if "node" does not have any attribute and |
3339 | | follows a text node not finishing with a space */ |
3340 | 11.5k | if (element->content != NULL && node->attributes == NULL |
3341 | 3.58k | && TY_(nodeIsText)(element->last) |
3342 | 2.42k | && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) ) |
3343 | 1.53k | { |
3344 | 1.53k | TY_(Report)(doc, element, node, COERCE_TO_ENDTAG); |
3345 | 1.53k | node->type = EndTag; |
3346 | 1.53k | TY_(UngetToken)(doc); |
3347 | 1.53k | continue; |
3348 | 1.53k | } |
3349 | | |
3350 | 9.99k | if (node->attributes == NULL || element->attributes == NULL) |
3351 | 8.27k | TY_(Report)(doc, element, node, NESTED_EMPHASIS); |
3352 | 9.99k | } |
3353 | 1.84M | else if ( TY_(IsPushed)(doc, node) && node->type == StartTag && |
3354 | 1.19M | nodeIsQ(node) ) |
3355 | 17.2k | { |
3356 | | /*\ |
3357 | | * Issue #215 - such nested quotes are NOT a problem if HTML5, so |
3358 | | * only issue this warning if NOT HTML5 mode. |
3359 | | \*/ |
3360 | 17.2k | if (TY_(HTMLVersion)(doc) != HT50) |
3361 | 17.2k | { |
3362 | 17.2k | TY_(Report)(doc, element, node, NESTED_QUOTATION); |
3363 | 17.2k | } |
3364 | 17.2k | } |
3365 | | |
3366 | 1.85M | if ( TY_(nodeIsText)(node) ) |
3367 | 77.0k | { |
3368 | | /* only called for 1st child */ |
3369 | 77.0k | if ( element->content == NULL && !(mode & Preformatted) ) |
3370 | 53.7k | TrimSpaces( doc, element ); |
3371 | | |
3372 | 77.0k | if ( node->start >= node->end ) |
3373 | 278 | { |
3374 | 278 | TY_(FreeNode)( doc, node ); |
3375 | 278 | continue; |
3376 | 278 | } |
3377 | | |
3378 | 76.7k | TY_(InsertNodeAtEnd)(element, node); |
3379 | 76.7k | continue; |
3380 | 77.0k | } |
3381 | | |
3382 | | /* mixed content model so allow text */ |
3383 | 1.77M | if (InsertMisc(element, node)) |
3384 | 8.95k | continue; |
3385 | | |
3386 | | /* deal with HTML tags */ |
3387 | 1.77M | if ( nodeIsHTML(node) ) |
3388 | 4.22k | { |
3389 | 4.22k | if ( TY_(nodeIsElement)(node) ) |
3390 | 3.87k | { |
3391 | 3.87k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
3392 | 3.87k | TY_(FreeNode)( doc, node ); |
3393 | 3.87k | continue; |
3394 | 3.87k | } |
3395 | | |
3396 | | /* otherwise infer end of inline element */ |
3397 | 348 | TY_(UngetToken)( doc ); |
3398 | | |
3399 | 348 | if (!(mode & Preformatted)) |
3400 | 348 | TrimSpaces(doc, element); |
3401 | | |
3402 | 348 | DEBUG_LOG_EXIT; |
3403 | 348 | return NULL; |
3404 | 4.22k | } |
3405 | | |
3406 | | /* within <dt> or <pre> map <p> to <br> */ |
3407 | 1.76M | if ( nodeIsP(node) && |
3408 | 205k | node->type == StartTag && |
3409 | 194k | ( (mode & Preformatted) || |
3410 | 194k | nodeIsDT(element) || |
3411 | 193k | DescendantOf(element, TidyTag_DT ) |
3412 | 194k | ) |
3413 | 1.76M | ) |
3414 | 1.53k | { |
3415 | 1.53k | node->tag = TY_(LookupTagDef)( TidyTag_BR ); |
3416 | 1.53k | TidyDocFree(doc, node->element); |
3417 | 1.53k | node->element = TY_(tmbstrdup)(doc->allocator, "br"); |
3418 | 1.53k | TrimSpaces(doc, element); |
3419 | 1.53k | TY_(InsertNodeAtEnd)(element, node); |
3420 | 1.53k | continue; |
3421 | 1.53k | } |
3422 | | |
3423 | | /* <p> allowed within <address> in HTML 4.01 Transitional */ |
3424 | 1.76M | if ( nodeIsP(node) && |
3425 | 204k | node->type == StartTag && |
3426 | 193k | nodeIsADDRESS(element) ) |
3427 | 0 | { |
3428 | 0 | TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); |
3429 | 0 | TY_(InsertNodeAtEnd)(element, node); |
3430 | 0 | (*node->tag->parser)( doc, node, mode ); |
3431 | 0 | continue; |
3432 | 0 | } |
3433 | | |
3434 | | /* ignore unknown and PARAM tags */ |
3435 | 1.76M | if ( node->tag == NULL || nodeIsPARAM(node) ) |
3436 | 45.4k | { |
3437 | 45.4k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3438 | 45.4k | TY_(FreeNode)( doc, node ); |
3439 | 45.4k | continue; |
3440 | 45.4k | } |
3441 | | |
3442 | 1.71M | if ( nodeIsBR(node) && node->type == EndTag ) |
3443 | 2.20k | node->type = StartTag; |
3444 | | |
3445 | 1.71M | if ( node->type == EndTag ) |
3446 | 51.6k | { |
3447 | | /* coerce </br> to <br> */ |
3448 | 51.6k | if ( nodeIsBR(node) ) |
3449 | 0 | node->type = StartTag; |
3450 | 51.6k | else if ( nodeIsP(node) ) |
3451 | 10.8k | { |
3452 | | /* coerce unmatched </p> to <br><br> */ |
3453 | 10.8k | if ( !DescendantOf(element, TidyTag_P) ) |
3454 | 1.66k | { |
3455 | 1.66k | TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); |
3456 | 1.66k | TrimSpaces( doc, element ); |
3457 | 1.66k | TY_(InsertNodeAtEnd)( element, node ); |
3458 | 1.66k | node = TY_(InferredTag)(doc, TidyTag_BR); |
3459 | 1.66k | TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */ |
3460 | 1.66k | continue; |
3461 | 1.66k | } |
3462 | 10.8k | } |
3463 | 40.8k | else if ( TY_(nodeHasCM)(node, CM_INLINE) |
3464 | 40.8k | && !nodeIsA(node) |
3465 | 8.54k | && !TY_(nodeHasCM)(node, CM_OBJECT) |
3466 | 4.87k | && TY_(nodeHasCM)(element, CM_INLINE) ) |
3467 | 4.03k | { |
3468 | | /* allow any inline end tag to end current element */ |
3469 | | |
3470 | | /* http://tidy.sf.net/issue/1426419 */ |
3471 | | /* but, like the browser, retain an earlier inline element. |
3472 | | This is implemented by setting the lexer into a mode |
3473 | | where it gets tokens from the inline stack rather than |
3474 | | from the input stream. Check if the scenerio fits. */ |
3475 | 4.03k | if ( !nodeIsA(element) |
3476 | 3.16k | && (node->tag != element->tag) |
3477 | 3.16k | && TY_(IsPushed)( doc, node ) |
3478 | 2.54k | && TY_(IsPushed)( doc, element ) ) |
3479 | 2.07k | { |
3480 | | /* we have something like |
3481 | | <b>bold <i>bold and italic</b> italics</i> */ |
3482 | 2.07k | if ( TY_(SwitchInline)( doc, element, node ) ) |
3483 | 714 | { |
3484 | 714 | TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG); |
3485 | 714 | TY_(UngetToken)( doc ); /* put this back */ |
3486 | 714 | TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */ |
3487 | 714 | if (!(mode & Preformatted)) |
3488 | 714 | TrimSpaces( doc, element ); |
3489 | | |
3490 | 714 | DEBUG_LOG_EXIT; |
3491 | 714 | return NULL; /* close <i>, but will re-open it, after </b> */ |
3492 | 714 | } |
3493 | 2.07k | } |
3494 | 3.32k | TY_(PopInline)( doc, element ); |
3495 | | |
3496 | 3.32k | if ( !nodeIsA(element) ) |
3497 | 2.45k | { |
3498 | 2.45k | if ( nodeIsA(node) && node->tag != element->tag ) |
3499 | 0 | { |
3500 | 0 | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
3501 | 0 | TY_(UngetToken)( doc ); |
3502 | 0 | } |
3503 | 2.45k | else |
3504 | 2.45k | { |
3505 | 2.45k | TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG); |
3506 | 2.45k | TY_(FreeNode)( doc, node); |
3507 | 2.45k | } |
3508 | | |
3509 | 2.45k | if (!(mode & Preformatted)) |
3510 | 2.45k | TrimSpaces(doc, element); |
3511 | | |
3512 | 2.45k | DEBUG_LOG_EXIT; |
3513 | 2.45k | return NULL; |
3514 | 2.45k | } |
3515 | | |
3516 | | /* if parent is <a> then discard unexpected inline end tag */ |
3517 | 870 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3518 | 870 | TY_(FreeNode)( doc, node); |
3519 | 870 | continue; |
3520 | 3.32k | } /* special case </tr> etc. for stuff moved in front of table */ |
3521 | 36.7k | else if ( lexer->exiled |
3522 | 6.63k | && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) |
3523 | 1.01k | { |
3524 | 1.01k | TY_(UngetToken)( doc ); |
3525 | 1.01k | TrimSpaces(doc, element); |
3526 | | |
3527 | 1.01k | DEBUG_LOG_EXIT; |
3528 | 1.01k | return NULL; |
3529 | 1.01k | } |
3530 | 51.6k | } |
3531 | | |
3532 | | /* allow any header tag to end current header */ |
3533 | 1.71M | if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) ) |
3534 | 2.07k | { |
3535 | | |
3536 | 2.07k | if ( node->tag == element->tag ) |
3537 | 783 | { |
3538 | 783 | TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG ); |
3539 | 783 | TY_(FreeNode)( doc, node); |
3540 | 783 | } |
3541 | 1.29k | else |
3542 | 1.29k | { |
3543 | 1.29k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
3544 | 1.29k | TY_(UngetToken)( doc ); |
3545 | 1.29k | } |
3546 | | |
3547 | 2.07k | if (!(mode & Preformatted)) |
3548 | 2.07k | TrimSpaces(doc, element); |
3549 | | |
3550 | 2.07k | DEBUG_LOG_EXIT; |
3551 | 2.07k | return NULL; |
3552 | 2.07k | } |
3553 | | |
3554 | | /* |
3555 | | an <A> tag to ends any open <A> element |
3556 | | but <A href=...> is mapped to </A><A href=...> |
3557 | | */ |
3558 | | /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ |
3559 | | /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */ |
3560 | 1.71M | if ( nodeIsA(node) && !node->implicit && |
3561 | 37.3k | (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) |
3562 | 33.1k | { |
3563 | | /* coerce <a> to </a> unless it has some attributes */ |
3564 | | /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ |
3565 | | /* other fixes by Dave Raggett */ |
3566 | | /* if (node->attributes == NULL) */ |
3567 | 33.1k | if (node->type != EndTag && node->attributes == NULL |
3568 | 3.54k | && cfgBool(doc, TidyCoerceEndTags) ) |
3569 | 3.54k | { |
3570 | 3.54k | node->type = EndTag; |
3571 | 3.54k | TY_(Report)(doc, element, node, COERCE_TO_ENDTAG); |
3572 | | /* TY_(PopInline)( doc, node ); */ |
3573 | 3.54k | TY_(UngetToken)( doc ); |
3574 | 3.54k | continue; |
3575 | 3.54k | } |
3576 | | |
3577 | 29.5k | TY_(UngetToken)( doc ); |
3578 | 29.5k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
3579 | | /* TY_(PopInline)( doc, element ); */ |
3580 | | |
3581 | 29.5k | if (!(mode & Preformatted)) |
3582 | 29.5k | TrimSpaces(doc, element); |
3583 | | |
3584 | 29.5k | DEBUG_LOG_EXIT; |
3585 | 29.5k | return NULL; |
3586 | 33.1k | } |
3587 | | |
3588 | 1.67M | if (element->tag->model & CM_HEADING) |
3589 | 5.64k | { |
3590 | 5.64k | if ( nodeIsCENTER(node) || nodeIsDIV(node) ) |
3591 | 2.09k | { |
3592 | 2.09k | if (!TY_(nodeIsElement)(node)) |
3593 | 757 | { |
3594 | 757 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3595 | 757 | TY_(FreeNode)( doc, node); |
3596 | 757 | continue; |
3597 | 757 | } |
3598 | | |
3599 | 1.34k | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN); |
3600 | | |
3601 | | /* insert center as parent if heading is empty */ |
3602 | 1.34k | if (element->content == NULL) |
3603 | 780 | { |
3604 | 780 | InsertNodeAsParent(element, node); |
3605 | 780 | continue; |
3606 | 780 | } |
3607 | | |
3608 | | /* split heading and make center parent of 2nd part */ |
3609 | 562 | TY_(InsertNodeAfterElement)(element, node); |
3610 | | |
3611 | 562 | if (!(mode & Preformatted)) |
3612 | 562 | TrimSpaces(doc, element); |
3613 | | |
3614 | 562 | element = TY_(CloneNode)( doc, element ); |
3615 | 562 | TY_(InsertNodeAtEnd)(node, element); |
3616 | 562 | continue; |
3617 | 1.34k | } |
3618 | | |
3619 | 3.55k | if ( nodeIsHR(node) ) |
3620 | 1.46k | { |
3621 | 1.46k | if ( !TY_(nodeIsElement)(node) ) |
3622 | 166 | { |
3623 | 166 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3624 | 166 | TY_(FreeNode)( doc, node); |
3625 | 166 | continue; |
3626 | 166 | } |
3627 | | |
3628 | 1.29k | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN); |
3629 | | |
3630 | | /* insert hr before heading if heading is empty */ |
3631 | 1.29k | if (element->content == NULL) |
3632 | 315 | { |
3633 | 315 | TY_(InsertNodeBeforeElement)(element, node); |
3634 | 315 | continue; |
3635 | 315 | } |
3636 | | |
3637 | | /* split heading and insert hr before 2nd part */ |
3638 | 979 | TY_(InsertNodeAfterElement)(element, node); |
3639 | | |
3640 | 979 | if (!(mode & Preformatted)) |
3641 | 979 | TrimSpaces(doc, element); |
3642 | | |
3643 | 979 | element = TY_(CloneNode)( doc, element ); |
3644 | 979 | TY_(InsertNodeAfterElement)(node, element); |
3645 | 979 | continue; |
3646 | 1.29k | } |
3647 | 3.55k | } |
3648 | | |
3649 | 1.67M | if ( nodeIsDT(element) ) |
3650 | 7.70k | { |
3651 | 7.70k | if ( nodeIsHR(node) ) |
3652 | 2.17k | { |
3653 | 2.17k | Node *dd; |
3654 | 2.17k | if ( !TY_(nodeIsElement)(node) ) |
3655 | 207 | { |
3656 | 207 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3657 | 207 | TY_(FreeNode)( doc, node); |
3658 | 207 | continue; |
3659 | 207 | } |
3660 | | |
3661 | 1.96k | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN); |
3662 | 1.96k | dd = TY_(InferredTag)(doc, TidyTag_DD); |
3663 | | |
3664 | | /* insert hr within dd before dt if dt is empty */ |
3665 | 1.96k | if (element->content == NULL) |
3666 | 1.03k | { |
3667 | 1.03k | TY_(InsertNodeBeforeElement)(element, dd); |
3668 | 1.03k | TY_(InsertNodeAtEnd)(dd, node); |
3669 | 1.03k | continue; |
3670 | 1.03k | } |
3671 | | |
3672 | | /* split dt and insert hr within dd before 2nd part */ |
3673 | 933 | TY_(InsertNodeAfterElement)(element, dd); |
3674 | 933 | TY_(InsertNodeAtEnd)(dd, node); |
3675 | | |
3676 | 933 | if (!(mode & Preformatted)) |
3677 | 933 | TrimSpaces(doc, element); |
3678 | | |
3679 | 933 | element = TY_(CloneNode)( doc, element ); |
3680 | 933 | TY_(InsertNodeAfterElement)(dd, element); |
3681 | 933 | continue; |
3682 | 1.96k | } |
3683 | 7.70k | } |
3684 | | |
3685 | | |
3686 | | /* |
3687 | | if this is the end tag for an ancestor element |
3688 | | then infer end tag for this element |
3689 | | */ |
3690 | 1.67M | if (node->type == EndTag) |
3691 | 21.1k | { |
3692 | 21.1k | for (parent = element->parent; |
3693 | 345k | parent != NULL; parent = parent->parent) |
3694 | 343k | { |
3695 | 343k | if (node->tag == parent->tag) |
3696 | 19.1k | { |
3697 | 19.1k | if (!(element->tag->model & CM_OPT) && !element->implicit) |
3698 | 7.29k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
3699 | | |
3700 | 19.1k | if( TY_(IsPushedLast)( doc, element, node ) ) |
3701 | 0 | TY_(PopInline)( doc, element ); |
3702 | 19.1k | TY_(UngetToken)( doc ); |
3703 | | |
3704 | 19.1k | if (!(mode & Preformatted)) |
3705 | 19.1k | TrimSpaces(doc, element); |
3706 | | |
3707 | 19.1k | DEBUG_LOG_EXIT; |
3708 | 19.1k | return NULL; |
3709 | 19.1k | } |
3710 | 343k | } |
3711 | 21.1k | } |
3712 | | |
3713 | | /*\ |
3714 | | * block level tags end this element |
3715 | | * Issue #333 - There seems an exception if the element is a 'span', |
3716 | | * and the node just collected is a 'meta'. The 'meta' can not have |
3717 | | * CM_INLINE added, nor can the 'span' have CM_MIXED added without |
3718 | | * big consequences. |
3719 | | * There may be other exceptions to be added... |
3720 | | \*/ |
3721 | 1.65M | if (!(node->tag->model & CM_INLINE) && |
3722 | 352k | !(element->tag->model & CM_MIXED) && |
3723 | 350k | !(nodeIsSPAN(element) && nodeIsMETA(node)) ) |
3724 | 350k | { |
3725 | 350k | if ( !TY_(nodeIsElement)(node) ) |
3726 | 977 | { |
3727 | 977 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3728 | 977 | TY_(FreeNode)( doc, node); |
3729 | 977 | continue; |
3730 | 977 | } |
3731 | | /* HTML5 */ |
3732 | 349k | if (nodeIsDATALIST(element)) { |
3733 | 0 | TY_(ConstrainVersion)( doc, ~VERS_HTML5 ); |
3734 | 0 | } else |
3735 | 349k | if (!(element->tag->model & CM_OPT)) |
3736 | 297k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
3737 | | |
3738 | 349k | if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) |
3739 | 1.96k | { |
3740 | 1.96k | MoveToHead(doc, element, node); |
3741 | 1.96k | continue; |
3742 | 1.96k | } |
3743 | | |
3744 | | /* |
3745 | | prevent anchors from propagating into block tags |
3746 | | except for headings h1 to h6 |
3747 | | */ |
3748 | 347k | if ( nodeIsA(element) ) |
3749 | 7.83k | { |
3750 | 7.83k | if (node->tag && !(node->tag->model & CM_HEADING)) |
3751 | 7.29k | TY_(PopInline)( doc, element ); |
3752 | 541 | else if (!(element->content)) |
3753 | 282 | { |
3754 | 282 | TY_(DiscardElement)( doc, element ); |
3755 | 282 | TY_(UngetToken)( doc ); |
3756 | | |
3757 | 282 | DEBUG_LOG_EXIT; |
3758 | 282 | return NULL; |
3759 | 282 | } |
3760 | 7.83k | } |
3761 | | |
3762 | 347k | TY_(UngetToken)( doc ); |
3763 | | |
3764 | 347k | if (!(mode & Preformatted)) |
3765 | 347k | TrimSpaces(doc, element); |
3766 | | |
3767 | 347k | DEBUG_LOG_EXIT; |
3768 | 347k | return NULL; |
3769 | 347k | } |
3770 | | |
3771 | | /* parse inline element */ |
3772 | 1.30M | if (TY_(nodeIsElement)(node)) |
3773 | 1.30M | { |
3774 | 1.30M | if (node->implicit) |
3775 | 1.06M | TY_(Report)(doc, element, node, INSERTING_TAG); |
3776 | | |
3777 | | /* trim white space before <br> */ |
3778 | 1.30M | if ( nodeIsBR(node) ) |
3779 | 5.97k | TrimSpaces(doc, element); |
3780 | | |
3781 | 1.30M | TY_(InsertNodeAtEnd)(element, node); |
3782 | | |
3783 | 1.30M | { |
3784 | 1.30M | TidyParserMemory memory = {0}; |
3785 | 1.30M | memory.identity = TY_(ParseInline); |
3786 | 1.30M | memory.original_node = element; |
3787 | 1.30M | memory.reentry_node = node; |
3788 | 1.30M | memory.mode = mode; |
3789 | 1.30M | memory.reentry_mode = mode; |
3790 | 1.30M | TY_(pushMemory)( doc, memory ); |
3791 | 1.30M | DEBUG_LOG_EXIT_WITH_NODE(node); |
3792 | 1.30M | return node; |
3793 | 1.30M | } |
3794 | 1.30M | } |
3795 | | |
3796 | | /* discard unexpected tags */ |
3797 | 1.03k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3798 | 1.03k | TY_(FreeNode)( doc, node ); |
3799 | 1.03k | continue; |
3800 | 1.30M | } |
3801 | | |
3802 | 850k | if (!(element->tag->model & CM_OPT)) |
3803 | 845k | TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR); |
3804 | | |
3805 | 850k | DEBUG_LOG_EXIT; |
3806 | 850k | return NULL; |
3807 | 2.56M | } |
3808 | | |
3809 | | |
3810 | | /** MARK: TY_(ParseList) |
3811 | | * Parses list tags. |
3812 | | * |
3813 | | * This is a non-recursing parser. It uses the document's parser memory stack |
3814 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
3815 | | * This parser is also re-enterable, so that post-processing can occur after |
3816 | | * such dispatching. |
3817 | | */ |
3818 | | Node* TY_(ParseList)( TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode) ) |
3819 | 75.2k | { |
3820 | 75.2k | Lexer* lexer = doc->lexer; |
3821 | 75.2k | Node *node = NULL; |
3822 | 75.2k | Node *parent = NULL; |
3823 | 75.2k | Node *lastli = NULL;; |
3824 | 75.2k | Bool wasblock = no; |
3825 | 75.2k | Bool nodeisOL = nodeIsOL(list); |
3826 | 75.2k | DEBUG_LOG_COUNTERS; |
3827 | | |
3828 | 75.2k | if ( list == NULL ) |
3829 | 40.2k | { |
3830 | 40.2k | TidyParserMemory memory = TY_(popMemory)( doc ); |
3831 | 40.2k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
3832 | 40.2k | DEBUG_LOG_REENTER_WITH_NODE(node); |
3833 | 40.2k | list = memory.original_node; |
3834 | 40.2k | DEBUG_LOG_GET_OLD_MODE; |
3835 | 40.2k | mode = memory.mode; |
3836 | 40.2k | DEBUG_LOG_CHANGE_MODE; |
3837 | 40.2k | } |
3838 | 35.0k | else |
3839 | 35.0k | { |
3840 | 35.0k | DEBUG_LOG_ENTER_WITH_NODE(list); |
3841 | | |
3842 | 35.0k | if (list->tag->model & CM_EMPTY) |
3843 | 0 | { |
3844 | 0 | DEBUG_LOG_EXIT; |
3845 | 0 | return NULL; |
3846 | 0 | } |
3847 | 35.0k | } |
3848 | | |
3849 | 75.2k | lexer->insert = NULL; /* defer implicit inline start tags */ |
3850 | | |
3851 | 82.0k | while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL) |
3852 | 56.3k | { |
3853 | 56.3k | Bool foundLI = no; |
3854 | 56.3k | if (node->tag == list->tag && node->type == EndTag) |
3855 | 709 | { |
3856 | 709 | TY_(FreeNode)( doc, node); |
3857 | 709 | list->closed = yes; |
3858 | 709 | DEBUG_LOG_EXIT; |
3859 | 709 | return NULL; |
3860 | 709 | } |
3861 | | |
3862 | | /* deal with comments etc. */ |
3863 | 55.6k | if (InsertMisc(list, node)) |
3864 | 1.27k | continue; |
3865 | | |
3866 | 54.3k | if (node->type != TextNode && node->tag == NULL) |
3867 | 3.01k | { |
3868 | 3.01k | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3869 | 3.01k | TY_(FreeNode)( doc, node); |
3870 | 3.01k | continue; |
3871 | 3.01k | } |
3872 | 51.3k | if (lexer && (node->type == TextNode)) |
3873 | 12.2k | { |
3874 | 12.2k | uint ch, ix = node->start; |
3875 | | /* Issue #572 - Skip whitespace. */ |
3876 | 13.2k | while (ix < node->end && (ch = (lexer->lexbuf[ix] & 0xff)) |
3877 | 12.6k | && (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n')) |
3878 | 1.08k | ++ix; |
3879 | 12.2k | if (ix >= node->end) |
3880 | 302 | { |
3881 | | /* Issue #572 - Discard if ALL whitespace. */ |
3882 | 302 | TY_(FreeNode)(doc, node); |
3883 | 302 | continue; |
3884 | 302 | } |
3885 | 12.2k | } |
3886 | | |
3887 | | |
3888 | | /* |
3889 | | if this is the end tag for an ancestor element |
3890 | | then infer end tag for this element |
3891 | | */ |
3892 | 51.0k | if (node->type == EndTag) |
3893 | 2.49k | { |
3894 | 2.49k | if ( nodeIsFORM(node) ) |
3895 | 221 | { |
3896 | 221 | BadForm( doc ); |
3897 | 221 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3898 | 221 | TY_(FreeNode)( doc, node ); |
3899 | 221 | continue; |
3900 | 221 | } |
3901 | | |
3902 | 2.27k | if (TY_(nodeHasCM)(node,CM_INLINE)) |
3903 | 705 | { |
3904 | 705 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3905 | 705 | TY_(PopInline)( doc, node ); |
3906 | 705 | TY_(FreeNode)( doc, node); |
3907 | 705 | continue; |
3908 | 705 | } |
3909 | | |
3910 | 1.56k | for ( parent = list->parent; |
3911 | 47.3k | parent != NULL; parent = parent->parent ) |
3912 | 46.8k | { |
3913 | | /* Do not match across BODY to avoid infinite loop |
3914 | | between ParseBody and this parser, |
3915 | | See http://tidy.sf.net/bug/1053626. */ |
3916 | 46.8k | if (nodeIsBODY(parent)) |
3917 | 700 | break; |
3918 | 46.1k | if (node->tag == parent->tag) |
3919 | 311 | { |
3920 | 311 | TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE); |
3921 | 311 | TY_(UngetToken)( doc ); |
3922 | 311 | DEBUG_LOG_EXIT; |
3923 | 311 | return NULL; |
3924 | 311 | } |
3925 | 46.1k | } |
3926 | | |
3927 | 1.25k | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3928 | 1.25k | TY_(FreeNode)( doc, node); |
3929 | 1.25k | continue; |
3930 | 1.56k | } |
3931 | | |
3932 | 48.5k | if ( !nodeIsLI(node) && nodeisOL ) |
3933 | 11.8k | { |
3934 | | /* Issue #572 - A <ol><li> can have nested <ol> elements */ |
3935 | 11.8k | foundLI = FindLastLI(list, &lastli); /* find last <li> */ |
3936 | 11.8k | } |
3937 | | |
3938 | 48.5k | if ( nodeIsLI(node) || (TY_(IsHTML5Mode)(doc) && !foundLI) ) |
3939 | 26.4k | { |
3940 | | /* node is <LI> OR |
3941 | | Issue #396 - A <ul> can have Zero or more <li> elements |
3942 | | */ |
3943 | 26.4k | TY_(InsertNodeAtEnd)(list,node); |
3944 | 26.4k | } |
3945 | 22.1k | else |
3946 | 22.1k | { |
3947 | 22.1k | TY_(UngetToken)( doc ); |
3948 | | |
3949 | 22.1k | if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks) |
3950 | 512 | { |
3951 | 512 | TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE); |
3952 | 512 | DEBUG_LOG_EXIT; |
3953 | 512 | return NULL; |
3954 | 512 | } |
3955 | | /* http://tidy.sf.net/issue/1316307 */ |
3956 | | /* In exiled mode, return so table processing can continue. */ |
3957 | 21.6k | else if ( lexer->exiled |
3958 | 8.39k | && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW) |
3959 | 8.39k | || nodeIsTABLE(node)) ) |
3960 | 4.18k | { |
3961 | 4.18k | DEBUG_LOG_EXIT; |
3962 | 4.18k | return NULL; |
3963 | 4.18k | } |
3964 | | /* http://tidy.sf.net/issue/836462 |
3965 | | If "list" is an unordered list, insert the next tag within |
3966 | | the last <li> to preserve the numbering to match the visual |
3967 | | rendering of most browsers. */ |
3968 | 17.4k | if ( nodeIsOL(list) && FindLastLI(list, &lastli) ) |
3969 | 381 | { |
3970 | | /* Create a node for error reporting */ |
3971 | 381 | node = TY_(InferredTag)(doc, TidyTag_LI); |
3972 | 381 | TY_(Report)(doc, list, node, MISSING_STARTTAG ); |
3973 | 381 | TY_(FreeNode)( doc, node); |
3974 | 381 | node = lastli; |
3975 | 381 | } |
3976 | 17.0k | else |
3977 | 17.0k | { |
3978 | | /* Add an inferred <li> */ |
3979 | 17.0k | wasblock = TY_(nodeHasCM)(node,CM_BLOCK); |
3980 | 17.0k | node = TY_(InferredTag)(doc, TidyTag_LI); |
3981 | | /* Add "display: inline" to avoid a blank line after <li> with |
3982 | | Internet Explorer. See http://tidy.sf.net/issue/836462 */ |
3983 | 17.0k | TY_(AddStyleProperty)( doc, node, |
3984 | 17.0k | wasblock |
3985 | 17.0k | ? "list-style: none; display: inline" |
3986 | 17.0k | : "list-style: none" |
3987 | 17.0k | ); |
3988 | 17.0k | TY_(Report)(doc, list, node, MISSING_STARTTAG ); |
3989 | 17.0k | TY_(InsertNodeAtEnd)(list,node); |
3990 | 17.0k | } |
3991 | 17.4k | } |
3992 | | |
3993 | 43.8k | { |
3994 | 43.8k | TidyParserMemory memory = {0}; |
3995 | 43.8k | memory.identity = TY_(ParseList); |
3996 | 43.8k | memory.original_node = list; |
3997 | 43.8k | memory.reentry_node = node; |
3998 | 43.8k | memory.mode = IgnoreWhitespace; |
3999 | 43.8k | TY_(pushMemory)( doc, memory ); |
4000 | 43.8k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4001 | 43.8k | return node; |
4002 | 48.5k | } |
4003 | 48.5k | } |
4004 | | |
4005 | 25.6k | TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR); |
4006 | 25.6k | DEBUG_LOG_EXIT; |
4007 | 25.6k | return NULL; |
4008 | 75.2k | } |
4009 | | |
4010 | | |
4011 | | /** MARK: TY_(ParseNamespace) |
4012 | | * Act as a generic XML (sub)tree parser: collect each node and add it |
4013 | | * to the DOM, without any further validation. It's useful for tags that |
4014 | | * have XML-like content, such as `svg` and `math`. |
4015 | | * |
4016 | | * @note Perhaps this is poorly named, as we're not parsing the namespace |
4017 | | * of a particular tag, but a tag with XML-like content. |
4018 | | * |
4019 | | * @todo Add schema- or other-hierarchy-definition-based validation |
4020 | | * of the subtree here. |
4021 | | * |
4022 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4023 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4024 | | * This parser is also re-enterable, so that post-processing can occur after |
4025 | | * such dispatching. |
4026 | | */ |
4027 | | Node* TY_(ParseNamespace)( TidyDocImpl* doc, Node *basenode, GetTokenMode mode ) |
4028 | 17.0k | { |
4029 | 17.0k | Lexer* lexer = doc->lexer; |
4030 | 17.0k | Node *node; |
4031 | 17.0k | Node *parent = basenode; |
4032 | 17.0k | uint istackbase; |
4033 | 17.0k | AttVal* av; /* #130 MathML attr and entity fix! */ |
4034 | | |
4035 | | /* a la <table>: defer popping elements off the inline stack */ |
4036 | 17.0k | TY_(DeferDup)( doc ); |
4037 | 17.0k | istackbase = lexer->istackbase; |
4038 | 17.0k | lexer->istackbase = lexer->istacksize; |
4039 | | |
4040 | 17.0k | mode = OtherNamespace; /* Preformatted; IgnoreWhitespace; */ |
4041 | | |
4042 | 76.4k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
4043 | 59.7k | { |
4044 | | /* |
4045 | | fix check to skip action in InsertMisc for regular/empty |
4046 | | nodes, which we don't want here... |
4047 | | |
4048 | | The way we do it here is by checking and processing everything |
4049 | | and only what remains goes into InsertMisc() |
4050 | | */ |
4051 | | |
4052 | | /* is this a close tag? And does it match the current parent node? */ |
4053 | 59.7k | if (node->type == EndTag) |
4054 | 5.72k | { |
4055 | | /* |
4056 | | to prevent end tags flowing from one 'alternate namespace' we |
4057 | | check this in two phases: first we check if the tag is a |
4058 | | descendant of the current node, and when it is, we check whether |
4059 | | it is the end tag for a node /within/ or /outside/ the basenode. |
4060 | | */ |
4061 | 5.72k | Bool outside; |
4062 | 5.72k | Node *mp = FindMatchingDescendant(parent, node, basenode, &outside); |
4063 | | |
4064 | 5.72k | if (mp != NULL) |
4065 | 4.33k | { |
4066 | | /* |
4067 | | when mp != parent as we might expect, |
4068 | | infer end tags until we 'hit' the matched |
4069 | | parent or the basenode |
4070 | | */ |
4071 | 4.33k | Node *n; |
4072 | | |
4073 | 4.33k | for (n = parent; |
4074 | 5.31k | n != NULL && n != basenode->parent && n != mp; |
4075 | 4.33k | n = n->parent) |
4076 | 976 | { |
4077 | | /* n->implicit = yes; */ |
4078 | 976 | n->closed = yes; |
4079 | 976 | TY_(Report)(doc, n->parent, n, MISSING_ENDTAG_BEFORE); |
4080 | 976 | } |
4081 | | |
4082 | | /* Issue #369 - Since 'assert' is DEBUG only, and there are |
4083 | | simple cases where these can be fired, removing them |
4084 | | pending feedback from the original author! |
4085 | | assert(outside == no ? n == mp : 1); |
4086 | | assert(outside == yes ? n == basenode->parent : 1); |
4087 | | =================================================== */ |
4088 | | |
4089 | 4.33k | if (outside == no) |
4090 | 4.33k | { |
4091 | | /* EndTag for a node within the basenode subtree. Roll on... */ |
4092 | 4.33k | if (n) |
4093 | 4.33k | n->closed = yes; |
4094 | 4.33k | TY_(FreeNode)(doc, node); |
4095 | | |
4096 | 4.33k | node = n; |
4097 | 4.33k | parent = node ? node->parent : NULL; |
4098 | 4.33k | } |
4099 | 0 | else |
4100 | 0 | { |
4101 | | /* EndTag for a node outside the basenode subtree: let the caller handle that. */ |
4102 | 0 | TY_(UngetToken)( doc ); |
4103 | 0 | node = basenode; |
4104 | 0 | parent = node->parent; |
4105 | 0 | } |
4106 | | |
4107 | | /* when we've arrived at the end-node for the base node, it's quitting time */ |
4108 | 4.33k | if (node == basenode) |
4109 | 272 | { |
4110 | 272 | lexer->istackbase = istackbase; |
4111 | 272 | assert(basenode && basenode->closed == yes); |
4112 | 272 | return NULL; |
4113 | 272 | } |
4114 | 4.33k | } |
4115 | 1.39k | else |
4116 | 1.39k | { |
4117 | | /* unmatched close tag: report an error and discard */ |
4118 | | /* TY_(Report)(doc, parent, node, NON_MATCHING_ENDTAG); Issue #308 - Seems wrong warning! */ |
4119 | 1.39k | TY_(Report)(doc, parent, node, DISCARDING_UNEXPECTED); |
4120 | 1.39k | assert(parent); |
4121 | | /* assert(parent->tag != node->tag); Issue #308 - Seems would always be true! */ |
4122 | 1.39k | TY_(FreeNode)( doc, node); /* Issue #308 - Discard unexpected end tag memory */ |
4123 | 1.39k | } |
4124 | 5.72k | } |
4125 | 54.0k | else if (node->type == StartTag) |
4126 | 27.1k | { |
4127 | | /* #130 MathML attr and entity fix! |
4128 | | care if it has attributes, and 'accidently' any of those attributes match known */ |
4129 | 35.3k | for ( av = node->attributes; av; av = av->next ) |
4130 | 8.25k | { |
4131 | 8.25k | av->dict = 0; /* does something need to be freed? */ |
4132 | 8.25k | } |
4133 | | /* add another child to the current parent */ |
4134 | 27.1k | TY_(InsertNodeAtEnd)(parent, node); |
4135 | 27.1k | parent = node; |
4136 | 27.1k | } |
4137 | 26.9k | else |
4138 | 26.9k | { |
4139 | | /* #130 MathML attr and entity fix! |
4140 | | care if it has attributes, and 'accidently' any of those attributes match known */ |
4141 | 36.5k | for ( av = node->attributes; av; av = av->next ) |
4142 | 9.60k | { |
4143 | 9.60k | av->dict = 0; /* does something need to be freed? */ |
4144 | 9.60k | } |
4145 | 26.9k | TY_(InsertNodeAtEnd)(parent, node); |
4146 | 26.9k | } |
4147 | 59.7k | } |
4148 | | |
4149 | 16.7k | TY_(Report)(doc, basenode->parent, basenode, MISSING_ENDTAG_FOR); |
4150 | 16.7k | return NULL; |
4151 | 17.0k | } |
4152 | | |
4153 | | |
4154 | | /** MARK: TY_(ParseNoFrames) |
4155 | | * Parses the `noframes` tag. |
4156 | | * |
4157 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4158 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4159 | | * This parser is also re-enterable, so that post-processing can occur after |
4160 | | * such dispatching. |
4161 | | */ |
4162 | | Node* TY_(ParseNoFrames)( TidyDocImpl* doc, Node *noframes, GetTokenMode mode ) |
4163 | 66.1k | { |
4164 | 66.1k | Lexer* lexer = doc->lexer; |
4165 | 66.1k | Node *node = NULL; |
4166 | 66.1k | Bool body_seen = no; |
4167 | 66.1k | DEBUG_LOG_COUNTERS; |
4168 | | |
4169 | 66.1k | enum parserState { |
4170 | 66.1k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4171 | 66.1k | STATE_POST_NODEISBODY, /* To-do after re-entering after checks. */ |
4172 | 66.1k | STATE_COMPLETE, /* Done with the switch. */ |
4173 | 66.1k | } state = STATE_INITIAL; |
4174 | | |
4175 | | /* |
4176 | | If we're re-entering, then we need to setup from a previous state, |
4177 | | instead of starting fresh. We can pull what we need from the document's |
4178 | | stack. |
4179 | | */ |
4180 | 66.1k | if ( noframes == NULL ) |
4181 | 45.1k | { |
4182 | 45.1k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4183 | 45.1k | node = memory.reentry_node; /* Throwaway, because we replace it entering the loop anyway.*/ |
4184 | 45.1k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4185 | 45.1k | noframes = memory.original_node; |
4186 | 45.1k | state = memory.reentry_state; |
4187 | 45.1k | body_seen = memory.register_1; |
4188 | 45.1k | DEBUG_LOG_GET_OLD_MODE; |
4189 | 45.1k | mode = memory.mode; |
4190 | 45.1k | DEBUG_LOG_CHANGE_MODE; |
4191 | 45.1k | } |
4192 | 20.9k | else |
4193 | 20.9k | { |
4194 | 20.9k | DEBUG_LOG_ENTER_WITH_NODE(noframes); |
4195 | 20.9k | if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) |
4196 | 20.9k | { |
4197 | 20.9k | doc->badAccess |= BA_USING_NOFRAMES; |
4198 | 20.9k | } |
4199 | 20.9k | } |
4200 | | |
4201 | 66.1k | mode = IgnoreWhitespace; |
4202 | | |
4203 | 91.1k | while ( state != STATE_COMPLETE ) |
4204 | 75.9k | { |
4205 | 75.9k | if ( state == STATE_INITIAL ) |
4206 | 73.5k | { |
4207 | 73.5k | node = TY_(GetToken)(doc, mode); |
4208 | 73.5k | DEBUG_LOG_GOT_TOKEN(node); |
4209 | 73.5k | } |
4210 | | |
4211 | 75.9k | switch ( state ) |
4212 | 75.9k | { |
4213 | 73.5k | case STATE_INITIAL: |
4214 | 73.5k | { |
4215 | 73.5k | if ( node == NULL ) |
4216 | 15.2k | { |
4217 | 15.2k | state = STATE_COMPLETE; |
4218 | 15.2k | continue; |
4219 | 15.2k | } |
4220 | | |
4221 | 58.3k | if ( node->tag == noframes->tag && node->type == EndTag ) |
4222 | 1.17k | { |
4223 | 1.17k | TY_(FreeNode)( doc, node); |
4224 | 1.17k | noframes->closed = yes; |
4225 | 1.17k | TrimSpaces(doc, noframes); |
4226 | 1.17k | DEBUG_LOG_EXIT; |
4227 | 1.17k | return NULL; |
4228 | 1.17k | } |
4229 | | |
4230 | 57.1k | if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) ) |
4231 | 3.89k | { |
4232 | 3.89k | TrimSpaces(doc, noframes); |
4233 | 3.89k | if (node->type == EndTag) |
4234 | 64 | { |
4235 | 64 | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4236 | 64 | TY_(FreeNode)( doc, node); /* Throw it away */ |
4237 | 64 | } |
4238 | 3.82k | else |
4239 | 3.82k | { |
4240 | 3.82k | TY_(Report)(doc, noframes, node, MISSING_ENDTAG_BEFORE); |
4241 | 3.82k | TY_(UngetToken)( doc ); |
4242 | 3.82k | } |
4243 | 3.89k | DEBUG_LOG_EXIT; |
4244 | 3.89k | return NULL; |
4245 | 3.89k | } |
4246 | | |
4247 | 53.3k | if ( nodeIsHTML(node) ) |
4248 | 219 | { |
4249 | 219 | if (TY_(nodeIsElement)(node)) |
4250 | 85 | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4251 | | |
4252 | 219 | TY_(FreeNode)( doc, node); |
4253 | 219 | continue; |
4254 | 219 | } |
4255 | | |
4256 | | /* deal with comments etc. */ |
4257 | 53.0k | if (InsertMisc(noframes, node)) |
4258 | 337 | continue; |
4259 | | |
4260 | 52.7k | if ( nodeIsBODY(node) && node->type == StartTag ) |
4261 | 2.45k | { |
4262 | 2.45k | TidyParserMemory memory = {0}; |
4263 | 2.45k | memory.identity = TY_(ParseNoFrames); |
4264 | 2.45k | memory.original_node = noframes; |
4265 | 2.45k | memory.reentry_node = node; |
4266 | 2.45k | memory.reentry_state = STATE_POST_NODEISBODY; |
4267 | 2.45k | memory.register_1 = lexer->seenEndBody; |
4268 | 2.45k | memory.mode = IgnoreWhitespace; |
4269 | | |
4270 | 2.45k | TY_(InsertNodeAtEnd)(noframes, node); |
4271 | 2.45k | TY_(pushMemory)( doc, memory ); |
4272 | 2.45k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4273 | 2.45k | return node; |
4274 | 2.45k | } |
4275 | | |
4276 | | /* implicit body element inferred */ |
4277 | 50.2k | if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag)) |
4278 | 43.7k | { |
4279 | 43.7k | Node *body = TY_(FindBody)( doc ); |
4280 | 43.7k | if ( body || lexer->seenEndBody ) |
4281 | 8.81k | { |
4282 | 8.81k | if ( body == NULL ) |
4283 | 323 | { |
4284 | 323 | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4285 | 323 | TY_(FreeNode)( doc, node); |
4286 | 323 | continue; |
4287 | 323 | } |
4288 | 8.49k | if ( TY_(nodeIsText)(node) ) |
4289 | 2.35k | { |
4290 | 2.35k | TY_(UngetToken)( doc ); |
4291 | 2.35k | node = TY_(InferredTag)(doc, TidyTag_P); |
4292 | 2.35k | TY_(Report)(doc, noframes, node, CONTENT_AFTER_BODY ); |
4293 | 2.35k | } |
4294 | 8.49k | TY_(InsertNodeAtEnd)( body, node ); |
4295 | 8.49k | } |
4296 | 34.8k | else |
4297 | 34.8k | { |
4298 | 34.8k | TY_(UngetToken)( doc ); |
4299 | 34.8k | node = TY_(InferredTag)(doc, TidyTag_BODY); |
4300 | 34.8k | if ( cfgBool(doc, TidyXmlOut) ) |
4301 | 34.8k | TY_(Report)(doc, noframes, node, INSERTING_TAG); |
4302 | 34.8k | TY_(InsertNodeAtEnd)( noframes, node ); |
4303 | 34.8k | } |
4304 | | |
4305 | 43.3k | { |
4306 | 43.3k | TidyParserMemory memory = {0}; |
4307 | 43.3k | memory.identity = TY_(ParseNoFrames); |
4308 | 43.3k | memory.original_node = noframes; |
4309 | 43.3k | memory.reentry_node = node; |
4310 | 43.3k | memory.mode = IgnoreWhitespace; /*MixedContent*/ |
4311 | 43.3k | memory.reentry_state = STATE_INITIAL; |
4312 | 43.3k | TY_(pushMemory)( doc, memory ); |
4313 | 43.3k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4314 | 43.3k | return node; |
4315 | 43.7k | } |
4316 | 43.7k | } |
4317 | | |
4318 | | /* discard unexpected end tags */ |
4319 | 6.58k | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4320 | 6.58k | TY_(FreeNode)( doc, node); |
4321 | 6.58k | } break; |
4322 | | |
4323 | | |
4324 | 2.33k | case STATE_POST_NODEISBODY: |
4325 | 2.33k | { |
4326 | | /* fix for bug http://tidy.sf.net/bug/887259 */ |
4327 | 2.33k | if (body_seen && TY_(FindBody)(doc) != node) |
4328 | 1.09k | { |
4329 | 1.09k | TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no); |
4330 | 1.09k | MoveNodeToBody(doc, node); |
4331 | 1.09k | } |
4332 | 2.33k | state = STATE_INITIAL; |
4333 | 2.33k | continue; |
4334 | | |
4335 | 50.2k | } break; |
4336 | | |
4337 | | |
4338 | 0 | default: |
4339 | 0 | break; |
4340 | 75.9k | } /* switch */ |
4341 | 75.9k | } /* while */ |
4342 | | |
4343 | 15.2k | TY_(Report)(doc, noframes, node, MISSING_ENDTAG_FOR); |
4344 | 15.2k | DEBUG_LOG_EXIT; |
4345 | 15.2k | return NULL; |
4346 | 66.1k | } |
4347 | | |
4348 | | |
4349 | | /** MARK: TY_(ParseOptGroup) |
4350 | | * Parses the `optgroup` tag. |
4351 | | * |
4352 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4353 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4354 | | * This parser is also re-enterable, so that post-processing can occur after |
4355 | | * such dispatching. |
4356 | | */ |
4357 | | Node* TY_(ParseOptGroup)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) ) |
4358 | 40.7k | { |
4359 | 40.7k | Lexer* lexer = doc->lexer; |
4360 | 40.7k | Node *node; |
4361 | 40.7k | DEBUG_LOG_COUNTERS; |
4362 | | |
4363 | 40.7k | if ( field == NULL ) |
4364 | 20.4k | { |
4365 | 20.4k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4366 | 20.4k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4367 | 20.4k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4368 | 20.4k | field = memory.original_node; |
4369 | 20.4k | DEBUG_LOG_GET_OLD_MODE; |
4370 | 20.4k | mode = memory.mode; |
4371 | 20.4k | DEBUG_LOG_CHANGE_MODE; |
4372 | 20.4k | } |
4373 | 20.3k | else |
4374 | 20.3k | { |
4375 | 20.3k | DEBUG_LOG_ENTER_WITH_NODE(field); |
4376 | 20.3k | } |
4377 | | |
4378 | 40.7k | lexer->insert = NULL; /* defer implicit inline start tags */ |
4379 | | |
4380 | 59.2k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
4381 | 39.8k | { |
4382 | 39.8k | if (node->tag == field->tag && node->type == EndTag) |
4383 | 997 | { |
4384 | 997 | TY_(FreeNode)( doc, node); |
4385 | 997 | field->closed = yes; |
4386 | 997 | TrimSpaces(doc, field); |
4387 | 997 | DEBUG_LOG_EXIT; |
4388 | 997 | return NULL; |
4389 | 997 | } |
4390 | | |
4391 | | /* deal with comments etc. */ |
4392 | 38.8k | if (InsertMisc(field, node)) |
4393 | 11.1k | continue; |
4394 | | |
4395 | 27.7k | if ( node->type == StartTag && |
4396 | 23.7k | (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) ) |
4397 | 20.4k | { |
4398 | 20.4k | TidyParserMemory memory = {0}; |
4399 | | |
4400 | 20.4k | if ( nodeIsOPTGROUP(node) ) |
4401 | 19.7k | TY_(Report)(doc, field, node, CANT_BE_NESTED); |
4402 | | |
4403 | 20.4k | TY_(InsertNodeAtEnd)(field, node); |
4404 | | |
4405 | 20.4k | memory.identity = TY_(ParseOptGroup); |
4406 | 20.4k | memory.original_node = field; |
4407 | 20.4k | memory.reentry_node = node; |
4408 | 20.4k | TY_(pushMemory)( doc, memory ); |
4409 | 20.4k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4410 | 20.4k | return node; |
4411 | 20.4k | } |
4412 | | |
4413 | | /* discard unexpected tags */ |
4414 | 7.28k | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED ); |
4415 | 7.28k | TY_(FreeNode)( doc, node); |
4416 | 7.28k | } |
4417 | 19.3k | DEBUG_LOG_EXIT; |
4418 | 19.3k | return NULL; |
4419 | 40.7k | } |
4420 | | |
4421 | | |
4422 | | /** MARK: TY_(ParsePre) |
4423 | | * Parses the `pre` tag. |
4424 | | * |
4425 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4426 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4427 | | * This parser is also re-enterable, so that post-processing can occur after |
4428 | | * such dispatching. |
4429 | | */ |
4430 | | Node* TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) |
4431 | 133k | { |
4432 | 133k | Node *node = NULL; |
4433 | 133k | DEBUG_LOG_COUNTERS; |
4434 | | |
4435 | 133k | enum parserState { |
4436 | 133k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4437 | 133k | STATE_RENTRY_ACTION, /* To-do after re-entering after checks. */ |
4438 | 133k | STATE_COMPLETE, /* Done with the switch. */ |
4439 | 133k | } state = STATE_INITIAL; |
4440 | | |
4441 | | |
4442 | 133k | if ( pre == NULL ) |
4443 | 66.9k | { |
4444 | 66.9k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4445 | 66.9k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4446 | 66.9k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4447 | 66.9k | pre = memory.original_node; |
4448 | 66.9k | state = memory.reentry_state; |
4449 | 66.9k | DEBUG_LOG_GET_OLD_MODE; |
4450 | 66.9k | mode = memory.mode; |
4451 | 66.9k | DEBUG_LOG_CHANGE_MODE; |
4452 | 66.9k | } |
4453 | 66.0k | else |
4454 | 66.0k | { |
4455 | 66.0k | DEBUG_LOG_ENTER_WITH_NODE(pre); |
4456 | 66.0k | if (pre->tag->model & CM_EMPTY) |
4457 | 0 | { |
4458 | 0 | DEBUG_LOG_EXIT; |
4459 | 0 | return NULL; |
4460 | 0 | } |
4461 | 66.0k | } |
4462 | | |
4463 | 133k | TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */ |
4464 | | |
4465 | 214k | while ( state != STATE_COMPLETE ) |
4466 | 210k | { |
4467 | 210k | if ( state == STATE_INITIAL ) |
4468 | 148k | node = TY_(GetToken)(doc, Preformatted); |
4469 | | |
4470 | 210k | switch ( state ) |
4471 | 210k | { |
4472 | 148k | case STATE_INITIAL: |
4473 | 148k | { |
4474 | 148k | if ( node == NULL ) |
4475 | 4.15k | { |
4476 | 4.15k | state = STATE_COMPLETE; |
4477 | 4.15k | continue; |
4478 | 4.15k | } |
4479 | | |
4480 | 143k | if ( node->type == EndTag && |
4481 | 12.1k | (node->tag == pre->tag || DescendantOf(pre, TagId(node))) ) |
4482 | 1.69k | { |
4483 | 1.69k | if (nodeIsBODY(node) || nodeIsHTML(node)) |
4484 | 868 | { |
4485 | 868 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4486 | 868 | TY_(FreeNode)(doc, node); |
4487 | 868 | continue; |
4488 | 868 | } |
4489 | 831 | if (node->tag == pre->tag) |
4490 | 296 | { |
4491 | 296 | TY_(FreeNode)(doc, node); |
4492 | 296 | } |
4493 | 535 | else |
4494 | 535 | { |
4495 | 535 | TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE ); |
4496 | 535 | TY_(UngetToken)( doc ); |
4497 | 535 | } |
4498 | 831 | pre->closed = yes; |
4499 | 831 | TrimSpaces(doc, pre); |
4500 | 831 | DEBUG_LOG_EXIT; |
4501 | 831 | return NULL; |
4502 | 1.69k | } |
4503 | | |
4504 | 142k | if (TY_(nodeIsText)(node)) |
4505 | 3.20k | { |
4506 | 3.20k | TY_(InsertNodeAtEnd)(pre, node); |
4507 | 3.20k | continue; |
4508 | 3.20k | } |
4509 | | |
4510 | | /* deal with comments etc. */ |
4511 | 139k | if (InsertMisc(pre, node)) |
4512 | 135 | continue; |
4513 | | |
4514 | 138k | if (node->tag == NULL) |
4515 | 4.10k | { |
4516 | 4.10k | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4517 | 4.10k | TY_(FreeNode)(doc, node); |
4518 | 4.10k | continue; |
4519 | 4.10k | } |
4520 | | |
4521 | | /* strip unexpected tags */ |
4522 | 134k | if ( !PreContent(doc, node) ) |
4523 | 123k | { |
4524 | | /* fix for http://tidy.sf.net/bug/772205 */ |
4525 | 123k | if (node->type == EndTag) |
4526 | 9.25k | { |
4527 | | /* http://tidy.sf.net/issue/1590220 */ |
4528 | 9.25k | if ( doc->lexer->exiled |
4529 | 9.16k | && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) |
4530 | 8.90k | { |
4531 | 8.90k | TY_(UngetToken)(doc); |
4532 | 8.90k | TrimSpaces(doc, pre); |
4533 | 8.90k | DEBUG_LOG_EXIT; |
4534 | 8.90k | return NULL; |
4535 | 8.90k | } |
4536 | | |
4537 | 355 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4538 | 355 | TY_(FreeNode)(doc, node); |
4539 | 355 | continue; |
4540 | 9.25k | } |
4541 | | /* http://tidy.sf.net/issue/1590220 */ |
4542 | 114k | else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) |
4543 | 110k | || nodeIsTABLE(node) ) |
4544 | 48.6k | { |
4545 | 48.6k | if (!doc->lexer->exiled) |
4546 | | /* No missing close warning if exiled. */ |
4547 | 1.36k | TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE); |
4548 | | |
4549 | 48.6k | TY_(UngetToken)(doc); |
4550 | 48.6k | DEBUG_LOG_EXIT; |
4551 | 48.6k | return NULL; |
4552 | 48.6k | } |
4553 | | |
4554 | | /* |
4555 | | This is basically what Tidy 04 August 2000 did and far more accurate |
4556 | | with respect to browser behaivour than the code commented out above. |
4557 | | Tidy could try to propagate the <pre> into each disallowed child where |
4558 | | <pre> is allowed in order to replicate some browsers behaivour, but |
4559 | | there are a lot of exceptions, e.g. Internet Explorer does not propagate |
4560 | | <pre> into table cells while Mozilla does. Opera 6 never propagates |
4561 | | <pre> into blocklevel elements while Opera 7 behaves much like Mozilla. |
4562 | | |
4563 | | Tidy behaves thus mostly like Opera 6 except for nested <pre> elements |
4564 | | which are handled like Mozilla takes them (Opera6 closes all <pre> after |
4565 | | the first </pre>). |
4566 | | |
4567 | | There are similar issues like replacing <p> in <pre> with <br>, for |
4568 | | example |
4569 | | |
4570 | | <pre>...<p>...</pre> (Input) |
4571 | | <pre>...<br>...</pre> (Tidy) |
4572 | | <pre>...<br>...</pre> (Opera 7 and Internet Explorer) |
4573 | | <pre>...<br><br>...</pre> (Opera 6 and Mozilla) |
4574 | | |
4575 | | <pre>...<p>...</p>...</pre> (Input) |
4576 | | <pre>...<br>......</pre> (Tidy, BUG!) |
4577 | | <pre>...<br>...<br>...</pre> (Internet Explorer) |
4578 | | <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6) |
4579 | | <pre>...<br>...<br><br>...</pre> (Opera 7) |
4580 | | |
4581 | | or something similar, they could also be closing the <pre> and propagate |
4582 | | the <pre> into the newly opened <p>. |
4583 | | |
4584 | | Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are |
4585 | | disallowed in <pre>, Tidy neither detects this nor does it perform any |
4586 | | cleanup operation. Tidy should at least issue a warning if it encounters |
4587 | | such constructs. |
4588 | | |
4589 | | Todo: discarding </p> is abviously a bug, it should be replaced by <br>. |
4590 | | */ |
4591 | 65.3k | TY_(InsertNodeAfterElement)(pre, node); |
4592 | 65.3k | TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE); |
4593 | | |
4594 | 65.3k | { |
4595 | 65.3k | TidyParserMemory memory = {0}; |
4596 | 65.3k | memory.identity = TY_(ParsePre); |
4597 | 65.3k | memory.original_node = pre; |
4598 | 65.3k | memory.reentry_node = node; |
4599 | 65.3k | memory.reentry_state = STATE_RENTRY_ACTION; |
4600 | 65.3k | TY_(pushMemory)( doc, memory ); |
4601 | 65.3k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4602 | 65.3k | return node; |
4603 | 123k | } |
4604 | 123k | } |
4605 | | |
4606 | 11.5k | if ( nodeIsP(node) ) |
4607 | 6.17k | { |
4608 | 6.17k | if (node->type == StartTag) |
4609 | 5.83k | { |
4610 | 5.83k | TY_(Report)(doc, pre, node, USING_BR_INPLACE_OF); |
4611 | | |
4612 | | /* trim white space before <p> in <pre>*/ |
4613 | 5.83k | TrimSpaces(doc, pre); |
4614 | | |
4615 | | /* coerce both <p> and </p> to <br> */ |
4616 | 5.83k | TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); |
4617 | 5.83k | TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */ |
4618 | 5.83k | TY_(InsertNodeAtEnd)( pre, node ); |
4619 | 5.83k | } |
4620 | 343 | else |
4621 | 343 | { |
4622 | 343 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4623 | 343 | TY_(FreeNode)( doc, node); |
4624 | 343 | } |
4625 | 6.17k | continue; |
4626 | 6.17k | } |
4627 | | |
4628 | 5.36k | if ( TY_(nodeIsElement)(node) ) |
4629 | 5.12k | { |
4630 | | /* trim white space before <br> */ |
4631 | 5.12k | if ( nodeIsBR(node) ) |
4632 | 131 | TrimSpaces(doc, pre); |
4633 | | |
4634 | 5.12k | TY_(InsertNodeAtEnd)(pre, node); |
4635 | | |
4636 | 5.12k | { |
4637 | 5.12k | TidyParserMemory memory = {0}; |
4638 | 5.12k | memory.identity = TY_(ParsePre); |
4639 | 5.12k | memory.original_node = pre; |
4640 | 5.12k | memory.reentry_node = node; |
4641 | 5.12k | memory.reentry_state = STATE_INITIAL; |
4642 | 5.12k | TY_(pushMemory)( doc, memory ); |
4643 | 5.12k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4644 | 5.12k | return node; |
4645 | 5.12k | } |
4646 | 5.12k | } |
4647 | | |
4648 | | /* discard unexpected tags */ |
4649 | 242 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4650 | 242 | TY_(FreeNode)( doc, node); |
4651 | 242 | } break; |
4652 | | |
4653 | 61.9k | case STATE_RENTRY_ACTION: |
4654 | 61.9k | { |
4655 | 61.9k | Node* newnode = TY_(InferredTag)(doc, TidyTag_PRE); |
4656 | 61.9k | TY_(Report)(doc, pre, newnode, INSERTING_TAG); |
4657 | 61.9k | pre = newnode; |
4658 | 61.9k | TY_(InsertNodeAfterElement)(node, pre); |
4659 | 61.9k | state = STATE_INITIAL; |
4660 | 61.9k | continue; |
4661 | 5.36k | } break; |
4662 | | |
4663 | 0 | default: |
4664 | 0 | break; |
4665 | | |
4666 | 210k | } /* switch */ |
4667 | 210k | } /* while */ |
4668 | | |
4669 | 4.15k | TY_(Report)(doc, pre, node, MISSING_ENDTAG_FOR); |
4670 | 4.15k | DEBUG_LOG_EXIT; |
4671 | 4.15k | return NULL; |
4672 | 133k | } |
4673 | | |
4674 | | |
4675 | | /** MARK: TY_(ParseRow) |
4676 | | * Parses the `row` tag. |
4677 | | * |
4678 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4679 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4680 | | * This parser is also re-enterable, so that post-processing can occur after |
4681 | | * such dispatching. |
4682 | | */ |
4683 | | Node* TY_(ParseRow)( TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode) ) |
4684 | 36.4k | { |
4685 | 36.4k | Lexer* lexer = doc->lexer; |
4686 | 36.4k | Node *node = NULL; |
4687 | 36.4k | Bool exclude_state = no; |
4688 | 36.4k | DEBUG_LOG_COUNTERS; |
4689 | | |
4690 | 36.4k | enum parserState { |
4691 | 36.4k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4692 | 36.4k | STATE_POST_NOT_ENDTAG, /* To-do after re-entering after !EndTag checks. */ |
4693 | 36.4k | STATE_POST_TD_TH, /* To-do after re-entering after TD/TH checks. */ |
4694 | 36.4k | STATE_COMPLETE, /* Done with the switch. */ |
4695 | 36.4k | } state = STATE_INITIAL; |
4696 | | |
4697 | 36.4k | if ( row == NULL ) |
4698 | 16.2k | { |
4699 | 16.2k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4700 | 16.2k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4701 | 16.2k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4702 | 16.2k | row = memory.original_node; |
4703 | 16.2k | state = memory.reentry_state; |
4704 | 16.2k | exclude_state = memory.register_1; |
4705 | 16.2k | DEBUG_LOG_GET_OLD_MODE; |
4706 | 16.2k | mode = memory.mode; |
4707 | 16.2k | DEBUG_LOG_CHANGE_MODE; |
4708 | 16.2k | } |
4709 | 20.1k | else |
4710 | 20.1k | { |
4711 | 20.1k | DEBUG_LOG_ENTER_WITH_NODE(row); |
4712 | | |
4713 | 20.1k | if (row->tag->model & CM_EMPTY) |
4714 | 0 | return NULL; |
4715 | 20.1k | } |
4716 | | |
4717 | 130k | while ( state != STATE_COMPLETE ) |
4718 | 120k | { |
4719 | 120k | if ( state == STATE_INITIAL ) |
4720 | 104k | { |
4721 | 104k | node = TY_(GetToken)( doc, IgnoreWhitespace ); |
4722 | 104k | DEBUG_LOG_GOT_TOKEN(node); |
4723 | 104k | } |
4724 | | |
4725 | 120k | switch (state) |
4726 | 120k | { |
4727 | 104k | case STATE_INITIAL: |
4728 | 104k | { |
4729 | 104k | if ( node == NULL) |
4730 | 9.41k | { |
4731 | 9.41k | state = STATE_COMPLETE; |
4732 | 9.41k | continue; |
4733 | 9.41k | } |
4734 | | |
4735 | 95.1k | if (node->tag == row->tag) |
4736 | 9.16k | { |
4737 | 9.16k | if (node->type == EndTag) |
4738 | 446 | { |
4739 | 446 | TY_(FreeNode)( doc, node); |
4740 | 446 | row->closed = yes; |
4741 | 446 | FixEmptyRow( doc, row); |
4742 | 446 | DEBUG_LOG_EXIT; |
4743 | 446 | return NULL; |
4744 | 446 | } |
4745 | | |
4746 | | /* New row start implies end of current row */ |
4747 | 8.71k | TY_(UngetToken)( doc ); |
4748 | 8.71k | FixEmptyRow( doc, row); |
4749 | 8.71k | DEBUG_LOG_EXIT; |
4750 | 8.71k | return NULL; |
4751 | 9.16k | } |
4752 | | |
4753 | | /* |
4754 | | if this is the end tag for an ancestor element |
4755 | | then infer end tag for this element |
4756 | | */ |
4757 | 85.9k | if ( node->type == EndTag ) |
4758 | 8.82k | { |
4759 | 8.82k | if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node)) |
4760 | 4.55k | && DescendantOf(row, TagId(node)) ) |
4761 | 244 | { |
4762 | 244 | TY_(UngetToken)( doc ); |
4763 | 244 | DEBUG_LOG_EXIT; |
4764 | 244 | return NULL; |
4765 | 244 | } |
4766 | | |
4767 | 8.58k | if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
4768 | 1.70k | { |
4769 | 1.70k | if ( nodeIsFORM(node) ) |
4770 | 431 | BadForm( doc ); |
4771 | | |
4772 | 1.70k | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4773 | 1.70k | TY_(FreeNode)( doc, node); |
4774 | 1.70k | continue; |
4775 | 1.70k | } |
4776 | | |
4777 | 6.88k | if ( nodeIsTD(node) || nodeIsTH(node) ) |
4778 | 583 | { |
4779 | 583 | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4780 | 583 | TY_(FreeNode)( doc, node); |
4781 | 583 | continue; |
4782 | 583 | } |
4783 | 6.88k | } |
4784 | | |
4785 | | /* deal with comments etc. */ |
4786 | 83.4k | if (InsertMisc(row, node)) |
4787 | 15.2k | continue; |
4788 | | |
4789 | | /* discard unknown tags */ |
4790 | 68.1k | if (node->tag == NULL && node->type != TextNode) |
4791 | 10.8k | { |
4792 | 10.8k | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4793 | 10.8k | TY_(FreeNode)( doc, node); |
4794 | 10.8k | continue; |
4795 | 10.8k | } |
4796 | | |
4797 | | /* discard unexpected <table> element */ |
4798 | 57.3k | if ( nodeIsTABLE(node) ) |
4799 | 327 | { |
4800 | 327 | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4801 | 327 | TY_(FreeNode)( doc, node); |
4802 | 327 | continue; |
4803 | 327 | } |
4804 | | |
4805 | | /* THEAD, TFOOT or TBODY */ |
4806 | 57.0k | if ( TY_(nodeHasCM)(node, CM_ROWGRP) ) |
4807 | 1.09k | { |
4808 | 1.09k | TY_(UngetToken)( doc ); |
4809 | 1.09k | DEBUG_LOG_EXIT; |
4810 | 1.09k | return NULL; |
4811 | 1.09k | } |
4812 | | |
4813 | 55.9k | if (node->type == EndTag) |
4814 | 4.17k | { |
4815 | 4.17k | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4816 | 4.17k | TY_(FreeNode)( doc, node); |
4817 | 4.17k | continue; |
4818 | 4.17k | } |
4819 | | |
4820 | | /* |
4821 | | if text or inline or block move before table |
4822 | | if head content move to head |
4823 | | */ |
4824 | | |
4825 | 51.7k | if (node->type != EndTag) |
4826 | 51.7k | { |
4827 | 51.7k | if ( nodeIsFORM(node) ) |
4828 | 730 | { |
4829 | 730 | TY_(UngetToken)( doc ); |
4830 | 730 | node = TY_(InferredTag)(doc, TidyTag_TD); |
4831 | 730 | TY_(Report)(doc, row, node, MISSING_STARTTAG); |
4832 | 730 | } |
4833 | 51.0k | else if ( TY_(nodeIsText)(node) |
4834 | 28.1k | || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) ) |
4835 | 28.6k | { |
4836 | 28.6k | MoveBeforeTable( doc, row, node ); |
4837 | 28.6k | TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN); |
4838 | 28.6k | lexer->exiled = yes; |
4839 | 28.6k | exclude_state = lexer->excludeBlocks; |
4840 | 28.6k | lexer->excludeBlocks = no; |
4841 | | |
4842 | 28.6k | if (node->type != TextNode) |
4843 | 5.79k | { |
4844 | 5.79k | TidyParserMemory memory = {0}; |
4845 | 5.79k | memory.identity = TY_(ParseRow); |
4846 | 5.79k | memory.original_node = row; |
4847 | 5.79k | memory.reentry_node = node; |
4848 | 5.79k | memory.reentry_state = STATE_POST_NOT_ENDTAG; |
4849 | 5.79k | memory.register_1 = exclude_state; |
4850 | 5.79k | TY_(pushMemory)( doc, memory ); |
4851 | 5.79k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4852 | 5.79k | return node; |
4853 | 5.79k | } |
4854 | | |
4855 | 22.8k | lexer->exiled = no; |
4856 | 22.8k | lexer->excludeBlocks = exclude_state; |
4857 | 22.8k | continue; |
4858 | 28.6k | } |
4859 | 22.3k | else if (node->tag->model & CM_HEAD) |
4860 | 670 | { |
4861 | 670 | TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN); |
4862 | 670 | MoveToHead( doc, row, node); |
4863 | 670 | continue; |
4864 | 670 | } |
4865 | 51.7k | } |
4866 | | |
4867 | 22.4k | if ( !(nodeIsTD(node) || nodeIsTH(node)) ) |
4868 | 11.7k | { |
4869 | 11.7k | TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN); |
4870 | 11.7k | TY_(FreeNode)( doc, node); |
4871 | 11.7k | continue; |
4872 | 11.7k | } |
4873 | | |
4874 | | /* node should be <TD> or <TH> */ |
4875 | 10.7k | TY_(InsertNodeAtEnd)(row, node); |
4876 | 10.7k | exclude_state = lexer->excludeBlocks; |
4877 | 10.7k | lexer->excludeBlocks = no; |
4878 | 10.7k | { |
4879 | 10.7k | TidyParserMemory memory = {0}; |
4880 | 10.7k | memory.identity = TY_(ParseRow); |
4881 | 10.7k | memory.original_node = row; |
4882 | 10.7k | memory.reentry_node = node; |
4883 | 10.7k | memory.reentry_state = STATE_POST_TD_TH; |
4884 | 10.7k | memory.register_1 = exclude_state; |
4885 | 10.7k | TY_(pushMemory)( doc, memory ); |
4886 | 10.7k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4887 | 10.7k | return node; |
4888 | 22.4k | } |
4889 | 22.4k | } break; |
4890 | | |
4891 | | |
4892 | 5.59k | case STATE_POST_NOT_ENDTAG: |
4893 | 5.59k | { |
4894 | 5.59k | lexer->exiled = no; |
4895 | 5.59k | lexer->excludeBlocks = exclude_state; /* capture this in stack. */ |
4896 | 5.59k | state = STATE_INITIAL; |
4897 | 5.59k | continue; |
4898 | 22.4k | } break; |
4899 | | |
4900 | | |
4901 | 10.6k | case STATE_POST_TD_TH: |
4902 | 10.6k | { |
4903 | 10.6k | lexer->excludeBlocks = exclude_state; /* capture this in stack. */ |
4904 | | |
4905 | | /* pop inline stack */ |
4906 | 16.0k | while ( lexer->istacksize > lexer->istackbase ) |
4907 | 5.39k | TY_(PopInline)( doc, NULL ); |
4908 | | |
4909 | 10.6k | state = STATE_INITIAL; |
4910 | 10.6k | continue; |
4911 | 22.4k | } break; |
4912 | | |
4913 | | |
4914 | 0 | default: |
4915 | 0 | break; |
4916 | | |
4917 | 120k | } /* switch */ |
4918 | 120k | } /* while */ |
4919 | 9.41k | DEBUG_LOG_EXIT; |
4920 | 9.41k | return NULL; |
4921 | 36.4k | } |
4922 | | |
4923 | | |
4924 | | /** MARK: TY_(ParseRowGroup) |
4925 | | * Parses the `rowgroup` tag. |
4926 | | * |
4927 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4928 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4929 | | * This parser is also re-enterable, so that post-processing can occur after |
4930 | | * such dispatching. |
4931 | | */ |
4932 | | Node* TY_(ParseRowGroup)( TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode) ) |
4933 | 26.7k | { |
4934 | 26.7k | Lexer* lexer = doc->lexer; |
4935 | 26.7k | Node *node = NULL; |
4936 | 26.7k | Node *parent = NULL; |
4937 | 26.7k | DEBUG_LOG_COUNTERS; |
4938 | | |
4939 | 26.7k | enum parserState { |
4940 | 26.7k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4941 | 26.7k | STATE_POST_NOT_TEXTNODE, /* To-do after re-entering after checks. */ |
4942 | 26.7k | STATE_COMPLETE, /* Done with the switch. */ |
4943 | 26.7k | } state = STATE_INITIAL; |
4944 | | |
4945 | 26.7k | if ( rowgroup == NULL ) |
4946 | 13.3k | { |
4947 | 13.3k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4948 | 13.3k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4949 | 13.3k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4950 | 13.3k | rowgroup = memory.original_node; |
4951 | 13.3k | state = memory.reentry_state; |
4952 | 13.3k | DEBUG_LOG_GET_OLD_MODE; |
4953 | 13.3k | mode = memory.mode; |
4954 | 13.3k | DEBUG_LOG_CHANGE_MODE; |
4955 | 13.3k | } |
4956 | 13.3k | else |
4957 | 13.3k | { |
4958 | 13.3k | DEBUG_LOG_ENTER_WITH_NODE(rowgroup); |
4959 | 13.3k | if (rowgroup->tag->model & CM_EMPTY) |
4960 | 0 | { |
4961 | 0 | DEBUG_LOG_EXIT; |
4962 | 0 | return NULL; |
4963 | 0 | } |
4964 | 13.3k | } |
4965 | | |
4966 | 53.6k | while ( state != STATE_COMPLETE ) |
4967 | 47.3k | { |
4968 | 47.3k | if ( state == STATE_INITIAL ) |
4969 | 33.0k | node = TY_(GetToken)(doc, IgnoreWhitespace); |
4970 | | |
4971 | 47.3k | switch (state) |
4972 | 47.3k | { |
4973 | 33.0k | case STATE_INITIAL: |
4974 | 33.0k | { |
4975 | 33.0k | TidyParserMemory memory = {0}; |
4976 | | |
4977 | 33.0k | if (node == NULL) |
4978 | 6.26k | { |
4979 | 6.26k | state = STATE_COMPLETE; |
4980 | 6.26k | continue; |
4981 | 6.26k | } |
4982 | | |
4983 | 26.8k | if (node->tag == rowgroup->tag) |
4984 | 5.00k | { |
4985 | 5.00k | if (node->type == EndTag) |
4986 | 82 | { |
4987 | 82 | rowgroup->closed = yes; |
4988 | 82 | TY_(FreeNode)( doc, node); |
4989 | 82 | DEBUG_LOG_EXIT; |
4990 | 82 | return NULL; |
4991 | 82 | } |
4992 | | |
4993 | 4.92k | TY_(UngetToken)( doc ); |
4994 | 4.92k | DEBUG_LOG_EXIT; |
4995 | 4.92k | return NULL; |
4996 | 5.00k | } |
4997 | | |
4998 | | /* if </table> infer end tag */ |
4999 | 21.7k | if ( nodeIsTABLE(node) && node->type == EndTag ) |
5000 | 115 | { |
5001 | 115 | TY_(UngetToken)( doc ); |
5002 | 115 | DEBUG_LOG_EXIT; |
5003 | 115 | return NULL; |
5004 | 115 | } |
5005 | | |
5006 | | /* deal with comments etc. */ |
5007 | 21.6k | if (InsertMisc(rowgroup, node)) |
5008 | 490 | continue; |
5009 | | |
5010 | | /* discard unknown tags */ |
5011 | 21.1k | if (node->tag == NULL && node->type != TextNode) |
5012 | 1.29k | { |
5013 | 1.29k | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5014 | 1.29k | TY_(FreeNode)( doc, node); |
5015 | 1.29k | continue; |
5016 | 1.29k | } |
5017 | | |
5018 | | /* |
5019 | | if TD or TH then infer <TR> |
5020 | | if text or inline or block move before table |
5021 | | if head content move to head |
5022 | | */ |
5023 | | |
5024 | 19.9k | if (node->type != EndTag) |
5025 | 17.6k | { |
5026 | 17.6k | if ( nodeIsTD(node) || nodeIsTH(node) ) |
5027 | 848 | { |
5028 | 848 | TY_(UngetToken)( doc ); |
5029 | 848 | node = TY_(InferredTag)(doc, TidyTag_TR); |
5030 | 848 | TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG); |
5031 | 848 | } |
5032 | 16.8k | else if ( TY_(nodeIsText)(node) |
5033 | 14.1k | || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
5034 | 14.3k | { |
5035 | 14.3k | MoveBeforeTable( doc, rowgroup, node ); |
5036 | 14.3k | TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); |
5037 | 14.3k | lexer->exiled = yes; |
5038 | | |
5039 | 14.3k | if (node->type != TextNode) |
5040 | 11.6k | { |
5041 | 11.6k | memory.identity = TY_(ParseRowGroup); |
5042 | 11.6k | memory.original_node = rowgroup; |
5043 | 11.6k | memory.reentry_node = node; |
5044 | 11.6k | memory.reentry_state = STATE_POST_NOT_TEXTNODE; |
5045 | 11.6k | TY_(pushMemory)( doc, memory ); |
5046 | 11.6k | DEBUG_LOG_EXIT_WITH_NODE(node); |
5047 | 11.6k | return node; |
5048 | 11.6k | } |
5049 | | |
5050 | 2.64k | state = STATE_POST_NOT_TEXTNODE; |
5051 | 2.64k | continue; |
5052 | 14.3k | } |
5053 | 2.47k | else if (node->tag->model & CM_HEAD) |
5054 | 78 | { |
5055 | 78 | TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); |
5056 | 78 | MoveToHead(doc, rowgroup, node); |
5057 | 78 | continue; |
5058 | 78 | } |
5059 | 17.6k | } |
5060 | | |
5061 | | /* |
5062 | | if this is the end tag for ancestor element |
5063 | | then infer end tag for this element |
5064 | | */ |
5065 | 5.49k | if (node->type == EndTag) |
5066 | 2.24k | { |
5067 | 2.24k | if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
5068 | 558 | { |
5069 | 558 | if ( nodeIsFORM(node) ) |
5070 | 209 | BadForm( doc ); |
5071 | | |
5072 | 558 | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5073 | 558 | TY_(FreeNode)( doc, node); |
5074 | 558 | continue; |
5075 | 558 | } |
5076 | | |
5077 | 1.69k | if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) ) |
5078 | 897 | { |
5079 | 897 | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5080 | 897 | TY_(FreeNode)( doc, node); |
5081 | 897 | continue; |
5082 | 897 | } |
5083 | | |
5084 | 793 | for ( parent = rowgroup->parent; |
5085 | 9.73k | parent != NULL; |
5086 | 8.94k | parent = parent->parent ) |
5087 | 9.33k | { |
5088 | 9.33k | if (node->tag == parent->tag) |
5089 | 389 | { |
5090 | 389 | TY_(UngetToken)( doc ); |
5091 | 389 | DEBUG_LOG_EXIT; |
5092 | 389 | return NULL; |
5093 | 389 | } |
5094 | 9.33k | } |
5095 | 793 | } |
5096 | | |
5097 | | /* |
5098 | | if THEAD, TFOOT or TBODY then implied end tag |
5099 | | |
5100 | | */ |
5101 | 3.65k | if (node->tag->model & CM_ROWGRP) |
5102 | 1.68k | { |
5103 | 1.68k | if (node->type != EndTag) |
5104 | 1.57k | { |
5105 | 1.57k | TY_(UngetToken)( doc ); |
5106 | 1.57k | DEBUG_LOG_EXIT; |
5107 | 1.57k | return NULL; |
5108 | 1.57k | } |
5109 | 1.68k | } |
5110 | | |
5111 | 2.08k | if (node->type == EndTag) |
5112 | 404 | { |
5113 | 404 | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5114 | 404 | TY_(FreeNode)( doc, node); |
5115 | 404 | continue; |
5116 | 404 | } |
5117 | | |
5118 | 1.67k | if ( !nodeIsTR(node) ) |
5119 | 434 | { |
5120 | 434 | node = TY_(InferredTag)(doc, TidyTag_TR); |
5121 | 434 | TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG); |
5122 | 434 | TY_(UngetToken)( doc ); |
5123 | 434 | } |
5124 | | |
5125 | | /* node should be <TR> */ |
5126 | 1.67k | TY_(InsertNodeAtEnd)(rowgroup, node); |
5127 | 1.67k | memory.identity = TY_(ParseRowGroup); |
5128 | 1.67k | memory.original_node = rowgroup; |
5129 | 1.67k | memory.reentry_node = node; |
5130 | 1.67k | memory.reentry_state = STATE_INITIAL; |
5131 | 1.67k | TY_(pushMemory)( doc, memory ); |
5132 | 1.67k | DEBUG_LOG_EXIT_WITH_NODE(node); |
5133 | 1.67k | return node; |
5134 | 2.08k | } break; |
5135 | | |
5136 | | |
5137 | 14.3k | case STATE_POST_NOT_TEXTNODE: |
5138 | 14.3k | { |
5139 | 14.3k | lexer->exiled = no; |
5140 | 14.3k | state = STATE_INITIAL; |
5141 | 14.3k | continue; |
5142 | 2.08k | } break; |
5143 | | |
5144 | | |
5145 | 0 | default: |
5146 | 0 | break; |
5147 | 47.3k | } /* switch */ |
5148 | 47.3k | } /* while */ |
5149 | 6.26k | DEBUG_LOG_EXIT; |
5150 | 6.26k | return NULL; |
5151 | 26.7k | } |
5152 | | |
5153 | | |
5154 | | /** MARK: TY_(ParseScript) |
5155 | | * Parses the `script` tag. |
5156 | | * |
5157 | | * @todo This isn't quite right for CDATA content as it recognises tags |
5158 | | * within the content and parses them accordingly. This will unfortunately |
5159 | | * screw up scripts which include: |
5160 | | * < + letter |
5161 | | * < + ! |
5162 | | * < + ? |
5163 | | * < + / + letter |
5164 | | * |
5165 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5166 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5167 | | * This parser is also re-enterable, so that post-processing can occur after |
5168 | | * such dispatching. |
5169 | | */ |
5170 | | Node* TY_(ParseScript)( TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode) ) |
5171 | 4.61k | { |
5172 | 4.61k | Node *node = NULL; |
5173 | | #if defined(ENABLE_DEBUG_LOG) |
5174 | | static int depth_parser = 0; |
5175 | | static int count_parser = 0; |
5176 | | #endif |
5177 | | |
5178 | 4.61k | DEBUG_LOG_ENTER_WITH_NODE(script); |
5179 | | |
5180 | 4.61k | doc->lexer->parent = script; |
5181 | 4.61k | node = TY_(GetToken)(doc, CdataContent); |
5182 | 4.61k | doc->lexer->parent = NULL; |
5183 | | |
5184 | 4.61k | if (node) |
5185 | 4.54k | { |
5186 | 4.54k | TY_(InsertNodeAtEnd)(script, node); |
5187 | 4.54k | } |
5188 | 67 | else |
5189 | 67 | { |
5190 | | /* handle e.g. a document like "<script>" */ |
5191 | 67 | TY_(Report)(doc, script, NULL, MISSING_ENDTAG_FOR); |
5192 | 67 | DEBUG_LOG_EXIT; |
5193 | 67 | return NULL; |
5194 | 67 | } |
5195 | | |
5196 | 4.54k | node = TY_(GetToken)(doc, IgnoreWhitespace); |
5197 | 4.54k | DEBUG_LOG_GOT_TOKEN(node); |
5198 | | |
5199 | 4.54k | if (!(node && node->type == EndTag && node->tag && |
5200 | 3.00k | node->tag->id == script->tag->id)) |
5201 | 2.61k | { |
5202 | 2.61k | TY_(Report)(doc, script, node, MISSING_ENDTAG_FOR); |
5203 | | |
5204 | 2.61k | if (node) |
5205 | 2.04k | TY_(UngetToken)(doc); |
5206 | 2.61k | } |
5207 | 1.93k | else |
5208 | 1.93k | { |
5209 | 1.93k | TY_(FreeNode)(doc, node); |
5210 | 1.93k | } |
5211 | 4.54k | DEBUG_LOG_EXIT; |
5212 | 4.54k | return NULL; |
5213 | 4.61k | } |
5214 | | |
5215 | | |
5216 | | /** MARK: TY_(ParseSelect) |
5217 | | * Parses the `select` tag. |
5218 | | * |
5219 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5220 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5221 | | * This parser is also re-enterable, so that post-processing can occur after |
5222 | | * such dispatching. |
5223 | | */ |
5224 | | Node* TY_(ParseSelect)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) ) |
5225 | 3.52k | { |
5226 | 3.52k | Lexer* lexer = doc->lexer; |
5227 | 3.52k | Node *node; |
5228 | 3.52k | DEBUG_LOG_COUNTERS; |
5229 | | |
5230 | 3.52k | if ( field == NULL ) |
5231 | 2.58k | { |
5232 | 2.58k | TidyParserMemory memory = TY_(popMemory)( doc ); |
5233 | 2.58k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
5234 | 2.58k | DEBUG_LOG_REENTER_WITH_NODE(node); |
5235 | 2.58k | field = memory.original_node; |
5236 | 2.58k | DEBUG_LOG_GET_OLD_MODE; |
5237 | 2.58k | mode = memory.mode; |
5238 | 2.58k | DEBUG_LOG_CHANGE_MODE; |
5239 | 2.58k | } |
5240 | 945 | else |
5241 | 945 | { |
5242 | 945 | DEBUG_LOG_ENTER_WITH_NODE(field); |
5243 | 945 | } |
5244 | | |
5245 | 3.52k | lexer->insert = NULL; /* defer implicit inline start tags */ |
5246 | | |
5247 | 19.5k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
5248 | 18.8k | { |
5249 | 18.8k | if (node->tag == field->tag && node->type == EndTag) |
5250 | 251 | { |
5251 | 251 | TY_(FreeNode)( doc, node); |
5252 | 251 | field->closed = yes; |
5253 | 251 | TrimSpaces(doc, field); |
5254 | | |
5255 | 251 | DEBUG_LOG_EXIT; |
5256 | 251 | return NULL; |
5257 | 251 | } |
5258 | | |
5259 | | /* deal with comments etc. */ |
5260 | 18.6k | if (InsertMisc(field, node)) |
5261 | 1.45k | continue; |
5262 | | |
5263 | 17.1k | if ( node->type == StartTag && |
5264 | 14.6k | ( nodeIsOPTION(node) || |
5265 | 14.6k | nodeIsOPTGROUP(node) || |
5266 | 14.6k | nodeIsDATALIST(node) || |
5267 | 14.6k | nodeIsSCRIPT(node)) |
5268 | 17.1k | ) |
5269 | 2.58k | { |
5270 | 2.58k | TidyParserMemory memory = {0}; |
5271 | 2.58k | memory.identity = TY_(ParseSelect); |
5272 | 2.58k | memory.original_node = field; |
5273 | 2.58k | memory.reentry_node = node; |
5274 | | |
5275 | 2.58k | TY_(InsertNodeAtEnd)(field, node); |
5276 | 2.58k | TY_(pushMemory)( doc, memory ); |
5277 | 2.58k | DEBUG_LOG_EXIT_WITH_NODE(node); |
5278 | 2.58k | return node; |
5279 | 2.58k | } |
5280 | | |
5281 | | /* discard unexpected tags */ |
5282 | 14.5k | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED); |
5283 | 14.5k | TY_(FreeNode)( doc, node); |
5284 | 14.5k | } |
5285 | | |
5286 | 694 | TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR); |
5287 | | |
5288 | 694 | DEBUG_LOG_EXIT; |
5289 | 694 | return NULL; |
5290 | 3.52k | } |
5291 | | |
5292 | | |
5293 | | /** MARK: TY_(ParseTableTag) |
5294 | | * Parses the `table` tag. |
5295 | | * |
5296 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5297 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5298 | | * This parser is also re-enterable, so that post-processing can occur after |
5299 | | * such dispatching. |
5300 | | */ |
5301 | | Node* TY_(ParseTableTag)( TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode) ) |
5302 | 37.7k | { |
5303 | 37.7k | Lexer* lexer = doc->lexer; |
5304 | 37.7k | Node *node, *parent; |
5305 | 37.7k | uint istackbase; |
5306 | 37.7k | DEBUG_LOG_COUNTERS; |
5307 | | |
5308 | 37.7k | if ( table == NULL ) |
5309 | 20.9k | { |
5310 | 20.9k | TidyParserMemory memory = TY_(popMemory)( doc ); |
5311 | 20.9k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
5312 | 20.9k | DEBUG_LOG_REENTER_WITH_NODE(node); |
5313 | 20.9k | table = memory.original_node; |
5314 | 20.9k | lexer->exiled = memory.register_1; |
5315 | 20.9k | DEBUG_LOG_GET_OLD_MODE; |
5316 | 20.9k | mode = memory.mode; |
5317 | 20.9k | DEBUG_LOG_CHANGE_MODE; |
5318 | 20.9k | } |
5319 | 16.7k | else |
5320 | 16.7k | { |
5321 | 16.7k | DEBUG_LOG_ENTER_WITH_NODE(table); |
5322 | 16.7k | TY_(DeferDup)( doc ); |
5323 | 16.7k | } |
5324 | | |
5325 | 37.7k | istackbase = lexer->istackbase; |
5326 | 37.7k | lexer->istackbase = lexer->istacksize; |
5327 | | |
5328 | 44.5k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
5329 | 33.4k | { |
5330 | 33.4k | DEBUG_LOG_GOT_TOKEN(node); |
5331 | 33.4k | if (node->tag == table->tag ) |
5332 | 4.23k | { |
5333 | 4.23k | if (node->type == EndTag) |
5334 | 129 | { |
5335 | 129 | TY_(FreeNode)(doc, node); |
5336 | 129 | } |
5337 | 4.10k | else |
5338 | 4.10k | { |
5339 | | /* Issue #498 - If a <table> in a <table> |
5340 | | * just close the current table, and issue a |
5341 | | * warning. The previous action was to discard |
5342 | | * this second <table> |
5343 | | */ |
5344 | 4.10k | TY_(UngetToken)(doc); |
5345 | 4.10k | TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN); |
5346 | 4.10k | } |
5347 | 4.23k | lexer->istackbase = istackbase; |
5348 | 4.23k | table->closed = yes; |
5349 | | |
5350 | 4.23k | DEBUG_LOG_EXIT; |
5351 | 4.23k | return NULL; |
5352 | 4.23k | } |
5353 | | |
5354 | | /* deal with comments etc. */ |
5355 | 29.2k | if (InsertMisc(table, node)) |
5356 | 763 | continue; |
5357 | | |
5358 | | /* discard unknown tags */ |
5359 | 28.4k | if (node->tag == NULL && node->type != TextNode) |
5360 | 1.88k | { |
5361 | 1.88k | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5362 | 1.88k | TY_(FreeNode)( doc, node); |
5363 | 1.88k | continue; |
5364 | 1.88k | } |
5365 | | |
5366 | | /* if TD or TH or text or inline or block then infer <TR> */ |
5367 | | |
5368 | 26.5k | if (node->type != EndTag) |
5369 | 24.3k | { |
5370 | 24.3k | if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) ) |
5371 | 7.33k | { |
5372 | 7.33k | TY_(UngetToken)( doc ); |
5373 | 7.33k | node = TY_(InferredTag)(doc, TidyTag_TR); |
5374 | 7.33k | TY_(Report)(doc, table, node, MISSING_STARTTAG); |
5375 | 7.33k | } |
5376 | 16.9k | else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) ) |
5377 | 5.77k | { |
5378 | 5.77k | TY_(InsertNodeBeforeElement)(table, node); |
5379 | 5.77k | TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN); |
5380 | 5.77k | lexer->exiled = yes; |
5381 | | |
5382 | 5.77k | if (node->type != TextNode) |
5383 | 2.76k | { |
5384 | 2.76k | TidyParserMemory memory = {0}; |
5385 | 2.76k | memory.identity = TY_(ParseTableTag); |
5386 | 2.76k | memory.original_node = table; |
5387 | 2.76k | memory.reentry_node = node; |
5388 | 2.76k | memory.register_1 = no; /* later, lexer->exiled = no */ |
5389 | 2.76k | memory.mode = IgnoreWhitespace; |
5390 | 2.76k | TY_(pushMemory)( doc, memory ); |
5391 | 2.76k | DEBUG_LOG_EXIT_WITH_NODE(node); |
5392 | 2.76k | return node; |
5393 | 2.76k | } |
5394 | | |
5395 | 3.01k | lexer->exiled = no; |
5396 | 3.01k | continue; |
5397 | 5.77k | } |
5398 | 11.2k | else if (node->tag->model & CM_HEAD) |
5399 | 73 | { |
5400 | 73 | MoveToHead(doc, table, node); |
5401 | 73 | continue; |
5402 | 73 | } |
5403 | 24.3k | } |
5404 | | |
5405 | | /* |
5406 | | if this is the end tag for an ancestor element |
5407 | | then infer end tag for this element |
5408 | | */ |
5409 | 20.7k | if (node->type == EndTag) |
5410 | 2.26k | { |
5411 | 2.26k | if ( nodeIsFORM(node) ) |
5412 | 154 | { |
5413 | 154 | BadForm( doc ); |
5414 | 154 | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5415 | 154 | TY_(FreeNode)( doc, node); |
5416 | 154 | continue; |
5417 | 154 | } |
5418 | | |
5419 | | /* best to discard unexpected block/inline end tags */ |
5420 | 2.11k | if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) || |
5421 | 1.84k | TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
5422 | 1.00k | { |
5423 | 1.00k | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5424 | 1.00k | TY_(FreeNode)( doc, node); |
5425 | 1.00k | continue; |
5426 | 1.00k | } |
5427 | | |
5428 | 1.10k | for ( parent = table->parent; |
5429 | 6.91k | parent != NULL; |
5430 | 5.80k | parent = parent->parent ) |
5431 | 6.37k | { |
5432 | 6.37k | if (node->tag == parent->tag) |
5433 | 564 | { |
5434 | 564 | TY_(Report)(doc, table, node, MISSING_ENDTAG_BEFORE ); |
5435 | 564 | TY_(UngetToken)( doc ); |
5436 | 564 | lexer->istackbase = istackbase; |
5437 | | |
5438 | 564 | DEBUG_LOG_EXIT; |
5439 | 564 | return NULL; |
5440 | 564 | } |
5441 | 6.37k | } |
5442 | 1.10k | } |
5443 | | |
5444 | 19.0k | if (!(node->tag->model & CM_TABLE)) |
5445 | 730 | { |
5446 | 730 | TY_(UngetToken)( doc ); |
5447 | 730 | TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN); |
5448 | 730 | lexer->istackbase = istackbase; |
5449 | | |
5450 | 730 | DEBUG_LOG_EXIT; |
5451 | 730 | return NULL; |
5452 | 730 | } |
5453 | | |
5454 | 18.2k | if (TY_(nodeIsElement)(node)) |
5455 | 18.2k | { |
5456 | 18.2k | TidyParserMemory memory = {0}; |
5457 | 18.2k | TY_(InsertNodeAtEnd)(table, node); |
5458 | 18.2k | memory.identity = TY_(ParseTableTag); |
5459 | 18.2k | memory.original_node = table; |
5460 | 18.2k | memory.reentry_node = node; |
5461 | 18.2k | memory.register_1 = lexer->exiled; |
5462 | 18.2k | TY_(pushMemory)( doc, memory ); |
5463 | 18.2k | DEBUG_LOG_EXIT_WITH_NODE(node); |
5464 | 18.2k | return node; |
5465 | 18.2k | } |
5466 | | |
5467 | | /* discard unexpected text nodes and end tags */ |
5468 | 0 | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5469 | 0 | TY_(FreeNode)( doc, node); |
5470 | 0 | } |
5471 | | |
5472 | 11.1k | TY_(Report)(doc, table, node, MISSING_ENDTAG_FOR); |
5473 | 11.1k | lexer->istackbase = istackbase; |
5474 | | |
5475 | 11.1k | DEBUG_LOG_EXIT; |
5476 | 11.1k | return NULL; |
5477 | 37.7k | } |
5478 | | |
5479 | | |
5480 | | /** MARK: TY_(ParseText) |
5481 | | * Parses the `option` and `textarea` tags. |
5482 | | * |
5483 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5484 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5485 | | * This parser is also re-enterable, so that post-processing can occur after |
5486 | | * such dispatching. |
5487 | | */ |
5488 | | Node* TY_(ParseText)( TidyDocImpl* doc, Node *field, GetTokenMode mode ) |
5489 | 5.72k | { |
5490 | 5.72k | Lexer* lexer = doc->lexer; |
5491 | 5.72k | Node *node; |
5492 | 5.72k | DEBUG_LOG_COUNTERS; |
5493 | | |
5494 | 5.72k | DEBUG_LOG_ENTER_WITH_NODE(field); |
5495 | | |
5496 | 5.72k | lexer->insert = NULL; /* defer implicit inline start tags */ |
5497 | | |
5498 | 5.72k | DEBUG_LOG_GET_OLD_MODE; |
5499 | 5.72k | if ( nodeIsTEXTAREA(field) ) |
5500 | 621 | mode = Preformatted; |
5501 | 5.10k | else |
5502 | 5.10k | mode = MixedContent; /* kludge for font tags */ |
5503 | 5.72k | DEBUG_LOG_CHANGE_MODE; |
5504 | | |
5505 | 10.5k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
5506 | 10.0k | { |
5507 | 10.0k | if (node->tag == field->tag && node->type == EndTag) |
5508 | 40 | { |
5509 | 40 | TY_(FreeNode)( doc, node); |
5510 | 40 | field->closed = yes; |
5511 | 40 | TrimSpaces(doc, field); |
5512 | 40 | DEBUG_LOG_EXIT; |
5513 | 40 | return NULL; |
5514 | 40 | } |
5515 | | |
5516 | | /* deal with comments etc. */ |
5517 | 9.97k | if (InsertMisc(field, node)) |
5518 | 1.29k | continue; |
5519 | | |
5520 | 8.67k | if (TY_(nodeIsText)(node)) |
5521 | 2.63k | { |
5522 | | /* only called for 1st child */ |
5523 | 2.63k | if (field->content == NULL && !(mode & Preformatted)) |
5524 | 1.12k | TrimSpaces(doc, field); |
5525 | | |
5526 | 2.63k | if (node->start >= node->end) |
5527 | 2 | { |
5528 | 2 | TY_(FreeNode)( doc, node); |
5529 | 2 | continue; |
5530 | 2 | } |
5531 | | |
5532 | 2.63k | TY_(InsertNodeAtEnd)(field, node); |
5533 | 2.63k | continue; |
5534 | 2.63k | } |
5535 | | |
5536 | | /* for textarea should all cases of < and & be escaped? */ |
5537 | | |
5538 | | /* discard inline tags e.g. font */ |
5539 | 6.04k | if ( node->tag |
5540 | 5.48k | && node->tag->model & CM_INLINE |
5541 | 1.43k | && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */ |
5542 | 895 | { |
5543 | 895 | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED); |
5544 | 895 | TY_(FreeNode)( doc, node); |
5545 | 895 | continue; |
5546 | 895 | } |
5547 | | |
5548 | | /* terminate element on other tags */ |
5549 | 5.14k | if (!(field->tag->model & CM_OPT)) |
5550 | 588 | TY_(Report)(doc, field, node, MISSING_ENDTAG_BEFORE); |
5551 | | |
5552 | 5.14k | TY_(UngetToken)( doc ); |
5553 | 5.14k | TrimSpaces(doc, field); |
5554 | 5.14k | DEBUG_LOG_EXIT; |
5555 | 5.14k | return NULL; |
5556 | 6.04k | } |
5557 | | |
5558 | 537 | if (!(field->tag->model & CM_OPT)) |
5559 | 33 | TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR); |
5560 | 537 | DEBUG_LOG_EXIT; |
5561 | 537 | return NULL; |
5562 | 5.72k | } |
5563 | | |
5564 | | |
5565 | | /** MARK: TY_(ParseTitle) |
5566 | | * Parses the `title` tag. |
5567 | | * |
5568 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5569 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5570 | | * This parser is also re-enterable, so that post-processing can occur after |
5571 | | * such dispatching. |
5572 | | */ |
5573 | | Node* TY_(ParseTitle)( TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode) ) |
5574 | 1.64k | { |
5575 | 1.64k | Node *node; |
5576 | 16.7k | while ((node = TY_(GetToken)(doc, MixedContent)) != NULL) |
5577 | 16.4k | { |
5578 | 16.4k | if (node->tag == title->tag && node->type == StartTag |
5579 | 375 | && cfgBool(doc, TidyCoerceEndTags) ) |
5580 | 375 | { |
5581 | 375 | TY_(Report)(doc, title, node, COERCE_TO_ENDTAG); |
5582 | 375 | node->type = EndTag; |
5583 | 375 | TY_(UngetToken)( doc ); |
5584 | 375 | continue; |
5585 | 375 | } |
5586 | 16.0k | else if (node->tag == title->tag && node->type == EndTag) |
5587 | 385 | { |
5588 | 385 | TY_(FreeNode)( doc, node); |
5589 | 385 | title->closed = yes; |
5590 | 385 | TrimSpaces(doc, title); |
5591 | 385 | return NULL; |
5592 | 385 | } |
5593 | | |
5594 | 15.6k | if (TY_(nodeIsText)(node)) |
5595 | 7.05k | { |
5596 | | /* only called for 1st child */ |
5597 | 7.05k | if (title->content == NULL) |
5598 | 540 | TrimInitialSpace(doc, title, node); |
5599 | | |
5600 | 7.05k | if (node->start >= node->end) |
5601 | 83 | { |
5602 | 83 | TY_(FreeNode)( doc, node); |
5603 | 83 | continue; |
5604 | 83 | } |
5605 | | |
5606 | 6.97k | TY_(InsertNodeAtEnd)(title, node); |
5607 | 6.97k | continue; |
5608 | 7.05k | } |
5609 | | |
5610 | | /* deal with comments etc. */ |
5611 | 8.60k | if (InsertMisc(title, node)) |
5612 | 283 | continue; |
5613 | | |
5614 | | /* discard unknown tags */ |
5615 | 8.32k | if (node->tag == NULL) |
5616 | 7.37k | { |
5617 | 7.37k | TY_(Report)(doc, title, node, DISCARDING_UNEXPECTED); |
5618 | 7.37k | TY_(FreeNode)( doc, node); |
5619 | 7.37k | continue; |
5620 | 7.37k | } |
5621 | | |
5622 | | /* pushback unexpected tokens */ |
5623 | 942 | TY_(Report)(doc, title, node, MISSING_ENDTAG_BEFORE); |
5624 | 942 | TY_(UngetToken)( doc ); |
5625 | 942 | TrimSpaces(doc, title); |
5626 | 942 | return NULL; |
5627 | 8.32k | } |
5628 | | |
5629 | 319 | TY_(Report)(doc, title, node, MISSING_ENDTAG_FOR); |
5630 | 319 | return NULL; |
5631 | 1.64k | } |
5632 | | |
5633 | | |
5634 | | /** MARK: ParseXMLElement |
5635 | | * Parses the given XML element. |
5636 | | */ |
5637 | | static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode) |
5638 | 0 | { |
5639 | 0 | Lexer* lexer = doc->lexer; |
5640 | 0 | Node *node; |
5641 | |
|
5642 | 0 | if ( element == NULL ) |
5643 | 0 | { |
5644 | 0 | TidyParserMemory memory = TY_(popMemory)( doc ); |
5645 | 0 | element = memory.original_node; |
5646 | 0 | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
5647 | 0 | mode = memory.reentry_mode; |
5648 | 0 | TY_(InsertNodeAtEnd)(element, node); /* The only re-entry action needed. */ |
5649 | 0 | } |
5650 | 0 | else |
5651 | 0 | { |
5652 | | /* if node is pre or has xml:space="preserve" then do so */ |
5653 | 0 | if ( TY_(XMLPreserveWhiteSpace)(doc, element) ) |
5654 | 0 | mode = Preformatted; |
5655 | | |
5656 | | /* deal with comments etc. */ |
5657 | 0 | InsertMisc( &doc->root, element); |
5658 | | |
5659 | | /* we shouldn't have plain text at this point. */ |
5660 | 0 | if (TY_(nodeIsText)(element)) |
5661 | 0 | { |
5662 | 0 | TY_(Report)(doc, &doc->root, element, DISCARDING_UNEXPECTED); |
5663 | 0 | TY_(FreeNode)( doc, element); |
5664 | 0 | return NULL; |
5665 | 0 | } |
5666 | 0 | } |
5667 | 0 | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
5668 | 0 | { |
5669 | 0 | if (node->type == EndTag && |
5670 | 0 | node->element && element->element && |
5671 | 0 | TY_(tmbstrcmp)(node->element, element->element) == 0) |
5672 | 0 | { |
5673 | 0 | TY_(FreeNode)( doc, node); |
5674 | 0 | element->closed = yes; |
5675 | 0 | break; |
5676 | 0 | } |
5677 | | |
5678 | | /* discard unexpected end tags */ |
5679 | 0 | if (node->type == EndTag) |
5680 | 0 | { |
5681 | 0 | if (element) |
5682 | 0 | TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_IN); |
5683 | 0 | else |
5684 | 0 | TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_ERR); |
5685 | |
|
5686 | 0 | TY_(FreeNode)( doc, node); |
5687 | 0 | continue; |
5688 | 0 | } |
5689 | | |
5690 | | /* parse content on seeing start tag */ |
5691 | 0 | if (node->type == StartTag) |
5692 | 0 | { |
5693 | 0 | TidyParserMemory memory = {0}; |
5694 | 0 | memory.identity = ParseXMLElement; |
5695 | 0 | memory.original_node = element; |
5696 | 0 | memory.reentry_node = node; |
5697 | 0 | memory.reentry_mode = mode; |
5698 | 0 | TY_(pushMemory)( doc, memory ); |
5699 | 0 | return node; |
5700 | 0 | } |
5701 | | |
5702 | 0 | TY_(InsertNodeAtEnd)(element, node); |
5703 | 0 | } /* while */ |
5704 | | |
5705 | | /* |
5706 | | if first child is text then trim initial space and |
5707 | | delete text node if it is empty. |
5708 | | */ |
5709 | | |
5710 | 0 | node = element->content; |
5711 | |
|
5712 | 0 | if (TY_(nodeIsText)(node) && mode != Preformatted) |
5713 | 0 | { |
5714 | 0 | if ( lexer->lexbuf[node->start] == ' ' ) |
5715 | 0 | { |
5716 | 0 | node->start++; |
5717 | |
|
5718 | 0 | if (node->start >= node->end) |
5719 | 0 | TY_(DiscardElement)( doc, node ); |
5720 | 0 | } |
5721 | 0 | } |
5722 | | |
5723 | | /* |
5724 | | if last child is text then trim final space and |
5725 | | delete the text node if it is empty |
5726 | | */ |
5727 | |
|
5728 | 0 | node = element->last; |
5729 | |
|
5730 | 0 | if (TY_(nodeIsText)(node) && mode != Preformatted) |
5731 | 0 | { |
5732 | 0 | if ( lexer->lexbuf[node->end - 1] == ' ' ) |
5733 | 0 | { |
5734 | 0 | node->end--; |
5735 | |
|
5736 | 0 | if (node->start >= node->end) |
5737 | 0 | TY_(DiscardElement)( doc, node ); |
5738 | 0 | } |
5739 | 0 | } |
5740 | 0 | return NULL; |
5741 | 0 | } |
5742 | | |
5743 | | |
5744 | | /***************************************************************************//* |
5745 | | ** MARK: - Post-Parse Operations |
5746 | | ***************************************************************************/ |
5747 | | |
5748 | | |
5749 | | /** |
5750 | | * Performs checking of all attributes recursively starting at `node`. |
5751 | | */ |
5752 | | static void AttributeChecks(TidyDocImpl* doc, Node* node) |
5753 | 1.74M | { |
5754 | 1.74M | Node *next; |
5755 | | |
5756 | 4.39M | while (node) |
5757 | 2.64M | { |
5758 | 2.64M | next = node->next; |
5759 | | |
5760 | 2.64M | if (TY_(nodeIsElement)(node)) |
5761 | 2.07M | { |
5762 | 2.07M | if (node->tag && node->tag->chkattrs) /* [i_a]2 fix crash after adding SVG support with alt/unknown tag subtree insertion there */ |
5763 | 48.8k | node->tag->chkattrs(doc, node); |
5764 | 2.02M | else |
5765 | 2.02M | TY_(CheckAttributes)(doc, node); |
5766 | 2.07M | } |
5767 | | |
5768 | 2.64M | if (node->content) |
5769 | 1.73M | AttributeChecks(doc, node->content); |
5770 | | |
5771 | 2.64M | assert( next != node ); /* http://tidy.sf.net/issue/1603538 */ |
5772 | 2.64M | node = next; |
5773 | 2.64M | } |
5774 | 1.74M | } |
5775 | | |
5776 | | |
5777 | | /** |
5778 | | * Encloses naked text in certain elements within `p` tags. |
5779 | | * |
5780 | | * <form>, <blockquote>, and <noscript> do not allow #PCDATA in |
5781 | | * HTML 4.01 Strict (%block; model instead of %flow;). |
5782 | | */ |
5783 | | static void EncloseBlockText(TidyDocImpl* doc, Node* node) |
5784 | 0 | { |
5785 | 0 | Node *next; |
5786 | 0 | Node *block; |
5787 | |
|
5788 | 0 | while (node) |
5789 | 0 | { |
5790 | 0 | next = node->next; |
5791 | |
|
5792 | 0 | if (node->content) |
5793 | 0 | EncloseBlockText(doc, node->content); |
5794 | |
|
5795 | 0 | if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) || |
5796 | 0 | nodeIsBLOCKQUOTE(node)) |
5797 | 0 | || !node->content) |
5798 | 0 | { |
5799 | 0 | node = next; |
5800 | 0 | continue; |
5801 | 0 | } |
5802 | | |
5803 | 0 | block = node->content; |
5804 | |
|
5805 | 0 | if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) || |
5806 | 0 | (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block))) |
5807 | 0 | { |
5808 | 0 | Node* p = TY_(InferredTag)(doc, TidyTag_P); |
5809 | 0 | TY_(InsertNodeBeforeElement)(block, p); |
5810 | 0 | while (block && |
5811 | 0 | (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block))) |
5812 | 0 | { |
5813 | 0 | Node* tempNext = block->next; |
5814 | 0 | TY_(RemoveNode)(block); |
5815 | 0 | TY_(InsertNodeAtEnd)(p, block); |
5816 | 0 | block = tempNext; |
5817 | 0 | } |
5818 | 0 | TrimSpaces(doc, p); |
5819 | 0 | continue; |
5820 | 0 | } |
5821 | | |
5822 | 0 | node = next; |
5823 | 0 | } |
5824 | 0 | } |
5825 | | |
5826 | | |
5827 | | /** |
5828 | | * Encloses all naked body text within `p` tags. |
5829 | | */ |
5830 | | static void EncloseBodyText(TidyDocImpl* doc) |
5831 | 0 | { |
5832 | 0 | Node* node; |
5833 | 0 | Node* body = TY_(FindBody)(doc); |
5834 | |
|
5835 | 0 | if (!body) |
5836 | 0 | return; |
5837 | | |
5838 | 0 | node = body->content; |
5839 | |
|
5840 | 0 | while (node) |
5841 | 0 | { |
5842 | 0 | if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) || |
5843 | 0 | (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node))) |
5844 | 0 | { |
5845 | 0 | Node* p = TY_(InferredTag)(doc, TidyTag_P); |
5846 | 0 | TY_(InsertNodeBeforeElement)(node, p); |
5847 | 0 | while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node))) |
5848 | 0 | { |
5849 | 0 | Node* next = node->next; |
5850 | 0 | TY_(RemoveNode)(node); |
5851 | 0 | TY_(InsertNodeAtEnd)(p, node); |
5852 | 0 | node = next; |
5853 | 0 | } |
5854 | 0 | TrimSpaces(doc, p); |
5855 | 0 | continue; |
5856 | 0 | } |
5857 | 0 | node = node->next; |
5858 | 0 | } |
5859 | 0 | } |
5860 | | |
5861 | | |
5862 | | /** |
5863 | | * Replaces elements that are obsolete with appropriate substitute tags. |
5864 | | */ |
5865 | | static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node) |
5866 | 1.74M | { |
5867 | 1.74M | Node *next; |
5868 | | |
5869 | 4.39M | while (node) |
5870 | 2.64M | { |
5871 | 2.64M | next = node->next; |
5872 | | |
5873 | | /* if (nodeIsDIR(node) || nodeIsMENU(node)) */ |
5874 | | /* HTML5 - <menu ... > is no longer obsolete */ |
5875 | 2.64M | if (nodeIsDIR(node)) |
5876 | 573 | TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes); |
5877 | | |
5878 | 2.64M | if (nodeIsXMP(node) || nodeIsLISTING(node) || |
5879 | 2.63M | (node->tag && node->tag->id == TidyTag_PLAINTEXT)) |
5880 | 13.9k | TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes); |
5881 | | |
5882 | 2.64M | if (node->content) |
5883 | 1.73M | ReplaceObsoleteElements(doc, node->content); |
5884 | | |
5885 | 2.64M | node = next; |
5886 | 2.64M | } |
5887 | 1.74M | } |
5888 | | |
5889 | | |
5890 | | /***************************************************************************//* |
5891 | | ** MARK: - Internal API Implementation |
5892 | | ***************************************************************************/ |
5893 | | |
5894 | | |
5895 | | /** MARK: TY_(CheckNodeIntegrity) |
5896 | | * Is used to perform a node integrity check after parsing an HTML or XML |
5897 | | * document. |
5898 | | * @note Actual performance of this check can be disabled by defining the |
5899 | | * macro NO_NODE_INTEGRITY_CHECK. |
5900 | | */ |
5901 | | Bool TY_(CheckNodeIntegrity)(Node *node) |
5902 | 4.39M | { |
5903 | 4.39M | #ifndef NO_NODE_INTEGRITY_CHECK |
5904 | 4.39M | Node *child; |
5905 | | |
5906 | 4.39M | if (node->prev) |
5907 | 1.36M | { |
5908 | 1.36M | if (node->prev->next != node) |
5909 | 0 | return no; |
5910 | 1.36M | } |
5911 | | |
5912 | 4.39M | if (node->next) |
5913 | 1.36M | { |
5914 | 1.36M | if (node->next == node || node->next->prev != node) |
5915 | 0 | return no; |
5916 | 1.36M | } |
5917 | | |
5918 | 4.39M | if (node->parent) |
5919 | 4.35M | { |
5920 | 4.35M | if (node->prev == NULL && node->parent->content != node) |
5921 | 0 | return no; |
5922 | | |
5923 | 4.35M | if (node->next == NULL && node->parent->last != node) |
5924 | 0 | return no; |
5925 | 4.35M | } |
5926 | | |
5927 | 8.75M | for (child = node->content; child; child = child->next) |
5928 | 4.35M | if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) ) |
5929 | 0 | return no; |
5930 | | |
5931 | 4.39M | #endif |
5932 | 4.39M | return yes; |
5933 | 4.39M | } |
5934 | | |
5935 | | |
5936 | | /** MARK: TY_(CoerceNode) |
5937 | | * Transforms a given node to another element, for example, from a <p> |
5938 | | * to a <br>. |
5939 | | */ |
5940 | | void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected) |
5941 | 23.1k | { |
5942 | 23.1k | const Dict* tag = TY_(LookupTagDef)(tid); |
5943 | 23.1k | Node* tmp = TY_(InferredTag)(doc, tag->id); |
5944 | | |
5945 | 23.1k | if (obsolete) |
5946 | 14.5k | TY_(Report)(doc, node, tmp, OBSOLETE_ELEMENT); |
5947 | 8.59k | else if (unexpected) |
5948 | 0 | TY_(Report)(doc, node, tmp, REPLACING_UNEX_ELEMENT); |
5949 | 8.59k | else |
5950 | 8.59k | TY_(Report)(doc, node, tmp, REPLACING_ELEMENT); |
5951 | | |
5952 | 23.1k | TidyDocFree(doc, tmp->element); |
5953 | 23.1k | TidyDocFree(doc, tmp); |
5954 | | |
5955 | 23.1k | node->was = node->tag; |
5956 | 23.1k | node->tag = tag; |
5957 | 23.1k | node->type = StartTag; |
5958 | 23.1k | node->implicit = yes; |
5959 | 23.1k | TidyDocFree(doc, node->element); |
5960 | 23.1k | node->element = TY_(tmbstrdup)(doc->allocator, tag->name); |
5961 | 23.1k | } |
5962 | | |
5963 | | |
5964 | | /** MARK: TY_(DiscardElement) |
5965 | | * Remove node from markup tree and discard it. |
5966 | | */ |
5967 | | Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element ) |
5968 | 416k | { |
5969 | 416k | Node *next = NULL; |
5970 | | |
5971 | 416k | if (element) |
5972 | 416k | { |
5973 | 416k | next = element->next; |
5974 | 416k | TY_(RemoveNode)(element); |
5975 | 416k | TY_(FreeNode)( doc, element); |
5976 | 416k | } |
5977 | | |
5978 | 416k | return next; |
5979 | 416k | } |
5980 | | |
5981 | | |
5982 | | /** MARK: TY_(DropEmptyElements) |
5983 | | * Trims a tree of empty elements recursively, returning the next node. |
5984 | | */ |
5985 | | Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node) |
5986 | 1.74M | { |
5987 | 1.74M | Node* next; |
5988 | | |
5989 | 4.39M | while (node) |
5990 | 2.64M | { |
5991 | 2.64M | next = node->next; |
5992 | | |
5993 | 2.64M | if (node->content) |
5994 | 1.73M | TY_(DropEmptyElements)(doc, node->content); |
5995 | | |
5996 | 2.64M | if (!TY_(nodeIsElement)(node) && |
5997 | 569k | !(TY_(nodeIsText)(node) && !(node->start < node->end))) |
5998 | 557k | { |
5999 | 557k | node = next; |
6000 | 557k | continue; |
6001 | 557k | } |
6002 | | |
6003 | 2.08M | next = TY_(TrimEmptyElement)(doc, node); |
6004 | 2.08M | node = next; |
6005 | 2.08M | } |
6006 | | |
6007 | 1.74M | return node; |
6008 | 1.74M | } |
6009 | | |
6010 | | |
6011 | | /** MARK: TY_(InsertNodeAtStart) |
6012 | | * Insert node into markup tree as the first element of content of element. |
6013 | | */ |
6014 | | void TY_(InsertNodeAtStart)(Node *element, Node *node) |
6015 | 19.8k | { |
6016 | 19.8k | node->parent = element; |
6017 | | |
6018 | 19.8k | if (element->content == NULL) |
6019 | 1.54k | element->last = node; |
6020 | 18.3k | else |
6021 | 18.3k | element->content->prev = node; |
6022 | | |
6023 | 19.8k | node->next = element->content; |
6024 | 19.8k | node->prev = NULL; |
6025 | 19.8k | element->content = node; |
6026 | 19.8k | } |
6027 | | |
6028 | | |
6029 | | /** MARK: TY_(InsertNodeAtEnd) |
6030 | | * Insert node into markup tree as the last element of content of element. |
6031 | | */ |
6032 | | void TY_(InsertNodeAtEnd)(Node *element, Node *node) |
6033 | 2.48M | { |
6034 | 2.48M | node->parent = element; |
6035 | 2.48M | node->prev = element ? element->last : NULL; |
6036 | | |
6037 | 2.48M | if (element && element->last != NULL) |
6038 | 730k | element->last->next = node; |
6039 | 1.75M | else |
6040 | 1.75M | if (element) |
6041 | 1.75M | element->content = node; |
6042 | | |
6043 | 2.48M | if (element) |
6044 | 2.48M | element->last = node; |
6045 | 2.48M | } |
6046 | | |
6047 | | |
6048 | | /** MARK: TY_(InsertNodeBeforeElement) |
6049 | | * Insert node into markup tree before element. |
6050 | | */ |
6051 | | void TY_(InsertNodeBeforeElement)(Node *element, Node *node) |
6052 | 77.7k | { |
6053 | 77.7k | Node *parent; |
6054 | | |
6055 | 77.7k | parent = element ? element->parent : NULL; |
6056 | 77.7k | node->parent = parent; |
6057 | 77.7k | node->next = element; |
6058 | 77.7k | node->prev = element ? element->prev : NULL; |
6059 | 77.7k | if (element) |
6060 | 68.7k | element->prev = node; |
6061 | | |
6062 | 77.7k | if (node->prev) |
6063 | 42.2k | node->prev->next = node; |
6064 | | |
6065 | 77.7k | if (parent && parent->content == element) |
6066 | 26.2k | parent->content = node; |
6067 | 77.7k | } |
6068 | | |
6069 | | |
6070 | | /** MARK: TY_(InsertNodeAfterElement) |
6071 | | * Insert node into markup tree after element. |
6072 | | */ |
6073 | | void TY_(InsertNodeAfterElement)(Node *element, Node *node) |
6074 | 132k | { |
6075 | 132k | Node *parent; |
6076 | | |
6077 | 132k | parent = element->parent; |
6078 | 132k | node->parent = parent; |
6079 | | |
6080 | | /* AQ - 13 Jan 2000 fix for parent == NULL */ |
6081 | 132k | if (parent != NULL && parent->last == element) |
6082 | 14.4k | parent->last = node; |
6083 | 118k | else |
6084 | 118k | { |
6085 | 118k | node->next = element->next; |
6086 | | /* AQ - 13 Jan 2000 fix for node->next == NULL */ |
6087 | 118k | if (node->next != NULL) |
6088 | 117k | node->next->prev = node; |
6089 | 118k | } |
6090 | | |
6091 | 132k | element->next = node; |
6092 | 132k | node->prev = element; |
6093 | 132k | } |
6094 | | |
6095 | | |
6096 | | /** MARK: TY_(IsBlank) |
6097 | | * Indicates whether or not a text node is blank, meaning that it consists |
6098 | | * of nothing, or a single space. |
6099 | | */ |
6100 | | Bool TY_(IsBlank)(Lexer *lexer, Node *node) |
6101 | 414 | { |
6102 | 414 | Bool isBlank = TY_(nodeIsText)(node); |
6103 | 414 | if ( isBlank ) |
6104 | 0 | isBlank = ( node->end == node->start || /* Zero length */ |
6105 | 0 | ( node->end == node->start+1 /* or one blank. */ |
6106 | 0 | && lexer->lexbuf[node->start] == ' ' ) ); |
6107 | | |
6108 | 414 | return isBlank; |
6109 | 414 | } |
6110 | | |
6111 | | |
6112 | | /** MARK: TY_(IsJavaScript) |
6113 | | * Indicates whether or not a node is declared as containing javascript |
6114 | | * code. |
6115 | | */ |
6116 | | Bool TY_(IsJavaScript)(Node *node) |
6117 | 7.76k | { |
6118 | 7.76k | Bool result = no; |
6119 | 7.76k | AttVal *attr; |
6120 | | |
6121 | 7.76k | if (node->attributes == NULL) |
6122 | 5.96k | return yes; |
6123 | | |
6124 | 3.72k | for (attr = node->attributes; attr; attr = attr->next) |
6125 | 2.38k | { |
6126 | 2.38k | if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr)) |
6127 | 1.19k | && AttrContains(attr, "javascript") ) |
6128 | 465 | { |
6129 | 465 | result = yes; |
6130 | 465 | break; |
6131 | 465 | } |
6132 | 2.38k | } |
6133 | | |
6134 | 1.79k | return result; |
6135 | 7.76k | } |
6136 | | |
6137 | | |
6138 | | /** MARK: TY_(IsNewNode) |
6139 | | * Used to check if a node uses CM_NEW, which determines how attributes |
6140 | | * without values should be printed. This was introduced to deal with |
6141 | | * user-defined tags e.g. ColdFusion. |
6142 | | */ |
6143 | | Bool TY_(IsNewNode)(Node *node) |
6144 | 0 | { |
6145 | 0 | if (node && node->tag) |
6146 | 0 | { |
6147 | 0 | return (node->tag->model & CM_NEW); |
6148 | 0 | } |
6149 | 0 | return yes; |
6150 | 0 | } |
6151 | | |
6152 | | |
6153 | | /** MARK: TY_(RemoveNode) |
6154 | | * Extract a node and its children from a markup tree |
6155 | | */ |
6156 | | Node *TY_(RemoveNode)(Node *node) |
6157 | 447k | { |
6158 | 447k | if (node->prev) |
6159 | 175k | node->prev->next = node->next; |
6160 | | |
6161 | 447k | if (node->next) |
6162 | 208k | node->next->prev = node->prev; |
6163 | | |
6164 | 447k | if (node->parent) |
6165 | 440k | { |
6166 | 440k | if (node->parent->content == node) |
6167 | 264k | node->parent->content = node->next; |
6168 | | |
6169 | 440k | if (node->parent->last == node) |
6170 | 232k | node->parent->last = node->prev; |
6171 | 440k | } |
6172 | | |
6173 | 447k | node->parent = node->prev = node->next = NULL; |
6174 | 447k | return node; |
6175 | 447k | } |
6176 | | |
6177 | | |
6178 | | /** MARK: TY_(TrimEmptyElement) |
6179 | | * Trims a single, empty element, returning the next node. |
6180 | | */ |
6181 | | Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element ) |
6182 | 2.08M | { |
6183 | 2.08M | if ( CanPrune(doc, element) ) |
6184 | 413k | { |
6185 | 413k | if (element->type != TextNode) |
6186 | 401k | { |
6187 | 401k | doc->footnotes |= FN_TRIM_EMPTY_ELEMENT; |
6188 | 401k | TY_(Report)(doc, element, NULL, TRIM_EMPTY_ELEMENT); |
6189 | 401k | } |
6190 | | |
6191 | 413k | return TY_(DiscardElement)(doc, element); |
6192 | 413k | } |
6193 | 1.67M | return element->next; |
6194 | 2.08M | } |
6195 | | |
6196 | | |
6197 | | /** MARK: TY_(XMLPreserveWhiteSpace) |
6198 | | * Indicates whether or not whitespace is to be preserved in XHTML/XML |
6199 | | * documents. |
6200 | | */ |
6201 | | Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element) |
6202 | 0 | { |
6203 | 0 | AttVal *attribute; |
6204 | | |
6205 | | /* search attributes for xml:space */ |
6206 | 0 | for (attribute = element->attributes; attribute; attribute = attribute->next) |
6207 | 0 | { |
6208 | 0 | if (attrIsXML_SPACE(attribute)) |
6209 | 0 | { |
6210 | 0 | if (AttrValueIs(attribute, "preserve")) |
6211 | 0 | return yes; |
6212 | | |
6213 | 0 | return no; |
6214 | 0 | } |
6215 | 0 | } |
6216 | | |
6217 | 0 | if (element->element == NULL) |
6218 | 0 | return no; |
6219 | | |
6220 | | /* kludge for html docs without explicit xml:space attribute */ |
6221 | 0 | if (nodeIsPRE(element) || |
6222 | 0 | nodeIsSCRIPT(element) || |
6223 | 0 | nodeIsSTYLE(element) || |
6224 | 0 | TY_(FindParser)(doc, element) == TY_(ParsePre)) |
6225 | 0 | return yes; |
6226 | | |
6227 | | /* kludge for XSL docs */ |
6228 | 0 | if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 ) |
6229 | 0 | return yes; |
6230 | | |
6231 | 0 | return no; |
6232 | 0 | } |
6233 | | |
6234 | | |
6235 | | /***************************************************************************//* |
6236 | | ** MARK: - Internal API Implementation - Main Parsers |
6237 | | ***************************************************************************/ |
6238 | | |
6239 | | |
6240 | | /** MARK: TY_(ParseDocument) |
6241 | | * Parses an HTML document after lexing. It begins by properly configuring |
6242 | | * the overall HTML structure, and subsequently processes all remaining |
6243 | | * nodes. |
6244 | | */ |
6245 | | void TY_(ParseDocument)(TidyDocImpl* doc) |
6246 | 16.9k | { |
6247 | 16.9k | Node *node, *html, *doctype = NULL; |
6248 | | |
6249 | 50.7k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
6250 | 48.7k | { |
6251 | 48.7k | if (node->type == XmlDecl) |
6252 | 717 | { |
6253 | 717 | doc->xmlDetected = yes; |
6254 | | |
6255 | 717 | if (TY_(FindXmlDecl)(doc) && doc->root.content) |
6256 | 632 | { |
6257 | 632 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6258 | 632 | TY_(FreeNode)(doc, node); |
6259 | 632 | continue; |
6260 | 632 | } |
6261 | 85 | if (node->line > 1 || node->column != 1) |
6262 | 41 | { |
6263 | 41 | TY_(Report)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL); |
6264 | 41 | } |
6265 | 85 | } |
6266 | | |
6267 | | /* deal with comments etc. */ |
6268 | 48.0k | if (InsertMisc( &doc->root, node )) |
6269 | 30.9k | continue; |
6270 | | |
6271 | 17.1k | if (node->type == DocTypeTag) |
6272 | 1.91k | { |
6273 | 1.91k | if (doctype == NULL) |
6274 | 1.27k | { |
6275 | 1.27k | TY_(InsertNodeAtEnd)( &doc->root, node); |
6276 | 1.27k | doctype = node; |
6277 | 1.27k | } |
6278 | 641 | else |
6279 | 641 | { |
6280 | 641 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6281 | 641 | TY_(FreeNode)( doc, node); |
6282 | 641 | } |
6283 | 1.91k | continue; |
6284 | 1.91k | } |
6285 | | |
6286 | 15.2k | if (node->type == EndTag) |
6287 | 327 | { |
6288 | 327 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6289 | 327 | TY_(FreeNode)( doc, node); |
6290 | 327 | continue; |
6291 | 327 | } |
6292 | | |
6293 | 14.9k | if (node->type == StartTag && nodeIsHTML(node)) |
6294 | 46 | { |
6295 | 46 | AttVal *xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS); |
6296 | | |
6297 | 46 | if (AttrValueIs(xmlns, XHTML_NAMESPACE)) |
6298 | 0 | { |
6299 | 0 | Bool htmlOut = cfgBool( doc, TidyHtmlOut ); |
6300 | 0 | doc->lexer->isvoyager = yes; /* Unless plain HTML */ |
6301 | 0 | TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/ |
6302 | 0 | TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */ |
6303 | | |
6304 | | /* adjust other config options, just as in config.c */ |
6305 | 0 | if ( !htmlOut ) |
6306 | 0 | { |
6307 | 0 | TY_(SetOptionBool)( doc, TidyUpperCaseTags, no ); |
6308 | 0 | TY_(SetOptionInt)( doc, TidyUpperCaseAttrs, no ); |
6309 | 0 | } |
6310 | 0 | } |
6311 | 46 | } |
6312 | | |
6313 | 14.9k | if ( node->type != StartTag || !nodeIsHTML(node) ) |
6314 | 14.8k | { |
6315 | 14.8k | TY_(UngetToken)( doc ); |
6316 | 14.8k | html = TY_(InferredTag)(doc, TidyTag_HTML); |
6317 | 14.8k | } |
6318 | 46 | else |
6319 | 46 | html = node; |
6320 | | |
6321 | | /*\ |
6322 | | * #72, avoid MISSING_DOCTYPE if show-body-only. |
6323 | | * #191, also if --doctype omit, that is TidyDoctypeOmit |
6324 | | * #342, adjust tags to html4-- if not 'auto' or 'html5' |
6325 | | \*/ |
6326 | 14.9k | if (!TY_(FindDocType)(doc)) |
6327 | 14.2k | { |
6328 | 14.2k | ulong dtmode = cfg( doc, TidyDoctypeMode ); |
6329 | 14.2k | if ((dtmode != TidyDoctypeOmit) && !showingBodyOnly(doc)) |
6330 | 14.2k | TY_(Report)(doc, NULL, NULL, MISSING_DOCTYPE); |
6331 | 14.2k | if ((dtmode != TidyDoctypeAuto) && (dtmode != TidyDoctypeHtml5)) |
6332 | 0 | { |
6333 | | /*\ |
6334 | | * Issue #342 - if not doctype 'auto', or 'html5' |
6335 | | * then reset mode htm4-- parsing |
6336 | | \*/ |
6337 | 0 | TY_(AdjustTags)(doc); /* Dynamically modify the tags table to html4-- mode */ |
6338 | 0 | } |
6339 | 14.2k | } |
6340 | 14.9k | TY_(InsertNodeAtEnd)( &doc->root, html); |
6341 | 14.9k | ParseHTMLWithNode( doc, html ); |
6342 | 14.9k | break; |
6343 | 15.2k | } |
6344 | | |
6345 | | /* do this before any more document fixes */ |
6346 | 16.9k | if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 ) |
6347 | 0 | TY_(AccessibilityChecks)( doc ); |
6348 | | |
6349 | 16.9k | if (!TY_(FindHTML)(doc)) |
6350 | 2.02k | { |
6351 | | /* a later check should complain if <body> is empty */ |
6352 | 2.02k | html = TY_(InferredTag)(doc, TidyTag_HTML); |
6353 | 2.02k | TY_(InsertNodeAtEnd)( &doc->root, html); |
6354 | 2.02k | ParseHTMLWithNode( doc, html ); |
6355 | 2.02k | } |
6356 | | |
6357 | 16.9k | node = TY_(FindTITLE)(doc); |
6358 | 16.9k | if (!node) |
6359 | 16.8k | { |
6360 | 16.8k | Node* head = TY_(FindHEAD)(doc); |
6361 | | /* #72, avoid MISSING_TITLE_ELEMENT if show-body-only (but allow InsertNodeAtEnd to avoid new warning) */ |
6362 | 33.7k | if (!showingBodyOnly(doc)) |
6363 | 16.8k | { |
6364 | 16.8k | TY_(Report)(doc, head, NULL, MISSING_TITLE_ELEMENT); |
6365 | 16.8k | } |
6366 | 16.8k | TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE)); |
6367 | 16.8k | } |
6368 | 102 | else if (!node->content && !showingBodyOnly(doc)) |
6369 | 64 | { |
6370 | | /* Is #839 - warn node is blank in HTML5 */ |
6371 | 64 | if (TY_(IsHTML5Mode)(doc)) |
6372 | 55 | { |
6373 | 55 | TY_(Report)(doc, node, NULL, BLANK_TITLE_ELEMENT); |
6374 | 55 | } |
6375 | 64 | } |
6376 | | |
6377 | 16.9k | AttributeChecks(doc, &doc->root); |
6378 | 16.9k | ReplaceObsoleteElements(doc, &doc->root); |
6379 | 16.9k | TY_(DropEmptyElements)(doc, &doc->root); |
6380 | 16.9k | CleanSpaces(doc, &doc->root); |
6381 | | |
6382 | 16.9k | if (cfgBool(doc, TidyEncloseBodyText)) |
6383 | 0 | EncloseBodyText(doc); |
6384 | 16.9k | if (cfgBool(doc, TidyEncloseBlockText)) |
6385 | 0 | EncloseBlockText(doc, &doc->root); |
6386 | 16.9k | } |
6387 | | |
6388 | | |
6389 | | /** MARK: TY_(ParseXMLDocument) |
6390 | | * Parses the document using Tidy's XML parser. |
6391 | | */ |
6392 | | void TY_(ParseXMLDocument)(TidyDocImpl* doc) |
6393 | 0 | { |
6394 | 0 | Node *node, *doctype = NULL; |
6395 | |
|
6396 | 0 | TY_(SetOptionBool)( doc, TidyXmlTags, yes ); |
6397 | |
|
6398 | 0 | doc->xmlDetected = yes; |
6399 | |
|
6400 | 0 | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
6401 | 0 | { |
6402 | | /* discard unexpected end tags */ |
6403 | 0 | if (node->type == EndTag) |
6404 | 0 | { |
6405 | 0 | TY_(Report)(doc, NULL, node, UNEXPECTED_ENDTAG); |
6406 | 0 | TY_(FreeNode)( doc, node); |
6407 | 0 | continue; |
6408 | 0 | } |
6409 | | |
6410 | | /* deal with comments etc. */ |
6411 | 0 | if (InsertMisc( &doc->root, node)) |
6412 | 0 | continue; |
6413 | | |
6414 | 0 | if (node->type == DocTypeTag) |
6415 | 0 | { |
6416 | 0 | if (doctype == NULL) |
6417 | 0 | { |
6418 | 0 | TY_(InsertNodeAtEnd)( &doc->root, node); |
6419 | 0 | doctype = node; |
6420 | 0 | } |
6421 | 0 | else |
6422 | 0 | { |
6423 | 0 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6424 | 0 | TY_(FreeNode)( doc, node); |
6425 | 0 | } |
6426 | 0 | continue; |
6427 | 0 | } |
6428 | | |
6429 | 0 | if (node->type == StartEndTag) |
6430 | 0 | { |
6431 | 0 | TY_(InsertNodeAtEnd)( &doc->root, node); |
6432 | 0 | continue; |
6433 | 0 | } |
6434 | | |
6435 | | /* if start tag then parse element's content */ |
6436 | 0 | if (node->type == StartTag) |
6437 | 0 | { |
6438 | 0 | TY_(InsertNodeAtEnd)( &doc->root, node ); |
6439 | 0 | ParseHTMLWithNode( doc, node ); |
6440 | 0 | continue; |
6441 | 0 | } |
6442 | | |
6443 | 0 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6444 | 0 | TY_(FreeNode)( doc, node); |
6445 | 0 | } |
6446 | | |
6447 | | /* ensure presence of initial <?xml version="1.0"?> */ |
6448 | 0 | if ( cfgBool(doc, TidyXmlDecl) ) |
6449 | 0 | TY_(FixXmlDecl)( doc ); |
6450 | 0 | } |