/src/tidy-html5/src/parser.c
Line | Count | Source |
1 | | /* parser.c -- HTML Parser |
2 | | |
3 | | (c) 1998-2007 (W3C) MIT, ERCIM, Keio University |
4 | | See tidy.h for the copyright notice. |
5 | | |
6 | | */ |
7 | | |
8 | | #include "tidy-int.h" |
9 | | #include "lexer.h" |
10 | | #include "parser.h" |
11 | | #include "message.h" |
12 | | #include "clean.h" |
13 | | #include "tags.h" |
14 | | #include "tmbstr.h" |
15 | | #include "sprtf.h" |
16 | | |
17 | | |
18 | | /****************************************************************************//* |
19 | | ** MARK: - Configuration Options |
20 | | ***************************************************************************/ |
21 | | |
22 | | |
23 | | /** |
24 | | * Issue #72 - Need to know to avoid error-reporting - no warning only if |
25 | | * --show-body-only yes. |
26 | | * Issue #132 - Likewise avoid warning if showing body only. |
27 | | */ |
28 | 15.2k | #define showingBodyOnly(doc) (cfgAutoBool(doc,TidyBodyOnly) == TidyYesState) ? yes : no |
29 | | |
30 | | |
31 | | /****************************************************************************//* |
32 | | ** MARK: - Forward Declarations |
33 | | ***************************************************************************/ |
34 | | |
35 | | |
36 | | static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode); |
37 | | |
38 | | |
39 | | /****************************************************************************//* |
40 | | ** MARK: - Node Operations |
41 | | ***************************************************************************/ |
42 | | |
43 | | |
44 | | /** |
45 | | * Generalised search for duplicate elements. |
46 | | * Issue #166 - repeated <main> element. |
47 | | */ |
48 | | static Bool findNodeWithId( Node *node, TidyTagId tid ) |
49 | 264 | { |
50 | 264 | Node *content; |
51 | 616 | while (node) |
52 | 352 | { |
53 | 352 | if (TagIsId(node,tid)) |
54 | 0 | return yes; |
55 | | /*\ |
56 | | * Issue #459 - Under certain circumstances, with many node this use of |
57 | | * 'for (content = node->content; content; content = content->content)' |
58 | | * would produce a **forever** circle, or at least a very extended loop... |
59 | | * It is sufficient to test the content, if it exists, |
60 | | * to quickly iterate all nodes. Now all nodes are tested only once. |
61 | | \*/ |
62 | 352 | content = node->content; |
63 | 352 | if (content) |
64 | 176 | { |
65 | 176 | if ( findNodeWithId(content,tid) ) |
66 | 0 | return yes; |
67 | 176 | } |
68 | 352 | node = node->next; |
69 | 352 | } |
70 | 264 | return no; |
71 | 264 | } |
72 | | |
73 | | |
74 | | /** |
75 | | * Perform a global search for an element. |
76 | | * Issue #166 - repeated <main> element |
77 | | */ |
78 | | static Bool findNodeById( TidyDocImpl* doc, TidyTagId tid ) |
79 | 88 | { |
80 | 88 | Node *node = (doc ? doc->root.content : NULL); |
81 | 88 | return findNodeWithId( node,tid ); |
82 | 88 | } |
83 | | |
84 | | |
85 | | /** |
86 | | * Inserts node into element at an appropriate location based |
87 | | * on the type of node being inserted. |
88 | | */ |
89 | | static Bool InsertMisc(Node *element, Node *node) |
90 | 855k | { |
91 | 855k | if (node->type == CommentTag || |
92 | 854k | node->type == ProcInsTag || |
93 | 842k | node->type == CDATATag || |
94 | 842k | node->type == SectionTag || |
95 | 839k | node->type == AspTag || |
96 | 839k | node->type == JsteTag || |
97 | 838k | node->type == PhpTag ) |
98 | 16.4k | { |
99 | 16.4k | TY_(InsertNodeAtEnd)(element, node); |
100 | 16.4k | return yes; |
101 | 16.4k | } |
102 | | |
103 | 838k | if ( node->type == XmlDecl ) |
104 | 302 | { |
105 | 302 | Node* root = element; |
106 | 1.97k | while ( root && root->parent ) |
107 | 1.67k | root = root->parent; |
108 | 302 | if ( root && !(root->content && root->content->type == XmlDecl)) |
109 | 97 | { |
110 | 97 | TY_(InsertNodeAtStart)( root, node ); |
111 | 97 | return yes; |
112 | 97 | } |
113 | 302 | } |
114 | | |
115 | | /* Declared empty tags seem to be slipping through |
116 | | ** the cracks. This is an experiment to figure out |
117 | | ** a decent place to pick them up. |
118 | | */ |
119 | 838k | if ( node->tag && |
120 | 756k | TY_(nodeIsElement)(node) && |
121 | 708k | TY_(nodeCMIsEmpty)(node) && TagId(node) == TidyTag_UNKNOWN && |
122 | 0 | (node->tag->versions & VERS_PROPRIETARY) != 0 ) |
123 | 0 | { |
124 | 0 | TY_(InsertNodeAtEnd)(element, node); |
125 | 0 | return yes; |
126 | 0 | } |
127 | | |
128 | 838k | return no; |
129 | 838k | } |
130 | | |
131 | | |
132 | | /** |
133 | | * Insert "node" into markup tree in place of "element" |
134 | | * which is moved to become the child of the node |
135 | | */ |
136 | | static void InsertNodeAsParent(Node *element, Node *node) |
137 | 329 | { |
138 | 329 | node->content = element; |
139 | 329 | node->last = element; |
140 | 329 | node->parent = element->parent; |
141 | 329 | element->parent = node; |
142 | | |
143 | 329 | if (node->parent->content == element) |
144 | 155 | node->parent->content = node; |
145 | | |
146 | 329 | if (node->parent->last == element) |
147 | 157 | node->parent->last = node; |
148 | | |
149 | 329 | node->prev = element->prev; |
150 | 329 | element->prev = NULL; |
151 | | |
152 | 329 | if (node->prev) |
153 | 174 | node->prev->next = node; |
154 | | |
155 | 329 | node->next = element->next; |
156 | 329 | element->next = NULL; |
157 | | |
158 | 329 | if (node->next) |
159 | 172 | node->next->prev = node; |
160 | 329 | } |
161 | | |
162 | | |
163 | | /** |
164 | | * Unexpected content in table row is moved to just before the table in |
165 | | * in accordance with Netscape and IE. This code assumes that node hasn't |
166 | | * been inserted into the row. |
167 | | */ |
168 | | static void MoveBeforeTable( TidyDocImpl* ARG_UNUSED(doc), Node *row, |
169 | | Node *node ) |
170 | 15.3k | { |
171 | 15.3k | Node *table; |
172 | | |
173 | | /* first find the table element */ |
174 | 30.6M | for (table = row->parent; table; table = table->parent) |
175 | 30.6M | { |
176 | 30.6M | if ( nodeIsTABLE(table) ) |
177 | 9.40k | { |
178 | 9.40k | TY_(InsertNodeBeforeElement)( table, node ); |
179 | 9.40k | return; |
180 | 9.40k | } |
181 | 30.6M | } |
182 | | /* No table element */ |
183 | 5.90k | TY_(InsertNodeBeforeElement)( row->parent, node ); |
184 | 5.90k | } |
185 | | |
186 | | |
187 | | /** |
188 | | * Moves given node to end of body element. |
189 | | */ |
190 | | static void MoveNodeToBody( TidyDocImpl* doc, Node* node ) |
191 | 686 | { |
192 | 686 | Node* body = TY_(FindBody)( doc ); |
193 | 686 | if ( body ) |
194 | 663 | { |
195 | 663 | TY_(RemoveNode)( node ); |
196 | 663 | TY_(InsertNodeAtEnd)( body, node ); |
197 | 663 | } |
198 | 686 | } |
199 | | |
200 | | |
201 | | /** |
202 | | * Move node to the head, where element is used as starting |
203 | | * point in hunt for head. Normally called during parsing. |
204 | | */ |
205 | | static void MoveToHead( TidyDocImpl* doc, Node *element, Node *node ) |
206 | 1.03k | { |
207 | 1.03k | Node *head = NULL; |
208 | | |
209 | 1.03k | TY_(RemoveNode)( node ); /* make sure that node is isolated */ |
210 | | |
211 | 1.03k | if ( TY_(nodeIsElement)(node) ) |
212 | 888 | { |
213 | 888 | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN ); |
214 | | |
215 | 888 | head = TY_(FindHEAD)(doc); |
216 | 888 | assert(head != NULL); |
217 | | |
218 | 888 | TY_(InsertNodeAtEnd)(head, node); |
219 | | |
220 | 888 | if ( node->tag->parser ) |
221 | 888 | { |
222 | | /* Only one of the existing test cases as of 2021-08-14 invoke |
223 | | MoveToHead, and it doesn't go deeper than one level. The |
224 | | parser() call is supposed to return a node if additional |
225 | | parsing is needed. Keep this in mind if we start to get bug |
226 | | reports. |
227 | | */ |
228 | 888 | Parser* parser = node->tag->parser; |
229 | 888 | parser( doc, node, IgnoreWhitespace ); |
230 | 888 | } |
231 | 888 | } |
232 | 142 | else |
233 | 142 | { |
234 | 142 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
235 | 142 | TY_(FreeNode)( doc, node ); |
236 | 142 | } |
237 | 1.03k | } |
238 | | |
239 | | |
240 | | /***************************************************************************//* |
241 | | ** MARK: - Decision Making |
242 | | ***************************************************************************/ |
243 | | |
244 | | |
245 | | /** |
246 | | * Indicates whether or not element can be pruned based on content, |
247 | | * user settings, etc. |
248 | | */ |
249 | | static Bool CanPrune( TidyDocImpl* doc, Node *element ) |
250 | 438k | { |
251 | 438k | if ( !cfgBool(doc, TidyDropEmptyElems) ) |
252 | 0 | return no; |
253 | | |
254 | 438k | if ( TY_(nodeIsText)(element) ) |
255 | 573 | return yes; |
256 | | |
257 | 437k | if ( element->content ) |
258 | 305k | return no; |
259 | | |
260 | 132k | if ( element->tag == NULL ) |
261 | 187 | return no; |
262 | | |
263 | 131k | if ( element->tag->model & CM_BLOCK && element->attributes != NULL ) |
264 | 4.51k | return no; |
265 | | |
266 | 127k | if ( nodeIsA(element) && element->attributes != NULL ) |
267 | 865 | return no; |
268 | | |
269 | 126k | if ( nodeIsP(element) && !cfgBool(doc, TidyDropEmptyParas) ) |
270 | 0 | return no; |
271 | | |
272 | 126k | if ( element->tag->model & CM_ROW ) |
273 | 1.45k | return no; |
274 | | |
275 | 125k | if ( element->tag->model & CM_EMPTY ) |
276 | 7.95k | return no; |
277 | | |
278 | 117k | if ( nodeIsAPPLET(element) ) |
279 | 0 | return no; |
280 | | |
281 | 117k | if ( nodeIsOBJECT(element) ) |
282 | 296 | return no; |
283 | | |
284 | 116k | if ( nodeIsSCRIPT(element) && attrGetSRC(element) ) |
285 | 0 | return no; |
286 | | |
287 | 116k | if ( nodeIsTITLE(element) ) |
288 | 469 | return no; |
289 | | |
290 | | /* #433359 - fix by Randy Waki 12 Mar 01 */ |
291 | 116k | if ( nodeIsIFRAME(element) ) |
292 | 9 | return no; |
293 | | |
294 | | /* fix for bug 770297 */ |
295 | 116k | if (nodeIsTEXTAREA(element)) |
296 | 1.14k | return no; |
297 | | |
298 | | /* fix for ISSUE #7 https://github.com/w3c/tidy-html5/issues/7 */ |
299 | 115k | if (nodeIsCANVAS(element)) |
300 | 0 | return no; |
301 | | |
302 | 115k | if (nodeIsPROGRESS(element)) |
303 | 0 | return no; |
304 | | |
305 | 115k | if ( attrGetID(element) || attrGetNAME(element) ) |
306 | 555 | return no; |
307 | | |
308 | | /* fix for bug 695408; a better fix would look for unknown and */ |
309 | | /* known proprietary attributes that make the element significant */ |
310 | 114k | if (attrGetDATAFLD(element)) |
311 | 0 | return no; |
312 | | |
313 | | /* fix for bug 723772, don't trim new-...-tags */ |
314 | 114k | if (element->tag->id == TidyTag_UNKNOWN) |
315 | 0 | return no; |
316 | | |
317 | 114k | if (nodeIsBODY(element)) |
318 | 1.74k | return no; |
319 | | |
320 | 112k | if (nodeIsCOLGROUP(element)) |
321 | 1.10k | return no; |
322 | | |
323 | | /* HTML5 - do NOT drop empty option if it has attributes */ |
324 | 111k | if ( nodeIsOPTION(element) && element->attributes != NULL ) |
325 | 1 | return no; |
326 | | |
327 | | /* fix for #103 - don't drop empty dd tags lest document not validate */ |
328 | 111k | if (nodeIsDD(element)) |
329 | 734 | return no; |
330 | | |
331 | 111k | return yes; |
332 | 111k | } |
333 | | |
334 | | |
335 | | /** |
336 | | * Indicates whether or not node is a descendant of a tag of the given tid. |
337 | | */ |
338 | | static Bool DescendantOf( Node *element, TidyTagId tid ) |
339 | 18.9k | { |
340 | 18.9k | Node *parent; |
341 | 18.9k | for ( parent = element->parent; |
342 | 6.46M | parent != NULL; |
343 | 6.44M | parent = parent->parent ) |
344 | 6.44M | { |
345 | 6.44M | if ( TagIsId(parent, tid) ) |
346 | 3.75k | return yes; |
347 | 6.44M | } |
348 | 15.2k | return no; |
349 | 18.9k | } |
350 | | |
351 | | |
352 | | /** |
353 | | * Indicates whether or not node is a descendant of a pre tag. |
354 | | */ |
355 | | static Bool IsPreDescendant(Node* node) |
356 | 272k | { |
357 | 272k | Node *parent = node->parent; |
358 | | |
359 | 539M | while (parent) |
360 | 538M | { |
361 | 538M | if (parent->tag && parent->tag->parser == TY_(ParsePre)) |
362 | 4.72k | return yes; |
363 | | |
364 | 538M | parent = parent->parent; |
365 | 538M | } |
366 | | |
367 | 267k | return no; |
368 | 272k | } |
369 | | |
370 | | |
371 | | /** |
372 | | * Indicates whether or not the only content model for the given node |
373 | | * is CM_INLINE. |
374 | | */ |
375 | | static Bool nodeCMIsOnlyInline( Node* node ) |
376 | 0 | { |
377 | 0 | return TY_(nodeHasCM)( node, CM_INLINE ) && !TY_(nodeHasCM)( node, CM_BLOCK ); |
378 | 0 | } |
379 | | |
380 | | |
381 | | /** |
382 | | * Indicates whether or not the content of the given node is acceptable |
383 | | * content for pre elements |
384 | | */ |
385 | | static Bool PreContent( TidyDocImpl* ARG_UNUSED(doc), Node* node ) |
386 | 11.0k | { |
387 | | /* p is coerced to br's, Text OK too */ |
388 | 11.0k | if ( nodeIsP(node) || TY_(nodeIsText)(node) ) |
389 | 611 | return yes; |
390 | | |
391 | 10.4k | if ( node->tag == NULL || |
392 | 10.4k | nodeIsPARAM(node) || |
393 | 10.4k | !TY_(nodeHasCM)(node, CM_INLINE|CM_NEW) ) |
394 | 9.37k | return no; |
395 | | |
396 | 1.05k | return yes; |
397 | 10.4k | } |
398 | | |
399 | | |
400 | | /** |
401 | | * Indicates whether or not leading whitespace should be cleaned. |
402 | | */ |
403 | | static Bool CleanLeadingWhitespace(TidyDocImpl* ARG_UNUSED(doc), Node* node) |
404 | 30.9k | { |
405 | 30.9k | if (!TY_(nodeIsText)(node)) |
406 | 0 | return no; |
407 | | |
408 | 30.9k | if (node->parent->type == DocTypeTag) |
409 | 0 | return no; |
410 | | |
411 | 30.9k | if (IsPreDescendant(node)) |
412 | 1.07k | return no; |
413 | | |
414 | 29.8k | if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) |
415 | 497 | return no; |
416 | | |
417 | | /* #523, prevent blank spaces after script if the next item is script. |
418 | | * This is actually more generalized as, if the preceding element is |
419 | | * a body level script, then indicate that we want to clean leading |
420 | | * whitespace. |
421 | | */ |
422 | 29.3k | if ( node->prev && nodeIsSCRIPT(node->prev) && nodeIsBODY(node->prev->parent) ) |
423 | 132 | return yes; |
424 | | |
425 | | /* <p>...<br> <em>...</em>...</p> */ |
426 | 29.2k | if (nodeIsBR(node->prev)) |
427 | 10 | return yes; |
428 | | |
429 | | /* <p> ...</p> */ |
430 | 29.2k | if (node->prev == NULL && !TY_(nodeHasCM)(node->parent, CM_INLINE)) |
431 | 4.39k | return yes; |
432 | | |
433 | | /* <h4>...</h4> <em>...</em> */ |
434 | 24.8k | if (node->prev && !TY_(nodeHasCM)(node->prev, CM_INLINE) && |
435 | 15.6k | TY_(nodeIsElement)(node->prev)) |
436 | 1.31k | return yes; |
437 | | |
438 | | /* <p><span> ...</span></p> */ |
439 | 23.5k | if (!node->prev && !node->parent->prev && !TY_(nodeHasCM)(node->parent->parent, CM_INLINE)) |
440 | 1.25k | return yes; |
441 | | |
442 | 22.3k | return no; |
443 | 23.5k | } |
444 | | |
445 | | |
446 | | /** |
447 | | * Indicates whether or not trailing whitespace should be cleaned. |
448 | | */ |
449 | | static Bool CleanTrailingWhitespace(TidyDocImpl* doc, Node* node) |
450 | 30.9k | { |
451 | 30.9k | Node* next; |
452 | | |
453 | 30.9k | if (!TY_(nodeIsText)(node)) |
454 | 0 | return no; |
455 | | |
456 | 30.9k | if (node->parent->type == DocTypeTag) |
457 | 0 | return no; |
458 | | |
459 | 30.9k | if (IsPreDescendant(node)) |
460 | 1.07k | return no; |
461 | | |
462 | 29.8k | if (node->parent->tag && node->parent->tag->parser == TY_(ParseScript)) |
463 | 497 | return no; |
464 | | |
465 | | /* #523, prevent blank spaces after script if the next item is script. |
466 | | * This is actually more generalized as, if the next element is |
467 | | * a body level script, then indicate that we want to clean trailing |
468 | | * whitespace. |
469 | | */ |
470 | 29.3k | if ( node->next && nodeIsSCRIPT(node->next) && nodeIsBODY(node->next->parent) ) |
471 | 21 | return yes; |
472 | | |
473 | 29.3k | next = node->next; |
474 | | |
475 | | /* <p>... </p> */ |
476 | 29.3k | if (!next && !TY_(nodeHasCM)(node->parent, CM_INLINE)) |
477 | 3.33k | return yes; |
478 | | |
479 | | /* <div><small>... </small><h3>...</h3></div> */ |
480 | 26.0k | if (!next && node->parent->next && !TY_(nodeHasCM)(node->parent->next, CM_INLINE)) |
481 | 2.00k | return yes; |
482 | | |
483 | 24.0k | if (!next) |
484 | 3.87k | return no; |
485 | | |
486 | 20.1k | if (nodeIsBR(next)) |
487 | 22 | return yes; |
488 | | |
489 | 20.1k | if (TY_(nodeHasCM)(next, CM_INLINE)) |
490 | 3.37k | return no; |
491 | | |
492 | | /* <a href='/'>...</a> <p>...</p> */ |
493 | 16.7k | if (next->type == StartTag) |
494 | 2.20k | return yes; |
495 | | |
496 | | /* <strong>...</strong> <hr /> */ |
497 | 14.5k | if (next->type == StartEndTag) |
498 | 6 | return yes; |
499 | | |
500 | | /* evil adjacent text nodes, Tidy should not generate these :-( */ |
501 | 14.5k | if (TY_(nodeIsText)(next) && next->start < next->end |
502 | 9.70k | && TY_(IsWhite)(doc->lexer->lexbuf[next->start])) |
503 | 2.52k | return yes; |
504 | | |
505 | 12.0k | return no; |
506 | 14.5k | } |
507 | | |
508 | | |
509 | | /***************************************************************************//* |
510 | | ** MARK: - Information Accumulation |
511 | | ***************************************************************************/ |
512 | | |
513 | | |
514 | | /** |
515 | | * Errors in positioning of form start or end tags |
516 | | * generally require human intervention to fix. |
517 | | * Issue #166 - repeated <main> element also uses this flag |
518 | | * to indicate duplicates, discarded. |
519 | | */ |
520 | | static void BadForm( TidyDocImpl* doc ) |
521 | 445 | { |
522 | 445 | doc->badForm |= flg_BadForm; |
523 | 445 | } |
524 | | |
525 | | |
526 | | /***************************************************************************//* |
527 | | ** MARK: - Fixes and Touchup |
528 | | ***************************************************************************/ |
529 | | |
530 | | |
531 | | /** |
532 | | * Adds style information as a class in the document or a property |
533 | | * of the node to prevent indentation of inferred UL tags. |
534 | | */ |
535 | | static void AddClassNoIndent( TidyDocImpl* doc, Node *node ) |
536 | 334 | { |
537 | 334 | ctmbstr sprop = |
538 | 334 | "padding-left: 2ex; margin-left: 0ex" |
539 | 334 | "; margin-top: 0ex; margin-bottom: 0ex"; |
540 | 334 | if ( !cfgBool(doc, TidyDecorateInferredUL) ) |
541 | 334 | return; |
542 | 0 | if ( cfgBool(doc, TidyMakeClean) ) |
543 | 0 | TY_(AddStyleAsClass)( doc, node, sprop ); |
544 | 0 | else |
545 | 0 | TY_(AddStyleProperty)( doc, node, sprop ); |
546 | 0 | } |
547 | | |
548 | | |
549 | | /** |
550 | | * Cleans whitespace from text nodes, and drops such nodes if emptied |
551 | | * completely as a result. |
552 | | */ |
553 | | static void CleanSpaces(TidyDocImpl* doc, Node* node) |
554 | 352 | { |
555 | 352 | Stack *stack = TY_(newStack)(doc, 16); |
556 | 352 | Node *next; |
557 | | |
558 | 373k | while (node) |
559 | 373k | { |
560 | 373k | next = node->next; |
561 | | |
562 | 373k | if (TY_(nodeIsText)(node) && CleanLeadingWhitespace(doc, node)) |
563 | 7.26k | while (node->start < node->end && TY_(IsWhite)(doc->lexer->lexbuf[node->start])) |
564 | 170 | ++(node->start); |
565 | | |
566 | 373k | if (TY_(nodeIsText)(node) && CleanTrailingWhitespace(doc, node)) |
567 | 14.0k | while (node->end > node->start && TY_(IsWhite)(doc->lexer->lexbuf[node->end - 1])) |
568 | 3.93k | --(node->end); |
569 | | |
570 | 373k | if (TY_(nodeIsText)(node) && !(node->start < node->end)) |
571 | 1.35k | { |
572 | 1.35k | TY_(RemoveNode)(node); |
573 | 1.35k | TY_(FreeNode)(doc, node); |
574 | 1.35k | node = next ? next : TY_(pop)(stack); |
575 | 1.35k | continue; |
576 | 1.35k | } |
577 | | |
578 | 371k | if (node->content) |
579 | 305k | { |
580 | 305k | TY_(push)(stack, next); |
581 | 305k | node = node->content; |
582 | 305k | continue; |
583 | 305k | } |
584 | | |
585 | 66.2k | node = next ? next : TY_(pop)(stack); |
586 | 66.2k | } |
587 | 352 | TY_(freeStack)(stack); |
588 | 352 | } |
589 | | |
590 | | |
591 | | /** |
592 | | * If a table row is empty then insert an empty cell. This practice is |
593 | | * consistent with browser behavior and avoids potential problems with |
594 | | * row spanning cells. |
595 | | */ |
596 | | static void FixEmptyRow(TidyDocImpl* doc, Node *row) |
597 | 1.52k | { |
598 | 1.52k | Node *cell; |
599 | | |
600 | 1.52k | if (row->content == NULL) |
601 | 1.27k | { |
602 | 1.27k | cell = TY_(InferredTag)(doc, TidyTag_TD); |
603 | 1.27k | TY_(InsertNodeAtEnd)(row, cell); |
604 | 1.27k | TY_(Report)(doc, row, cell, MISSING_STARTTAG); |
605 | 1.27k | } |
606 | 1.52k | } |
607 | | |
608 | | |
609 | | /** |
610 | | * The doctype has been found after other tags, |
611 | | * and needs moving to before the html element |
612 | | */ |
613 | | static void InsertDocType( TidyDocImpl* doc, Node *element, Node *doctype ) |
614 | 1.13k | { |
615 | 1.13k | Node* existing = TY_(FindDocType)( doc ); |
616 | 1.13k | if ( existing ) |
617 | 2 | { |
618 | 2 | TY_(Report)(doc, element, doctype, DISCARDING_UNEXPECTED ); |
619 | 2 | TY_(FreeNode)( doc, doctype ); |
620 | 2 | } |
621 | 1.13k | else |
622 | 1.13k | { |
623 | 1.13k | TY_(Report)(doc, element, doctype, DOCTYPE_AFTER_TAGS ); |
624 | 2.26k | while ( !nodeIsHTML(element) ) |
625 | 1.13k | element = element->parent; |
626 | 1.13k | TY_(InsertNodeBeforeElement)( element, doctype ); |
627 | 1.13k | } |
628 | 1.13k | } |
629 | | |
630 | | |
631 | | /** |
632 | | * This maps |
633 | | * <p>hello<em> world</em> |
634 | | * to |
635 | | * <p>hello <em>world</em> |
636 | | * |
637 | | * Trims initial space, by moving it before the |
638 | | * start tag, or if this element is the first in |
639 | | * parent's content, then by discarding the space |
640 | | */ |
641 | | static void TrimInitialSpace( TidyDocImpl* doc, Node *element, Node *text ) |
642 | 12.6k | { |
643 | 12.6k | Lexer* lexer = doc->lexer; |
644 | 12.6k | Node *prev, *node; |
645 | | |
646 | 12.6k | if ( TY_(nodeIsText)(text) && |
647 | 12.6k | lexer->lexbuf[text->start] == ' ' && |
648 | 1.03k | text->start < text->end ) |
649 | 1.03k | { |
650 | 1.03k | if ( (element->tag->model & CM_INLINE) && |
651 | 670 | !(element->tag->model & CM_FIELD) ) |
652 | 310 | { |
653 | 310 | prev = element->prev; |
654 | | |
655 | 310 | if (TY_(nodeIsText)(prev)) |
656 | 63 | { |
657 | 63 | if (prev->end == 0 || lexer->lexbuf[prev->end - 1] != ' ') |
658 | 41 | lexer->lexbuf[(prev->end)++] = ' '; |
659 | | |
660 | 63 | ++(element->start); |
661 | 63 | } |
662 | 247 | else /* create new node */ |
663 | 247 | { |
664 | 247 | node = TY_(NewNode)(lexer->allocator, lexer); |
665 | 247 | node->start = (element->start)++; |
666 | 247 | node->end = element->start; |
667 | 247 | lexer->lexbuf[node->start] = ' '; |
668 | 247 | TY_(InsertNodeBeforeElement)(element ,node); |
669 | 247 | DEBUG_LOG(SPRTF("TrimInitialSpace: Created text node, inserted before <%s>\n", |
670 | 247 | (element->element ? element->element : "unknown"))); |
671 | 247 | } |
672 | 310 | } |
673 | | |
674 | | /* discard the space in current node */ |
675 | 1.03k | ++(text->start); |
676 | 1.03k | } |
677 | 12.6k | } |
678 | | |
679 | | |
680 | | /** |
681 | | * This maps |
682 | | * <em>hello </em><strong>world</strong> |
683 | | * to |
684 | | * <em>hello</em> <strong>world</strong> |
685 | | * |
686 | | * If last child of element is a text node |
687 | | * then trim trailing white space character |
688 | | * moving it to after element's end tag. |
689 | | */ |
690 | | static void TrimTrailingSpace( TidyDocImpl* doc, Node *element, Node *last ) |
691 | 10.3k | { |
692 | 10.3k | Lexer* lexer = doc->lexer; |
693 | 10.3k | byte c; |
694 | | |
695 | 10.3k | if (TY_(nodeIsText)(last)) |
696 | 10.3k | { |
697 | 10.3k | if (last->end > last->start) |
698 | 10.3k | { |
699 | 10.3k | c = (byte) lexer->lexbuf[ last->end - 1 ]; |
700 | | |
701 | 10.3k | if ( c == ' ' ) |
702 | 459 | { |
703 | 459 | last->end -= 1; |
704 | 459 | if ( (element->tag->model & CM_INLINE) && |
705 | 445 | !(element->tag->model & CM_FIELD) ) |
706 | 107 | lexer->insertspace = yes; |
707 | 459 | } |
708 | 10.3k | } |
709 | 10.3k | } |
710 | 10.3k | } |
711 | | |
712 | | |
713 | | /** |
714 | | * Move initial and trailing space out. |
715 | | * This routine maps: |
716 | | * hello<em> world</em> |
717 | | * to |
718 | | * hello <em>world</em> |
719 | | * and |
720 | | * <em>hello </em><strong>world</strong> |
721 | | * to |
722 | | * <em>hello</em> <strong>world</strong> |
723 | | */ |
724 | | static void TrimSpaces( TidyDocImpl* doc, Node *element) |
725 | 210k | { |
726 | 210k | Node* text = element->content; |
727 | | |
728 | 210k | if (nodeIsPRE(element) || IsPreDescendant(element)) |
729 | 3.23k | return; |
730 | | |
731 | 207k | if (TY_(nodeIsText)(text)) |
732 | 12.5k | TrimInitialSpace(doc, element, text); |
733 | | |
734 | 207k | text = element->last; |
735 | | |
736 | 207k | if (TY_(nodeIsText)(text)) |
737 | 10.3k | TrimTrailingSpace(doc, element, text); |
738 | 207k | } |
739 | | |
740 | | |
741 | | /***************************************************************************//* |
742 | | ** MARK: - Parsers Support |
743 | | ***************************************************************************/ |
744 | | |
745 | | |
746 | | /** |
747 | | * Structure used by FindDescendant_cb. |
748 | | */ |
749 | | struct MatchingDescendantData |
750 | | { |
751 | | Node *found_node; |
752 | | Bool *passed_marker_node; |
753 | | |
754 | | /* input: */ |
755 | | TidyTagId matching_tagId; |
756 | | Node *node_to_find; |
757 | | Node *marker_node; |
758 | | }; |
759 | | |
760 | | |
761 | | /** |
762 | | * The main engine for FindMatchingDescendant. |
763 | | */ |
764 | | static NodeTraversalSignal FindDescendant_cb(TidyDocImpl* ARG_UNUSED(doc), Node* node, void *propagate) |
765 | 5.19k | { |
766 | 5.19k | struct MatchingDescendantData *cb_data = (struct MatchingDescendantData *)propagate; |
767 | | |
768 | 5.19k | if (TagId(node) == cb_data->matching_tagId) |
769 | 630 | { |
770 | | /* make sure we match up 'unknown' tags exactly! */ |
771 | 630 | if (cb_data->matching_tagId != TidyTag_UNKNOWN || |
772 | 165 | (node->element != NULL && |
773 | 116 | cb_data->node_to_find != NULL && |
774 | 116 | cb_data->node_to_find->element != NULL && |
775 | 116 | 0 == TY_(tmbstrcmp)(cb_data->node_to_find->element, node->element))) |
776 | 467 | { |
777 | 467 | cb_data->found_node = node; |
778 | 467 | return ExitTraversal; |
779 | 467 | } |
780 | 630 | } |
781 | | |
782 | 4.73k | if (cb_data->passed_marker_node && node == cb_data->marker_node) |
783 | 0 | *cb_data->passed_marker_node = yes; |
784 | | |
785 | 4.73k | return VisitParent; |
786 | 5.19k | } |
787 | | |
788 | | |
789 | | /** |
790 | | * Search the parent chain (from `parent` upwards up to the root) for a node |
791 | | * matching the given 'node'. |
792 | | * |
793 | | * When the search passes beyond the `marker_node` (which is assumed to sit |
794 | | * in the parent chain), this will be flagged by setting the boolean |
795 | | * referenced by `is_parent_of_marker` to `yes`. |
796 | | * |
797 | | * 'is_parent_of_marker' and 'marker_node' are optional parameters and may |
798 | | * be NULL. |
799 | | */ |
800 | | static Node *FindMatchingDescendant( Node *parent, Node *node, Node *marker_node, Bool *is_parent_of_marker ) |
801 | 705 | { |
802 | 705 | struct MatchingDescendantData cb_data = { 0 }; |
803 | 705 | cb_data.matching_tagId = TagId(node); |
804 | 705 | cb_data.node_to_find = node; |
805 | 705 | cb_data.marker_node = marker_node; |
806 | | |
807 | 705 | assert(node); |
808 | | |
809 | 705 | if (is_parent_of_marker) |
810 | 705 | *is_parent_of_marker = no; |
811 | | |
812 | 705 | TY_(TraverseNodeTree)(NULL, parent, FindDescendant_cb, &cb_data); |
813 | 705 | return cb_data.found_node; |
814 | 705 | } |
815 | | |
816 | | |
817 | | /** |
818 | | * Finds the last list item for the given list, providing it in the |
819 | | * in-out parameter. Returns yes or no if the item was the last list |
820 | | * item. |
821 | | */ |
822 | | static Bool FindLastLI( Node *list, Node **lastli ) |
823 | 64.4k | { |
824 | 64.4k | Node *node; |
825 | | |
826 | 64.4k | *lastli = NULL; |
827 | 64.5k | for ( node = list->content; node ; node = node->next ) |
828 | 130 | if ( nodeIsLI(node) && node->type == StartTag ) |
829 | 2 | *lastli=node; |
830 | 64.4k | return *lastli ? yes:no; |
831 | 64.4k | } |
832 | | |
833 | | |
834 | | /***************************************************************************//* |
835 | | ** MARK: - Parser Stack |
836 | | ***************************************************************************/ |
837 | | |
838 | | |
839 | | /** |
840 | | * Allocates and initializes the parser's stack. |
841 | | */ |
842 | | void TY_(InitParserStack)( TidyDocImpl* doc ) |
843 | 402 | { |
844 | 402 | enum { default_size = 32 }; |
845 | 402 | TidyParserMemory *content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * default_size ); |
846 | | |
847 | 402 | doc->stack.content = content; |
848 | 402 | doc->stack.size = default_size; |
849 | 402 | doc->stack.top = -1; |
850 | 402 | } |
851 | | |
852 | | |
853 | | /** |
854 | | * Frees the parser's stack when done. |
855 | | */ |
856 | | void TY_(FreeParserStack)( TidyDocImpl* doc ) |
857 | 402 | { |
858 | 402 | TidyFree( doc->allocator, doc->stack.content ); |
859 | | |
860 | 402 | doc->stack.content = NULL; |
861 | 402 | doc->stack.size = 0; |
862 | 402 | doc->stack.top = -1; |
863 | 402 | } |
864 | | |
865 | | |
866 | | /** |
867 | | * Increase the stack size. |
868 | | */ |
869 | | static void growParserStack( TidyDocImpl* doc ) |
870 | 584 | { |
871 | 584 | TidyParserMemory *content; |
872 | 584 | content = (TidyParserMemory *) TidyAlloc( doc->allocator, sizeof(TidyParserMemory) * doc->stack.size * 2 ); |
873 | | |
874 | 584 | memcpy( content, doc->stack.content, sizeof(TidyParserMemory) * (doc->stack.top + 1) ); |
875 | 584 | TidyFree(doc->allocator, doc->stack.content); |
876 | | |
877 | 584 | doc->stack.content = content; |
878 | 584 | doc->stack.size = doc->stack.size * 2; |
879 | 584 | } |
880 | | |
881 | | |
882 | | /** |
883 | | * Indicates whether or not the stack is empty. |
884 | | */ |
885 | | Bool TY_(isEmptyParserStack)( TidyDocImpl* doc ) |
886 | 1.00M | { |
887 | 1.00M | return doc->stack.top < 0; |
888 | 1.00M | } |
889 | | |
890 | | |
891 | | /** |
892 | | * Peek at the parser memory. |
893 | | */ |
894 | | TidyParserMemory TY_(peekMemory)( TidyDocImpl* doc ) |
895 | 0 | { |
896 | 0 | return doc->stack.content[doc->stack.top]; |
897 | 0 | } |
898 | | |
899 | | |
900 | | /** |
901 | | * Peek at the parser memory "identity" field. This is just a convenience |
902 | | * to avoid having to create a new struct instance in the caller. |
903 | | */ |
904 | | Parser* TY_(peekMemoryIdentity)( TidyDocImpl* doc ) |
905 | 501k | { |
906 | 501k | return doc->stack.content[doc->stack.top].identity; |
907 | 501k | } |
908 | | |
909 | | |
910 | | /** |
911 | | * Peek at the parser memory "mode" field. This is just a convenience |
912 | | * to avoid having to create a new struct instance in the caller. |
913 | | */ |
914 | | GetTokenMode TY_(peekMemoryMode)( TidyDocImpl* doc ) |
915 | 1.10k | { |
916 | 1.10k | return doc->stack.content[doc->stack.top].mode; |
917 | 1.10k | } |
918 | | |
919 | | |
920 | | /** |
921 | | * Pop out a parser memory. |
922 | | */ |
923 | | TidyParserMemory TY_(popMemory)( TidyDocImpl* doc ) |
924 | 501k | { |
925 | 501k | if ( !TY_(isEmptyParserStack)( doc ) ) |
926 | 501k | { |
927 | 501k | TidyParserMemory data = doc->stack.content[doc->stack.top]; |
928 | 501k | DEBUG_LOG(SPRTF("\n" |
929 | 501k | "<--POP original: %s @ %p\n" |
930 | 501k | " reentry: %s @ %p\n" |
931 | 501k | " stack depth: %lu @ %p\n" |
932 | 501k | " mode: %u\n" |
933 | 501k | " register 1: %i\n" |
934 | 501k | " register 2: %i\n\n", |
935 | 501k | data.original_node ? data.original_node->element : "none", data.original_node, |
936 | 501k | data.reentry_node ? data.reentry_node->element : "none", data.reentry_node, |
937 | 501k | doc->stack.top, &doc->stack.content[doc->stack.top], |
938 | 501k | data.mode, |
939 | 501k | data.register_1, |
940 | 501k | data.register_2 |
941 | 501k | )); |
942 | 501k | doc->stack.top = doc->stack.top - 1; |
943 | 501k | return data; |
944 | 501k | } |
945 | 0 | TidyParserMemory blank = { NULL }; |
946 | 0 | return blank; |
947 | 501k | } |
948 | | |
949 | | |
950 | | /** |
951 | | * Push the parser memory to the stack. |
952 | | */ |
953 | | void TY_(pushMemory)( TidyDocImpl* doc, TidyParserMemory data ) |
954 | 549k | { |
955 | 549k | if ( doc->stack.top == doc->stack.size - 1 ) |
956 | 584 | growParserStack( doc ); |
957 | | |
958 | 549k | doc->stack.top++; |
959 | | |
960 | 549k | doc->stack.content[doc->stack.top] = data; |
961 | 549k | DEBUG_LOG(SPRTF("\n" |
962 | 549k | "-->PUSH original: %s @ %p\n" |
963 | 549k | " reentry: %s @ %p\n" |
964 | 549k | " stack depth: %lu @ %p\n" |
965 | 549k | " mode: %u\n" |
966 | 549k | " register 1: %i\n" |
967 | 549k | " register 2: %i\n\n", |
968 | 549k | data.original_node ? data.original_node->element : "none", data.original_node, |
969 | 549k | data.reentry_node ? data.reentry_node->element : "none", data.reentry_node, |
970 | 549k | doc->stack.top, &doc->stack.content[doc->stack.top], |
971 | 549k | data.mode, |
972 | 549k | data.register_1, |
973 | 549k | data.register_2 |
974 | 549k | )); |
975 | 549k | } |
976 | | |
977 | | |
978 | | /***************************************************************************//* |
979 | | ** MARK: Convenience Logging Macros |
980 | | ***************************************************************************/ |
981 | | |
982 | | |
983 | | #if defined(ENABLE_DEBUG_LOG) |
984 | | # define DEBUG_LOG_COUNTERS \ |
985 | | static int depth_parser = 0;\ |
986 | | static int count_parser = 0;\ |
987 | | int old_mode = IgnoreWhitespace; |
988 | | # define DEBUG_LOG_GET_OLD_MODE old_mode = mode; |
989 | | # define DEBUG_LOG_REENTER_WITH_NODE(NODE) SPRTF("\n>>>Re-Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser); |
990 | | # define DEBUG_LOG_ENTER_WITH_NODE(NODE) SPRTF("\n>>>Enter %s-%u with '%s', +++mode: %u, depth: %d, cnt: %d\n", __FUNCTION__, __LINE__, NODE->element, mode, ++depth_parser, ++count_parser); |
991 | | # define DEBUG_LOG_CHANGE_MODE SPRTF("+++%s-%u Changing mode to %u (was %u)\n", __FUNCTION__, __LINE__, mode, old_mode); |
992 | | # define DEBUG_LOG_GOT_TOKEN(NODE) SPRTF("---%s-%u got token '%s' with mode '%u'.\n", __FUNCTION__, __LINE__, NODE ? NODE->element : NULL, mode); |
993 | | # define DEBUG_LOG_EXIT_WITH_NODE(NODE) SPRTF("<<<Exit %s-%u with a node to parse: '%s', depth: %d\n", __FUNCTION__, __LINE__, NODE->element, depth_parser--); |
994 | | # define DEBUG_LOG_EXIT SPRTF("<<<Exit %s-%u, depth: %d\n", __FUNCTION__, __LINE__, depth_parser--); |
995 | | #else |
996 | | # define DEBUG_LOG_COUNTERS |
997 | | # define DEBUG_LOG_GET_OLD_MODE |
998 | | # define DEBUG_LOG_REENTER_WITH_NODE(NODE) |
999 | | # define DEBUG_LOG_ENTER_WITH_NODE(NODE) |
1000 | | # define DEBUG_LOG_CHANGE_MODE |
1001 | | # define DEBUG_LOG_GOT_TOKEN(NODE) |
1002 | | # define DEBUG_LOG_EXIT_WITH_NODE(NODE) |
1003 | | # define DEBUG_LOG_EXIT |
1004 | | #endif |
1005 | | |
1006 | | |
1007 | | /***************************************************************************//* |
1008 | | ** MARK: - Parser Search and Instantiation |
1009 | | ***************************************************************************/ |
1010 | | |
1011 | | |
1012 | | /** |
1013 | | * Retrieves the correct parser for the given node, accounting for various |
1014 | | * conditions, and readies the lexer for parsing that node. |
1015 | | */ |
1016 | | static Parser* GetParserForNode( TidyDocImpl* doc, Node *node ) |
1017 | 553k | { |
1018 | 553k | Lexer* lexer = doc->lexer; |
1019 | | |
1020 | 553k | if ( cfgBool( doc, TidyXmlTags ) ) |
1021 | 27.4k | return ParseXMLElement; |
1022 | | |
1023 | | /* [i_a]2 prevent crash for active content (php, asp) docs */ |
1024 | 526k | if (!node || node->tag == NULL) |
1025 | 4.45k | return NULL; |
1026 | | |
1027 | | /* |
1028 | | Fix by GLP 2000-12-21. Need to reset insertspace if this is both |
1029 | | a non-inline and empty tag (base, link, meta, isindex, hr, area). |
1030 | | */ |
1031 | 521k | if (node->tag->model & CM_EMPTY) |
1032 | 7.44k | { |
1033 | 7.44k | lexer->waswhite = no; |
1034 | 7.44k | if (node->tag->parser == NULL) |
1035 | 0 | return NULL; |
1036 | 7.44k | } |
1037 | 514k | else if (!(node->tag->model & CM_INLINE)) |
1038 | 208k | lexer->insertspace = no; |
1039 | | |
1040 | 521k | if (node->tag->parser == NULL) |
1041 | 0 | return NULL; |
1042 | | |
1043 | 521k | if (node->type == StartEndTag) |
1044 | 5.72k | return NULL; |
1045 | | |
1046 | | /* [i_a]2 added this - not sure why - CHECKME: */ |
1047 | 515k | lexer->parent = node; |
1048 | | |
1049 | 515k | return (node->tag->parser); |
1050 | 521k | } |
1051 | | |
1052 | | |
1053 | | /** |
1054 | | * This parser controller initiates the parsing process with the document's |
1055 | | * root starting with the provided node, which should be the HTML node after |
1056 | | * the pre-HTML stuff is handled at a higher level. |
1057 | | * |
1058 | | * This controller is responsible for calling each of the individual parsers, |
1059 | | * based on the tokens it pulls from the lexer, or the tokens passed back via |
1060 | | * the parserMemory stack from each of the parsers. Having a main, central |
1061 | | * looping dispatcher in this fashion allows the prevention of recursion. |
1062 | | */ |
1063 | | void ParseHTMLWithNode( TidyDocImpl* doc, Node* node ) |
1064 | 413 | { |
1065 | 413 | GetTokenMode mode = IgnoreWhitespace; |
1066 | 413 | Parser* parser = GetParserForNode( doc, node ); |
1067 | 413 | Bool something_to_do = yes; |
1068 | | |
1069 | | /* |
1070 | | This main loop is only extinguished when all of the parser tokens are |
1071 | | consumed. Ideally, EVERY parser will return nodes to this loop for |
1072 | | dispatch to the appropriate parser, but some of the recursive parsers |
1073 | | still consume some tokens on their own. |
1074 | | */ |
1075 | 1.05M | while (something_to_do) |
1076 | 1.05M | { |
1077 | 1.05M | node = parser ? parser( doc, node, mode ) : NULL; |
1078 | | |
1079 | | /* |
1080 | | We have a node, so anything deferred was already pushed to the stack |
1081 | | to be dealt with later. |
1082 | | */ |
1083 | 1.05M | if ( node ) |
1084 | 548k | { |
1085 | 548k | parser = GetParserForNode( doc, node ); |
1086 | 548k | continue; |
1087 | 548k | } |
1088 | | |
1089 | | /* |
1090 | | We weren't given a node, which means this particular leaf is bottomed |
1091 | | out. We'll re-enter the parsers using information from the stack. |
1092 | | */ |
1093 | 504k | if ( !TY_(isEmptyParserStack)(doc)) |
1094 | 501k | { |
1095 | 501k | parser = TY_(peekMemoryIdentity)(doc); |
1096 | 501k | if (parser) |
1097 | 499k | { |
1098 | 499k | continue; |
1099 | 499k | } |
1100 | 1.10k | else |
1101 | 1.10k | { |
1102 | | /* No parser means we're only passing back a parsing mode. */ |
1103 | 1.10k | mode = TY_(peekMemoryMode)( doc ); |
1104 | 1.10k | TY_(popMemory)( doc ); |
1105 | 1.10k | } |
1106 | 501k | } |
1107 | | |
1108 | | /* |
1109 | | At this point, there's nothing being returned from parsers, and |
1110 | | nothing on the stack, so we can draw a new node from the lexer. |
1111 | | */ |
1112 | 4.63k | node = TY_(GetToken)( doc, mode ); |
1113 | 4.63k | DEBUG_LOG_GOT_TOKEN(node); |
1114 | | |
1115 | 4.63k | if (node) |
1116 | 4.22k | parser = GetParserForNode( doc, node ); |
1117 | 413 | else |
1118 | 413 | something_to_do = no; |
1119 | 4.63k | } |
1120 | 413 | } |
1121 | | |
1122 | | |
1123 | | /***************************************************************************//* |
1124 | | ** MARK: - Parsers |
1125 | | ***************************************************************************/ |
1126 | | |
1127 | | |
1128 | | /** MARK: TY_(ParseBlock) |
1129 | | * `element` is a node created by the lexer upon seeing the start tag, or |
1130 | | * by the parser when the start tag is inferred |
1131 | | * |
1132 | | * This is a non-recursing parser. It uses the document's parser memory stack |
1133 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
1134 | | * This parser is also re-enterable, so that post-processing can occur after |
1135 | | * such dispatching. |
1136 | | */ |
1137 | | Node* TY_(ParseBlock)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) |
1138 | 35.0k | { |
1139 | 35.0k | Lexer* lexer = doc->lexer; |
1140 | 35.0k | Node *node = NULL; |
1141 | 35.0k | Bool checkstack = yes; |
1142 | 35.0k | uint istackbase = 0; |
1143 | 35.0k | DEBUG_LOG_COUNTERS; |
1144 | | |
1145 | 35.0k | if ( element == NULL ) |
1146 | 18.6k | { |
1147 | 18.6k | TidyParserMemory memory = TY_(popMemory)( doc ); |
1148 | 18.6k | node = memory.reentry_node; /* Throwaway, because the loop overwrites this immediately. */ |
1149 | 18.6k | DEBUG_LOG_REENTER_WITH_NODE(node); |
1150 | 18.6k | element = memory.original_node; |
1151 | 18.6k | DEBUG_LOG_GET_OLD_MODE; |
1152 | 18.6k | mode = memory.reentry_mode; |
1153 | 18.6k | DEBUG_LOG_CHANGE_MODE; |
1154 | 18.6k | } |
1155 | 16.4k | else |
1156 | 16.4k | { |
1157 | 16.4k | DEBUG_LOG_ENTER_WITH_NODE(element); |
1158 | | |
1159 | 16.4k | if ( element->tag->model & CM_EMPTY ) |
1160 | 0 | { |
1161 | 0 | DEBUG_LOG_EXIT; |
1162 | 0 | return NULL; |
1163 | 0 | } |
1164 | | |
1165 | 16.4k | if ( nodeIsDIV(element) && nodeIsDL(element->parent) && TY_(IsHTML5Mode)(doc) ) |
1166 | 229 | { |
1167 | 229 | DEBUG_LOG_EXIT; |
1168 | 229 | return TY_(ParseDefList)(doc, element, mode); /* @warning: possible recursion! */ |
1169 | 229 | } |
1170 | | |
1171 | 16.1k | if ( nodeIsFORM(element) && DescendantOf(element, TidyTag_FORM) ) |
1172 | 548 | { |
1173 | 548 | TY_(Report)(doc, element, NULL, ILLEGAL_NESTING ); |
1174 | 548 | } |
1175 | | |
1176 | | /* |
1177 | | InlineDup() asks the lexer to insert inline emphasis tags |
1178 | | currently pushed on the istack, but take care to avoid |
1179 | | propagating inline emphasis inside OBJECT or APPLET. |
1180 | | For these elements a fresh inline stack context is created |
1181 | | and disposed of upon reaching the end of the element. |
1182 | | They thus behave like table cells in this respect. |
1183 | | */ |
1184 | 16.1k | if (element->tag->model & CM_OBJECT) |
1185 | 2.50k | { |
1186 | 2.50k | istackbase = lexer->istackbase; |
1187 | 2.50k | lexer->istackbase = lexer->istacksize; |
1188 | 2.50k | } |
1189 | | |
1190 | 16.1k | if (!(element->tag->model & CM_MIXED)) |
1191 | 15.0k | { |
1192 | 15.0k | TY_(InlineDup)( doc, NULL ); |
1193 | 15.0k | } |
1194 | | |
1195 | | /*\ |
1196 | | * Issue #212 - If it is likely that it may be necessary |
1197 | | * to move a leading space into a text node before this |
1198 | | * element, then keep the mode MixedContent to keep any |
1199 | | * leading space |
1200 | | \*/ |
1201 | 16.1k | if ( !(element->tag->model & CM_INLINE) || |
1202 | 7.73k | (element->tag->model & CM_FIELD ) ) |
1203 | 8.45k | { |
1204 | 8.45k | DEBUG_LOG_GET_OLD_MODE; |
1205 | 8.45k | mode = IgnoreWhitespace; |
1206 | 8.45k | DEBUG_LOG_CHANGE_MODE; |
1207 | 8.45k | } |
1208 | 7.73k | else if (mode == IgnoreWhitespace) |
1209 | 7.73k | { |
1210 | | /* Issue #212 - Further fix in case ParseBlock() is called with 'IgnoreWhitespace' |
1211 | | when such a leading space may need to be inserted before this element to |
1212 | | preserve the browser view */ |
1213 | 7.73k | DEBUG_LOG_GET_OLD_MODE; |
1214 | 7.73k | mode = MixedContent; |
1215 | 7.73k | DEBUG_LOG_CHANGE_MODE; |
1216 | 7.73k | } |
1217 | 16.1k | } /* Re-Entering */ |
1218 | | |
1219 | | /* |
1220 | | Main Loop |
1221 | | */ |
1222 | | |
1223 | 49.3k | while ((node = TY_(GetToken)(doc, mode /*MixedContent*/)) != NULL) |
1224 | 43.1k | { |
1225 | 43.1k | DEBUG_LOG_GOT_TOKEN(node); |
1226 | | /* end tag for this element */ |
1227 | 43.1k | if (node->type == EndTag && node->tag && |
1228 | 6.44k | (node->tag == element->tag || element->was == node->tag)) |
1229 | 3.07k | { |
1230 | 3.07k | TY_(FreeNode)( doc, node ); |
1231 | | |
1232 | 3.07k | if (element->tag->model & CM_OBJECT) |
1233 | 1.38k | { |
1234 | | /* pop inline stack */ |
1235 | 3.52k | while (lexer->istacksize > lexer->istackbase) |
1236 | 2.14k | TY_(PopInline)( doc, NULL ); |
1237 | 1.38k | lexer->istackbase = istackbase; |
1238 | 1.38k | } |
1239 | | |
1240 | 3.07k | element->closed = yes; |
1241 | 3.07k | TrimSpaces( doc, element ); |
1242 | 3.07k | DEBUG_LOG_EXIT; |
1243 | 3.07k | return NULL; |
1244 | 3.07k | } |
1245 | | |
1246 | 40.0k | if ( nodeIsHTML(node) || nodeIsHEAD(node) || nodeIsBODY(node) ) |
1247 | 639 | { |
1248 | 639 | if ( TY_(nodeIsElement)(node) ) |
1249 | 400 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1250 | 639 | TY_(FreeNode)( doc, node ); |
1251 | 639 | continue; |
1252 | 639 | } |
1253 | | |
1254 | | |
1255 | 39.4k | if (node->type == EndTag) |
1256 | 3.41k | { |
1257 | 3.41k | if (node->tag == NULL) |
1258 | 286 | { |
1259 | 286 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1260 | 286 | TY_(FreeNode)( doc, node ); |
1261 | 286 | continue; |
1262 | 286 | } |
1263 | 3.12k | else if ( nodeIsBR(node) ) |
1264 | 0 | { |
1265 | 0 | node->type = StartTag; |
1266 | 0 | } |
1267 | 3.12k | else if ( nodeIsP(node) ) |
1268 | 192 | { |
1269 | | /* Cannot have a block inside a paragraph, so no checking |
1270 | | for an ancestor is necessary -- but we _can_ have |
1271 | | paragraphs inside a block, so change it to an implicit |
1272 | | empty paragraph, to be dealt with according to the user's |
1273 | | options |
1274 | | */ |
1275 | 192 | node->type = StartEndTag; |
1276 | 192 | node->implicit = yes; |
1277 | 192 | } |
1278 | 2.93k | else if (DescendantOf( element, node->tag->id )) |
1279 | 544 | { |
1280 | | /* |
1281 | | if this is the end tag for an ancestor element |
1282 | | then infer end tag for this element |
1283 | | */ |
1284 | 544 | TY_(UngetToken)( doc ); |
1285 | 544 | break; |
1286 | 544 | } |
1287 | 2.39k | else |
1288 | 2.39k | { |
1289 | | /* special case </tr> etc. for stuff moved in front of table */ |
1290 | 2.39k | if ( lexer->exiled |
1291 | 849 | && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) |
1292 | 545 | { |
1293 | 545 | TY_(UngetToken)( doc ); |
1294 | 545 | TrimSpaces( doc, element ); |
1295 | 545 | DEBUG_LOG_EXIT; |
1296 | 545 | return NULL; |
1297 | 545 | } |
1298 | 2.39k | } |
1299 | 3.41k | } |
1300 | | |
1301 | | /* mixed content model permits text */ |
1302 | 38.0k | if (TY_(nodeIsText)(node)) |
1303 | 5.26k | { |
1304 | 5.26k | if ( checkstack ) |
1305 | 3.11k | { |
1306 | 3.11k | checkstack = no; |
1307 | 3.11k | if (!(element->tag->model & CM_MIXED)) |
1308 | 2.74k | { |
1309 | 2.74k | if ( TY_(InlineDup)(doc, node) > 0 ) |
1310 | 647 | continue; |
1311 | 2.74k | } |
1312 | 3.11k | } |
1313 | | |
1314 | 4.62k | TY_(InsertNodeAtEnd)(element, node); |
1315 | 4.62k | DEBUG_LOG_GET_OLD_MODE |
1316 | 4.62k | mode = MixedContent; |
1317 | 4.62k | DEBUG_LOG_CHANGE_MODE; |
1318 | | /* |
1319 | | HTML4 strict doesn't allow mixed content for |
1320 | | elements with %block; as their content model |
1321 | | */ |
1322 | | /* |
1323 | | But only body, map, blockquote, form and |
1324 | | noscript have content model %block; |
1325 | | */ |
1326 | 4.62k | if ( nodeIsBODY(element) || |
1327 | 4.62k | nodeIsMAP(element) || |
1328 | 4.62k | nodeIsBLOCKQUOTE(element) || |
1329 | 4.62k | nodeIsFORM(element) || |
1330 | 4.34k | nodeIsNOSCRIPT(element) ) |
1331 | 273 | TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); |
1332 | 4.62k | continue; |
1333 | 5.26k | } |
1334 | | |
1335 | 32.8k | if ( InsertMisc(element, node) ) |
1336 | 610 | continue; |
1337 | | |
1338 | | /* allow PARAM elements? */ |
1339 | 32.1k | if ( nodeIsPARAM(node) ) |
1340 | 584 | { |
1341 | 584 | if ( TY_(nodeHasCM)(element, CM_PARAM) && TY_(nodeIsElement)(node) ) |
1342 | 584 | { |
1343 | 584 | TY_(InsertNodeAtEnd)(element, node); |
1344 | 584 | continue; |
1345 | 584 | } |
1346 | | |
1347 | | /* otherwise discard it */ |
1348 | 0 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1349 | 0 | TY_(FreeNode)( doc, node ); |
1350 | 0 | continue; |
1351 | 584 | } |
1352 | | |
1353 | | /* allow AREA elements? */ |
1354 | 31.6k | if ( nodeIsAREA(node) ) |
1355 | 0 | { |
1356 | 0 | if ( nodeIsMAP(element) && TY_(nodeIsElement)(node) ) |
1357 | 0 | { |
1358 | 0 | TY_(InsertNodeAtEnd)(element, node); |
1359 | 0 | continue; |
1360 | 0 | } |
1361 | | |
1362 | | /* otherwise discard it */ |
1363 | 0 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1364 | 0 | TY_(FreeNode)( doc, node ); |
1365 | 0 | continue; |
1366 | 0 | } |
1367 | | |
1368 | | /* ignore unknown start/end tags */ |
1369 | 31.6k | if ( node->tag == NULL ) |
1370 | 3.78k | { |
1371 | 3.78k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1372 | 3.78k | TY_(FreeNode)( doc, node ); |
1373 | 3.78k | continue; |
1374 | 3.78k | } |
1375 | | |
1376 | | /* |
1377 | | Allow CM_INLINE elements here. |
1378 | | |
1379 | | Allow CM_BLOCK elements here unless |
1380 | | lexer->excludeBlocks is yes. |
1381 | | |
1382 | | LI and DD are special cased. |
1383 | | |
1384 | | Otherwise infer end tag for this element. |
1385 | | */ |
1386 | | |
1387 | 27.8k | if ( !TY_(nodeHasCM)(node, CM_INLINE) ) |
1388 | 13.9k | { |
1389 | 13.9k | if ( !TY_(nodeIsElement)(node) ) |
1390 | 991 | { |
1391 | 991 | if ( nodeIsFORM(node) ) |
1392 | 258 | BadForm( doc ); |
1393 | | |
1394 | 991 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1395 | 991 | TY_(FreeNode)( doc, node ); |
1396 | 991 | continue; |
1397 | 991 | } |
1398 | | |
1399 | | /* #427671 - Fix by Randy Waki - 10 Aug 00 */ |
1400 | | /* |
1401 | | If an LI contains an illegal FRAME, FRAMESET, OPTGROUP, or OPTION |
1402 | | start tag, discard the start tag and let the subsequent content get |
1403 | | parsed as content of the enclosing LI. This seems to mimic IE and |
1404 | | Netscape, and avoids an infinite loop: without this check, |
1405 | | ParseBlock (which is parsing the LI's content) and ParseList (which |
1406 | | is parsing the LI's parent's content) repeatedly defer to each |
1407 | | other to parse the illegal start tag, each time inferring a missing |
1408 | | </li> or <li> respectively. |
1409 | | |
1410 | | NOTE: This check is a bit fragile. It specifically checks for the |
1411 | | four tags that happen to weave their way through the current series |
1412 | | of tests performed by ParseBlock and ParseList to trigger the |
1413 | | infinite loop. |
1414 | | */ |
1415 | 12.9k | if ( nodeIsLI(element) ) |
1416 | 1.90k | { |
1417 | 1.90k | if ( nodeIsFRAME(node) || |
1418 | 1.90k | nodeIsFRAMESET(node) || |
1419 | 1.90k | nodeIsOPTGROUP(node) || |
1420 | 1.45k | nodeIsOPTION(node) ) |
1421 | 451 | { |
1422 | 451 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1423 | 451 | TY_(FreeNode)( doc, node ); /* DSR - 27Apr02 avoid memory leak */ |
1424 | 451 | continue; |
1425 | 451 | } |
1426 | 1.90k | } |
1427 | | |
1428 | 12.5k | if ( nodeIsTD(element) || nodeIsTH(element) ) |
1429 | 806 | { |
1430 | | /* if parent is a table cell, avoid inferring the end of the cell */ |
1431 | | |
1432 | 806 | if ( TY_(nodeHasCM)(node, CM_HEAD) ) |
1433 | 61 | { |
1434 | 61 | MoveToHead( doc, element, node ); |
1435 | 61 | continue; |
1436 | 61 | } |
1437 | | |
1438 | 745 | if ( TY_(nodeHasCM)(node, CM_LIST) ) |
1439 | 13 | { |
1440 | 13 | TY_(UngetToken)( doc ); |
1441 | 13 | node = TY_(InferredTag)(doc, TidyTag_UL); |
1442 | 13 | AddClassNoIndent(doc, node); |
1443 | 13 | lexer->excludeBlocks = yes; |
1444 | 13 | } |
1445 | 732 | else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) |
1446 | 15 | { |
1447 | 15 | TY_(UngetToken)( doc ); |
1448 | 15 | node = TY_(InferredTag)(doc, TidyTag_DL); |
1449 | 15 | lexer->excludeBlocks = yes; |
1450 | 15 | } |
1451 | | |
1452 | | /* infer end of current table cell */ |
1453 | 745 | if ( !TY_(nodeHasCM)(node, CM_BLOCK) ) |
1454 | 503 | { |
1455 | 503 | TY_(UngetToken)( doc ); |
1456 | 503 | TrimSpaces( doc, element ); |
1457 | 503 | DEBUG_LOG_EXIT; |
1458 | 503 | return NULL; |
1459 | 503 | } |
1460 | 745 | } |
1461 | 11.7k | else if ( TY_(nodeHasCM)(node, CM_BLOCK) ) |
1462 | 7.35k | { |
1463 | 7.35k | if ( lexer->excludeBlocks ) |
1464 | 1.20k | { |
1465 | 1.20k | if ( !TY_(nodeHasCM)(element, CM_OPT) ) |
1466 | 624 | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
1467 | | |
1468 | 1.20k | TY_(UngetToken)( doc ); |
1469 | | |
1470 | 1.20k | if ( TY_(nodeHasCM)(element, CM_OBJECT) ) |
1471 | 6 | lexer->istackbase = istackbase; |
1472 | | |
1473 | 1.20k | TrimSpaces( doc, element ); |
1474 | 1.20k | DEBUG_LOG_EXIT; |
1475 | 1.20k | return NULL; |
1476 | 1.20k | } |
1477 | 7.35k | } |
1478 | 4.36k | else if ( ! nodeIsTEMPLATE( element ) )/* things like list items */ |
1479 | 4.36k | { |
1480 | 4.36k | if (node->tag->model & CM_HEAD) |
1481 | 12 | { |
1482 | 12 | MoveToHead( doc, element, node ); |
1483 | 12 | continue; |
1484 | 12 | } |
1485 | | |
1486 | | /* |
1487 | | special case where a form start tag |
1488 | | occurs in a tr and is followed by td or th |
1489 | | */ |
1490 | | |
1491 | 4.35k | if ( nodeIsFORM(element) && |
1492 | 4.35k | nodeIsTD(element->parent) && |
1493 | 0 | element->parent->implicit ) |
1494 | 0 | { |
1495 | 0 | if ( nodeIsTD(node) ) |
1496 | 0 | { |
1497 | 0 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1498 | 0 | TY_(FreeNode)( doc, node ); |
1499 | 0 | continue; |
1500 | 0 | } |
1501 | | |
1502 | 0 | if ( nodeIsTH(node) ) |
1503 | 0 | { |
1504 | 0 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1505 | 0 | TY_(FreeNode)( doc, node ); |
1506 | 0 | node = element->parent; |
1507 | 0 | TidyDocFree(doc, node->element); |
1508 | 0 | node->element = TY_(tmbstrdup)(doc->allocator, "th"); |
1509 | 0 | node->tag = TY_(LookupTagDef)( TidyTag_TH ); |
1510 | 0 | continue; |
1511 | 0 | } |
1512 | 0 | } |
1513 | | |
1514 | 4.35k | if ( !TY_(nodeHasCM)(element, CM_OPT) && !element->implicit ) |
1515 | 2.60k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
1516 | | |
1517 | | /* #521, warn on missing optional end-tags if not omitting them. */ |
1518 | 4.35k | if ( cfgBool( doc, TidyOmitOptionalTags ) == no && TY_(nodeHasCM)(element, CM_OPT) ) |
1519 | 1.60k | TY_(Report)(doc, element, node, MISSING_ENDTAG_OPTIONAL ); |
1520 | | |
1521 | | |
1522 | 4.35k | TY_(UngetToken)( doc ); |
1523 | | |
1524 | 4.35k | if ( TY_(nodeHasCM)(node, CM_LIST) ) |
1525 | 290 | { |
1526 | 290 | if ( element->parent && element->parent->tag && |
1527 | 289 | element->parent->tag->parser == TY_(ParseList) ) |
1528 | 286 | { |
1529 | 286 | TrimSpaces( doc, element ); |
1530 | 286 | DEBUG_LOG_EXIT; |
1531 | 286 | return NULL; |
1532 | 286 | } |
1533 | | |
1534 | 4 | node = TY_(InferredTag)(doc, TidyTag_UL); |
1535 | 4 | AddClassNoIndent(doc, node); |
1536 | 4 | } |
1537 | 4.06k | else if ( TY_(nodeHasCM)(node, CM_DEFLIST) ) |
1538 | 512 | { |
1539 | 512 | if ( nodeIsDL(element->parent) ) |
1540 | 17 | { |
1541 | 17 | TrimSpaces( doc, element ); |
1542 | 17 | DEBUG_LOG_EXIT; |
1543 | 17 | return NULL; |
1544 | 17 | } |
1545 | | |
1546 | 495 | node = TY_(InferredTag)(doc, TidyTag_DL); |
1547 | 495 | } |
1548 | 3.54k | else if ( TY_(nodeHasCM)(node, CM_TABLE) || TY_(nodeHasCM)(node, CM_ROW) ) |
1549 | 2.12k | { |
1550 | | /* http://tidy.sf.net/issue/1316307 */ |
1551 | | /* In exiled mode, return so table processing can |
1552 | | continue. */ |
1553 | 2.12k | if (lexer->exiled) |
1554 | 1.32k | { |
1555 | 1.32k | DEBUG_LOG_EXIT; |
1556 | 1.32k | return NULL; |
1557 | 1.32k | } |
1558 | 803 | node = TY_(InferredTag)(doc, TidyTag_TABLE); |
1559 | 803 | } |
1560 | 1.42k | else if ( TY_(nodeHasCM)(element, CM_OBJECT) ) |
1561 | 65 | { |
1562 | | /* pop inline stack */ |
1563 | 899 | while ( lexer->istacksize > lexer->istackbase ) |
1564 | 834 | TY_(PopInline)( doc, NULL ); |
1565 | 65 | lexer->istackbase = istackbase; |
1566 | 65 | TrimSpaces( doc, element ); |
1567 | 65 | DEBUG_LOG_EXIT; |
1568 | 65 | return NULL; |
1569 | | |
1570 | 65 | } |
1571 | 1.36k | else |
1572 | 1.36k | { |
1573 | 1.36k | TrimSpaces( doc, element ); |
1574 | 1.36k | DEBUG_LOG_EXIT; |
1575 | 1.36k | return NULL; |
1576 | 1.36k | } |
1577 | 4.35k | } |
1578 | 12.5k | } |
1579 | | |
1580 | | /*\ |
1581 | | * Issue #307 - an <A> tag to ends any open <A> element |
1582 | | * Like #427827 - fixed by Randy Waki and Bjoern Hoehrmann 23 Aug 00 |
1583 | | * in ParseInline(), fix copied HERE to ParseBlock() |
1584 | | * href: http://www.w3.org/TR/html-markup/a.html |
1585 | | * The interactive element a must not appear as a descendant of the a element. |
1586 | | \*/ |
1587 | 21.5k | if ( nodeIsA(node) && !node->implicit && |
1588 | 2.94k | (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) |
1589 | 1.27k | { |
1590 | 1.27k | if (node->type != EndTag && node->attributes == NULL |
1591 | 912 | && cfgBool(doc, TidyCoerceEndTags) ) |
1592 | 912 | { |
1593 | 912 | node->type = EndTag; |
1594 | 912 | TY_(Report)(doc, element, node, COERCE_TO_ENDTAG); |
1595 | 912 | TY_(UngetToken)( doc ); |
1596 | 912 | continue; |
1597 | 912 | } |
1598 | | |
1599 | 361 | if (nodeIsA(element)) |
1600 | 46 | { |
1601 | 46 | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
1602 | 46 | TY_(UngetToken)( doc ); |
1603 | 46 | } |
1604 | 315 | else |
1605 | 315 | { |
1606 | | /* Issue #597 - if we not 'UngetToken' then it is being discarded. |
1607 | | Add message, and 'FreeNode' - thanks @ralfjunker */ |
1608 | 315 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
1609 | 315 | TY_(FreeNode)(doc, node); |
1610 | 315 | } |
1611 | | |
1612 | 361 | if (!(mode & Preformatted)) |
1613 | 361 | TrimSpaces(doc, element); |
1614 | | |
1615 | 361 | DEBUG_LOG_EXIT; |
1616 | 361 | return NULL; |
1617 | 1.27k | } |
1618 | | |
1619 | | /* parse known element */ |
1620 | 20.2k | if (TY_(nodeIsElement)(node)) |
1621 | 19.4k | { |
1622 | 19.4k | if (node->tag->model & CM_INLINE) |
1623 | 11.7k | { |
1624 | 11.7k | if (checkstack && !node->implicit) |
1625 | 5.97k | { |
1626 | 5.97k | checkstack = no; |
1627 | | |
1628 | 5.97k | if (!(element->tag->model & CM_MIXED)) /* #431731 - fix by Randy Waki 25 Dec 00 */ |
1629 | 5.31k | { |
1630 | 5.31k | if ( TY_(InlineDup)(doc, node) > 0 ) |
1631 | 124 | continue; |
1632 | 5.31k | } |
1633 | 5.97k | } |
1634 | | |
1635 | 11.6k | DEBUG_LOG_GET_OLD_MODE; |
1636 | 11.6k | mode = MixedContent; |
1637 | 11.6k | DEBUG_LOG_CHANGE_MODE; |
1638 | 11.6k | } |
1639 | 7.69k | else |
1640 | 7.69k | { |
1641 | 7.69k | checkstack = yes; |
1642 | 7.69k | DEBUG_LOG_GET_OLD_MODE; |
1643 | 7.69k | mode = IgnoreWhitespace; |
1644 | 7.69k | DEBUG_LOG_CHANGE_MODE; |
1645 | 7.69k | } |
1646 | | |
1647 | | /* trim white space before <br> */ |
1648 | 19.3k | if ( nodeIsBR(node) ) |
1649 | 4 | TrimSpaces( doc, element ); |
1650 | | |
1651 | 19.3k | TY_(InsertNodeAtEnd)(element, node); |
1652 | | |
1653 | 19.3k | if (node->implicit) |
1654 | 6.06k | TY_(Report)(doc, element, node, INSERTING_TAG ); |
1655 | | |
1656 | | /* Issue #212 - WHY is this hard coded to 'IgnoreWhitespace' while an |
1657 | | effort has been made above to set a 'MixedContent' mode in some cases? |
1658 | | WHY IS THE 'mode' VARIABLE NOT USED HERE???? */ |
1659 | | |
1660 | 19.3k | { |
1661 | 19.3k | TidyParserMemory memory = {0}; |
1662 | 19.3k | memory.identity = TY_(ParseBlock); |
1663 | 19.3k | memory.reentry_node = node; |
1664 | 19.3k | memory.reentry_mode = mode; |
1665 | 19.3k | memory.original_node = element; |
1666 | 19.3k | TY_(pushMemory)(doc, memory); |
1667 | 19.3k | DEBUG_LOG_EXIT_WITH_NODE(node); |
1668 | 19.3k | } |
1669 | 19.3k | return node; |
1670 | 19.4k | } |
1671 | | |
1672 | | /* discard unexpected tags */ |
1673 | 854 | if (node->type == EndTag) |
1674 | 854 | TY_(PopInline)( doc, node ); /* if inline end tag */ |
1675 | | |
1676 | 854 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
1677 | 854 | TY_(FreeNode)( doc, node ); |
1678 | 854 | continue; |
1679 | 20.2k | } |
1680 | | |
1681 | 6.77k | if (!(element->tag->model & CM_OPT)) |
1682 | 3.98k | TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR); |
1683 | | |
1684 | 6.77k | if (element->tag->model & CM_OBJECT) |
1685 | 913 | { |
1686 | | /* pop inline stack */ |
1687 | 1.34k | while ( lexer->istacksize > lexer->istackbase ) |
1688 | 433 | TY_(PopInline)( doc, NULL ); |
1689 | 913 | lexer->istackbase = istackbase; |
1690 | 913 | } |
1691 | | |
1692 | 6.77k | TrimSpaces( doc, element ); |
1693 | | |
1694 | 6.77k | DEBUG_LOG_EXIT; |
1695 | 6.77k | return NULL; |
1696 | 34.8k | } |
1697 | | |
1698 | | |
1699 | | /** MARK: TY_(ParseBody) |
1700 | | * Parses the `body` tag. |
1701 | | * |
1702 | | * This is a non-recursing parser. It uses the document's parser memory stack |
1703 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
1704 | | * This parser is also re-enterable, so that post-processing can occur after |
1705 | | * such dispatching. |
1706 | | */ |
1707 | | Node* TY_(ParseBody)( TidyDocImpl* doc, Node *body, GetTokenMode mode ) |
1708 | 33.7k | { |
1709 | 33.7k | Lexer* lexer = doc->lexer; |
1710 | 33.7k | Node *node = NULL; |
1711 | 33.7k | Bool checkstack = no; |
1712 | 33.7k | Bool iswhitenode = no; |
1713 | 33.7k | DEBUG_LOG_COUNTERS; |
1714 | | |
1715 | 33.7k | mode = IgnoreWhitespace; |
1716 | 33.7k | checkstack = yes; |
1717 | | |
1718 | | /* |
1719 | | If we're re-entering, then we need to setup from a previous state, |
1720 | | instead of starting fresh. We can pull what we need from the document's |
1721 | | stack. |
1722 | | */ |
1723 | 33.7k | if ( body == NULL ) |
1724 | 12.3k | { |
1725 | 12.3k | TidyParserMemory memory = TY_(popMemory)( doc ); |
1726 | 12.3k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
1727 | 12.3k | DEBUG_LOG_REENTER_WITH_NODE(node); |
1728 | 12.3k | body = memory.original_node; |
1729 | 12.3k | checkstack = memory.register_1; |
1730 | 12.3k | iswhitenode = memory.register_2; |
1731 | 12.3k | DEBUG_LOG_GET_OLD_MODE; |
1732 | 12.3k | mode = memory.mode; |
1733 | 12.3k | DEBUG_LOG_CHANGE_MODE; |
1734 | 12.3k | } |
1735 | 21.3k | else |
1736 | 21.3k | { |
1737 | 21.3k | DEBUG_LOG_ENTER_WITH_NODE(body); |
1738 | 21.3k | TY_(BumpObject)( doc, body->parent ); |
1739 | 21.3k | } |
1740 | | |
1741 | 55.6k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
1742 | 51.8k | { |
1743 | 51.8k | DEBUG_LOG_GOT_TOKEN(node); |
1744 | | /* find and discard multiple <body> elements */ |
1745 | 51.8k | if (node->tag == body->tag && node->type == StartTag) |
1746 | 1.04k | { |
1747 | 1.04k | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1748 | 1.04k | TY_(FreeNode)(doc, node); |
1749 | 1.04k | continue; |
1750 | 1.04k | } |
1751 | | |
1752 | | /* #538536 Extra endtags not detected */ |
1753 | 50.8k | if ( nodeIsHTML(node) ) |
1754 | 351 | { |
1755 | 351 | if (TY_(nodeIsElement)(node) || lexer->seenEndHtml) |
1756 | 351 | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1757 | 0 | else |
1758 | 0 | lexer->seenEndHtml = 1; |
1759 | | |
1760 | 351 | TY_(FreeNode)( doc, node); |
1761 | 351 | continue; |
1762 | 351 | } |
1763 | | |
1764 | 50.4k | if ( lexer->seenEndBody && |
1765 | 2.67k | ( node->type == StartTag || |
1766 | 1.28k | node->type == EndTag || |
1767 | 464 | node->type == StartEndTag ) ) |
1768 | 2.21k | { |
1769 | 2.21k | TY_(Report)(doc, body, node, CONTENT_AFTER_BODY ); |
1770 | 2.21k | } |
1771 | | |
1772 | 50.4k | if ( node->tag == body->tag && node->type == EndTag ) |
1773 | 643 | { |
1774 | 643 | body->closed = yes; |
1775 | 643 | TrimSpaces(doc, body); |
1776 | 643 | TY_(FreeNode)( doc, node); |
1777 | 643 | lexer->seenEndBody = 1; |
1778 | 643 | DEBUG_LOG_GET_OLD_MODE; |
1779 | 643 | mode = IgnoreWhitespace; |
1780 | 643 | DEBUG_LOG_CHANGE_MODE; |
1781 | | |
1782 | 643 | if ( nodeIsNOFRAMES(body->parent) ) |
1783 | 480 | break; |
1784 | | |
1785 | 163 | continue; |
1786 | 643 | } |
1787 | | |
1788 | 49.8k | if ( nodeIsNOFRAMES(node) ) |
1789 | 1.82k | { |
1790 | 1.82k | if (node->type == StartTag) |
1791 | 1.66k | { |
1792 | 1.66k | TidyParserMemory memory = {0}; |
1793 | | |
1794 | 1.66k | TY_(InsertNodeAtEnd)(body, node); |
1795 | | |
1796 | 1.66k | memory.identity = TY_(ParseBody); |
1797 | 1.66k | memory.original_node = body; |
1798 | 1.66k | memory.reentry_node = node; |
1799 | 1.66k | memory.register_1 = checkstack; |
1800 | 1.66k | memory.register_2 = iswhitenode; |
1801 | 1.66k | memory.mode = mode; |
1802 | 1.66k | TY_(pushMemory)( doc, memory ); |
1803 | 1.66k | DEBUG_LOG_EXIT_WITH_NODE(node); |
1804 | 1.66k | return node; |
1805 | 1.66k | } |
1806 | | |
1807 | 163 | if (node->type == EndTag && nodeIsNOFRAMES(body->parent) ) |
1808 | 163 | { |
1809 | 163 | TrimSpaces(doc, body); |
1810 | 163 | TY_(UngetToken)( doc ); |
1811 | 163 | break; |
1812 | 163 | } |
1813 | 163 | } |
1814 | | |
1815 | 48.0k | if ( (nodeIsFRAME(node) || nodeIsFRAMESET(node)) |
1816 | 2.91k | && nodeIsNOFRAMES(body->parent) ) |
1817 | 1.13k | { |
1818 | 1.13k | TrimSpaces(doc, body); |
1819 | 1.13k | TY_(UngetToken)( doc ); |
1820 | 1.13k | break; |
1821 | 1.13k | } |
1822 | | |
1823 | 46.8k | iswhitenode = no; |
1824 | | |
1825 | 46.8k | if ( TY_(nodeIsText)(node) && |
1826 | 7.31k | node->end <= node->start + 1 && |
1827 | 1.80k | lexer->lexbuf[node->start] == ' ' ) |
1828 | 731 | iswhitenode = yes; |
1829 | | |
1830 | | /* deal with comments etc. */ |
1831 | 46.8k | if (InsertMisc(body, node)) |
1832 | 2.77k | continue; |
1833 | | |
1834 | | /* mixed content model permits text */ |
1835 | 44.1k | if (TY_(nodeIsText)(node)) |
1836 | 7.31k | { |
1837 | 7.31k | if (iswhitenode && mode == IgnoreWhitespace) |
1838 | 0 | { |
1839 | 0 | TY_(FreeNode)( doc, node); |
1840 | 0 | continue; |
1841 | 0 | } |
1842 | | |
1843 | | /* HTML 2 and HTML4 strict don't allow text here */ |
1844 | 7.31k | TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT | VERS_HTML20)); |
1845 | | |
1846 | 7.31k | if (checkstack) |
1847 | 3.30k | { |
1848 | 3.30k | checkstack = no; |
1849 | | |
1850 | 3.30k | if ( TY_(InlineDup)(doc, node) > 0 ) |
1851 | 562 | continue; |
1852 | 3.30k | } |
1853 | | |
1854 | 6.75k | TY_(InsertNodeAtEnd)(body, node); |
1855 | 6.75k | DEBUG_LOG_GET_OLD_MODE; |
1856 | 6.75k | mode = MixedContent; |
1857 | 6.75k | DEBUG_LOG_CHANGE_MODE; |
1858 | 6.75k | continue; |
1859 | 7.31k | } |
1860 | | |
1861 | 36.8k | if (node->type == DocTypeTag) |
1862 | 850 | { |
1863 | 850 | InsertDocType(doc, body, node); |
1864 | 850 | continue; |
1865 | 850 | } |
1866 | | /* discard unknown and PARAM tags */ |
1867 | 35.9k | if ( node->tag == NULL || nodeIsPARAM(node) ) |
1868 | 7.20k | { |
1869 | 7.20k | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1870 | 7.20k | TY_(FreeNode)( doc, node); |
1871 | 7.20k | continue; |
1872 | 7.20k | } |
1873 | | |
1874 | | /* |
1875 | | Netscape allows LI and DD directly in BODY |
1876 | | We infer UL or DL respectively and use this |
1877 | | Bool to exclude block-level elements so as |
1878 | | to match Netscape's observed behaviour. |
1879 | | */ |
1880 | 28.7k | lexer->excludeBlocks = no; |
1881 | | |
1882 | 28.7k | if ((( nodeIsINPUT(node) || |
1883 | 28.7k | (!TY_(nodeHasCM)(node, CM_BLOCK) && !TY_(nodeHasCM)(node, CM_INLINE)) |
1884 | 28.7k | ) && !TY_(IsHTML5Mode)(doc)) || nodeIsLI(node) ) |
1885 | 2.67k | { |
1886 | | /* avoid this error message being issued twice */ |
1887 | 2.67k | if (!(node->tag->model & CM_HEAD)) |
1888 | 2.40k | TY_(Report)(doc, body, node, TAG_NOT_ALLOWED_IN); |
1889 | | |
1890 | 2.67k | if (node->tag->model & CM_HTML) |
1891 | 378 | { |
1892 | | /* copy body attributes if current body was inferred */ |
1893 | 378 | if ( nodeIsBODY(node) && body->implicit |
1894 | 14 | && body->attributes == NULL ) |
1895 | 14 | { |
1896 | 14 | body->attributes = node->attributes; |
1897 | 14 | node->attributes = NULL; |
1898 | 14 | } |
1899 | | |
1900 | 378 | TY_(FreeNode)( doc, node); |
1901 | 378 | continue; |
1902 | 378 | } |
1903 | | |
1904 | 2.29k | if (node->tag->model & CM_HEAD) |
1905 | 268 | { |
1906 | 268 | MoveToHead(doc, body, node); |
1907 | 268 | continue; |
1908 | 268 | } |
1909 | | |
1910 | 2.03k | if (node->tag->model & CM_LIST) |
1911 | 317 | { |
1912 | 317 | TY_(UngetToken)( doc ); |
1913 | 317 | node = TY_(InferredTag)(doc, TidyTag_UL); |
1914 | 317 | AddClassNoIndent(doc, node); |
1915 | 317 | lexer->excludeBlocks = yes; |
1916 | 317 | } |
1917 | 1.71k | else if (node->tag->model & CM_DEFLIST) |
1918 | 52 | { |
1919 | 52 | TY_(UngetToken)( doc ); |
1920 | 52 | node = TY_(InferredTag)(doc, TidyTag_DL); |
1921 | 52 | lexer->excludeBlocks = yes; |
1922 | 52 | } |
1923 | 1.66k | else if (node->tag->model & (CM_TABLE | CM_ROWGRP | CM_ROW)) |
1924 | 321 | { |
1925 | | /* http://tidy.sf.net/issue/2855621 */ |
1926 | 321 | if (node->type != EndTag) { |
1927 | 320 | TY_(UngetToken)( doc ); |
1928 | 320 | node = TY_(InferredTag)(doc, TidyTag_TABLE); |
1929 | 320 | } |
1930 | 321 | lexer->excludeBlocks = yes; |
1931 | 321 | } |
1932 | 1.34k | else if ( nodeIsINPUT(node) ) |
1933 | 8 | { |
1934 | 8 | TY_(UngetToken)( doc ); |
1935 | 8 | node = TY_(InferredTag)(doc, TidyTag_FORM); |
1936 | 8 | lexer->excludeBlocks = yes; |
1937 | 8 | } |
1938 | 1.33k | else |
1939 | 1.33k | { |
1940 | 1.33k | if ( !TY_(nodeHasCM)(node, CM_ROW | CM_FIELD) ) |
1941 | 1.33k | { |
1942 | 1.33k | TY_(UngetToken)( doc ); |
1943 | 1.33k | DEBUG_LOG_EXIT; |
1944 | 1.33k | return NULL; |
1945 | 1.33k | } |
1946 | | |
1947 | | /* ignore </td> </th> <option> etc. */ |
1948 | 0 | TY_(FreeNode)( doc, node ); |
1949 | 0 | continue; |
1950 | 1.33k | } |
1951 | 2.03k | } |
1952 | | |
1953 | 26.7k | if (node->type == EndTag) |
1954 | 1.21k | { |
1955 | 1.21k | if ( nodeIsBR(node) ) |
1956 | 1 | { |
1957 | 1 | node->type = StartTag; |
1958 | 1 | } |
1959 | 1.21k | else if ( nodeIsP(node) ) |
1960 | 8 | { |
1961 | 8 | node->type = StartEndTag; |
1962 | 8 | node->implicit = yes; |
1963 | 8 | } |
1964 | 1.20k | else if ( TY_(nodeHasCM)(node, CM_INLINE) ) |
1965 | 581 | TY_(PopInline)( doc, node ); |
1966 | 1.21k | } |
1967 | | |
1968 | 26.7k | if (TY_(nodeIsElement)(node)) |
1969 | 25.5k | { |
1970 | 25.5k | if (nodeIsMAIN(node)) |
1971 | 88 | { |
1972 | | /*\ Issue #166 - repeated <main> element |
1973 | | * How to efficiently search for a previous main element? |
1974 | | \*/ |
1975 | 88 | if ( findNodeById(doc, TidyTag_MAIN) ) |
1976 | 0 | { |
1977 | 0 | doc->badForm |= flg_BadMain; /* this is an ERROR in format */ |
1978 | 0 | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
1979 | 0 | TY_(FreeNode)( doc, node); |
1980 | 0 | continue; |
1981 | 0 | } |
1982 | 88 | } |
1983 | | /* Issue #20 - merging from Ger Hobbelt fork put back CM_MIXED, which had been |
1984 | | removed to fix this issue - reverting to fix 880221e |
1985 | | */ |
1986 | 25.5k | if ( TY_(nodeHasCM)(node, CM_INLINE) ) |
1987 | 4.97k | { |
1988 | | /* HTML4 strict doesn't allow inline content here */ |
1989 | | /* but HTML2 does allow img elements as children of body */ |
1990 | 4.97k | if ( nodeIsIMG(node) ) |
1991 | 479 | TY_(ConstrainVersion)(doc, ~VERS_HTML40_STRICT); |
1992 | 4.49k | else |
1993 | 4.49k | TY_(ConstrainVersion)(doc, ~(VERS_HTML40_STRICT|VERS_HTML20)); |
1994 | | |
1995 | 4.97k | if (checkstack && !node->implicit) |
1996 | 398 | { |
1997 | 398 | checkstack = no; |
1998 | | |
1999 | 398 | if ( TY_(InlineDup)(doc, node) > 0 ) |
2000 | 358 | continue; |
2001 | 398 | } |
2002 | | |
2003 | 4.61k | DEBUG_LOG_GET_OLD_MODE; |
2004 | 4.61k | mode = MixedContent; |
2005 | 4.61k | DEBUG_LOG_CHANGE_MODE; |
2006 | 4.61k | } |
2007 | 20.5k | else |
2008 | 20.5k | { |
2009 | 20.5k | checkstack = yes; |
2010 | 20.5k | DEBUG_LOG_GET_OLD_MODE; |
2011 | 20.5k | mode = IgnoreWhitespace; |
2012 | 20.5k | DEBUG_LOG_CHANGE_MODE; |
2013 | 20.5k | } |
2014 | | |
2015 | 25.2k | if (node->implicit) |
2016 | 1.84k | { |
2017 | 1.84k | TY_(Report)(doc, body, node, INSERTING_TAG); |
2018 | 1.84k | } |
2019 | | |
2020 | 25.2k | TY_(InsertNodeAtEnd)(body, node); |
2021 | | |
2022 | 25.2k | { |
2023 | 25.2k | TidyParserMemory memory = {0}; |
2024 | 25.2k | memory.identity = TY_(ParseBody); |
2025 | 25.2k | memory.original_node = body; |
2026 | 25.2k | memory.reentry_node = node; |
2027 | 25.2k | memory.register_1 = checkstack; |
2028 | 25.2k | memory.register_2 = iswhitenode; |
2029 | 25.2k | memory.mode = mode; |
2030 | 25.2k | TY_(pushMemory)( doc, memory ); |
2031 | 25.2k | } |
2032 | 25.2k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2033 | 25.2k | return node; |
2034 | 25.5k | } |
2035 | | |
2036 | | /* discard unexpected tags */ |
2037 | 1.20k | TY_(Report)(doc, body, node, DISCARDING_UNEXPECTED); |
2038 | 1.20k | TY_(FreeNode)( doc, node); |
2039 | 1.20k | } |
2040 | 5.51k | DEBUG_LOG_EXIT; |
2041 | 5.51k | return NULL; |
2042 | 33.7k | } |
2043 | | |
2044 | | |
2045 | | /** MARK: TY_(ParseColGroup) |
2046 | | * Parses the `colgroup` tag. |
2047 | | * |
2048 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2049 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2050 | | * This parser is also re-enterable, so that post-processing can occur after |
2051 | | * such dispatching. |
2052 | | */ |
2053 | | Node* TY_(ParseColGroup)( TidyDocImpl* doc, Node *colgroup, GetTokenMode ARG_UNUSED(mode) ) |
2054 | 1.57k | { |
2055 | 1.57k | Node *node, *parent; |
2056 | 1.57k | DEBUG_LOG_COUNTERS; |
2057 | | |
2058 | | /* |
2059 | | If we're re-entering, then we need to setup from a previous state, |
2060 | | instead of starting fresh. We can pull what we need from the document's |
2061 | | stack. |
2062 | | */ |
2063 | 1.57k | if ( colgroup == NULL ) |
2064 | 247 | { |
2065 | 247 | TidyParserMemory memory = TY_(popMemory)( doc ); |
2066 | 247 | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2067 | 247 | DEBUG_LOG_REENTER_WITH_NODE(node); |
2068 | 247 | colgroup = memory.original_node; |
2069 | 247 | DEBUG_LOG_GET_OLD_MODE; |
2070 | 247 | mode = memory.mode; |
2071 | 247 | DEBUG_LOG_CHANGE_MODE; |
2072 | 247 | } |
2073 | 1.32k | else |
2074 | 1.32k | { |
2075 | 1.32k | DEBUG_LOG_ENTER_WITH_NODE(colgroup); |
2076 | 1.32k | if (colgroup->tag->model & CM_EMPTY) |
2077 | 0 | return NULL; |
2078 | 1.32k | } |
2079 | | |
2080 | 1.80k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2081 | 1.66k | { |
2082 | 1.66k | DEBUG_LOG_GOT_TOKEN(node); |
2083 | | |
2084 | 1.66k | if (node->tag == colgroup->tag && node->type == EndTag) |
2085 | 1 | { |
2086 | 1 | TY_(FreeNode)( doc, node); |
2087 | 1 | colgroup->closed = yes; |
2088 | 1 | return NULL; |
2089 | 1 | } |
2090 | | |
2091 | | /* |
2092 | | if this is the end tag for an ancestor element |
2093 | | then infer end tag for this element |
2094 | | */ |
2095 | 1.66k | if (node->type == EndTag) |
2096 | 433 | { |
2097 | 433 | if ( nodeIsFORM(node) ) |
2098 | 30 | { |
2099 | 30 | BadForm( doc ); |
2100 | 30 | TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED); |
2101 | 30 | TY_(FreeNode)( doc, node); |
2102 | 30 | continue; |
2103 | 30 | } |
2104 | | |
2105 | 403 | for ( parent = colgroup->parent; |
2106 | 1.71k | parent != NULL; |
2107 | 1.31k | parent = parent->parent ) |
2108 | 1.71k | { |
2109 | 1.71k | if (node->tag == parent->tag) |
2110 | 403 | { |
2111 | 403 | TY_(UngetToken)( doc ); |
2112 | 403 | DEBUG_LOG_EXIT; |
2113 | 403 | return NULL; |
2114 | 403 | } |
2115 | 1.71k | } |
2116 | 403 | } |
2117 | | |
2118 | 1.23k | if (TY_(nodeIsText)(node)) |
2119 | 216 | { |
2120 | 216 | TY_(UngetToken)( doc ); |
2121 | 216 | DEBUG_LOG_EXIT; |
2122 | 216 | return NULL; |
2123 | 216 | } |
2124 | | |
2125 | | /* deal with comments etc. */ |
2126 | 1.01k | if (InsertMisc(colgroup, node)) |
2127 | 3 | continue; |
2128 | | |
2129 | | /* discard unknown tags */ |
2130 | 1.01k | if (node->tag == NULL) |
2131 | 200 | { |
2132 | 200 | TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED); |
2133 | 200 | TY_(FreeNode)( doc, node); |
2134 | 200 | continue; |
2135 | 200 | } |
2136 | | |
2137 | 816 | if ( !nodeIsCOL(node) ) |
2138 | 569 | { |
2139 | 569 | TY_(UngetToken)( doc ); |
2140 | 569 | DEBUG_LOG_EXIT; |
2141 | 569 | return NULL; |
2142 | 569 | } |
2143 | | |
2144 | 247 | if (node->type == EndTag) |
2145 | 0 | { |
2146 | 0 | TY_(Report)(doc, colgroup, node, DISCARDING_UNEXPECTED); |
2147 | 0 | TY_(FreeNode)( doc, node); |
2148 | 0 | continue; |
2149 | 0 | } |
2150 | | |
2151 | | /* node should be <COL> */ |
2152 | 247 | TY_(InsertNodeAtEnd)(colgroup, node); |
2153 | | |
2154 | 247 | { |
2155 | 247 | TidyParserMemory memory = {0}; |
2156 | 247 | memory.identity = TY_(ParseColGroup); |
2157 | 247 | memory.original_node = colgroup; |
2158 | 247 | memory.reentry_node = node; |
2159 | 247 | memory.mode = mode; |
2160 | 247 | TY_(pushMemory)( doc, memory ); |
2161 | 247 | DEBUG_LOG_EXIT_WITH_NODE(node); |
2162 | 247 | } |
2163 | 247 | DEBUG_LOG_EXIT; |
2164 | 247 | return node; |
2165 | 247 | } |
2166 | 135 | DEBUG_LOG_EXIT; |
2167 | 135 | return NULL; |
2168 | 1.57k | } |
2169 | | |
2170 | | |
2171 | | /** MARK: TY_(ParseDatalist) |
2172 | | * Parses the `datalist` tag. |
2173 | | * |
2174 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2175 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2176 | | * This parser is also re-enterable, so that post-processing can occur after |
2177 | | * such dispatching. |
2178 | | */ |
2179 | | Node* TY_(ParseDatalist)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) ) |
2180 | 401 | { |
2181 | 401 | Lexer* lexer = doc->lexer; |
2182 | 401 | Node *node; |
2183 | 401 | DEBUG_LOG_COUNTERS; |
2184 | | |
2185 | 401 | if ( field == NULL ) |
2186 | 204 | { |
2187 | 204 | TidyParserMemory memory = TY_(popMemory)( doc ); |
2188 | 204 | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2189 | 204 | DEBUG_LOG_REENTER_WITH_NODE(node); |
2190 | 204 | field = memory.original_node; |
2191 | 204 | DEBUG_LOG_GET_OLD_MODE; |
2192 | 204 | mode = memory.mode; |
2193 | 204 | DEBUG_LOG_CHANGE_MODE; |
2194 | 204 | } |
2195 | 197 | else |
2196 | 197 | { |
2197 | 197 | DEBUG_LOG_ENTER_WITH_NODE(field); |
2198 | 197 | } |
2199 | | |
2200 | 401 | lexer->insert = NULL; /* defer implicit inline start tags */ |
2201 | | |
2202 | 424 | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2203 | 407 | { |
2204 | 407 | if (node->tag == field->tag && node->type == EndTag) |
2205 | 180 | { |
2206 | 180 | TY_(FreeNode)( doc, node); |
2207 | 180 | field->closed = yes; |
2208 | 180 | TrimSpaces(doc, field); |
2209 | | |
2210 | 180 | DEBUG_LOG_EXIT; |
2211 | 180 | return NULL; |
2212 | 180 | } |
2213 | | |
2214 | | /* deal with comments etc. */ |
2215 | 227 | if (InsertMisc(field, node)) |
2216 | 0 | continue; |
2217 | | |
2218 | 227 | if ( node->type == StartTag && |
2219 | 211 | ( nodeIsOPTION(node) || |
2220 | 211 | nodeIsOPTGROUP(node) || |
2221 | 211 | nodeIsDATALIST(node) || |
2222 | 211 | nodeIsSCRIPT(node)) |
2223 | 227 | ) |
2224 | 204 | { |
2225 | 204 | TidyParserMemory memory = {0}; |
2226 | 204 | memory.identity = TY_(ParseDatalist); |
2227 | 204 | memory.original_node = field; |
2228 | 204 | memory.reentry_node = node; |
2229 | 204 | memory.reentry_mode = IgnoreWhitespace; |
2230 | | |
2231 | 204 | TY_(InsertNodeAtEnd)(field, node); |
2232 | 204 | TY_(pushMemory)(doc, memory); |
2233 | 204 | DEBUG_LOG_EXIT_WITH_NODE(node); |
2234 | 204 | return node; |
2235 | 204 | } |
2236 | | |
2237 | | /* discard unexpected tags */ |
2238 | 23 | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED); |
2239 | 23 | TY_(FreeNode)( doc, node); |
2240 | 23 | } |
2241 | | |
2242 | 17 | TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR); |
2243 | | |
2244 | 17 | DEBUG_LOG_EXIT; |
2245 | 17 | return NULL; |
2246 | 401 | } |
2247 | | |
2248 | | |
2249 | | /** MARK: TY_(ParseDefList) |
2250 | | * Parses the `dl` tag. |
2251 | | * |
2252 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2253 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2254 | | * This parser is also re-enterable, so that post-processing can occur after |
2255 | | * such dispatching. |
2256 | | */ |
2257 | | Node* TY_(ParseDefList)( TidyDocImpl* doc, Node *list, GetTokenMode mode ) |
2258 | 18.1k | { |
2259 | 18.1k | Lexer* lexer = doc->lexer; |
2260 | 18.1k | Node *node = NULL; |
2261 | 18.1k | Node *parent = NULL; |
2262 | 18.1k | DEBUG_LOG_COUNTERS; |
2263 | | |
2264 | 18.1k | enum parserState { |
2265 | 18.1k | STATE_INITIAL, /* This is the initial state for every parser. */ |
2266 | 18.1k | STATE_POST_NODEISCENTER, /* To-do after re-entering after checks. */ |
2267 | 18.1k | STATE_COMPLETE, /* Done with the switch. */ |
2268 | 18.1k | } state = STATE_INITIAL; |
2269 | | |
2270 | 18.1k | if ( list == NULL ) |
2271 | 12.7k | { |
2272 | 12.7k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2273 | 12.7k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2274 | 12.7k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2275 | 12.7k | list = memory.original_node; |
2276 | 12.7k | state = memory.reentry_state; |
2277 | 12.7k | DEBUG_LOG_GET_OLD_MODE; |
2278 | 12.7k | mode = memory.mode; |
2279 | 12.7k | DEBUG_LOG_CHANGE_MODE; |
2280 | 12.7k | } |
2281 | 5.34k | else |
2282 | 5.34k | { |
2283 | 5.34k | DEBUG_LOG_ENTER_WITH_NODE(list); |
2284 | 5.34k | } |
2285 | | |
2286 | 18.1k | if (list->tag->model & CM_EMPTY) |
2287 | 0 | return NULL; |
2288 | | |
2289 | 18.1k | lexer->insert = NULL; /* defer implicit inline start tags */ |
2290 | | |
2291 | 26.7k | while ( state != STATE_COMPLETE ) |
2292 | 24.1k | { |
2293 | 24.1k | if ( state == STATE_INITIAL ) |
2294 | 20.6k | node = TY_(GetToken)( doc, IgnoreWhitespace); |
2295 | | |
2296 | 24.1k | switch ( state) |
2297 | 24.1k | { |
2298 | 20.6k | case STATE_INITIAL: |
2299 | 20.6k | { |
2300 | 20.6k | if ( node == NULL) |
2301 | 2.59k | { |
2302 | 2.59k | state = STATE_COMPLETE; |
2303 | 2.59k | continue; |
2304 | 2.59k | } |
2305 | | |
2306 | 18.0k | if (node->tag == list->tag && node->type == EndTag) |
2307 | 0 | { |
2308 | 0 | TY_(FreeNode)( doc, node); |
2309 | 0 | list->closed = yes; |
2310 | 0 | DEBUG_LOG_EXIT; |
2311 | 0 | return NULL; |
2312 | 0 | } |
2313 | | |
2314 | | /* deal with comments etc. */ |
2315 | 18.0k | if (InsertMisc(list, node)) |
2316 | 696 | continue; |
2317 | | |
2318 | 17.3k | if (TY_(nodeIsText)(node)) |
2319 | 3.22k | { |
2320 | 3.22k | TY_(UngetToken)( doc ); |
2321 | 3.22k | node = TY_(InferredTag)(doc, TidyTag_DT); |
2322 | 3.22k | TY_(Report)(doc, list, node, MISSING_STARTTAG); |
2323 | 3.22k | } |
2324 | | |
2325 | 17.3k | if (node->tag == NULL) |
2326 | 1.68k | { |
2327 | 1.68k | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2328 | 1.68k | TY_(FreeNode)( doc, node); |
2329 | 1.68k | continue; |
2330 | 1.68k | } |
2331 | | |
2332 | | /* |
2333 | | if this is the end tag for an ancestor element |
2334 | | then infer end tag for this element |
2335 | | */ |
2336 | 15.6k | if (node->type == EndTag) |
2337 | 126 | { |
2338 | 126 | Bool discardIt = no; |
2339 | 126 | if ( nodeIsFORM(node) ) |
2340 | 0 | { |
2341 | 0 | BadForm( doc ); |
2342 | 0 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2343 | 0 | TY_(FreeNode)( doc, node ); |
2344 | 0 | continue; |
2345 | 0 | } |
2346 | | |
2347 | 126 | for (parent = list->parent; |
2348 | 1.01k | parent != NULL; parent = parent->parent) |
2349 | 1.01k | { |
2350 | | /* Do not match across BODY to avoid infinite loop |
2351 | | between ParseBody and this parser, |
2352 | | See http://tidy.sf.net/bug/1098012. */ |
2353 | 1.01k | if (nodeIsBODY(parent)) |
2354 | 89 | { |
2355 | 89 | discardIt = yes; |
2356 | 89 | break; |
2357 | 89 | } |
2358 | 926 | if (node->tag == parent->tag) |
2359 | 36 | { |
2360 | 36 | TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE); |
2361 | 36 | TY_(UngetToken)( doc ); |
2362 | | |
2363 | 36 | DEBUG_LOG_EXIT; |
2364 | 36 | return NULL; |
2365 | 36 | } |
2366 | 926 | } |
2367 | 90 | if (discardIt) |
2368 | 89 | { |
2369 | 89 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2370 | 89 | TY_(FreeNode)( doc, node); |
2371 | 89 | continue; |
2372 | 89 | } |
2373 | 90 | } |
2374 | | |
2375 | | /* center in a dt or a dl breaks the dl list in two */ |
2376 | 15.4k | if ( nodeIsCENTER(node) ) |
2377 | 3.51k | { |
2378 | 3.51k | if (list->content) |
2379 | 3.51k | TY_(InsertNodeAfterElement)(list, node); |
2380 | 2 | else /* trim empty dl list */ |
2381 | 2 | { |
2382 | 2 | TY_(InsertNodeBeforeElement)(list, node); |
2383 | 2 | } |
2384 | | |
2385 | | /* #426885 - fix by Glenn Carroll 19 Apr 00, and |
2386 | | Gary Dechaines 11 Aug 00 */ |
2387 | | /* ParseTag can destroy node, if it finds that |
2388 | | * this <center> is followed immediately by </center>. |
2389 | | * It's awkward but necessary to determine if this |
2390 | | * has happened. |
2391 | | */ |
2392 | 3.51k | parent = node->parent; |
2393 | | |
2394 | | /* and parse contents of center */ |
2395 | 3.51k | lexer->excludeBlocks = no; |
2396 | | |
2397 | 3.51k | { |
2398 | 3.51k | TidyParserMemory memory = {0}; |
2399 | 3.51k | memory.identity = TY_(ParseDefList); |
2400 | 3.51k | memory.original_node = list; |
2401 | 3.51k | memory.reentry_node = node; |
2402 | 3.51k | memory.reentry_state = STATE_POST_NODEISCENTER; |
2403 | 3.51k | TY_(pushMemory)( doc, memory ); |
2404 | 3.51k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2405 | 3.51k | return node; |
2406 | 3.51k | } |
2407 | 3.51k | } |
2408 | | |
2409 | 11.9k | if ( !( nodeIsDT(node) || nodeIsDD(node) || ( nodeIsDIV(node) && TY_(IsHTML5Mode)(doc) ) ) ) |
2410 | 6.12k | { |
2411 | 6.12k | TY_(UngetToken)( doc ); |
2412 | | |
2413 | 6.12k | if (!(node->tag->model & (CM_BLOCK | CM_INLINE))) |
2414 | 1.49k | { |
2415 | 1.49k | TY_(Report)(doc, list, node, TAG_NOT_ALLOWED_IN); |
2416 | 1.49k | DEBUG_LOG_EXIT; |
2417 | 1.49k | return NULL; |
2418 | 1.49k | } |
2419 | | |
2420 | | /* if DD appeared directly in BODY then exclude blocks */ |
2421 | 4.63k | if (!(node->tag->model & CM_INLINE) && lexer->excludeBlocks) |
2422 | 1.18k | { |
2423 | 1.18k | DEBUG_LOG_EXIT; |
2424 | 1.18k | return NULL; |
2425 | 1.18k | } |
2426 | | |
2427 | 3.45k | node = TY_(InferredTag)(doc, TidyTag_DD); |
2428 | 3.45k | TY_(Report)(doc, list, node, MISSING_STARTTAG); |
2429 | 3.45k | } |
2430 | | |
2431 | 9.30k | if (node->type == EndTag) |
2432 | 0 | { |
2433 | 0 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
2434 | 0 | TY_(FreeNode)( doc, node); |
2435 | 0 | continue; |
2436 | 0 | } |
2437 | | |
2438 | | /* node should be <DT> or <DD> or <DIV>*/ |
2439 | 9.30k | TY_(InsertNodeAtEnd)(list, node); |
2440 | 9.30k | { |
2441 | 9.30k | TidyParserMemory memory = {0}; |
2442 | 9.30k | memory.identity = TY_(ParseDefList); |
2443 | 9.30k | memory.original_node = list; |
2444 | 9.30k | memory.reentry_node = node; |
2445 | 9.30k | memory.reentry_state = STATE_INITIAL; |
2446 | 9.30k | TY_(pushMemory)( doc, memory ); |
2447 | 9.30k | DEBUG_LOG_EXIT; |
2448 | 9.30k | return node; |
2449 | 9.30k | } |
2450 | 9.30k | } break; |
2451 | | |
2452 | | |
2453 | 3.51k | case STATE_POST_NODEISCENTER: |
2454 | 3.51k | { |
2455 | 3.51k | lexer->excludeBlocks = yes; |
2456 | | |
2457 | | /* now create a new dl element, |
2458 | | * unless node has been blown away because the |
2459 | | * center was empty, as above. |
2460 | | */ |
2461 | 3.51k | if (parent && parent->last == node) |
2462 | 0 | { |
2463 | 0 | list = TY_(InferredTag)(doc, TidyTag_DL); |
2464 | 0 | TY_(InsertNodeAfterElement)(node, list); |
2465 | 0 | } |
2466 | 3.51k | state = STATE_INITIAL; |
2467 | 3.51k | continue; |
2468 | 9.30k | } break; |
2469 | | |
2470 | | |
2471 | 0 | default: |
2472 | 0 | break; |
2473 | 24.1k | } /* switch */ |
2474 | 24.1k | } /* while */ |
2475 | | |
2476 | 2.59k | TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR); |
2477 | 2.59k | DEBUG_LOG_EXIT; |
2478 | 2.59k | return NULL; |
2479 | 18.1k | } |
2480 | | |
2481 | | |
2482 | | /** MARK: TY_(ParseEmpty) |
2483 | | * Parse empty element nodes. |
2484 | | * |
2485 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2486 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2487 | | * This parser is also re-enterable, so that post-processing can occur after |
2488 | | * such dispatching. |
2489 | | */ |
2490 | | Node* TY_(ParseEmpty)( TidyDocImpl* doc, Node *element, GetTokenMode mode ) |
2491 | 7.46k | { |
2492 | 7.46k | Lexer* lexer = doc->lexer; |
2493 | 7.46k | if ( lexer->isvoyager ) |
2494 | 1.99k | { |
2495 | 1.99k | Node *node = TY_(GetToken)( doc, mode); |
2496 | 1.99k | if ( node ) |
2497 | 1.84k | { |
2498 | 1.84k | if ( !(node->type == EndTag && node->tag == element->tag) ) |
2499 | 1.66k | { |
2500 | | /* TY_(Report)(doc, element, node, ELEMENT_NOT_EMPTY); */ |
2501 | 1.66k | TY_(UngetToken)( doc ); |
2502 | 1.66k | } |
2503 | 186 | else |
2504 | 186 | { |
2505 | 186 | TY_(FreeNode)( doc, node ); |
2506 | 186 | } |
2507 | 1.84k | } |
2508 | 1.99k | } |
2509 | 7.46k | return NULL; |
2510 | 7.46k | } |
2511 | | |
2512 | | |
2513 | | /** MARK: TY_(ParseFrameSet) |
2514 | | * Parses the `frameset` tag. |
2515 | | * |
2516 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2517 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2518 | | * This parser is also re-enterable, so that post-processing can occur after |
2519 | | * such dispatching. |
2520 | | */ |
2521 | | Node* TY_(ParseFrameSet)( TidyDocImpl* doc, Node *frameset, GetTokenMode ARG_UNUSED(mode) ) |
2522 | 23.2k | { |
2523 | 23.2k | Lexer* lexer = doc->lexer; |
2524 | 23.2k | Node *node; |
2525 | 23.2k | DEBUG_LOG_COUNTERS; |
2526 | | |
2527 | | /* |
2528 | | If we're re-entering, then we need to setup from a previous state, |
2529 | | instead of starting fresh. We can pull what we need from the document's |
2530 | | stack. |
2531 | | */ |
2532 | 23.2k | if ( frameset == NULL ) |
2533 | 10.8k | { |
2534 | 10.8k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2535 | 10.8k | node = memory.reentry_node; /* Throwaway, because we replace it entering the loop. */ |
2536 | 10.8k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2537 | 10.8k | frameset = memory.original_node; |
2538 | 10.8k | DEBUG_LOG_GET_OLD_MODE; |
2539 | 10.8k | mode = memory.mode; |
2540 | 10.8k | DEBUG_LOG_CHANGE_MODE; |
2541 | 10.8k | } |
2542 | 12.4k | else |
2543 | 12.4k | { |
2544 | 12.4k | DEBUG_LOG_ENTER_WITH_NODE(frameset); |
2545 | 12.4k | if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) |
2546 | 12.4k | { |
2547 | 12.4k | doc->badAccess |= BA_USING_FRAMES; |
2548 | 12.4k | } |
2549 | 12.4k | } |
2550 | | |
2551 | 38.2k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2552 | 28.4k | { |
2553 | 28.4k | if (node->tag == frameset->tag && node->type == EndTag) |
2554 | 2.51k | { |
2555 | 2.51k | TY_(FreeNode)( doc, node); |
2556 | 2.51k | frameset->closed = yes; |
2557 | 2.51k | TrimSpaces(doc, frameset); |
2558 | 2.51k | DEBUG_LOG_EXIT; |
2559 | 2.51k | return NULL; |
2560 | 2.51k | } |
2561 | | |
2562 | | /* deal with comments etc. */ |
2563 | 25.9k | if (InsertMisc(frameset, node)) |
2564 | 269 | continue; |
2565 | | |
2566 | 25.6k | if (node->tag == NULL) |
2567 | 6.23k | { |
2568 | 6.23k | TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED); |
2569 | 6.23k | TY_(FreeNode)( doc, node); |
2570 | 6.23k | continue; |
2571 | 6.23k | } |
2572 | | |
2573 | 19.4k | if (TY_(nodeIsElement)(node)) |
2574 | 19.0k | { |
2575 | 19.0k | if (node->tag && node->tag->model & CM_HEAD) |
2576 | 665 | { |
2577 | 665 | MoveToHead(doc, frameset, node); |
2578 | 665 | continue; |
2579 | 665 | } |
2580 | 19.0k | } |
2581 | | |
2582 | 18.7k | if ( nodeIsBODY(node) ) |
2583 | 1.42k | { |
2584 | 1.42k | TY_(UngetToken)( doc ); |
2585 | 1.42k | node = TY_(InferredTag)(doc, TidyTag_NOFRAMES); |
2586 | 1.42k | TY_(Report)(doc, frameset, node, INSERTING_TAG); |
2587 | 1.42k | } |
2588 | | |
2589 | 18.7k | if (node->type == StartTag && (node->tag && node->tag->model & CM_FRAMES)) |
2590 | 10.9k | { |
2591 | 10.9k | TY_(InsertNodeAtEnd)(frameset, node); |
2592 | 10.9k | lexer->excludeBlocks = no; |
2593 | | |
2594 | | /* |
2595 | | * We don't really have to do anything when re-entering, except |
2596 | | * setting up the state when we left. No post-processing means |
2597 | | * this stays simple. |
2598 | | */ |
2599 | 10.9k | TidyParserMemory memory = {0}; |
2600 | 10.9k | memory.identity = TY_(ParseFrameSet); |
2601 | 10.9k | memory.original_node = frameset; |
2602 | 10.9k | memory.reentry_node = node; |
2603 | 10.9k | memory.mode = MixedContent; |
2604 | 10.9k | TY_(pushMemory)( doc, memory ); |
2605 | 10.9k | DEBUG_LOG_EXIT_WITH_NODE(node); |
2606 | 10.9k | return node; |
2607 | 10.9k | } |
2608 | 7.84k | else if (node->type == StartEndTag && (node->tag && node->tag->model & CM_FRAMES)) |
2609 | 599 | { |
2610 | 599 | TY_(InsertNodeAtEnd)(frameset, node); |
2611 | 599 | continue; |
2612 | 599 | } |
2613 | | |
2614 | | /* discard unexpected tags */ |
2615 | | /* WAI [6.5.1.4] link is being discarded outside of NOFRAME */ |
2616 | 7.25k | if ( nodeIsA(node) ) |
2617 | 390 | doc->badAccess |= BA_INVALID_LINK_NOFRAMES; |
2618 | | |
2619 | 7.25k | TY_(Report)(doc, frameset, node, DISCARDING_UNEXPECTED); |
2620 | 7.25k | TY_(FreeNode)( doc, node); |
2621 | 7.25k | } |
2622 | | |
2623 | 9.74k | TY_(Report)(doc, frameset, node, MISSING_ENDTAG_FOR); |
2624 | 9.74k | DEBUG_LOG_EXIT; |
2625 | 9.74k | return NULL; |
2626 | 23.2k | } |
2627 | | |
2628 | | |
2629 | | /** MARK: TY_(ParseHead) |
2630 | | * Parses the `head` tag. |
2631 | | * |
2632 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2633 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2634 | | * This parser is also re-enterable, so that post-processing can occur after |
2635 | | * such dispatching. |
2636 | | */ |
2637 | | Node* TY_(ParseHead)( TidyDocImpl* doc, Node *head, GetTokenMode ARG_UNUSED(mode) ) |
2638 | 17.5k | { |
2639 | 17.5k | Lexer* lexer = doc->lexer; |
2640 | 17.5k | Node *node; |
2641 | 17.5k | int HasTitle = 0; |
2642 | 17.5k | int HasBase = 0; |
2643 | 17.5k | DEBUG_LOG_COUNTERS; |
2644 | | |
2645 | 17.5k | if ( head == NULL ) |
2646 | 511 | { |
2647 | 511 | TidyParserMemory memory = TY_(popMemory)( doc ); |
2648 | 511 | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
2649 | 511 | DEBUG_LOG_REENTER_WITH_NODE(node); |
2650 | 511 | head = memory.original_node; |
2651 | 511 | HasTitle = memory.register_1; |
2652 | 511 | HasBase = memory.register_2; |
2653 | 511 | DEBUG_LOG_GET_OLD_MODE; |
2654 | 511 | mode = memory.mode; |
2655 | 511 | DEBUG_LOG_CHANGE_MODE; |
2656 | 511 | } |
2657 | 17.0k | else |
2658 | 17.0k | { |
2659 | 17.0k | DEBUG_LOG_ENTER_WITH_NODE(head); |
2660 | 17.0k | } |
2661 | | |
2662 | 20.2k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
2663 | 20.1k | { |
2664 | 20.1k | if (node->tag == head->tag && node->type == EndTag) |
2665 | 136 | { |
2666 | 136 | TY_(FreeNode)( doc, node); |
2667 | 136 | head->closed = yes; |
2668 | 136 | break; |
2669 | 136 | } |
2670 | | |
2671 | | /* find and discard multiple <head> elements */ |
2672 | | /* find and discard <html> in <head> elements */ |
2673 | 20.0k | if ((node->tag == head->tag || nodeIsHTML(node)) && node->type == StartTag) |
2674 | 962 | { |
2675 | 962 | TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED); |
2676 | 962 | TY_(FreeNode)(doc, node); |
2677 | 962 | continue; |
2678 | 962 | } |
2679 | | |
2680 | 19.0k | if (TY_(nodeIsText)(node)) |
2681 | 1.06k | { |
2682 | | /*\ Issue #132 - avoid warning for missing body tag, |
2683 | | * if configured to --omit-otpional-tags yes |
2684 | | * Issue #314 - and if --show-body-only |
2685 | | \*/ |
2686 | 1.06k | if (!cfgBool( doc, TidyOmitOptionalTags ) && |
2687 | 973 | !showingBodyOnly(doc) ) |
2688 | 973 | { |
2689 | 973 | TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN); |
2690 | 973 | } |
2691 | 1.06k | TY_(UngetToken)( doc ); |
2692 | 1.06k | break; |
2693 | 1.06k | } |
2694 | | |
2695 | 18.0k | if (node->type == ProcInsTag && node->element && |
2696 | 0 | TY_(tmbstrcmp)(node->element, "xml-stylesheet") == 0) |
2697 | 0 | { |
2698 | 0 | TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN); |
2699 | 0 | TY_(InsertNodeBeforeElement)(TY_(FindHTML)(doc), node); |
2700 | 0 | continue; |
2701 | 0 | } |
2702 | | |
2703 | | /* deal with comments etc. */ |
2704 | 18.0k | if (InsertMisc(head, node)) |
2705 | 0 | continue; |
2706 | | |
2707 | 18.0k | if (node->type == DocTypeTag) |
2708 | 283 | { |
2709 | 283 | InsertDocType(doc, head, node); |
2710 | 283 | continue; |
2711 | 283 | } |
2712 | | |
2713 | | /* discard unknown tags */ |
2714 | 17.7k | if (node->tag == NULL) |
2715 | 1.40k | { |
2716 | 1.40k | TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED); |
2717 | 1.40k | TY_(FreeNode)( doc, node); |
2718 | 1.40k | continue; |
2719 | 1.40k | } |
2720 | | |
2721 | | /* |
2722 | | if it doesn't belong in the head then |
2723 | | treat as implicit end of head and deal |
2724 | | with as part of the body |
2725 | | */ |
2726 | 16.3k | if (!(node->tag->model & CM_HEAD)) |
2727 | 15.8k | { |
2728 | | /* #545067 Implicit closing of head broken - warn only for XHTML input */ |
2729 | 15.8k | if ( lexer->isvoyager ) |
2730 | 2 | TY_(Report)(doc, head, node, TAG_NOT_ALLOWED_IN ); |
2731 | 15.8k | TY_(UngetToken)( doc ); |
2732 | 15.8k | break; |
2733 | 15.8k | } |
2734 | | |
2735 | 516 | if (TY_(nodeIsElement)(node)) |
2736 | 511 | { |
2737 | 511 | if ( nodeIsTITLE(node) ) |
2738 | 12 | { |
2739 | 12 | ++HasTitle; |
2740 | | |
2741 | 12 | if (HasTitle > 1) |
2742 | 11 | TY_(Report)(doc, head, node, |
2743 | 11 | head ? |
2744 | 11 | TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS); |
2745 | 12 | } |
2746 | 499 | else if ( nodeIsBASE(node) ) |
2747 | 0 | { |
2748 | 0 | ++HasBase; |
2749 | |
|
2750 | 0 | if (HasBase > 1) |
2751 | 0 | TY_(Report)(doc, head, node, |
2752 | 0 | head ? |
2753 | 0 | TOO_MANY_ELEMENTS_IN : TOO_MANY_ELEMENTS); |
2754 | 0 | } |
2755 | | |
2756 | 511 | TY_(InsertNodeAtEnd)(head, node); |
2757 | | |
2758 | 511 | { |
2759 | 511 | TidyParserMemory memory = {0}; |
2760 | 511 | memory.identity = TY_(ParseHead); |
2761 | 511 | memory.original_node = head; |
2762 | 511 | memory.reentry_node = node; |
2763 | 511 | memory.register_1 = HasTitle; |
2764 | 511 | memory.register_2 = HasBase; |
2765 | 511 | TY_(pushMemory)( doc, memory ); |
2766 | 511 | DEBUG_LOG_EXIT_WITH_NODE(node); |
2767 | 511 | return node; |
2768 | 511 | } |
2769 | 511 | } |
2770 | | |
2771 | | /* discard unexpected text nodes and end tags */ |
2772 | 5 | TY_(Report)(doc, head, node, DISCARDING_UNEXPECTED); |
2773 | 5 | TY_(FreeNode)( doc, node); |
2774 | 5 | } |
2775 | 17.0k | DEBUG_LOG_EXIT; |
2776 | 17.0k | return NULL; |
2777 | 17.5k | } |
2778 | | |
2779 | | |
2780 | | /** MARK: TY_(ParseHTML) |
2781 | | * Parses the `html` tag. At this point, other root-level stuff (doctype, |
2782 | | * comments) are already set up, and here we handle all of the complexities |
2783 | | * of things such as frameset documents, etc. |
2784 | | * |
2785 | | * This is a non-recursing parser. It uses the document's parser memory stack |
2786 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
2787 | | * This parser is also re-enterable, so that post-processing can occur after |
2788 | | * such dispatching. |
2789 | | */ |
2790 | | Node* TY_(ParseHTML)( TidyDocImpl *doc, Node *html, GetTokenMode mode ) |
2791 | 39.5k | { |
2792 | 39.5k | Node *node = NULL; |
2793 | 39.5k | Node *head = NULL; |
2794 | 39.5k | Node *frameset = NULL; |
2795 | 39.5k | Node *noframes = NULL; |
2796 | 39.5k | DEBUG_LOG_COUNTERS; |
2797 | | |
2798 | 39.5k | enum parserState { |
2799 | 39.5k | STATE_INITIAL, /* This is the initial state for every parser. */ |
2800 | 39.5k | STATE_COMPLETE, /* Complete! */ |
2801 | 39.5k | STATE_PRE_BODY, /* In this state, we'll consider frames vs. body. */ |
2802 | 39.5k | STATE_PARSE_BODY, /* In this state, we can parse the body. */ |
2803 | 39.5k | STATE_PARSE_HEAD, /* In this state, we will setup head for parsing. */ |
2804 | 39.5k | STATE_PARSE_HEAD_REENTER, /* Resume here after parsing head. */ |
2805 | 39.5k | STATE_PARSE_NOFRAMES, /* In this state, we can parse noframes content. */ |
2806 | 39.5k | STATE_PARSE_NOFRAMES_REENTER, /* In this state, we can restore more state. */ |
2807 | 39.5k | STATE_PARSE_FRAMESET, /* In this state, we will parse frameset content. */ |
2808 | 39.5k | STATE_PARSE_FRAMESET_REENTER, /* We need to cleanup some things after parsing frameset. */ |
2809 | 39.5k | } state = STATE_INITIAL; |
2810 | | |
2811 | 39.5k | TY_(SetOptionBool)( doc, TidyXmlTags, no ); |
2812 | | |
2813 | 39.5k | if ( html == NULL ) |
2814 | 22.6k | { |
2815 | 22.6k | TidyParserMemory memory = TY_(popMemory)( doc ); |
2816 | 22.6k | node = memory.reentry_node; |
2817 | 22.6k | DEBUG_LOG_REENTER_WITH_NODE(node); |
2818 | 22.6k | html = memory.original_node; |
2819 | 22.6k | state = memory.reentry_state; |
2820 | 22.6k | DEBUG_LOG_GET_OLD_MODE; |
2821 | 22.6k | mode = memory.reentry_mode; |
2822 | 22.6k | DEBUG_LOG_CHANGE_MODE; |
2823 | 22.6k | } |
2824 | 16.9k | else |
2825 | 16.9k | { |
2826 | 16.9k | DEBUG_LOG_ENTER_WITH_NODE(html); |
2827 | 16.9k | } |
2828 | | |
2829 | | /* |
2830 | | This main loop pulls tokens from the lexer until we're out of tokens, |
2831 | | or until there's no more work to do. |
2832 | | */ |
2833 | 105k | while ( state != STATE_COMPLETE ) |
2834 | 104k | { |
2835 | 104k | if ( state == STATE_INITIAL || state == STATE_PRE_BODY ) |
2836 | 43.2k | { |
2837 | 43.2k | node = TY_(GetToken)( doc, IgnoreWhitespace ); |
2838 | 43.2k | DEBUG_LOG_GOT_TOKEN(node); |
2839 | 43.2k | } |
2840 | | |
2841 | 104k | switch ( state ) |
2842 | 104k | { |
2843 | | /************************************************************** |
2844 | | This case is all about finding a head tag and dealing with |
2845 | | cases were we don't, so that we can move on to parsing a head |
2846 | | tag. |
2847 | | **************************************************************/ |
2848 | 17.0k | case STATE_INITIAL: |
2849 | 17.0k | { |
2850 | | /* |
2851 | | The only way we can possibly be here is if the lexer |
2852 | | had nothing to give us. Thus we'll create our own |
2853 | | head, and set the signal to start parsing it. |
2854 | | */ |
2855 | 17.0k | if (node == NULL) |
2856 | 442 | { |
2857 | 442 | node = TY_(InferredTag)(doc, TidyTag_HEAD); |
2858 | 442 | state = STATE_PARSE_HEAD; |
2859 | 442 | continue; |
2860 | 442 | } |
2861 | | |
2862 | | /* We found exactly what we expected: head. */ |
2863 | 16.6k | if ( nodeIsHEAD(node) ) |
2864 | 2 | { |
2865 | 2 | state = STATE_PARSE_HEAD; |
2866 | 2 | continue; |
2867 | 2 | } |
2868 | | |
2869 | | /* We did not expect to find an html closing tag here! */ |
2870 | 16.6k | if (html && (node->tag == html->tag) && (node->type == EndTag)) |
2871 | 0 | { |
2872 | 0 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2873 | 0 | TY_(FreeNode)( doc, node); |
2874 | 0 | continue; |
2875 | 0 | } |
2876 | | |
2877 | | /* Find and discard multiple <html> elements. */ |
2878 | 16.6k | if (html && (node->tag == html->tag) && (node->type == StartTag)) |
2879 | 91 | { |
2880 | 91 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2881 | 91 | TY_(FreeNode)(doc, node); |
2882 | 91 | continue; |
2883 | 91 | } |
2884 | | |
2885 | | /* Deal with comments, etc. */ |
2886 | 16.5k | if (InsertMisc(html, node)) |
2887 | 84 | continue; |
2888 | | |
2889 | | /* |
2890 | | At this point, we didn't find a head tag, so put the |
2891 | | token back and create our own head tag, so we can |
2892 | | move on. |
2893 | | */ |
2894 | 16.4k | TY_(UngetToken)( doc ); |
2895 | 16.4k | node = TY_(InferredTag)(doc, TidyTag_HEAD); |
2896 | 16.4k | state = STATE_PARSE_HEAD; |
2897 | 16.4k | continue; |
2898 | 16.5k | } break; |
2899 | | |
2900 | | |
2901 | | /************************************************************** |
2902 | | This case determines whether we're dealing with body or |
2903 | | frameset + noframes, and sets things up accordingly. |
2904 | | **************************************************************/ |
2905 | 26.1k | case STATE_PRE_BODY: |
2906 | 26.1k | { |
2907 | 26.1k | if (node == NULL ) |
2908 | 836 | { |
2909 | 836 | if (frameset == NULL) /* Implied body. */ |
2910 | 9 | { |
2911 | 9 | node = TY_(InferredTag)(doc, TidyTag_BODY); |
2912 | 9 | state = STATE_PARSE_BODY; |
2913 | 827 | } else { |
2914 | 827 | state = STATE_COMPLETE; |
2915 | 827 | } |
2916 | | |
2917 | 836 | continue; |
2918 | 836 | } |
2919 | | |
2920 | | /* Robustly handle html tags. */ |
2921 | 25.3k | if (node->tag == html->tag) |
2922 | 22 | { |
2923 | 22 | if (node->type != StartTag && frameset == NULL) |
2924 | 16 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2925 | | |
2926 | 22 | TY_(FreeNode)( doc, node); |
2927 | 22 | continue; |
2928 | 22 | } |
2929 | | |
2930 | | /* Deal with comments, etc. */ |
2931 | 25.2k | if (InsertMisc(html, node)) |
2932 | 3.22k | continue; |
2933 | | |
2934 | | /* If frameset document, coerce <body> to <noframes> */ |
2935 | 22.0k | if ( nodeIsBODY(node) ) |
2936 | 2.04k | { |
2937 | 2.04k | if (node->type != StartTag) |
2938 | 11 | { |
2939 | 11 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2940 | 11 | TY_(FreeNode)( doc, node); |
2941 | 11 | continue; |
2942 | 11 | } |
2943 | | |
2944 | 2.02k | if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) |
2945 | 2.02k | { |
2946 | 2.02k | if (frameset != NULL) |
2947 | 2.01k | { |
2948 | 2.01k | TY_(UngetToken)( doc ); |
2949 | | |
2950 | 2.01k | if (noframes == NULL) |
2951 | 2.01k | { |
2952 | 2.01k | noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES); |
2953 | 2.01k | TY_(InsertNodeAtEnd)(frameset, noframes); |
2954 | 2.01k | TY_(Report)(doc, html, noframes, INSERTING_TAG); |
2955 | 2.01k | } |
2956 | 2 | else |
2957 | 2 | { |
2958 | 2 | if (noframes->type == StartEndTag) |
2959 | 0 | noframes->type = StartTag; |
2960 | 2 | } |
2961 | | |
2962 | 2.01k | state = STATE_PARSE_NOFRAMES; |
2963 | 2.01k | continue; |
2964 | 2.01k | } |
2965 | 2.02k | } |
2966 | | |
2967 | 17 | TY_(ConstrainVersion)(doc, ~VERS_FRAMESET); |
2968 | 17 | state = STATE_PARSE_BODY; |
2969 | 17 | continue; |
2970 | 2.02k | } |
2971 | | |
2972 | | /* Flag an error if we see more than one frameset. */ |
2973 | 20.0k | if ( nodeIsFRAMESET(node) ) |
2974 | 3.63k | { |
2975 | 3.63k | if (node->type != StartTag) |
2976 | 24 | { |
2977 | 24 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2978 | 24 | TY_(FreeNode)( doc, node); |
2979 | 24 | continue; |
2980 | 24 | } |
2981 | | |
2982 | 3.60k | if (frameset != NULL) |
2983 | 312 | TY_(Report)(doc, html, node, DUPLICATE_FRAMESET); |
2984 | 3.29k | else |
2985 | 3.29k | frameset = node; |
2986 | | |
2987 | 3.60k | state = STATE_PARSE_FRAMESET; |
2988 | 3.60k | continue; |
2989 | 3.63k | } |
2990 | | |
2991 | | /* If not a frameset document coerce <noframes> to <body>. */ |
2992 | 16.3k | if ( nodeIsNOFRAMES(node) ) |
2993 | 707 | { |
2994 | 707 | if (node->type != StartTag) |
2995 | 0 | { |
2996 | 0 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
2997 | 0 | TY_(FreeNode)( doc, node); |
2998 | 0 | continue; |
2999 | 0 | } |
3000 | | |
3001 | 707 | if (frameset == NULL) |
3002 | 13 | { |
3003 | 13 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
3004 | 13 | TY_(FreeNode)( doc, node); |
3005 | 13 | node = TY_(InferredTag)(doc, TidyTag_BODY); |
3006 | 13 | state = STATE_PARSE_BODY; |
3007 | 13 | continue; |
3008 | 13 | } |
3009 | | |
3010 | 694 | if (noframes == NULL) |
3011 | 688 | { |
3012 | 688 | noframes = node; |
3013 | 688 | TY_(InsertNodeAtEnd)(frameset, noframes); |
3014 | 688 | state = STATE_PARSE_NOFRAMES; |
3015 | 688 | } |
3016 | 6 | else |
3017 | 6 | { |
3018 | 6 | TY_(FreeNode)( doc, node); |
3019 | 6 | } |
3020 | | |
3021 | 694 | continue; |
3022 | 707 | } |
3023 | | |
3024 | | /* Deal with some other element that we're not expecting. */ |
3025 | 15.6k | if (TY_(nodeIsElement)(node)) |
3026 | 13.5k | { |
3027 | 13.5k | if (node->tag && node->tag->model & CM_HEAD) |
3028 | 12 | { |
3029 | 12 | MoveToHead(doc, html, node); |
3030 | 12 | continue; |
3031 | 12 | } |
3032 | | |
3033 | | /* Discard illegal frame element following a frameset. */ |
3034 | 13.5k | if ( frameset != NULL && nodeIsFRAME(node) ) |
3035 | 239 | { |
3036 | 239 | TY_(Report)(doc, html, node, DISCARDING_UNEXPECTED); |
3037 | 239 | TY_(FreeNode)(doc, node); |
3038 | 239 | continue; |
3039 | 239 | } |
3040 | 13.5k | } |
3041 | | |
3042 | 15.4k | TY_(UngetToken)( doc ); |
3043 | | |
3044 | | /* Insert other content into noframes element. */ |
3045 | 15.4k | if (frameset) |
3046 | 1.84k | { |
3047 | 1.84k | if (noframes == NULL) |
3048 | 1.32k | { |
3049 | 1.32k | noframes = TY_(InferredTag)(doc, TidyTag_NOFRAMES); |
3050 | 1.32k | TY_(InsertNodeAtEnd)(frameset, noframes); |
3051 | 1.32k | } |
3052 | 524 | else |
3053 | 524 | { |
3054 | 524 | TY_(Report)(doc, html, node, NOFRAMES_CONTENT); |
3055 | 524 | if (noframes->type == StartEndTag) |
3056 | 221 | noframes->type = StartTag; |
3057 | 524 | } |
3058 | | |
3059 | 1.84k | TY_(ConstrainVersion)(doc, VERS_FRAMESET); |
3060 | 1.84k | state = STATE_PARSE_NOFRAMES; |
3061 | 1.84k | continue; |
3062 | 1.84k | } |
3063 | | |
3064 | 13.5k | node = TY_(InferredTag)(doc, TidyTag_BODY); |
3065 | | |
3066 | | /* Issue #132 - disable inserting BODY tag warning |
3067 | | BUT only if NOT --show-body-only yes */ |
3068 | 27.1k | if (!showingBodyOnly(doc)) |
3069 | 13.5k | TY_(Report)(doc, html, node, INSERTING_TAG ); |
3070 | | |
3071 | 13.5k | TY_(ConstrainVersion)(doc, ~VERS_FRAMESET); |
3072 | 13.5k | state = STATE_PARSE_BODY; |
3073 | 13.5k | continue; |
3074 | 15.4k | } break; |
3075 | | |
3076 | | |
3077 | | /************************************************************** |
3078 | | In this case, we're ready to parse the head, and move on to |
3079 | | look for the body or body alternative. |
3080 | | **************************************************************/ |
3081 | 16.9k | case STATE_PARSE_HEAD: |
3082 | 16.9k | { |
3083 | 16.9k | TidyParserMemory memory = {0}; |
3084 | 16.9k | memory.identity = TY_(ParseHTML); |
3085 | 16.9k | memory.mode = mode; |
3086 | 16.9k | memory.original_node = html; |
3087 | 16.9k | memory.reentry_node = node; |
3088 | 16.9k | memory.reentry_mode = mode; |
3089 | 16.9k | memory.reentry_state = STATE_PARSE_HEAD_REENTER; |
3090 | 16.9k | TY_(InsertNodeAtEnd)(html, node); |
3091 | 16.9k | TY_(pushMemory)( doc, memory ); |
3092 | 16.9k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3093 | 16.9k | return node; |
3094 | 15.4k | } break; |
3095 | | |
3096 | 16.9k | case STATE_PARSE_HEAD_REENTER: |
3097 | 16.9k | { |
3098 | 16.9k | head = node; |
3099 | 16.9k | state = STATE_PRE_BODY; |
3100 | 16.9k | } break; |
3101 | | |
3102 | | |
3103 | | /************************************************************** |
3104 | | In this case, we can finally parse a body. |
3105 | | **************************************************************/ |
3106 | 13.6k | case STATE_PARSE_BODY: |
3107 | 13.6k | { |
3108 | 13.6k | TidyParserMemory memory = {0}; |
3109 | 13.6k | memory.identity = NULL; /* we don't need to reenter */ |
3110 | 13.6k | memory.mode = mode; |
3111 | 13.6k | memory.original_node = html; |
3112 | 13.6k | memory.reentry_node = NULL; |
3113 | 13.6k | memory.reentry_mode = mode; |
3114 | 13.6k | memory.reentry_state = STATE_COMPLETE; |
3115 | 13.6k | TY_(InsertNodeAtEnd)(html, node); |
3116 | 13.6k | TY_(pushMemory)( doc, memory ); |
3117 | 13.6k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3118 | 13.6k | return node; |
3119 | 15.4k | } break; |
3120 | | |
3121 | | |
3122 | | /************************************************************** |
3123 | | In this case, we will parse noframes. If necessary, the |
3124 | | node is already inserted in the proper spot. |
3125 | | **************************************************************/ |
3126 | 4.54k | case STATE_PARSE_NOFRAMES: |
3127 | 4.54k | { |
3128 | 4.54k | TidyParserMemory memory = {0}; |
3129 | 4.54k | memory.identity = TY_(ParseHTML); |
3130 | 4.54k | memory.mode = mode; |
3131 | 4.54k | memory.original_node = html; |
3132 | 4.54k | memory.reentry_node = frameset; |
3133 | 4.54k | memory.reentry_mode = mode; |
3134 | 4.54k | memory.reentry_state = STATE_PARSE_NOFRAMES_REENTER; |
3135 | 4.54k | TY_(pushMemory)( doc, memory ); |
3136 | 4.54k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3137 | 4.54k | return noframes; |
3138 | 15.4k | } break; |
3139 | | |
3140 | 2.22k | case STATE_PARSE_NOFRAMES_REENTER: |
3141 | 2.22k | { |
3142 | 2.22k | frameset = node; |
3143 | 2.22k | state = STATE_PRE_BODY; |
3144 | 2.22k | } break; |
3145 | | |
3146 | | |
3147 | | /************************************************************** |
3148 | | In this case, we parse the frameset, and look for noframes |
3149 | | content to merge later if necessary. |
3150 | | **************************************************************/ |
3151 | 3.60k | case STATE_PARSE_FRAMESET: |
3152 | 3.60k | { |
3153 | 3.60k | TidyParserMemory memory = {0}; |
3154 | 3.60k | memory.identity = TY_(ParseHTML); |
3155 | 3.60k | memory.mode = mode; |
3156 | 3.60k | memory.original_node = html; |
3157 | 3.60k | memory.reentry_node = frameset; |
3158 | 3.60k | memory.reentry_mode = mode; |
3159 | 3.60k | memory.reentry_state = STATE_PARSE_FRAMESET_REENTER; |
3160 | 3.60k | TY_(InsertNodeAtEnd)(html, node); |
3161 | 3.60k | TY_(pushMemory)( doc, memory ); |
3162 | 3.60k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3163 | 3.60k | return node; |
3164 | 15.4k | } break; |
3165 | | |
3166 | 3.46k | case (STATE_PARSE_FRAMESET_REENTER): |
3167 | 3.46k | { |
3168 | 3.46k | frameset = node; |
3169 | | /* |
3170 | | See if it includes a noframes element so that |
3171 | | we can merge subsequent noframes elements. |
3172 | | */ |
3173 | 4.18k | for (node = frameset->content; node; node = node->next) |
3174 | 724 | { |
3175 | 724 | if ( nodeIsNOFRAMES(node) ) |
3176 | 567 | noframes = node; |
3177 | 724 | } |
3178 | 3.46k | state = STATE_PRE_BODY; |
3179 | 3.46k | } break; |
3180 | | |
3181 | | |
3182 | | /************************************************************** |
3183 | | We really shouldn't get here, but if we do, finish nicely. |
3184 | | **************************************************************/ |
3185 | 0 | default: |
3186 | 0 | { |
3187 | 0 | state = STATE_COMPLETE; |
3188 | 0 | } |
3189 | 104k | } /* switch */ |
3190 | 104k | } /* while */ |
3191 | | |
3192 | 827 | DEBUG_LOG_EXIT; |
3193 | 827 | return NULL; |
3194 | 39.5k | } |
3195 | | |
3196 | | |
3197 | | /** MARK: TY_(ParseInline) |
3198 | | * Parse inline element nodes. |
3199 | | * |
3200 | | * This is a non-recursing parser. It uses the document's parser memory stack |
3201 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
3202 | | * This parser is also re-enterable, so that post-processing can occur after |
3203 | | * such dispatching. |
3204 | | */ |
3205 | | Node* TY_(ParseInline)( TidyDocImpl *doc, Node *element, GetTokenMode mode ) |
3206 | 591k | { |
3207 | 591k | Lexer* lexer = doc->lexer; |
3208 | 591k | Node *node = NULL; |
3209 | 591k | Node *parent = NULL; |
3210 | 591k | DEBUG_LOG_COUNTERS; |
3211 | | |
3212 | 591k | if ( element == NULL ) |
3213 | 288k | { |
3214 | 288k | TidyParserMemory memory = TY_(popMemory)( doc ); |
3215 | 288k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
3216 | 288k | DEBUG_LOG_REENTER_WITH_NODE(node); |
3217 | 288k | element = memory.original_node; |
3218 | 288k | DEBUG_LOG_GET_OLD_MODE; |
3219 | 288k | mode = memory.reentry_mode; |
3220 | 288k | DEBUG_LOG_CHANGE_MODE; |
3221 | 288k | } |
3222 | 302k | else |
3223 | 302k | { |
3224 | 302k | DEBUG_LOG_ENTER_WITH_NODE(element); |
3225 | | |
3226 | 302k | if (element->tag->model & CM_EMPTY) |
3227 | 0 | { |
3228 | 0 | DEBUG_LOG_EXIT; |
3229 | 0 | return NULL; |
3230 | 0 | } |
3231 | | |
3232 | | /* |
3233 | | ParseInline is used for some block level elements like H1 to H6 |
3234 | | For such elements we need to insert inline emphasis tags currently |
3235 | | on the inline stack. For Inline elements, we normally push them |
3236 | | onto the inline stack provided they aren't implicit or OBJECT/APPLET. |
3237 | | This test is carried out in PushInline and PopInline, see istack.c |
3238 | | |
3239 | | InlineDup(...) is not called for elements with a CM_MIXED (inline and |
3240 | | block) content model, e.g. <del> or <ins>, otherwise constructs like |
3241 | | |
3242 | | <p>111<a name='foo'>222<del>333</del>444</a>555</p> |
3243 | | <p>111<span>222<del>333</del>444</span>555</p> |
3244 | | <p>111<em>222<del>333</del>444</em>555</p> |
3245 | | |
3246 | | will get corrupted. |
3247 | | */ |
3248 | 302k | if ((TY_(nodeHasCM)(element, CM_BLOCK) || nodeIsDT(element)) && |
3249 | 10.0k | !TY_(nodeHasCM)(element, CM_MIXED)) |
3250 | 10.0k | TY_(InlineDup)(doc, NULL); |
3251 | 292k | else if (TY_(nodeHasCM)(element, CM_INLINE)) |
3252 | 292k | TY_(PushInline)(doc, element); |
3253 | | |
3254 | 302k | if ( nodeIsNOBR(element) ) |
3255 | 0 | doc->badLayout |= USING_NOBR; |
3256 | 302k | else if ( nodeIsFONT(element) ) |
3257 | 275k | doc->badLayout |= USING_FONT; |
3258 | | |
3259 | | /* Inline elements may or may not be within a preformatted element */ |
3260 | 302k | if (mode != Preformatted) |
3261 | 302k | { |
3262 | 302k | DEBUG_LOG_GET_OLD_MODE; |
3263 | 302k | mode = MixedContent; |
3264 | 302k | DEBUG_LOG_CHANGE_MODE; |
3265 | 302k | } |
3266 | 302k | } |
3267 | | |
3268 | 630k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
3269 | 495k | { |
3270 | | /* end tag for current element */ |
3271 | 495k | if (node->tag == element->tag && node->type == EndTag) |
3272 | 5.42k | { |
3273 | 5.42k | if (element->tag->model & CM_INLINE) |
3274 | 5.29k | TY_(PopInline)( doc, node ); |
3275 | | |
3276 | 5.42k | TY_(FreeNode)( doc, node ); |
3277 | | |
3278 | 5.42k | if (!(mode & Preformatted)) |
3279 | 5.42k | TrimSpaces(doc, element); |
3280 | | |
3281 | | /* |
3282 | | if a font element wraps an anchor and nothing else |
3283 | | then move the font element inside the anchor since |
3284 | | otherwise it won't alter the anchor text color |
3285 | | */ |
3286 | 5.42k | if ( nodeIsFONT(element) && |
3287 | 1.94k | element->content && element->content == element->last ) |
3288 | 1.14k | { |
3289 | 1.14k | Node *child = element->content; |
3290 | | |
3291 | 1.14k | if ( nodeIsA(child) ) |
3292 | 15 | { |
3293 | 15 | child->parent = element->parent; |
3294 | 15 | child->next = element->next; |
3295 | 15 | child->prev = element->prev; |
3296 | | |
3297 | 15 | element->next = NULL; |
3298 | 15 | element->prev = NULL; |
3299 | 15 | element->parent = child; |
3300 | | |
3301 | 15 | element->content = child->content; |
3302 | 15 | element->last = child->last; |
3303 | 15 | child->content = element; |
3304 | | |
3305 | 15 | TY_(FixNodeLinks)(child); |
3306 | 15 | TY_(FixNodeLinks)(element); |
3307 | 15 | } |
3308 | 1.14k | } |
3309 | | |
3310 | 5.42k | element->closed = yes; |
3311 | 5.42k | TrimSpaces( doc, element ); |
3312 | | |
3313 | 5.42k | DEBUG_LOG_EXIT; |
3314 | 5.42k | return NULL; |
3315 | 5.42k | } |
3316 | | |
3317 | | /* <u>...<u> map 2nd <u> to </u> if 1st is explicit */ |
3318 | | /* (see additional conditions below) */ |
3319 | | /* otherwise emphasis nesting is probably unintentional */ |
3320 | | /* big, small, sub, sup have cumulative effect to leave them alone */ |
3321 | 490k | if ( node->type == StartTag |
3322 | 422k | && node->tag == element->tag |
3323 | 279k | && TY_(IsPushed)( doc, node ) |
3324 | 275k | && !node->implicit |
3325 | 26.6k | && !element->implicit |
3326 | 25.4k | && node->tag && (node->tag->model & CM_INLINE) |
3327 | 490k | && !nodeIsA(node) |
3328 | 490k | && !nodeIsFONT(node) |
3329 | 490k | && !nodeIsBIG(node) |
3330 | 490k | && !nodeIsSMALL(node) |
3331 | 490k | && !nodeIsSUB(node) |
3332 | 490k | && !nodeIsSUP(node) |
3333 | 490k | && !nodeIsQ(node) |
3334 | 490k | && !nodeIsSPAN(node) |
3335 | 1.04k | && cfgBool(doc, TidyCoerceEndTags) |
3336 | 490k | ) |
3337 | 1.04k | { |
3338 | | /* proceeds only if "node" does not have any attribute and |
3339 | | follows a text node not finishing with a space */ |
3340 | 1.04k | if (element->content != NULL && node->attributes == NULL |
3341 | 85 | && TY_(nodeIsText)(element->last) |
3342 | 52 | && !TY_(TextNodeEndWithSpace)(doc->lexer, element->last) ) |
3343 | 52 | { |
3344 | 52 | TY_(Report)(doc, element, node, COERCE_TO_ENDTAG); |
3345 | 52 | node->type = EndTag; |
3346 | 52 | TY_(UngetToken)(doc); |
3347 | 52 | continue; |
3348 | 52 | } |
3349 | | |
3350 | 996 | if (node->attributes == NULL || element->attributes == NULL) |
3351 | 842 | TY_(Report)(doc, element, node, NESTED_EMPHASIS); |
3352 | 996 | } |
3353 | 488k | else if ( TY_(IsPushed)(doc, node) && node->type == StartTag && |
3354 | 282k | nodeIsQ(node) ) |
3355 | 1.70k | { |
3356 | | /*\ |
3357 | | * Issue #215 - such nested quotes are NOT a problem if HTML5, so |
3358 | | * only issue this warning if NOT HTML5 mode. |
3359 | | \*/ |
3360 | 1.70k | if (TY_(HTMLVersion)(doc) != HT50) |
3361 | 1.67k | { |
3362 | 1.67k | TY_(Report)(doc, element, node, NESTED_QUOTATION); |
3363 | 1.67k | } |
3364 | 1.70k | } |
3365 | | |
3366 | 489k | if ( TY_(nodeIsText)(node) ) |
3367 | 16.8k | { |
3368 | | /* only called for 1st child */ |
3369 | 16.8k | if ( element->content == NULL && !(mode & Preformatted) ) |
3370 | 10.4k | TrimSpaces( doc, element ); |
3371 | | |
3372 | 16.8k | if ( node->start >= node->end ) |
3373 | 0 | { |
3374 | 0 | TY_(FreeNode)( doc, node ); |
3375 | 0 | continue; |
3376 | 0 | } |
3377 | | |
3378 | 16.8k | TY_(InsertNodeAtEnd)(element, node); |
3379 | 16.8k | continue; |
3380 | 16.8k | } |
3381 | | |
3382 | | /* mixed content model so allow text */ |
3383 | 473k | if (InsertMisc(element, node)) |
3384 | 3.06k | continue; |
3385 | | |
3386 | | /* deal with HTML tags */ |
3387 | 470k | if ( nodeIsHTML(node) ) |
3388 | 959 | { |
3389 | 959 | if ( TY_(nodeIsElement)(node) ) |
3390 | 959 | { |
3391 | 959 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED ); |
3392 | 959 | TY_(FreeNode)( doc, node ); |
3393 | 959 | continue; |
3394 | 959 | } |
3395 | | |
3396 | | /* otherwise infer end of inline element */ |
3397 | 0 | TY_(UngetToken)( doc ); |
3398 | |
|
3399 | 0 | if (!(mode & Preformatted)) |
3400 | 0 | TrimSpaces(doc, element); |
3401 | |
|
3402 | 0 | DEBUG_LOG_EXIT; |
3403 | 0 | return NULL; |
3404 | 959 | } |
3405 | | |
3406 | | /* within <dt> or <pre> map <p> to <br> */ |
3407 | 469k | if ( nodeIsP(node) && |
3408 | 9.35k | node->type == StartTag && |
3409 | 9.21k | ( (mode & Preformatted) || |
3410 | 9.21k | nodeIsDT(element) || |
3411 | 8.39k | DescendantOf(element, TidyTag_DT ) |
3412 | 9.21k | ) |
3413 | 469k | ) |
3414 | 856 | { |
3415 | 856 | node->tag = TY_(LookupTagDef)( TidyTag_BR ); |
3416 | 856 | TidyDocFree(doc, node->element); |
3417 | 856 | node->element = TY_(tmbstrdup)(doc->allocator, "br"); |
3418 | 856 | TrimSpaces(doc, element); |
3419 | 856 | TY_(InsertNodeAtEnd)(element, node); |
3420 | 856 | continue; |
3421 | 856 | } |
3422 | | |
3423 | | /* <p> allowed within <address> in HTML 4.01 Transitional */ |
3424 | 468k | if ( nodeIsP(node) && |
3425 | 8.50k | node->type == StartTag && |
3426 | 8.35k | nodeIsADDRESS(element) ) |
3427 | 0 | { |
3428 | 0 | TY_(ConstrainVersion)( doc, ~VERS_HTML40_STRICT ); |
3429 | 0 | TY_(InsertNodeAtEnd)(element, node); |
3430 | 0 | (*node->tag->parser)( doc, node, mode ); |
3431 | 0 | continue; |
3432 | 0 | } |
3433 | | |
3434 | | /* ignore unknown and PARAM tags */ |
3435 | 468k | if ( node->tag == NULL || nodeIsPARAM(node) ) |
3436 | 11.8k | { |
3437 | 11.8k | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3438 | 11.8k | TY_(FreeNode)( doc, node ); |
3439 | 11.8k | continue; |
3440 | 11.8k | } |
3441 | | |
3442 | 456k | if ( nodeIsBR(node) && node->type == EndTag ) |
3443 | 429 | node->type = StartTag; |
3444 | | |
3445 | 456k | if ( node->type == EndTag ) |
3446 | 41.5k | { |
3447 | | /* coerce </br> to <br> */ |
3448 | 41.5k | if ( nodeIsBR(node) ) |
3449 | 0 | node->type = StartTag; |
3450 | 41.5k | else if ( nodeIsP(node) ) |
3451 | 115 | { |
3452 | | /* coerce unmatched </p> to <br><br> */ |
3453 | 115 | if ( !DescendantOf(element, TidyTag_P) ) |
3454 | 8 | { |
3455 | 8 | TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); |
3456 | 8 | TrimSpaces( doc, element ); |
3457 | 8 | TY_(InsertNodeAtEnd)( element, node ); |
3458 | 8 | node = TY_(InferredTag)(doc, TidyTag_BR); |
3459 | 8 | TY_(InsertNodeAtEnd)( element, node ); /* todo: check this */ |
3460 | 8 | continue; |
3461 | 8 | } |
3462 | 115 | } |
3463 | 41.4k | else if ( TY_(nodeHasCM)(node, CM_INLINE) |
3464 | 41.4k | && !nodeIsA(node) |
3465 | 5.69k | && !TY_(nodeHasCM)(node, CM_OBJECT) |
3466 | 1.67k | && TY_(nodeHasCM)(element, CM_INLINE) ) |
3467 | 1.66k | { |
3468 | | /* allow any inline end tag to end current element */ |
3469 | | |
3470 | | /* http://tidy.sf.net/issue/1426419 */ |
3471 | | /* but, like the browser, retain an earlier inline element. |
3472 | | This is implemented by setting the lexer into a mode |
3473 | | where it gets tokens from the inline stack rather than |
3474 | | from the input stream. Check if the scenerio fits. */ |
3475 | 1.66k | if ( !nodeIsA(element) |
3476 | 1.22k | && (node->tag != element->tag) |
3477 | 1.22k | && TY_(IsPushed)( doc, node ) |
3478 | 1.00k | && TY_(IsPushed)( doc, element ) ) |
3479 | 1.00k | { |
3480 | | /* we have something like |
3481 | | <b>bold <i>bold and italic</b> italics</i> */ |
3482 | 1.00k | if ( TY_(SwitchInline)( doc, element, node ) ) |
3483 | 760 | { |
3484 | 760 | TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG); |
3485 | 760 | TY_(UngetToken)( doc ); /* put this back */ |
3486 | 760 | TY_(InlineDup1)( doc, NULL, element ); /* dupe the <i>, after </b> */ |
3487 | 760 | if (!(mode & Preformatted)) |
3488 | 760 | TrimSpaces( doc, element ); |
3489 | | |
3490 | 760 | DEBUG_LOG_EXIT; |
3491 | 760 | return NULL; /* close <i>, but will re-open it, after </b> */ |
3492 | 760 | } |
3493 | 1.00k | } |
3494 | 901 | TY_(PopInline)( doc, element ); |
3495 | | |
3496 | 901 | if ( !nodeIsA(element) ) |
3497 | 466 | { |
3498 | 466 | if ( nodeIsA(node) && node->tag != element->tag ) |
3499 | 0 | { |
3500 | 0 | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
3501 | 0 | TY_(UngetToken)( doc ); |
3502 | 0 | } |
3503 | 466 | else |
3504 | 466 | { |
3505 | 466 | TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG); |
3506 | 466 | TY_(FreeNode)( doc, node); |
3507 | 466 | } |
3508 | | |
3509 | 466 | if (!(mode & Preformatted)) |
3510 | 466 | TrimSpaces(doc, element); |
3511 | | |
3512 | 466 | DEBUG_LOG_EXIT; |
3513 | 466 | return NULL; |
3514 | 466 | } |
3515 | | |
3516 | | /* if parent is <a> then discard unexpected inline end tag */ |
3517 | 435 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3518 | 435 | TY_(FreeNode)( doc, node); |
3519 | 435 | continue; |
3520 | 901 | } /* special case </tr> etc. for stuff moved in front of table */ |
3521 | 39.8k | else if ( lexer->exiled |
3522 | 34.4k | && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) |
3523 | 33.3k | { |
3524 | 33.3k | TY_(UngetToken)( doc ); |
3525 | 33.3k | TrimSpaces(doc, element); |
3526 | | |
3527 | 33.3k | DEBUG_LOG_EXIT; |
3528 | 33.3k | return NULL; |
3529 | 33.3k | } |
3530 | 41.5k | } |
3531 | | |
3532 | | /* allow any header tag to end current header */ |
3533 | 421k | if ( TY_(nodeHasCM)(node, CM_HEADING) && TY_(nodeHasCM)(element, CM_HEADING) ) |
3534 | 727 | { |
3535 | | |
3536 | 727 | if ( node->tag == element->tag ) |
3537 | 427 | { |
3538 | 427 | TY_(Report)(doc, element, node, NON_MATCHING_ENDTAG ); |
3539 | 427 | TY_(FreeNode)( doc, node); |
3540 | 427 | } |
3541 | 300 | else |
3542 | 300 | { |
3543 | 300 | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE ); |
3544 | 300 | TY_(UngetToken)( doc ); |
3545 | 300 | } |
3546 | | |
3547 | 727 | if (!(mode & Preformatted)) |
3548 | 727 | TrimSpaces(doc, element); |
3549 | | |
3550 | 727 | DEBUG_LOG_EXIT; |
3551 | 727 | return NULL; |
3552 | 727 | } |
3553 | | |
3554 | | /* |
3555 | | an <A> tag to ends any open <A> element |
3556 | | but <A href=...> is mapped to </A><A href=...> |
3557 | | */ |
3558 | | /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ |
3559 | | /* if (node->tag == doc->tags.tag_a && !node->implicit && TY_(IsPushed)(doc, node)) */ |
3560 | 420k | if ( nodeIsA(node) && !node->implicit && |
3561 | 7.82k | (nodeIsA(element) || DescendantOf(element, TidyTag_A)) ) |
3562 | 5.30k | { |
3563 | | /* coerce <a> to </a> unless it has some attributes */ |
3564 | | /* #427827 - fix by Randy Waki and Bjoern Hoehrmann 23 Aug 00 */ |
3565 | | /* other fixes by Dave Raggett */ |
3566 | | /* if (node->attributes == NULL) */ |
3567 | 5.30k | if (node->type != EndTag && node->attributes == NULL |
3568 | 3.16k | && cfgBool(doc, TidyCoerceEndTags) ) |
3569 | 3.16k | { |
3570 | 3.16k | node->type = EndTag; |
3571 | 3.16k | TY_(Report)(doc, element, node, COERCE_TO_ENDTAG); |
3572 | | /* TY_(PopInline)( doc, node ); */ |
3573 | 3.16k | TY_(UngetToken)( doc ); |
3574 | 3.16k | continue; |
3575 | 3.16k | } |
3576 | | |
3577 | 2.13k | TY_(UngetToken)( doc ); |
3578 | 2.13k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
3579 | | /* TY_(PopInline)( doc, element ); */ |
3580 | | |
3581 | 2.13k | if (!(mode & Preformatted)) |
3582 | 2.13k | TrimSpaces(doc, element); |
3583 | | |
3584 | 2.13k | DEBUG_LOG_EXIT; |
3585 | 2.13k | return NULL; |
3586 | 5.30k | } |
3587 | | |
3588 | 415k | if (element->tag->model & CM_HEADING) |
3589 | 2.63k | { |
3590 | 2.63k | if ( nodeIsCENTER(node) || nodeIsDIV(node) ) |
3591 | 511 | { |
3592 | 511 | if (!TY_(nodeIsElement)(node)) |
3593 | 152 | { |
3594 | 152 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3595 | 152 | TY_(FreeNode)( doc, node); |
3596 | 152 | continue; |
3597 | 152 | } |
3598 | | |
3599 | 359 | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN); |
3600 | | |
3601 | | /* insert center as parent if heading is empty */ |
3602 | 359 | if (element->content == NULL) |
3603 | 329 | { |
3604 | 329 | InsertNodeAsParent(element, node); |
3605 | 329 | continue; |
3606 | 329 | } |
3607 | | |
3608 | | /* split heading and make center parent of 2nd part */ |
3609 | 30 | TY_(InsertNodeAfterElement)(element, node); |
3610 | | |
3611 | 30 | if (!(mode & Preformatted)) |
3612 | 30 | TrimSpaces(doc, element); |
3613 | | |
3614 | 30 | element = TY_(CloneNode)( doc, element ); |
3615 | 30 | TY_(InsertNodeAtEnd)(node, element); |
3616 | 30 | continue; |
3617 | 359 | } |
3618 | | |
3619 | 2.12k | if ( nodeIsHR(node) ) |
3620 | 566 | { |
3621 | 566 | if ( !TY_(nodeIsElement)(node) ) |
3622 | 0 | { |
3623 | 0 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3624 | 0 | TY_(FreeNode)( doc, node); |
3625 | 0 | continue; |
3626 | 0 | } |
3627 | | |
3628 | 566 | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN); |
3629 | | |
3630 | | /* insert hr before heading if heading is empty */ |
3631 | 566 | if (element->content == NULL) |
3632 | 256 | { |
3633 | 256 | TY_(InsertNodeBeforeElement)(element, node); |
3634 | 256 | continue; |
3635 | 256 | } |
3636 | | |
3637 | | /* split heading and insert hr before 2nd part */ |
3638 | 310 | TY_(InsertNodeAfterElement)(element, node); |
3639 | | |
3640 | 310 | if (!(mode & Preformatted)) |
3641 | 310 | TrimSpaces(doc, element); |
3642 | | |
3643 | 310 | element = TY_(CloneNode)( doc, element ); |
3644 | 310 | TY_(InsertNodeAfterElement)(node, element); |
3645 | 310 | continue; |
3646 | 566 | } |
3647 | 2.12k | } |
3648 | | |
3649 | 414k | if ( nodeIsDT(element) ) |
3650 | 6.00k | { |
3651 | 6.00k | if ( nodeIsHR(node) ) |
3652 | 521 | { |
3653 | 521 | Node *dd; |
3654 | 521 | if ( !TY_(nodeIsElement)(node) ) |
3655 | 0 | { |
3656 | 0 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3657 | 0 | TY_(FreeNode)( doc, node); |
3658 | 0 | continue; |
3659 | 0 | } |
3660 | | |
3661 | 521 | TY_(Report)(doc, element, node, TAG_NOT_ALLOWED_IN); |
3662 | 521 | dd = TY_(InferredTag)(doc, TidyTag_DD); |
3663 | | |
3664 | | /* insert hr within dd before dt if dt is empty */ |
3665 | 521 | if (element->content == NULL) |
3666 | 0 | { |
3667 | 0 | TY_(InsertNodeBeforeElement)(element, dd); |
3668 | 0 | TY_(InsertNodeAtEnd)(dd, node); |
3669 | 0 | continue; |
3670 | 0 | } |
3671 | | |
3672 | | /* split dt and insert hr within dd before 2nd part */ |
3673 | 521 | TY_(InsertNodeAfterElement)(element, dd); |
3674 | 521 | TY_(InsertNodeAtEnd)(dd, node); |
3675 | | |
3676 | 521 | if (!(mode & Preformatted)) |
3677 | 521 | TrimSpaces(doc, element); |
3678 | | |
3679 | 521 | element = TY_(CloneNode)( doc, element ); |
3680 | 521 | TY_(InsertNodeAfterElement)(dd, element); |
3681 | 521 | continue; |
3682 | 521 | } |
3683 | 6.00k | } |
3684 | | |
3685 | | |
3686 | | /* |
3687 | | if this is the end tag for an ancestor element |
3688 | | then infer end tag for this element |
3689 | | */ |
3690 | 413k | if (node->type == EndTag) |
3691 | 5.47k | { |
3692 | 5.47k | for (parent = element->parent; |
3693 | 515k | parent != NULL; parent = parent->parent) |
3694 | 514k | { |
3695 | 514k | if (node->tag == parent->tag) |
3696 | 4.61k | { |
3697 | 4.61k | if (!(element->tag->model & CM_OPT) && !element->implicit) |
3698 | 1.28k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
3699 | | |
3700 | 4.61k | if( TY_(IsPushedLast)( doc, element, node ) ) |
3701 | 0 | TY_(PopInline)( doc, element ); |
3702 | 4.61k | TY_(UngetToken)( doc ); |
3703 | | |
3704 | 4.61k | if (!(mode & Preformatted)) |
3705 | 4.61k | TrimSpaces(doc, element); |
3706 | | |
3707 | 4.61k | DEBUG_LOG_EXIT; |
3708 | 4.61k | return NULL; |
3709 | 4.61k | } |
3710 | 514k | } |
3711 | 5.47k | } |
3712 | | |
3713 | | /*\ |
3714 | | * block level tags end this element |
3715 | | * Issue #333 - There seems an exception if the element is a 'span', |
3716 | | * and the node just collected is a 'meta'. The 'meta' can not have |
3717 | | * CM_INLINE added, nor can the 'span' have CM_MIXED added without |
3718 | | * big consequences. |
3719 | | * There may be other exceptions to be added... |
3720 | | \*/ |
3721 | 409k | if (!(node->tag->model & CM_INLINE) && |
3722 | 120k | !(element->tag->model & CM_MIXED) && |
3723 | 120k | !(nodeIsSPAN(element) && nodeIsMETA(node)) ) |
3724 | 120k | { |
3725 | 120k | if ( !TY_(nodeIsElement)(node) ) |
3726 | 602 | { |
3727 | 602 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3728 | 602 | TY_(FreeNode)( doc, node); |
3729 | 602 | continue; |
3730 | 602 | } |
3731 | | /* HTML5 */ |
3732 | 120k | if (nodeIsDATALIST(element)) { |
3733 | 0 | TY_(ConstrainVersion)( doc, ~VERS_HTML5 ); |
3734 | 0 | } else |
3735 | 120k | if (!(element->tag->model & CM_OPT)) |
3736 | 113k | TY_(Report)(doc, element, node, MISSING_ENDTAG_BEFORE); |
3737 | | |
3738 | 120k | if (node->tag->model & CM_HEAD && !(node->tag->model & CM_BLOCK)) |
3739 | 1 | { |
3740 | 1 | MoveToHead(doc, element, node); |
3741 | 1 | continue; |
3742 | 1 | } |
3743 | | |
3744 | | /* |
3745 | | prevent anchors from propagating into block tags |
3746 | | except for headings h1 to h6 |
3747 | | */ |
3748 | 120k | if ( nodeIsA(element) ) |
3749 | 1.51k | { |
3750 | 1.51k | if (node->tag && !(node->tag->model & CM_HEADING)) |
3751 | 979 | TY_(PopInline)( doc, element ); |
3752 | 535 | else if (!(element->content)) |
3753 | 463 | { |
3754 | 463 | TY_(DiscardElement)( doc, element ); |
3755 | 463 | TY_(UngetToken)( doc ); |
3756 | | |
3757 | 463 | DEBUG_LOG_EXIT; |
3758 | 463 | return NULL; |
3759 | 463 | } |
3760 | 1.51k | } |
3761 | | |
3762 | 119k | TY_(UngetToken)( doc ); |
3763 | | |
3764 | 119k | if (!(mode & Preformatted)) |
3765 | 119k | TrimSpaces(doc, element); |
3766 | | |
3767 | 119k | DEBUG_LOG_EXIT; |
3768 | 119k | return NULL; |
3769 | 120k | } |
3770 | | |
3771 | | /* parse inline element */ |
3772 | 288k | if (TY_(nodeIsElement)(node)) |
3773 | 288k | { |
3774 | 288k | if (node->implicit) |
3775 | 256k | TY_(Report)(doc, element, node, INSERTING_TAG); |
3776 | | |
3777 | | /* trim white space before <br> */ |
3778 | 288k | if ( nodeIsBR(node) ) |
3779 | 1.91k | TrimSpaces(doc, element); |
3780 | | |
3781 | 288k | TY_(InsertNodeAtEnd)(element, node); |
3782 | | |
3783 | 288k | { |
3784 | 288k | TidyParserMemory memory = {0}; |
3785 | 288k | memory.identity = TY_(ParseInline); |
3786 | 288k | memory.original_node = element; |
3787 | 288k | memory.reentry_node = node; |
3788 | 288k | memory.mode = mode; |
3789 | 288k | memory.reentry_mode = mode; |
3790 | 288k | TY_(pushMemory)( doc, memory ); |
3791 | 288k | DEBUG_LOG_EXIT_WITH_NODE(node); |
3792 | 288k | return node; |
3793 | 288k | } |
3794 | 288k | } |
3795 | | |
3796 | | /* discard unexpected tags */ |
3797 | 249 | TY_(Report)(doc, element, node, DISCARDING_UNEXPECTED); |
3798 | 249 | TY_(FreeNode)( doc, node ); |
3799 | 249 | continue; |
3800 | 288k | } |
3801 | | |
3802 | 135k | if (!(element->tag->model & CM_OPT)) |
3803 | 134k | TY_(Report)(doc, element, node, MISSING_ENDTAG_FOR); |
3804 | | |
3805 | 135k | DEBUG_LOG_EXIT; |
3806 | 135k | return NULL; |
3807 | 591k | } |
3808 | | |
3809 | | |
3810 | | /** MARK: TY_(ParseList) |
3811 | | * Parses list tags. |
3812 | | * |
3813 | | * This is a non-recursing parser. It uses the document's parser memory stack |
3814 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
3815 | | * This parser is also re-enterable, so that post-processing can occur after |
3816 | | * such dispatching. |
3817 | | */ |
3818 | | Node* TY_(ParseList)( TidyDocImpl* doc, Node *list, GetTokenMode ARG_UNUSED(mode) ) |
3819 | 129k | { |
3820 | 129k | Lexer* lexer = doc->lexer; |
3821 | 129k | Node *node = NULL; |
3822 | 129k | Node *parent = NULL; |
3823 | 129k | Node *lastli = NULL;; |
3824 | 129k | Bool wasblock = no; |
3825 | 129k | Bool nodeisOL = nodeIsOL(list); |
3826 | 129k | DEBUG_LOG_COUNTERS; |
3827 | | |
3828 | 129k | if ( list == NULL ) |
3829 | 60.9k | { |
3830 | 60.9k | TidyParserMemory memory = TY_(popMemory)( doc ); |
3831 | 60.9k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
3832 | 60.9k | DEBUG_LOG_REENTER_WITH_NODE(node); |
3833 | 60.9k | list = memory.original_node; |
3834 | 60.9k | DEBUG_LOG_GET_OLD_MODE; |
3835 | 60.9k | mode = memory.mode; |
3836 | 60.9k | DEBUG_LOG_CHANGE_MODE; |
3837 | 60.9k | } |
3838 | 68.2k | else |
3839 | 68.2k | { |
3840 | 68.2k | DEBUG_LOG_ENTER_WITH_NODE(list); |
3841 | | |
3842 | 68.2k | if (list->tag->model & CM_EMPTY) |
3843 | 0 | { |
3844 | 0 | DEBUG_LOG_EXIT; |
3845 | 0 | return NULL; |
3846 | 0 | } |
3847 | 68.2k | } |
3848 | | |
3849 | 129k | lexer->insert = NULL; /* defer implicit inline start tags */ |
3850 | | |
3851 | 134k | while ((node = TY_(GetToken)( doc, IgnoreWhitespace)) != NULL) |
3852 | 82.2k | { |
3853 | 82.2k | Bool foundLI = no; |
3854 | 82.2k | if (node->tag == list->tag && node->type == EndTag) |
3855 | 0 | { |
3856 | 0 | TY_(FreeNode)( doc, node); |
3857 | 0 | list->closed = yes; |
3858 | 0 | DEBUG_LOG_EXIT; |
3859 | 0 | return NULL; |
3860 | 0 | } |
3861 | | |
3862 | | /* deal with comments etc. */ |
3863 | 82.2k | if (InsertMisc(list, node)) |
3864 | 143 | continue; |
3865 | | |
3866 | 82.0k | if (node->type != TextNode && node->tag == NULL) |
3867 | 4.87k | { |
3868 | 4.87k | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3869 | 4.87k | TY_(FreeNode)( doc, node); |
3870 | 4.87k | continue; |
3871 | 4.87k | } |
3872 | 77.1k | if (lexer && (node->type == TextNode)) |
3873 | 4.15k | { |
3874 | 4.15k | uint ch, ix = node->start; |
3875 | | /* Issue #572 - Skip whitespace. */ |
3876 | 4.15k | while (ix < node->end && (ch = (lexer->lexbuf[ix] & 0xff)) |
3877 | 4.03k | && (ch == ' ' || ch == '\t' || ch == '\r' || ch == '\n')) |
3878 | 0 | ++ix; |
3879 | 4.15k | if (ix >= node->end) |
3880 | 0 | { |
3881 | | /* Issue #572 - Discard if ALL whitespace. */ |
3882 | 0 | TY_(FreeNode)(doc, node); |
3883 | 0 | continue; |
3884 | 0 | } |
3885 | 4.15k | } |
3886 | | |
3887 | | |
3888 | | /* |
3889 | | if this is the end tag for an ancestor element |
3890 | | then infer end tag for this element |
3891 | | */ |
3892 | 77.1k | if (node->type == EndTag) |
3893 | 218 | { |
3894 | 218 | if ( nodeIsFORM(node) ) |
3895 | 0 | { |
3896 | 0 | BadForm( doc ); |
3897 | 0 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3898 | 0 | TY_(FreeNode)( doc, node ); |
3899 | 0 | continue; |
3900 | 0 | } |
3901 | | |
3902 | 218 | if (TY_(nodeHasCM)(node,CM_INLINE)) |
3903 | 143 | { |
3904 | 143 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3905 | 143 | TY_(PopInline)( doc, node ); |
3906 | 143 | TY_(FreeNode)( doc, node); |
3907 | 143 | continue; |
3908 | 143 | } |
3909 | | |
3910 | 75 | for ( parent = list->parent; |
3911 | 888 | parent != NULL; parent = parent->parent ) |
3912 | 888 | { |
3913 | | /* Do not match across BODY to avoid infinite loop |
3914 | | between ParseBody and this parser, |
3915 | | See http://tidy.sf.net/bug/1053626. */ |
3916 | 888 | if (nodeIsBODY(parent)) |
3917 | 39 | break; |
3918 | 849 | if (node->tag == parent->tag) |
3919 | 36 | { |
3920 | 36 | TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE); |
3921 | 36 | TY_(UngetToken)( doc ); |
3922 | 36 | DEBUG_LOG_EXIT; |
3923 | 36 | return NULL; |
3924 | 36 | } |
3925 | 849 | } |
3926 | | |
3927 | 39 | TY_(Report)(doc, list, node, DISCARDING_UNEXPECTED); |
3928 | 39 | TY_(FreeNode)( doc, node); |
3929 | 39 | continue; |
3930 | 75 | } |
3931 | | |
3932 | 76.9k | if ( !nodeIsLI(node) && nodeisOL ) |
3933 | 64.2k | { |
3934 | | /* Issue #572 - A <ol><li> can have nested <ol> elements */ |
3935 | 64.2k | foundLI = FindLastLI(list, &lastli); /* find last <li> */ |
3936 | 64.2k | } |
3937 | | |
3938 | 76.9k | if ( nodeIsLI(node) || (TY_(IsHTML5Mode)(doc) && !foundLI) ) |
3939 | 75.7k | { |
3940 | | /* node is <LI> OR |
3941 | | Issue #396 - A <ul> can have Zero or more <li> elements |
3942 | | */ |
3943 | 75.7k | TY_(InsertNodeAtEnd)(list,node); |
3944 | 75.7k | } |
3945 | 1.23k | else |
3946 | 1.23k | { |
3947 | 1.23k | TY_(UngetToken)( doc ); |
3948 | | |
3949 | 1.23k | if (TY_(nodeHasCM)(node,CM_BLOCK) && lexer->excludeBlocks) |
3950 | 310 | { |
3951 | 310 | TY_(Report)(doc, list, node, MISSING_ENDTAG_BEFORE); |
3952 | 310 | DEBUG_LOG_EXIT; |
3953 | 310 | return NULL; |
3954 | 310 | } |
3955 | | /* http://tidy.sf.net/issue/1316307 */ |
3956 | | /* In exiled mode, return so table processing can continue. */ |
3957 | 921 | else if ( lexer->exiled |
3958 | 684 | && (TY_(nodeHasCM)(node, CM_TABLE|CM_ROWGRP|CM_ROW) |
3959 | 684 | || nodeIsTABLE(node)) ) |
3960 | 243 | { |
3961 | 243 | DEBUG_LOG_EXIT; |
3962 | 243 | return NULL; |
3963 | 243 | } |
3964 | | /* http://tidy.sf.net/issue/836462 |
3965 | | If "list" is an unordered list, insert the next tag within |
3966 | | the last <li> to preserve the numbering to match the visual |
3967 | | rendering of most browsers. */ |
3968 | 678 | if ( nodeIsOL(list) && FindLastLI(list, &lastli) ) |
3969 | 2 | { |
3970 | | /* Create a node for error reporting */ |
3971 | 2 | node = TY_(InferredTag)(doc, TidyTag_LI); |
3972 | 2 | TY_(Report)(doc, list, node, MISSING_STARTTAG ); |
3973 | 2 | TY_(FreeNode)( doc, node); |
3974 | 2 | node = lastli; |
3975 | 2 | } |
3976 | 676 | else |
3977 | 676 | { |
3978 | | /* Add an inferred <li> */ |
3979 | 676 | wasblock = TY_(nodeHasCM)(node,CM_BLOCK); |
3980 | 676 | node = TY_(InferredTag)(doc, TidyTag_LI); |
3981 | | /* Add "display: inline" to avoid a blank line after <li> with |
3982 | | Internet Explorer. See http://tidy.sf.net/issue/836462 */ |
3983 | 676 | TY_(AddStyleProperty)( doc, node, |
3984 | 676 | wasblock |
3985 | 676 | ? "list-style: none; display: inline" |
3986 | 676 | : "list-style: none" |
3987 | 676 | ); |
3988 | 676 | TY_(Report)(doc, list, node, MISSING_STARTTAG ); |
3989 | 676 | TY_(InsertNodeAtEnd)(list,node); |
3990 | 676 | } |
3991 | 678 | } |
3992 | | |
3993 | 76.4k | { |
3994 | 76.4k | TidyParserMemory memory = {0}; |
3995 | 76.4k | memory.identity = TY_(ParseList); |
3996 | 76.4k | memory.original_node = list; |
3997 | 76.4k | memory.reentry_node = node; |
3998 | 76.4k | memory.mode = IgnoreWhitespace; |
3999 | 76.4k | TY_(pushMemory)( doc, memory ); |
4000 | 76.4k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4001 | 76.4k | return node; |
4002 | 76.9k | } |
4003 | 76.9k | } |
4004 | | |
4005 | 52.2k | TY_(Report)(doc, list, node, MISSING_ENDTAG_FOR); |
4006 | 52.2k | DEBUG_LOG_EXIT; |
4007 | 52.2k | return NULL; |
4008 | 129k | } |
4009 | | |
4010 | | |
4011 | | /** MARK: TY_(ParseNamespace) |
4012 | | * Act as a generic XML (sub)tree parser: collect each node and add it |
4013 | | * to the DOM, without any further validation. It's useful for tags that |
4014 | | * have XML-like content, such as `svg` and `math`. |
4015 | | * |
4016 | | * @note Perhaps this is poorly named, as we're not parsing the namespace |
4017 | | * of a particular tag, but a tag with XML-like content. |
4018 | | * |
4019 | | * @todo Add schema- or other-hierarchy-definition-based validation |
4020 | | * of the subtree here. |
4021 | | * |
4022 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4023 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4024 | | * This parser is also re-enterable, so that post-processing can occur after |
4025 | | * such dispatching. |
4026 | | */ |
4027 | | Node* TY_(ParseNamespace)( TidyDocImpl* doc, Node *basenode, GetTokenMode mode ) |
4028 | 628 | { |
4029 | 628 | Lexer* lexer = doc->lexer; |
4030 | 628 | Node *node; |
4031 | 628 | Node *parent = basenode; |
4032 | 628 | uint istackbase; |
4033 | 628 | AttVal* av; /* #130 MathML attr and entity fix! */ |
4034 | | |
4035 | | /* a la <table>: defer popping elements off the inline stack */ |
4036 | 628 | TY_(DeferDup)( doc ); |
4037 | 628 | istackbase = lexer->istackbase; |
4038 | 628 | lexer->istackbase = lexer->istacksize; |
4039 | | |
4040 | 628 | mode = OtherNamespace; /* Preformatted; IgnoreWhitespace; */ |
4041 | | |
4042 | 3.58k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
4043 | 2.96k | { |
4044 | | /* |
4045 | | fix check to skip action in InsertMisc for regular/empty |
4046 | | nodes, which we don't want here... |
4047 | | |
4048 | | The way we do it here is by checking and processing everything |
4049 | | and only what remains goes into InsertMisc() |
4050 | | */ |
4051 | | |
4052 | | /* is this a close tag? And does it match the current parent node? */ |
4053 | 2.96k | if (node->type == EndTag) |
4054 | 705 | { |
4055 | | /* |
4056 | | to prevent end tags flowing from one 'alternate namespace' we |
4057 | | check this in two phases: first we check if the tag is a |
4058 | | descendant of the current node, and when it is, we check whether |
4059 | | it is the end tag for a node /within/ or /outside/ the basenode. |
4060 | | */ |
4061 | 705 | Bool outside; |
4062 | 705 | Node *mp = FindMatchingDescendant(parent, node, basenode, &outside); |
4063 | | |
4064 | 705 | if (mp != NULL) |
4065 | 467 | { |
4066 | | /* |
4067 | | when mp != parent as we might expect, |
4068 | | infer end tags until we 'hit' the matched |
4069 | | parent or the basenode |
4070 | | */ |
4071 | 467 | Node *n; |
4072 | | |
4073 | 467 | for (n = parent; |
4074 | 1.26k | n != NULL && n != basenode->parent && n != mp; |
4075 | 799 | n = n->parent) |
4076 | 799 | { |
4077 | | /* n->implicit = yes; */ |
4078 | 799 | n->closed = yes; |
4079 | 799 | TY_(Report)(doc, n->parent, n, MISSING_ENDTAG_BEFORE); |
4080 | 799 | } |
4081 | | |
4082 | | /* Issue #369 - Since 'assert' is DEBUG only, and there are |
4083 | | simple cases where these can be fired, removing them |
4084 | | pending feedback from the original author! |
4085 | | assert(outside == no ? n == mp : 1); |
4086 | | assert(outside == yes ? n == basenode->parent : 1); |
4087 | | =================================================== */ |
4088 | | |
4089 | 467 | if (outside == no) |
4090 | 467 | { |
4091 | | /* EndTag for a node within the basenode subtree. Roll on... */ |
4092 | 467 | if (n) |
4093 | 467 | n->closed = yes; |
4094 | 467 | TY_(FreeNode)(doc, node); |
4095 | | |
4096 | 467 | node = n; |
4097 | 467 | parent = node ? node->parent : NULL; |
4098 | 467 | } |
4099 | 0 | else |
4100 | 0 | { |
4101 | | /* EndTag for a node outside the basenode subtree: let the caller handle that. */ |
4102 | 0 | TY_(UngetToken)( doc ); |
4103 | 0 | node = basenode; |
4104 | 0 | parent = node->parent; |
4105 | 0 | } |
4106 | | |
4107 | | /* when we've arrived at the end-node for the base node, it's quitting time */ |
4108 | 467 | if (node == basenode) |
4109 | 11 | { |
4110 | 11 | lexer->istackbase = istackbase; |
4111 | 11 | assert(basenode && basenode->closed == yes); |
4112 | 11 | return NULL; |
4113 | 11 | } |
4114 | 467 | } |
4115 | 238 | else |
4116 | 238 | { |
4117 | | /* unmatched close tag: report an error and discard */ |
4118 | | /* TY_(Report)(doc, parent, node, NON_MATCHING_ENDTAG); Issue #308 - Seems wrong warning! */ |
4119 | 238 | TY_(Report)(doc, parent, node, DISCARDING_UNEXPECTED); |
4120 | 238 | assert(parent); |
4121 | | /* assert(parent->tag != node->tag); Issue #308 - Seems would always be true! */ |
4122 | 238 | TY_(FreeNode)( doc, node); /* Issue #308 - Discard unexpected end tag memory */ |
4123 | 238 | } |
4124 | 705 | } |
4125 | 2.26k | else if (node->type == StartTag) |
4126 | 1.61k | { |
4127 | | /* #130 MathML attr and entity fix! |
4128 | | care if it has attributes, and 'accidently' any of those attributes match known */ |
4129 | 1.73k | for ( av = node->attributes; av; av = av->next ) |
4130 | 119 | { |
4131 | 119 | av->dict = 0; /* does something need to be freed? */ |
4132 | 119 | } |
4133 | | /* add another child to the current parent */ |
4134 | 1.61k | TY_(InsertNodeAtEnd)(parent, node); |
4135 | 1.61k | parent = node; |
4136 | 1.61k | } |
4137 | 645 | else |
4138 | 645 | { |
4139 | | /* #130 MathML attr and entity fix! |
4140 | | care if it has attributes, and 'accidently' any of those attributes match known */ |
4141 | 647 | for ( av = node->attributes; av; av = av->next ) |
4142 | 2 | { |
4143 | 2 | av->dict = 0; /* does something need to be freed? */ |
4144 | 2 | } |
4145 | 645 | TY_(InsertNodeAtEnd)(parent, node); |
4146 | 645 | } |
4147 | 2.96k | } |
4148 | | |
4149 | 617 | TY_(Report)(doc, basenode->parent, basenode, MISSING_ENDTAG_FOR); |
4150 | 617 | return NULL; |
4151 | 628 | } |
4152 | | |
4153 | | |
4154 | | /** MARK: TY_(ParseNoFrames) |
4155 | | * Parses the `noframes` tag. |
4156 | | * |
4157 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4158 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4159 | | * This parser is also re-enterable, so that post-processing can occur after |
4160 | | * such dispatching. |
4161 | | */ |
4162 | | Node* TY_(ParseNoFrames)( TidyDocImpl* doc, Node *noframes, GetTokenMode mode ) |
4163 | 14.1k | { |
4164 | 14.1k | Lexer* lexer = doc->lexer; |
4165 | 14.1k | Node *node = NULL; |
4166 | 14.1k | Bool body_seen = no; |
4167 | 14.1k | DEBUG_LOG_COUNTERS; |
4168 | | |
4169 | 14.1k | enum parserState { |
4170 | 14.1k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4171 | 14.1k | STATE_POST_NODEISBODY, /* To-do after re-entering after checks. */ |
4172 | 14.1k | STATE_COMPLETE, /* Done with the switch. */ |
4173 | 14.1k | } state = STATE_INITIAL; |
4174 | | |
4175 | | /* |
4176 | | If we're re-entering, then we need to setup from a previous state, |
4177 | | instead of starting fresh. We can pull what we need from the document's |
4178 | | stack. |
4179 | | */ |
4180 | 14.1k | if ( noframes == NULL ) |
4181 | 6.20k | { |
4182 | 6.20k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4183 | 6.20k | node = memory.reentry_node; /* Throwaway, because we replace it entering the loop anyway.*/ |
4184 | 6.20k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4185 | 6.20k | noframes = memory.original_node; |
4186 | 6.20k | state = memory.reentry_state; |
4187 | 6.20k | body_seen = memory.register_1; |
4188 | 6.20k | DEBUG_LOG_GET_OLD_MODE; |
4189 | 6.20k | mode = memory.mode; |
4190 | 6.20k | DEBUG_LOG_CHANGE_MODE; |
4191 | 6.20k | } |
4192 | 7.94k | else |
4193 | 7.94k | { |
4194 | 7.94k | DEBUG_LOG_ENTER_WITH_NODE(noframes); |
4195 | 7.94k | if ( cfg(doc, TidyAccessibilityCheckLevel) == 0 ) |
4196 | 7.94k | { |
4197 | 7.94k | doc->badAccess |= BA_USING_NOFRAMES; |
4198 | 7.94k | } |
4199 | 7.94k | } |
4200 | | |
4201 | 14.1k | mode = IgnoreWhitespace; |
4202 | | |
4203 | 24.2k | while ( state != STATE_COMPLETE ) |
4204 | 20.6k | { |
4205 | 20.6k | if ( state == STATE_INITIAL ) |
4206 | 18.7k | { |
4207 | 18.7k | node = TY_(GetToken)(doc, mode); |
4208 | 18.7k | DEBUG_LOG_GOT_TOKEN(node); |
4209 | 18.7k | } |
4210 | | |
4211 | 20.6k | switch ( state ) |
4212 | 20.6k | { |
4213 | 18.7k | case STATE_INITIAL: |
4214 | 18.7k | { |
4215 | 18.7k | if ( node == NULL ) |
4216 | 3.52k | { |
4217 | 3.52k | state = STATE_COMPLETE; |
4218 | 3.52k | continue; |
4219 | 3.52k | } |
4220 | | |
4221 | 15.2k | if ( node->tag == noframes->tag && node->type == EndTag ) |
4222 | 165 | { |
4223 | 165 | TY_(FreeNode)( doc, node); |
4224 | 165 | noframes->closed = yes; |
4225 | 165 | TrimSpaces(doc, noframes); |
4226 | 165 | DEBUG_LOG_EXIT; |
4227 | 165 | return NULL; |
4228 | 165 | } |
4229 | | |
4230 | 15.0k | if ( nodeIsFRAME(node) || nodeIsFRAMESET(node) ) |
4231 | 1.72k | { |
4232 | 1.72k | TrimSpaces(doc, noframes); |
4233 | 1.72k | if (node->type == EndTag) |
4234 | 276 | { |
4235 | 276 | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4236 | 276 | TY_(FreeNode)( doc, node); /* Throw it away */ |
4237 | 276 | } |
4238 | 1.44k | else |
4239 | 1.44k | { |
4240 | 1.44k | TY_(Report)(doc, noframes, node, MISSING_ENDTAG_BEFORE); |
4241 | 1.44k | TY_(UngetToken)( doc ); |
4242 | 1.44k | } |
4243 | 1.72k | DEBUG_LOG_EXIT; |
4244 | 1.72k | return NULL; |
4245 | 1.72k | } |
4246 | | |
4247 | 13.3k | if ( nodeIsHTML(node) ) |
4248 | 22 | { |
4249 | 22 | if (TY_(nodeIsElement)(node)) |
4250 | 22 | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4251 | | |
4252 | 22 | TY_(FreeNode)( doc, node); |
4253 | 22 | continue; |
4254 | 22 | } |
4255 | | |
4256 | | /* deal with comments etc. */ |
4257 | 13.3k | if (InsertMisc(noframes, node)) |
4258 | 352 | continue; |
4259 | | |
4260 | 12.9k | if ( nodeIsBODY(node) && node->type == StartTag ) |
4261 | 3.88k | { |
4262 | 3.88k | TidyParserMemory memory = {0}; |
4263 | 3.88k | memory.identity = TY_(ParseNoFrames); |
4264 | 3.88k | memory.original_node = noframes; |
4265 | 3.88k | memory.reentry_node = node; |
4266 | 3.88k | memory.reentry_state = STATE_POST_NODEISBODY; |
4267 | 3.88k | memory.register_1 = lexer->seenEndBody; |
4268 | 3.88k | memory.mode = IgnoreWhitespace; |
4269 | | |
4270 | 3.88k | TY_(InsertNodeAtEnd)(noframes, node); |
4271 | 3.88k | TY_(pushMemory)( doc, memory ); |
4272 | 3.88k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4273 | 3.88k | return node; |
4274 | 3.88k | } |
4275 | | |
4276 | | /* implicit body element inferred */ |
4277 | 9.07k | if (TY_(nodeIsText)(node) || (node->tag && node->type != EndTag)) |
4278 | 4.88k | { |
4279 | 4.88k | Node *body = TY_(FindBody)( doc ); |
4280 | 4.88k | if ( body || lexer->seenEndBody ) |
4281 | 2.80k | { |
4282 | 2.80k | if ( body == NULL ) |
4283 | 30 | { |
4284 | 30 | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4285 | 30 | TY_(FreeNode)( doc, node); |
4286 | 30 | continue; |
4287 | 30 | } |
4288 | 2.77k | if ( TY_(nodeIsText)(node) ) |
4289 | 532 | { |
4290 | 532 | TY_(UngetToken)( doc ); |
4291 | 532 | node = TY_(InferredTag)(doc, TidyTag_P); |
4292 | 532 | TY_(Report)(doc, noframes, node, CONTENT_AFTER_BODY ); |
4293 | 532 | } |
4294 | 2.77k | TY_(InsertNodeAtEnd)( body, node ); |
4295 | 2.77k | } |
4296 | 2.07k | else |
4297 | 2.07k | { |
4298 | 2.07k | TY_(UngetToken)( doc ); |
4299 | 2.07k | node = TY_(InferredTag)(doc, TidyTag_BODY); |
4300 | 2.07k | if ( cfgBool(doc, TidyXmlOut) ) |
4301 | 1.73k | TY_(Report)(doc, noframes, node, INSERTING_TAG); |
4302 | 2.07k | TY_(InsertNodeAtEnd)( noframes, node ); |
4303 | 2.07k | } |
4304 | | |
4305 | 4.85k | { |
4306 | 4.85k | TidyParserMemory memory = {0}; |
4307 | 4.85k | memory.identity = TY_(ParseNoFrames); |
4308 | 4.85k | memory.original_node = noframes; |
4309 | 4.85k | memory.reentry_node = node; |
4310 | 4.85k | memory.mode = IgnoreWhitespace; /*MixedContent*/ |
4311 | 4.85k | memory.reentry_state = STATE_INITIAL; |
4312 | 4.85k | TY_(pushMemory)( doc, memory ); |
4313 | 4.85k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4314 | 4.85k | return node; |
4315 | 4.88k | } |
4316 | 4.88k | } |
4317 | | |
4318 | | /* discard unexpected end tags */ |
4319 | 4.18k | TY_(Report)(doc, noframes, node, DISCARDING_UNEXPECTED); |
4320 | 4.18k | TY_(FreeNode)( doc, node); |
4321 | 4.18k | } break; |
4322 | | |
4323 | | |
4324 | 1.94k | case STATE_POST_NODEISBODY: |
4325 | 1.94k | { |
4326 | | /* fix for bug http://tidy.sf.net/bug/887259 */ |
4327 | 1.94k | if (body_seen && TY_(FindBody)(doc) != node) |
4328 | 686 | { |
4329 | 686 | TY_(CoerceNode)(doc, node, TidyTag_DIV, no, no); |
4330 | 686 | MoveNodeToBody(doc, node); |
4331 | 686 | } |
4332 | 1.94k | state = STATE_INITIAL; |
4333 | 1.94k | continue; |
4334 | | |
4335 | 9.07k | } break; |
4336 | | |
4337 | | |
4338 | 0 | default: |
4339 | 0 | break; |
4340 | 20.6k | } /* switch */ |
4341 | 20.6k | } /* while */ |
4342 | | |
4343 | 3.52k | TY_(Report)(doc, noframes, node, MISSING_ENDTAG_FOR); |
4344 | 3.52k | DEBUG_LOG_EXIT; |
4345 | 3.52k | return NULL; |
4346 | 14.1k | } |
4347 | | |
4348 | | |
4349 | | /** MARK: TY_(ParseOptGroup) |
4350 | | * Parses the `optgroup` tag. |
4351 | | * |
4352 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4353 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4354 | | * This parser is also re-enterable, so that post-processing can occur after |
4355 | | * such dispatching. |
4356 | | */ |
4357 | | Node* TY_(ParseOptGroup)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) ) |
4358 | 16.8k | { |
4359 | 16.8k | Lexer* lexer = doc->lexer; |
4360 | 16.8k | Node *node; |
4361 | 16.8k | DEBUG_LOG_COUNTERS; |
4362 | | |
4363 | 16.8k | if ( field == NULL ) |
4364 | 8.39k | { |
4365 | 8.39k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4366 | 8.39k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4367 | 8.39k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4368 | 8.39k | field = memory.original_node; |
4369 | 8.39k | DEBUG_LOG_GET_OLD_MODE; |
4370 | 8.39k | mode = memory.mode; |
4371 | 8.39k | DEBUG_LOG_CHANGE_MODE; |
4372 | 8.39k | } |
4373 | 8.42k | else |
4374 | 8.42k | { |
4375 | 8.42k | DEBUG_LOG_ENTER_WITH_NODE(field); |
4376 | 8.42k | } |
4377 | | |
4378 | 16.8k | lexer->insert = NULL; /* defer implicit inline start tags */ |
4379 | | |
4380 | 17.9k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
4381 | 9.67k | { |
4382 | 9.67k | if (node->tag == field->tag && node->type == EndTag) |
4383 | 157 | { |
4384 | 157 | TY_(FreeNode)( doc, node); |
4385 | 157 | field->closed = yes; |
4386 | 157 | TrimSpaces(doc, field); |
4387 | 157 | DEBUG_LOG_EXIT; |
4388 | 157 | return NULL; |
4389 | 157 | } |
4390 | | |
4391 | | /* deal with comments etc. */ |
4392 | 9.51k | if (InsertMisc(field, node)) |
4393 | 0 | continue; |
4394 | | |
4395 | 9.51k | if ( node->type == StartTag && |
4396 | 9.15k | (nodeIsOPTION(node) || nodeIsOPTGROUP(node)) ) |
4397 | 8.39k | { |
4398 | 8.39k | TidyParserMemory memory = {0}; |
4399 | | |
4400 | 8.39k | if ( nodeIsOPTGROUP(node) ) |
4401 | 8.39k | TY_(Report)(doc, field, node, CANT_BE_NESTED); |
4402 | | |
4403 | 8.39k | TY_(InsertNodeAtEnd)(field, node); |
4404 | | |
4405 | 8.39k | memory.identity = TY_(ParseOptGroup); |
4406 | 8.39k | memory.original_node = field; |
4407 | 8.39k | memory.reentry_node = node; |
4408 | 8.39k | TY_(pushMemory)( doc, memory ); |
4409 | 8.39k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4410 | 8.39k | return node; |
4411 | 8.39k | } |
4412 | | |
4413 | | /* discard unexpected tags */ |
4414 | 1.12k | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED ); |
4415 | 1.12k | TY_(FreeNode)( doc, node); |
4416 | 1.12k | } |
4417 | 8.26k | DEBUG_LOG_EXIT; |
4418 | 8.26k | return NULL; |
4419 | 16.8k | } |
4420 | | |
4421 | | |
4422 | | /** MARK: TY_(ParsePre) |
4423 | | * Parses the `pre` tag. |
4424 | | * |
4425 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4426 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4427 | | * This parser is also re-enterable, so that post-processing can occur after |
4428 | | * such dispatching. |
4429 | | */ |
4430 | | Node* TY_(ParsePre)( TidyDocImpl* doc, Node *pre, GetTokenMode ARG_UNUSED(mode) ) |
4431 | 11.2k | { |
4432 | 11.2k | Node *node = NULL; |
4433 | 11.2k | DEBUG_LOG_COUNTERS; |
4434 | | |
4435 | 11.2k | enum parserState { |
4436 | 11.2k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4437 | 11.2k | STATE_RENTRY_ACTION, /* To-do after re-entering after checks. */ |
4438 | 11.2k | STATE_COMPLETE, /* Done with the switch. */ |
4439 | 11.2k | } state = STATE_INITIAL; |
4440 | | |
4441 | | |
4442 | 11.2k | if ( pre == NULL ) |
4443 | 7.52k | { |
4444 | 7.52k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4445 | 7.52k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4446 | 7.52k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4447 | 7.52k | pre = memory.original_node; |
4448 | 7.52k | state = memory.reentry_state; |
4449 | 7.52k | DEBUG_LOG_GET_OLD_MODE; |
4450 | 7.52k | mode = memory.mode; |
4451 | 7.52k | DEBUG_LOG_CHANGE_MODE; |
4452 | 7.52k | } |
4453 | 3.68k | else |
4454 | 3.68k | { |
4455 | 3.68k | DEBUG_LOG_ENTER_WITH_NODE(pre); |
4456 | 3.68k | if (pre->tag->model & CM_EMPTY) |
4457 | 0 | { |
4458 | 0 | DEBUG_LOG_EXIT; |
4459 | 0 | return NULL; |
4460 | 0 | } |
4461 | 3.68k | } |
4462 | | |
4463 | 11.2k | TY_(InlineDup)( doc, NULL ); /* tell lexer to insert inlines if needed */ |
4464 | | |
4465 | 24.4k | while ( state != STATE_COMPLETE ) |
4466 | 23.5k | { |
4467 | 23.5k | if ( state == STATE_INITIAL ) |
4468 | 17.0k | node = TY_(GetToken)(doc, Preformatted); |
4469 | | |
4470 | 23.5k | switch ( state ) |
4471 | 23.5k | { |
4472 | 17.0k | case STATE_INITIAL: |
4473 | 17.0k | { |
4474 | 17.0k | if ( node == NULL ) |
4475 | 861 | { |
4476 | 861 | state = STATE_COMPLETE; |
4477 | 861 | continue; |
4478 | 861 | } |
4479 | | |
4480 | 16.1k | if ( node->type == EndTag && |
4481 | 250 | (node->tag == pre->tag || DescendantOf(pre, TagId(node))) ) |
4482 | 69 | { |
4483 | 69 | if (nodeIsBODY(node) || nodeIsHTML(node)) |
4484 | 3 | { |
4485 | 3 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4486 | 3 | TY_(FreeNode)(doc, node); |
4487 | 3 | continue; |
4488 | 3 | } |
4489 | 66 | if (node->tag == pre->tag) |
4490 | 6 | { |
4491 | 6 | TY_(FreeNode)(doc, node); |
4492 | 6 | } |
4493 | 60 | else |
4494 | 60 | { |
4495 | 60 | TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE ); |
4496 | 60 | TY_(UngetToken)( doc ); |
4497 | 60 | } |
4498 | 66 | pre->closed = yes; |
4499 | 66 | TrimSpaces(doc, pre); |
4500 | 66 | DEBUG_LOG_EXIT; |
4501 | 66 | return NULL; |
4502 | 69 | } |
4503 | | |
4504 | 16.1k | if (TY_(nodeIsText)(node)) |
4505 | 891 | { |
4506 | 891 | TY_(InsertNodeAtEnd)(pre, node); |
4507 | 891 | continue; |
4508 | 891 | } |
4509 | | |
4510 | | /* deal with comments etc. */ |
4511 | 15.2k | if (InsertMisc(pre, node)) |
4512 | 745 | continue; |
4513 | | |
4514 | 14.4k | if (node->tag == NULL) |
4515 | 3.43k | { |
4516 | 3.43k | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4517 | 3.43k | TY_(FreeNode)(doc, node); |
4518 | 3.43k | continue; |
4519 | 3.43k | } |
4520 | | |
4521 | | /* strip unexpected tags */ |
4522 | 11.0k | if ( !PreContent(doc, node) ) |
4523 | 9.37k | { |
4524 | | /* fix for http://tidy.sf.net/bug/772205 */ |
4525 | 9.37k | if (node->type == EndTag) |
4526 | 120 | { |
4527 | | /* http://tidy.sf.net/issue/1590220 */ |
4528 | 120 | if ( doc->lexer->exiled |
4529 | 6 | && (TY_(nodeHasCM)(node, CM_TABLE) || nodeIsTABLE(node)) ) |
4530 | 1 | { |
4531 | 1 | TY_(UngetToken)(doc); |
4532 | 1 | TrimSpaces(doc, pre); |
4533 | 1 | DEBUG_LOG_EXIT; |
4534 | 1 | return NULL; |
4535 | 1 | } |
4536 | | |
4537 | 119 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4538 | 119 | TY_(FreeNode)(doc, node); |
4539 | 119 | continue; |
4540 | 120 | } |
4541 | | /* http://tidy.sf.net/issue/1590220 */ |
4542 | 9.25k | else if (TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) |
4543 | 6.50k | || nodeIsTABLE(node) ) |
4544 | 2.74k | { |
4545 | 2.74k | if (!doc->lexer->exiled) |
4546 | | /* No missing close warning if exiled. */ |
4547 | 83 | TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE); |
4548 | | |
4549 | 2.74k | TY_(UngetToken)(doc); |
4550 | 2.74k | DEBUG_LOG_EXIT; |
4551 | 2.74k | return NULL; |
4552 | 2.74k | } |
4553 | | |
4554 | | /* |
4555 | | This is basically what Tidy 04 August 2000 did and far more accurate |
4556 | | with respect to browser behaivour than the code commented out above. |
4557 | | Tidy could try to propagate the <pre> into each disallowed child where |
4558 | | <pre> is allowed in order to replicate some browsers behaivour, but |
4559 | | there are a lot of exceptions, e.g. Internet Explorer does not propagate |
4560 | | <pre> into table cells while Mozilla does. Opera 6 never propagates |
4561 | | <pre> into blocklevel elements while Opera 7 behaves much like Mozilla. |
4562 | | |
4563 | | Tidy behaves thus mostly like Opera 6 except for nested <pre> elements |
4564 | | which are handled like Mozilla takes them (Opera6 closes all <pre> after |
4565 | | the first </pre>). |
4566 | | |
4567 | | There are similar issues like replacing <p> in <pre> with <br>, for |
4568 | | example |
4569 | | |
4570 | | <pre>...<p>...</pre> (Input) |
4571 | | <pre>...<br>...</pre> (Tidy) |
4572 | | <pre>...<br>...</pre> (Opera 7 and Internet Explorer) |
4573 | | <pre>...<br><br>...</pre> (Opera 6 and Mozilla) |
4574 | | |
4575 | | <pre>...<p>...</p>...</pre> (Input) |
4576 | | <pre>...<br>......</pre> (Tidy, BUG!) |
4577 | | <pre>...<br>...<br>...</pre> (Internet Explorer) |
4578 | | <pre>...<br><br>...<br><br>...</pre> (Mozilla, Opera 6) |
4579 | | <pre>...<br>...<br><br>...</pre> (Opera 7) |
4580 | | |
4581 | | or something similar, they could also be closing the <pre> and propagate |
4582 | | the <pre> into the newly opened <p>. |
4583 | | |
4584 | | Todo: IMG, OBJECT, APPLET, BIG, SMALL, SUB, SUP, FONT, and BASEFONT are |
4585 | | disallowed in <pre>, Tidy neither detects this nor does it perform any |
4586 | | cleanup operation. Tidy should at least issue a warning if it encounters |
4587 | | such constructs. |
4588 | | |
4589 | | Todo: discarding </p> is abviously a bug, it should be replaced by <br>. |
4590 | | */ |
4591 | 6.50k | TY_(InsertNodeAfterElement)(pre, node); |
4592 | 6.50k | TY_(Report)(doc, pre, node, MISSING_ENDTAG_BEFORE); |
4593 | | |
4594 | 6.50k | { |
4595 | 6.50k | TidyParserMemory memory = {0}; |
4596 | 6.50k | memory.identity = TY_(ParsePre); |
4597 | 6.50k | memory.original_node = pre; |
4598 | 6.50k | memory.reentry_node = node; |
4599 | 6.50k | memory.reentry_state = STATE_RENTRY_ACTION; |
4600 | 6.50k | TY_(pushMemory)( doc, memory ); |
4601 | 6.50k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4602 | 6.50k | return node; |
4603 | 9.37k | } |
4604 | 9.37k | } |
4605 | | |
4606 | 1.66k | if ( nodeIsP(node) ) |
4607 | 611 | { |
4608 | 611 | if (node->type == StartTag) |
4609 | 584 | { |
4610 | 584 | TY_(Report)(doc, pre, node, USING_BR_INPLACE_OF); |
4611 | | |
4612 | | /* trim white space before <p> in <pre>*/ |
4613 | 584 | TrimSpaces(doc, pre); |
4614 | | |
4615 | | /* coerce both <p> and </p> to <br> */ |
4616 | 584 | TY_(CoerceNode)(doc, node, TidyTag_BR, no, no); |
4617 | 584 | TY_(FreeAttrs)( doc, node ); /* discard align attribute etc. */ |
4618 | 584 | TY_(InsertNodeAtEnd)( pre, node ); |
4619 | 584 | } |
4620 | 27 | else |
4621 | 27 | { |
4622 | 27 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4623 | 27 | TY_(FreeNode)( doc, node); |
4624 | 27 | } |
4625 | 611 | continue; |
4626 | 611 | } |
4627 | | |
4628 | 1.05k | if ( TY_(nodeIsElement)(node) ) |
4629 | 1.02k | { |
4630 | | /* trim white space before <br> */ |
4631 | 1.02k | if ( nodeIsBR(node) ) |
4632 | 0 | TrimSpaces(doc, pre); |
4633 | | |
4634 | 1.02k | TY_(InsertNodeAtEnd)(pre, node); |
4635 | | |
4636 | 1.02k | { |
4637 | 1.02k | TidyParserMemory memory = {0}; |
4638 | 1.02k | memory.identity = TY_(ParsePre); |
4639 | 1.02k | memory.original_node = pre; |
4640 | 1.02k | memory.reentry_node = node; |
4641 | 1.02k | memory.reentry_state = STATE_INITIAL; |
4642 | 1.02k | TY_(pushMemory)( doc, memory ); |
4643 | 1.02k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4644 | 1.02k | return node; |
4645 | 1.02k | } |
4646 | 1.02k | } |
4647 | | |
4648 | | /* discard unexpected tags */ |
4649 | 31 | TY_(Report)(doc, pre, node, DISCARDING_UNEXPECTED); |
4650 | 31 | TY_(FreeNode)( doc, node); |
4651 | 31 | } break; |
4652 | | |
4653 | 6.50k | case STATE_RENTRY_ACTION: |
4654 | 6.50k | { |
4655 | 6.50k | Node* newnode = TY_(InferredTag)(doc, TidyTag_PRE); |
4656 | 6.50k | TY_(Report)(doc, pre, newnode, INSERTING_TAG); |
4657 | 6.50k | pre = newnode; |
4658 | 6.50k | TY_(InsertNodeAfterElement)(node, pre); |
4659 | 6.50k | state = STATE_INITIAL; |
4660 | 6.50k | continue; |
4661 | 1.05k | } break; |
4662 | | |
4663 | 0 | default: |
4664 | 0 | break; |
4665 | | |
4666 | 23.5k | } /* switch */ |
4667 | 23.5k | } /* while */ |
4668 | | |
4669 | 861 | TY_(Report)(doc, pre, node, MISSING_ENDTAG_FOR); |
4670 | 861 | DEBUG_LOG_EXIT; |
4671 | 861 | return NULL; |
4672 | 11.2k | } |
4673 | | |
4674 | | |
4675 | | /** MARK: TY_(ParseRow) |
4676 | | * Parses the `row` tag. |
4677 | | * |
4678 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4679 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4680 | | * This parser is also re-enterable, so that post-processing can occur after |
4681 | | * such dispatching. |
4682 | | */ |
4683 | | Node* TY_(ParseRow)( TidyDocImpl* doc, Node *row, GetTokenMode ARG_UNUSED(mode) ) |
4684 | 10.3k | { |
4685 | 10.3k | Lexer* lexer = doc->lexer; |
4686 | 10.3k | Node *node = NULL; |
4687 | 10.3k | Bool exclude_state = no; |
4688 | 10.3k | DEBUG_LOG_COUNTERS; |
4689 | | |
4690 | 10.3k | enum parserState { |
4691 | 10.3k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4692 | 10.3k | STATE_POST_NOT_ENDTAG, /* To-do after re-entering after !EndTag checks. */ |
4693 | 10.3k | STATE_POST_TD_TH, /* To-do after re-entering after TD/TH checks. */ |
4694 | 10.3k | STATE_COMPLETE, /* Done with the switch. */ |
4695 | 10.3k | } state = STATE_INITIAL; |
4696 | | |
4697 | 10.3k | if ( row == NULL ) |
4698 | 6.03k | { |
4699 | 6.03k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4700 | 6.03k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4701 | 6.03k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4702 | 6.03k | row = memory.original_node; |
4703 | 6.03k | state = memory.reentry_state; |
4704 | 6.03k | exclude_state = memory.register_1; |
4705 | 6.03k | DEBUG_LOG_GET_OLD_MODE; |
4706 | 6.03k | mode = memory.mode; |
4707 | 6.03k | DEBUG_LOG_CHANGE_MODE; |
4708 | 6.03k | } |
4709 | 4.34k | else |
4710 | 4.34k | { |
4711 | 4.34k | DEBUG_LOG_ENTER_WITH_NODE(row); |
4712 | | |
4713 | 4.34k | if (row->tag->model & CM_EMPTY) |
4714 | 0 | return NULL; |
4715 | 4.34k | } |
4716 | | |
4717 | 25.0k | while ( state != STATE_COMPLETE ) |
4718 | 22.6k | { |
4719 | 22.6k | if ( state == STATE_INITIAL ) |
4720 | 16.6k | { |
4721 | 16.6k | node = TY_(GetToken)( doc, IgnoreWhitespace ); |
4722 | 16.6k | DEBUG_LOG_GOT_TOKEN(node); |
4723 | 16.6k | } |
4724 | | |
4725 | 22.6k | switch (state) |
4726 | 22.6k | { |
4727 | 16.6k | case STATE_INITIAL: |
4728 | 16.6k | { |
4729 | 16.6k | if ( node == NULL) |
4730 | 2.35k | { |
4731 | 2.35k | state = STATE_COMPLETE; |
4732 | 2.35k | continue; |
4733 | 2.35k | } |
4734 | | |
4735 | 14.3k | if (node->tag == row->tag) |
4736 | 1.52k | { |
4737 | 1.52k | if (node->type == EndTag) |
4738 | 509 | { |
4739 | 509 | TY_(FreeNode)( doc, node); |
4740 | 509 | row->closed = yes; |
4741 | 509 | FixEmptyRow( doc, row); |
4742 | 509 | DEBUG_LOG_EXIT; |
4743 | 509 | return NULL; |
4744 | 509 | } |
4745 | | |
4746 | | /* New row start implies end of current row */ |
4747 | 1.01k | TY_(UngetToken)( doc ); |
4748 | 1.01k | FixEmptyRow( doc, row); |
4749 | 1.01k | DEBUG_LOG_EXIT; |
4750 | 1.01k | return NULL; |
4751 | 1.52k | } |
4752 | | |
4753 | | /* |
4754 | | if this is the end tag for an ancestor element |
4755 | | then infer end tag for this element |
4756 | | */ |
4757 | 12.7k | if ( node->type == EndTag ) |
4758 | 764 | { |
4759 | 764 | if ( (TY_(nodeHasCM)(node, CM_HTML|CM_TABLE) || nodeIsTABLE(node)) |
4760 | 202 | && DescendantOf(row, TagId(node)) ) |
4761 | 202 | { |
4762 | 202 | TY_(UngetToken)( doc ); |
4763 | 202 | DEBUG_LOG_EXIT; |
4764 | 202 | return NULL; |
4765 | 202 | } |
4766 | | |
4767 | 562 | if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
4768 | 408 | { |
4769 | 408 | if ( nodeIsFORM(node) ) |
4770 | 0 | BadForm( doc ); |
4771 | | |
4772 | 408 | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4773 | 408 | TY_(FreeNode)( doc, node); |
4774 | 408 | continue; |
4775 | 408 | } |
4776 | | |
4777 | 154 | if ( nodeIsTD(node) || nodeIsTH(node) ) |
4778 | 119 | { |
4779 | 119 | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4780 | 119 | TY_(FreeNode)( doc, node); |
4781 | 119 | continue; |
4782 | 119 | } |
4783 | 154 | } |
4784 | | |
4785 | | /* deal with comments etc. */ |
4786 | 12.0k | if (InsertMisc(row, node)) |
4787 | 11 | continue; |
4788 | | |
4789 | | /* discard unknown tags */ |
4790 | 12.0k | if (node->tag == NULL && node->type != TextNode) |
4791 | 1.57k | { |
4792 | 1.57k | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4793 | 1.57k | TY_(FreeNode)( doc, node); |
4794 | 1.57k | continue; |
4795 | 1.57k | } |
4796 | | |
4797 | | /* discard unexpected <table> element */ |
4798 | 10.4k | if ( nodeIsTABLE(node) ) |
4799 | 41 | { |
4800 | 41 | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4801 | 41 | TY_(FreeNode)( doc, node); |
4802 | 41 | continue; |
4803 | 41 | } |
4804 | | |
4805 | | /* THEAD, TFOOT or TBODY */ |
4806 | 10.4k | if ( TY_(nodeHasCM)(node, CM_ROWGRP) ) |
4807 | 265 | { |
4808 | 265 | TY_(UngetToken)( doc ); |
4809 | 265 | DEBUG_LOG_EXIT; |
4810 | 265 | return NULL; |
4811 | 265 | } |
4812 | | |
4813 | 10.1k | if (node->type == EndTag) |
4814 | 12 | { |
4815 | 12 | TY_(Report)(doc, row, node, DISCARDING_UNEXPECTED); |
4816 | 12 | TY_(FreeNode)( doc, node); |
4817 | 12 | continue; |
4818 | 12 | } |
4819 | | |
4820 | | /* |
4821 | | if text or inline or block move before table |
4822 | | if head content move to head |
4823 | | */ |
4824 | | |
4825 | 10.1k | if (node->type != EndTag) |
4826 | 10.1k | { |
4827 | 10.1k | if ( nodeIsFORM(node) ) |
4828 | 0 | { |
4829 | 0 | TY_(UngetToken)( doc ); |
4830 | 0 | node = TY_(InferredTag)(doc, TidyTag_TD); |
4831 | 0 | TY_(Report)(doc, row, node, MISSING_STARTTAG); |
4832 | 0 | } |
4833 | 10.1k | else if ( TY_(nodeIsText)(node) |
4834 | 8.32k | || TY_(nodeHasCM)(node, CM_BLOCK | CM_INLINE) ) |
4835 | 6.86k | { |
4836 | 6.86k | MoveBeforeTable( doc, row, node ); |
4837 | 6.86k | TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN); |
4838 | 6.86k | lexer->exiled = yes; |
4839 | 6.86k | exclude_state = lexer->excludeBlocks; |
4840 | 6.86k | lexer->excludeBlocks = no; |
4841 | | |
4842 | 6.86k | if (node->type != TextNode) |
4843 | 5.03k | { |
4844 | 5.03k | TidyParserMemory memory = {0}; |
4845 | 5.03k | memory.identity = TY_(ParseRow); |
4846 | 5.03k | memory.original_node = row; |
4847 | 5.03k | memory.reentry_node = node; |
4848 | 5.03k | memory.reentry_state = STATE_POST_NOT_ENDTAG; |
4849 | 5.03k | memory.register_1 = exclude_state; |
4850 | 5.03k | TY_(pushMemory)( doc, memory ); |
4851 | 5.03k | DEBUG_LOG_EXIT_WITH_NODE(node); |
4852 | 5.03k | return node; |
4853 | 5.03k | } |
4854 | | |
4855 | 1.82k | lexer->exiled = no; |
4856 | 1.82k | lexer->excludeBlocks = exclude_state; |
4857 | 1.82k | continue; |
4858 | 6.86k | } |
4859 | 3.29k | else if (node->tag->model & CM_HEAD) |
4860 | 7 | { |
4861 | 7 | TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN); |
4862 | 7 | MoveToHead( doc, row, node); |
4863 | 7 | continue; |
4864 | 7 | } |
4865 | 10.1k | } |
4866 | | |
4867 | 3.28k | if ( !(nodeIsTD(node) || nodeIsTH(node)) ) |
4868 | 2.29k | { |
4869 | 2.29k | TY_(Report)(doc, row, node, TAG_NOT_ALLOWED_IN); |
4870 | 2.29k | TY_(FreeNode)( doc, node); |
4871 | 2.29k | continue; |
4872 | 2.29k | } |
4873 | | |
4874 | | /* node should be <TD> or <TH> */ |
4875 | 993 | TY_(InsertNodeAtEnd)(row, node); |
4876 | 993 | exclude_state = lexer->excludeBlocks; |
4877 | 993 | lexer->excludeBlocks = no; |
4878 | 993 | { |
4879 | 993 | TidyParserMemory memory = {0}; |
4880 | 993 | memory.identity = TY_(ParseRow); |
4881 | 993 | memory.original_node = row; |
4882 | 993 | memory.reentry_node = node; |
4883 | 993 | memory.reentry_state = STATE_POST_TD_TH; |
4884 | 993 | memory.register_1 = exclude_state; |
4885 | 993 | TY_(pushMemory)( doc, memory ); |
4886 | 993 | DEBUG_LOG_EXIT_WITH_NODE(node); |
4887 | 993 | return node; |
4888 | 3.28k | } |
4889 | 3.28k | } break; |
4890 | | |
4891 | | |
4892 | 5.03k | case STATE_POST_NOT_ENDTAG: |
4893 | 5.03k | { |
4894 | 5.03k | lexer->exiled = no; |
4895 | 5.03k | lexer->excludeBlocks = exclude_state; /* capture this in stack. */ |
4896 | 5.03k | state = STATE_INITIAL; |
4897 | 5.03k | continue; |
4898 | 3.28k | } break; |
4899 | | |
4900 | | |
4901 | 993 | case STATE_POST_TD_TH: |
4902 | 993 | { |
4903 | 993 | lexer->excludeBlocks = exclude_state; /* capture this in stack. */ |
4904 | | |
4905 | | /* pop inline stack */ |
4906 | 5.75k | while ( lexer->istacksize > lexer->istackbase ) |
4907 | 4.76k | TY_(PopInline)( doc, NULL ); |
4908 | | |
4909 | 993 | state = STATE_INITIAL; |
4910 | 993 | continue; |
4911 | 3.28k | } break; |
4912 | | |
4913 | | |
4914 | 0 | default: |
4915 | 0 | break; |
4916 | | |
4917 | 22.6k | } /* switch */ |
4918 | 22.6k | } /* while */ |
4919 | 2.35k | DEBUG_LOG_EXIT; |
4920 | 2.35k | return NULL; |
4921 | 10.3k | } |
4922 | | |
4923 | | |
4924 | | /** MARK: TY_(ParseRowGroup) |
4925 | | * Parses the `rowgroup` tag. |
4926 | | * |
4927 | | * This is a non-recursing parser. It uses the document's parser memory stack |
4928 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
4929 | | * This parser is also re-enterable, so that post-processing can occur after |
4930 | | * such dispatching. |
4931 | | */ |
4932 | | Node* TY_(ParseRowGroup)( TidyDocImpl* doc, Node *rowgroup, GetTokenMode ARG_UNUSED(mode) ) |
4933 | 19.1k | { |
4934 | 19.1k | Lexer* lexer = doc->lexer; |
4935 | 19.1k | Node *node = NULL; |
4936 | 19.1k | Node *parent = NULL; |
4937 | 19.1k | DEBUG_LOG_COUNTERS; |
4938 | | |
4939 | 19.1k | enum parserState { |
4940 | 19.1k | STATE_INITIAL, /* This is the initial state for every parser. */ |
4941 | 19.1k | STATE_POST_NOT_TEXTNODE, /* To-do after re-entering after checks. */ |
4942 | 19.1k | STATE_COMPLETE, /* Done with the switch. */ |
4943 | 19.1k | } state = STATE_INITIAL; |
4944 | | |
4945 | 19.1k | if ( rowgroup == NULL ) |
4946 | 8.51k | { |
4947 | 8.51k | TidyParserMemory memory = TY_(popMemory)( doc ); |
4948 | 8.51k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
4949 | 8.51k | DEBUG_LOG_REENTER_WITH_NODE(node); |
4950 | 8.51k | rowgroup = memory.original_node; |
4951 | 8.51k | state = memory.reentry_state; |
4952 | 8.51k | DEBUG_LOG_GET_OLD_MODE; |
4953 | 8.51k | mode = memory.mode; |
4954 | 8.51k | DEBUG_LOG_CHANGE_MODE; |
4955 | 8.51k | } |
4956 | 10.5k | else |
4957 | 10.5k | { |
4958 | 10.5k | DEBUG_LOG_ENTER_WITH_NODE(rowgroup); |
4959 | 10.5k | if (rowgroup->tag->model & CM_EMPTY) |
4960 | 0 | { |
4961 | 0 | DEBUG_LOG_EXIT; |
4962 | 0 | return NULL; |
4963 | 0 | } |
4964 | 10.5k | } |
4965 | | |
4966 | 35.9k | while ( state != STATE_COMPLETE ) |
4967 | 29.3k | { |
4968 | 29.3k | if ( state == STATE_INITIAL ) |
4969 | 20.9k | node = TY_(GetToken)(doc, IgnoreWhitespace); |
4970 | | |
4971 | 29.3k | switch (state) |
4972 | 29.3k | { |
4973 | 20.9k | case STATE_INITIAL: |
4974 | 20.9k | { |
4975 | 20.9k | TidyParserMemory memory = {0}; |
4976 | | |
4977 | 20.9k | if (node == NULL) |
4978 | 6.54k | { |
4979 | 6.54k | state = STATE_COMPLETE; |
4980 | 6.54k | continue; |
4981 | 6.54k | } |
4982 | | |
4983 | 14.3k | if (node->tag == rowgroup->tag) |
4984 | 3.81k | { |
4985 | 3.81k | if (node->type == EndTag) |
4986 | 0 | { |
4987 | 0 | rowgroup->closed = yes; |
4988 | 0 | TY_(FreeNode)( doc, node); |
4989 | 0 | DEBUG_LOG_EXIT; |
4990 | 0 | return NULL; |
4991 | 0 | } |
4992 | | |
4993 | 3.81k | TY_(UngetToken)( doc ); |
4994 | 3.81k | DEBUG_LOG_EXIT; |
4995 | 3.81k | return NULL; |
4996 | 3.81k | } |
4997 | | |
4998 | | /* if </table> infer end tag */ |
4999 | 10.5k | if ( nodeIsTABLE(node) && node->type == EndTag ) |
5000 | 230 | { |
5001 | 230 | TY_(UngetToken)( doc ); |
5002 | 230 | DEBUG_LOG_EXIT; |
5003 | 230 | return NULL; |
5004 | 230 | } |
5005 | | |
5006 | | /* deal with comments etc. */ |
5007 | 10.3k | if (InsertMisc(rowgroup, node)) |
5008 | 123 | continue; |
5009 | | |
5010 | | /* discard unknown tags */ |
5011 | 10.2k | if (node->tag == NULL && node->type != TextNode) |
5012 | 754 | { |
5013 | 754 | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5014 | 754 | TY_(FreeNode)( doc, node); |
5015 | 754 | continue; |
5016 | 754 | } |
5017 | | |
5018 | | /* |
5019 | | if TD or TH then infer <TR> |
5020 | | if text or inline or block move before table |
5021 | | if head content move to head |
5022 | | */ |
5023 | | |
5024 | 9.46k | if (node->type != EndTag) |
5025 | 9.12k | { |
5026 | 9.12k | if ( nodeIsTD(node) || nodeIsTH(node) ) |
5027 | 231 | { |
5028 | 231 | TY_(UngetToken)( doc ); |
5029 | 231 | node = TY_(InferredTag)(doc, TidyTag_TR); |
5030 | 231 | TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG); |
5031 | 231 | } |
5032 | 8.89k | else if ( TY_(nodeIsText)(node) |
5033 | 8.28k | || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
5034 | 8.44k | { |
5035 | 8.44k | MoveBeforeTable( doc, rowgroup, node ); |
5036 | 8.44k | TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); |
5037 | 8.44k | lexer->exiled = yes; |
5038 | | |
5039 | 8.44k | if (node->type != TextNode) |
5040 | 7.83k | { |
5041 | 7.83k | memory.identity = TY_(ParseRowGroup); |
5042 | 7.83k | memory.original_node = rowgroup; |
5043 | 7.83k | memory.reentry_node = node; |
5044 | 7.83k | memory.reentry_state = STATE_POST_NOT_TEXTNODE; |
5045 | 7.83k | TY_(pushMemory)( doc, memory ); |
5046 | 7.83k | DEBUG_LOG_EXIT_WITH_NODE(node); |
5047 | 7.83k | return node; |
5048 | 7.83k | } |
5049 | | |
5050 | 608 | state = STATE_POST_NOT_TEXTNODE; |
5051 | 608 | continue; |
5052 | 8.44k | } |
5053 | 450 | else if (node->tag->model & CM_HEAD) |
5054 | 0 | { |
5055 | 0 | TY_(Report)(doc, rowgroup, node, TAG_NOT_ALLOWED_IN); |
5056 | 0 | MoveToHead(doc, rowgroup, node); |
5057 | 0 | continue; |
5058 | 0 | } |
5059 | 9.12k | } |
5060 | | |
5061 | | /* |
5062 | | if this is the end tag for ancestor element |
5063 | | then infer end tag for this element |
5064 | | */ |
5065 | 1.02k | if (node->type == EndTag) |
5066 | 340 | { |
5067 | 340 | if ( nodeIsFORM(node) || TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
5068 | 37 | { |
5069 | 37 | if ( nodeIsFORM(node) ) |
5070 | 4 | BadForm( doc ); |
5071 | | |
5072 | 37 | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5073 | 37 | TY_(FreeNode)( doc, node); |
5074 | 37 | continue; |
5075 | 37 | } |
5076 | | |
5077 | 303 | if ( nodeIsTR(node) || nodeIsTD(node) || nodeIsTH(node) ) |
5078 | 303 | { |
5079 | 303 | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5080 | 303 | TY_(FreeNode)( doc, node); |
5081 | 303 | continue; |
5082 | 303 | } |
5083 | | |
5084 | 0 | for ( parent = rowgroup->parent; |
5085 | 0 | parent != NULL; |
5086 | 0 | parent = parent->parent ) |
5087 | 0 | { |
5088 | 0 | if (node->tag == parent->tag) |
5089 | 0 | { |
5090 | 0 | TY_(UngetToken)( doc ); |
5091 | 0 | DEBUG_LOG_EXIT; |
5092 | 0 | return NULL; |
5093 | 0 | } |
5094 | 0 | } |
5095 | 0 | } |
5096 | | |
5097 | | /* |
5098 | | if THEAD, TFOOT or TBODY then implied end tag |
5099 | | |
5100 | | */ |
5101 | 681 | if (node->tag->model & CM_ROWGRP) |
5102 | 0 | { |
5103 | 0 | if (node->type != EndTag) |
5104 | 0 | { |
5105 | 0 | TY_(UngetToken)( doc ); |
5106 | 0 | DEBUG_LOG_EXIT; |
5107 | 0 | return NULL; |
5108 | 0 | } |
5109 | 0 | } |
5110 | | |
5111 | 681 | if (node->type == EndTag) |
5112 | 0 | { |
5113 | 0 | TY_(Report)(doc, rowgroup, node, DISCARDING_UNEXPECTED); |
5114 | 0 | TY_(FreeNode)( doc, node); |
5115 | 0 | continue; |
5116 | 0 | } |
5117 | | |
5118 | 681 | if ( !nodeIsTR(node) ) |
5119 | 286 | { |
5120 | 286 | node = TY_(InferredTag)(doc, TidyTag_TR); |
5121 | 286 | TY_(Report)(doc, rowgroup, node, MISSING_STARTTAG); |
5122 | 286 | TY_(UngetToken)( doc ); |
5123 | 286 | } |
5124 | | |
5125 | | /* node should be <TR> */ |
5126 | 681 | TY_(InsertNodeAtEnd)(rowgroup, node); |
5127 | 681 | memory.identity = TY_(ParseRowGroup); |
5128 | 681 | memory.original_node = rowgroup; |
5129 | 681 | memory.reentry_node = node; |
5130 | 681 | memory.reentry_state = STATE_INITIAL; |
5131 | 681 | TY_(pushMemory)( doc, memory ); |
5132 | 681 | DEBUG_LOG_EXIT_WITH_NODE(node); |
5133 | 681 | return node; |
5134 | 681 | } break; |
5135 | | |
5136 | | |
5137 | 8.44k | case STATE_POST_NOT_TEXTNODE: |
5138 | 8.44k | { |
5139 | 8.44k | lexer->exiled = no; |
5140 | 8.44k | state = STATE_INITIAL; |
5141 | 8.44k | continue; |
5142 | 681 | } break; |
5143 | | |
5144 | | |
5145 | 0 | default: |
5146 | 0 | break; |
5147 | 29.3k | } /* switch */ |
5148 | 29.3k | } /* while */ |
5149 | 6.54k | DEBUG_LOG_EXIT; |
5150 | 6.54k | return NULL; |
5151 | 19.1k | } |
5152 | | |
5153 | | |
5154 | | /** MARK: TY_(ParseScript) |
5155 | | * Parses the `script` tag. |
5156 | | * |
5157 | | * @todo This isn't quite right for CDATA content as it recognises tags |
5158 | | * within the content and parses them accordingly. This will unfortunately |
5159 | | * screw up scripts which include: |
5160 | | * < + letter |
5161 | | * < + ! |
5162 | | * < + ? |
5163 | | * < + / + letter |
5164 | | * |
5165 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5166 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5167 | | * This parser is also re-enterable, so that post-processing can occur after |
5168 | | * such dispatching. |
5169 | | */ |
5170 | | Node* TY_(ParseScript)( TidyDocImpl* doc, Node *script, GetTokenMode ARG_UNUSED(mode) ) |
5171 | 2.02k | { |
5172 | 2.02k | Node *node = NULL; |
5173 | | #if defined(ENABLE_DEBUG_LOG) |
5174 | | static int depth_parser = 0; |
5175 | | static int count_parser = 0; |
5176 | | #endif |
5177 | | |
5178 | 2.02k | DEBUG_LOG_ENTER_WITH_NODE(script); |
5179 | | |
5180 | 2.02k | doc->lexer->parent = script; |
5181 | 2.02k | node = TY_(GetToken)(doc, CdataContent); |
5182 | 2.02k | doc->lexer->parent = NULL; |
5183 | | |
5184 | 2.02k | if (node) |
5185 | 2.02k | { |
5186 | 2.02k | TY_(InsertNodeAtEnd)(script, node); |
5187 | 2.02k | } |
5188 | 0 | else |
5189 | 0 | { |
5190 | | /* handle e.g. a document like "<script>" */ |
5191 | 0 | TY_(Report)(doc, script, NULL, MISSING_ENDTAG_FOR); |
5192 | 0 | DEBUG_LOG_EXIT; |
5193 | 0 | return NULL; |
5194 | 0 | } |
5195 | | |
5196 | 2.02k | node = TY_(GetToken)(doc, IgnoreWhitespace); |
5197 | 2.02k | DEBUG_LOG_GOT_TOKEN(node); |
5198 | | |
5199 | 2.02k | if (!(node && node->type == EndTag && node->tag && |
5200 | 587 | node->tag->id == script->tag->id)) |
5201 | 1.69k | { |
5202 | 1.69k | TY_(Report)(doc, script, node, MISSING_ENDTAG_FOR); |
5203 | | |
5204 | 1.69k | if (node) |
5205 | 1.01k | TY_(UngetToken)(doc); |
5206 | 1.69k | } |
5207 | 321 | else |
5208 | 321 | { |
5209 | 321 | TY_(FreeNode)(doc, node); |
5210 | 321 | } |
5211 | 2.02k | DEBUG_LOG_EXIT; |
5212 | 2.02k | return NULL; |
5213 | 2.02k | } |
5214 | | |
5215 | | |
5216 | | /** MARK: TY_(ParseSelect) |
5217 | | * Parses the `select` tag. |
5218 | | * |
5219 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5220 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5221 | | * This parser is also re-enterable, so that post-processing can occur after |
5222 | | * such dispatching. |
5223 | | */ |
5224 | | Node* TY_(ParseSelect)( TidyDocImpl* doc, Node *field, GetTokenMode ARG_UNUSED(mode) ) |
5225 | 275 | { |
5226 | 275 | Lexer* lexer = doc->lexer; |
5227 | 275 | Node *node; |
5228 | 275 | DEBUG_LOG_COUNTERS; |
5229 | | |
5230 | 275 | if ( field == NULL ) |
5231 | 130 | { |
5232 | 130 | TidyParserMemory memory = TY_(popMemory)( doc ); |
5233 | 130 | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
5234 | 130 | DEBUG_LOG_REENTER_WITH_NODE(node); |
5235 | 130 | field = memory.original_node; |
5236 | 130 | DEBUG_LOG_GET_OLD_MODE; |
5237 | 130 | mode = memory.mode; |
5238 | 130 | DEBUG_LOG_CHANGE_MODE; |
5239 | 130 | } |
5240 | 145 | else |
5241 | 145 | { |
5242 | 145 | DEBUG_LOG_ENTER_WITH_NODE(field); |
5243 | 145 | } |
5244 | | |
5245 | 275 | lexer->insert = NULL; /* defer implicit inline start tags */ |
5246 | | |
5247 | 794 | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
5248 | 782 | { |
5249 | 782 | if (node->tag == field->tag && node->type == EndTag) |
5250 | 133 | { |
5251 | 133 | TY_(FreeNode)( doc, node); |
5252 | 133 | field->closed = yes; |
5253 | 133 | TrimSpaces(doc, field); |
5254 | | |
5255 | 133 | DEBUG_LOG_EXIT; |
5256 | 133 | return NULL; |
5257 | 133 | } |
5258 | | |
5259 | | /* deal with comments etc. */ |
5260 | 649 | if (InsertMisc(field, node)) |
5261 | 0 | continue; |
5262 | | |
5263 | 649 | if ( node->type == StartTag && |
5264 | 343 | ( nodeIsOPTION(node) || |
5265 | 343 | nodeIsOPTGROUP(node) || |
5266 | 343 | nodeIsDATALIST(node) || |
5267 | 343 | nodeIsSCRIPT(node)) |
5268 | 649 | ) |
5269 | 130 | { |
5270 | 130 | TidyParserMemory memory = {0}; |
5271 | 130 | memory.identity = TY_(ParseSelect); |
5272 | 130 | memory.original_node = field; |
5273 | 130 | memory.reentry_node = node; |
5274 | | |
5275 | 130 | TY_(InsertNodeAtEnd)(field, node); |
5276 | 130 | TY_(pushMemory)( doc, memory ); |
5277 | 130 | DEBUG_LOG_EXIT_WITH_NODE(node); |
5278 | 130 | return node; |
5279 | 130 | } |
5280 | | |
5281 | | /* discard unexpected tags */ |
5282 | 519 | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED); |
5283 | 519 | TY_(FreeNode)( doc, node); |
5284 | 519 | } |
5285 | | |
5286 | 12 | TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR); |
5287 | | |
5288 | 12 | DEBUG_LOG_EXIT; |
5289 | 12 | return NULL; |
5290 | 275 | } |
5291 | | |
5292 | | |
5293 | | /** MARK: TY_(ParseTableTag) |
5294 | | * Parses the `table` tag. |
5295 | | * |
5296 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5297 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5298 | | * This parser is also re-enterable, so that post-processing can occur after |
5299 | | * such dispatching. |
5300 | | */ |
5301 | | Node* TY_(ParseTableTag)( TidyDocImpl* doc, Node *table, GetTokenMode ARG_UNUSED(mode) ) |
5302 | 15.5k | { |
5303 | 15.5k | Lexer* lexer = doc->lexer; |
5304 | 15.5k | Node *node, *parent; |
5305 | 15.5k | uint istackbase; |
5306 | 15.5k | DEBUG_LOG_COUNTERS; |
5307 | | |
5308 | 15.5k | if ( table == NULL ) |
5309 | 8.75k | { |
5310 | 8.75k | TidyParserMemory memory = TY_(popMemory)( doc ); |
5311 | 8.75k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
5312 | 8.75k | DEBUG_LOG_REENTER_WITH_NODE(node); |
5313 | 8.75k | table = memory.original_node; |
5314 | 8.75k | lexer->exiled = memory.register_1; |
5315 | 8.75k | DEBUG_LOG_GET_OLD_MODE; |
5316 | 8.75k | mode = memory.mode; |
5317 | 8.75k | DEBUG_LOG_CHANGE_MODE; |
5318 | 8.75k | } |
5319 | 6.82k | else |
5320 | 6.82k | { |
5321 | 6.82k | DEBUG_LOG_ENTER_WITH_NODE(table); |
5322 | 6.82k | TY_(DeferDup)( doc ); |
5323 | 6.82k | } |
5324 | | |
5325 | 15.5k | istackbase = lexer->istackbase; |
5326 | 15.5k | lexer->istackbase = lexer->istacksize; |
5327 | | |
5328 | 26.9k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
5329 | 21.2k | { |
5330 | 21.2k | DEBUG_LOG_GOT_TOKEN(node); |
5331 | 21.2k | if (node->tag == table->tag ) |
5332 | 694 | { |
5333 | 694 | if (node->type == EndTag) |
5334 | 104 | { |
5335 | 104 | TY_(FreeNode)(doc, node); |
5336 | 104 | } |
5337 | 590 | else |
5338 | 590 | { |
5339 | | /* Issue #498 - If a <table> in a <table> |
5340 | | * just close the current table, and issue a |
5341 | | * warning. The previous action was to discard |
5342 | | * this second <table> |
5343 | | */ |
5344 | 590 | TY_(UngetToken)(doc); |
5345 | 590 | TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN); |
5346 | 590 | } |
5347 | 694 | lexer->istackbase = istackbase; |
5348 | 694 | table->closed = yes; |
5349 | | |
5350 | 694 | DEBUG_LOG_EXIT; |
5351 | 694 | return NULL; |
5352 | 694 | } |
5353 | | |
5354 | | /* deal with comments etc. */ |
5355 | 20.5k | if (InsertMisc(table, node)) |
5356 | 3.37k | continue; |
5357 | | |
5358 | | /* discard unknown tags */ |
5359 | 17.1k | if (node->tag == NULL && node->type != TextNode) |
5360 | 2.32k | { |
5361 | 2.32k | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5362 | 2.32k | TY_(FreeNode)( doc, node); |
5363 | 2.32k | continue; |
5364 | 2.32k | } |
5365 | | |
5366 | | /* if TD or TH or text or inline or block then infer <TR> */ |
5367 | | |
5368 | 14.8k | if (node->type != EndTag) |
5369 | 14.2k | { |
5370 | 14.2k | if ( nodeIsTD(node) || nodeIsTH(node) || nodeIsTABLE(node) ) |
5371 | 61 | { |
5372 | 61 | TY_(UngetToken)( doc ); |
5373 | 61 | node = TY_(InferredTag)(doc, TidyTag_TR); |
5374 | 61 | TY_(Report)(doc, table, node, MISSING_STARTTAG); |
5375 | 61 | } |
5376 | 14.1k | else if ( TY_(nodeIsText)(node) ||TY_(nodeHasCM)(node,CM_BLOCK|CM_INLINE) ) |
5377 | 6.03k | { |
5378 | 6.03k | TY_(InsertNodeBeforeElement)(table, node); |
5379 | 6.03k | TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN); |
5380 | 6.03k | lexer->exiled = yes; |
5381 | | |
5382 | 6.03k | if (node->type != TextNode) |
5383 | 576 | { |
5384 | 576 | TidyParserMemory memory = {0}; |
5385 | 576 | memory.identity = TY_(ParseTableTag); |
5386 | 576 | memory.original_node = table; |
5387 | 576 | memory.reentry_node = node; |
5388 | 576 | memory.register_1 = no; /* later, lexer->exiled = no */ |
5389 | 576 | memory.mode = IgnoreWhitespace; |
5390 | 576 | TY_(pushMemory)( doc, memory ); |
5391 | 576 | DEBUG_LOG_EXIT_WITH_NODE(node); |
5392 | 576 | return node; |
5393 | 576 | } |
5394 | | |
5395 | 5.45k | lexer->exiled = no; |
5396 | 5.45k | continue; |
5397 | 6.03k | } |
5398 | 8.15k | else if (node->tag->model & CM_HEAD) |
5399 | 4 | { |
5400 | 4 | MoveToHead(doc, table, node); |
5401 | 4 | continue; |
5402 | 4 | } |
5403 | 14.2k | } |
5404 | | |
5405 | | /* |
5406 | | if this is the end tag for an ancestor element |
5407 | | then infer end tag for this element |
5408 | | */ |
5409 | 8.81k | if (node->type == EndTag) |
5410 | 599 | { |
5411 | 599 | if ( nodeIsFORM(node) ) |
5412 | 153 | { |
5413 | 153 | BadForm( doc ); |
5414 | 153 | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5415 | 153 | TY_(FreeNode)( doc, node); |
5416 | 153 | continue; |
5417 | 153 | } |
5418 | | |
5419 | | /* best to discard unexpected block/inline end tags */ |
5420 | 446 | if ( TY_(nodeHasCM)(node, CM_TABLE|CM_ROW) || |
5421 | 430 | TY_(nodeHasCM)(node, CM_BLOCK|CM_INLINE) ) |
5422 | 28 | { |
5423 | 28 | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5424 | 28 | TY_(FreeNode)( doc, node); |
5425 | 28 | continue; |
5426 | 28 | } |
5427 | | |
5428 | 418 | for ( parent = table->parent; |
5429 | 973 | parent != NULL; |
5430 | 555 | parent = parent->parent ) |
5431 | 694 | { |
5432 | 694 | if (node->tag == parent->tag) |
5433 | 139 | { |
5434 | 139 | TY_(Report)(doc, table, node, MISSING_ENDTAG_BEFORE ); |
5435 | 139 | TY_(UngetToken)( doc ); |
5436 | 139 | lexer->istackbase = istackbase; |
5437 | | |
5438 | 139 | DEBUG_LOG_EXIT; |
5439 | 139 | return NULL; |
5440 | 139 | } |
5441 | 694 | } |
5442 | 418 | } |
5443 | | |
5444 | 8.49k | if (!(node->tag->model & CM_TABLE)) |
5445 | 310 | { |
5446 | 310 | TY_(UngetToken)( doc ); |
5447 | 310 | TY_(Report)(doc, table, node, TAG_NOT_ALLOWED_IN); |
5448 | 310 | lexer->istackbase = istackbase; |
5449 | | |
5450 | 310 | DEBUG_LOG_EXIT; |
5451 | 310 | return NULL; |
5452 | 310 | } |
5453 | | |
5454 | 8.18k | if (TY_(nodeIsElement)(node)) |
5455 | 8.18k | { |
5456 | 8.18k | TidyParserMemory memory = {0}; |
5457 | 8.18k | TY_(InsertNodeAtEnd)(table, node); |
5458 | 8.18k | memory.identity = TY_(ParseTableTag); |
5459 | 8.18k | memory.original_node = table; |
5460 | 8.18k | memory.reentry_node = node; |
5461 | 8.18k | memory.register_1 = lexer->exiled; |
5462 | 8.18k | TY_(pushMemory)( doc, memory ); |
5463 | 8.18k | DEBUG_LOG_EXIT_WITH_NODE(node); |
5464 | 8.18k | return node; |
5465 | 8.18k | } |
5466 | | |
5467 | | /* discard unexpected text nodes and end tags */ |
5468 | 0 | TY_(Report)(doc, table, node, DISCARDING_UNEXPECTED); |
5469 | 0 | TY_(FreeNode)( doc, node); |
5470 | 0 | } |
5471 | | |
5472 | 5.67k | TY_(Report)(doc, table, node, MISSING_ENDTAG_FOR); |
5473 | 5.67k | lexer->istackbase = istackbase; |
5474 | | |
5475 | 5.67k | DEBUG_LOG_EXIT; |
5476 | 5.67k | return NULL; |
5477 | 15.5k | } |
5478 | | |
5479 | | |
5480 | | /** MARK: TY_(ParseText) |
5481 | | * Parses the `option` and `textarea` tags. |
5482 | | * |
5483 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5484 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5485 | | * This parser is also re-enterable, so that post-processing can occur after |
5486 | | * such dispatching. |
5487 | | */ |
5488 | | Node* TY_(ParseText)( TidyDocImpl* doc, Node *field, GetTokenMode mode ) |
5489 | 2.48k | { |
5490 | 2.48k | Lexer* lexer = doc->lexer; |
5491 | 2.48k | Node *node; |
5492 | 2.48k | DEBUG_LOG_COUNTERS; |
5493 | | |
5494 | 2.48k | DEBUG_LOG_ENTER_WITH_NODE(field); |
5495 | | |
5496 | 2.48k | lexer->insert = NULL; /* defer implicit inline start tags */ |
5497 | | |
5498 | 2.48k | DEBUG_LOG_GET_OLD_MODE; |
5499 | 2.48k | if ( nodeIsTEXTAREA(field) ) |
5500 | 2.33k | mode = Preformatted; |
5501 | 151 | else |
5502 | 151 | mode = MixedContent; /* kludge for font tags */ |
5503 | 2.48k | DEBUG_LOG_CHANGE_MODE; |
5504 | | |
5505 | 4.90k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
5506 | 4.69k | { |
5507 | 4.69k | if (node->tag == field->tag && node->type == EndTag) |
5508 | 0 | { |
5509 | 0 | TY_(FreeNode)( doc, node); |
5510 | 0 | field->closed = yes; |
5511 | 0 | TrimSpaces(doc, field); |
5512 | 0 | DEBUG_LOG_EXIT; |
5513 | 0 | return NULL; |
5514 | 0 | } |
5515 | | |
5516 | | /* deal with comments etc. */ |
5517 | 4.69k | if (InsertMisc(field, node)) |
5518 | 342 | continue; |
5519 | | |
5520 | 4.35k | if (TY_(nodeIsText)(node)) |
5521 | 1.77k | { |
5522 | | /* only called for 1st child */ |
5523 | 1.77k | if (field->content == NULL && !(mode & Preformatted)) |
5524 | 133 | TrimSpaces(doc, field); |
5525 | | |
5526 | 1.77k | if (node->start >= node->end) |
5527 | 0 | { |
5528 | 0 | TY_(FreeNode)( doc, node); |
5529 | 0 | continue; |
5530 | 0 | } |
5531 | | |
5532 | 1.77k | TY_(InsertNodeAtEnd)(field, node); |
5533 | 1.77k | continue; |
5534 | 1.77k | } |
5535 | | |
5536 | | /* for textarea should all cases of < and & be escaped? */ |
5537 | | |
5538 | | /* discard inline tags e.g. font */ |
5539 | 2.57k | if ( node->tag |
5540 | 2.26k | && node->tag->model & CM_INLINE |
5541 | 2.17k | && !(node->tag->model & CM_FIELD)) /* #487283 - fix by Lee Passey 25 Jan 02 */ |
5542 | 304 | { |
5543 | 304 | TY_(Report)(doc, field, node, DISCARDING_UNEXPECTED); |
5544 | 304 | TY_(FreeNode)( doc, node); |
5545 | 304 | continue; |
5546 | 304 | } |
5547 | | |
5548 | | /* terminate element on other tags */ |
5549 | 2.27k | if (!(field->tag->model & CM_OPT)) |
5550 | 2.24k | TY_(Report)(doc, field, node, MISSING_ENDTAG_BEFORE); |
5551 | | |
5552 | 2.27k | TY_(UngetToken)( doc ); |
5553 | 2.27k | TrimSpaces(doc, field); |
5554 | 2.27k | DEBUG_LOG_EXIT; |
5555 | 2.27k | return NULL; |
5556 | 2.57k | } |
5557 | | |
5558 | 214 | if (!(field->tag->model & CM_OPT)) |
5559 | 88 | TY_(Report)(doc, field, node, MISSING_ENDTAG_FOR); |
5560 | 214 | DEBUG_LOG_EXIT; |
5561 | 214 | return NULL; |
5562 | 2.48k | } |
5563 | | |
5564 | | |
5565 | | /** MARK: TY_(ParseTitle) |
5566 | | * Parses the `title` tag. |
5567 | | * |
5568 | | * This is a non-recursing parser. It uses the document's parser memory stack |
5569 | | * to send subsequent nodes back to the controller for dispatching to parsers. |
5570 | | * This parser is also re-enterable, so that post-processing can occur after |
5571 | | * such dispatching. |
5572 | | */ |
5573 | | Node* TY_(ParseTitle)( TidyDocImpl* doc, Node *title, GetTokenMode ARG_UNUSED(mode) ) |
5574 | 203 | { |
5575 | 203 | Node *node; |
5576 | 779 | while ((node = TY_(GetToken)(doc, MixedContent)) != NULL) |
5577 | 754 | { |
5578 | 754 | if (node->tag == title->tag && node->type == StartTag |
5579 | 101 | && cfgBool(doc, TidyCoerceEndTags) ) |
5580 | 101 | { |
5581 | 101 | TY_(Report)(doc, title, node, COERCE_TO_ENDTAG); |
5582 | 101 | node->type = EndTag; |
5583 | 101 | TY_(UngetToken)( doc ); |
5584 | 101 | continue; |
5585 | 101 | } |
5586 | 653 | else if (node->tag == title->tag && node->type == EndTag) |
5587 | 156 | { |
5588 | 156 | TY_(FreeNode)( doc, node); |
5589 | 156 | title->closed = yes; |
5590 | 156 | TrimSpaces(doc, title); |
5591 | 156 | return NULL; |
5592 | 156 | } |
5593 | | |
5594 | 497 | if (TY_(nodeIsText)(node)) |
5595 | 219 | { |
5596 | | /* only called for 1st child */ |
5597 | 219 | if (title->content == NULL) |
5598 | 127 | TrimInitialSpace(doc, title, node); |
5599 | | |
5600 | 219 | if (node->start >= node->end) |
5601 | 50 | { |
5602 | 50 | TY_(FreeNode)( doc, node); |
5603 | 50 | continue; |
5604 | 50 | } |
5605 | | |
5606 | 169 | TY_(InsertNodeAtEnd)(title, node); |
5607 | 169 | continue; |
5608 | 219 | } |
5609 | | |
5610 | | /* deal with comments etc. */ |
5611 | 278 | if (InsertMisc(title, node)) |
5612 | 33 | continue; |
5613 | | |
5614 | | /* discard unknown tags */ |
5615 | 245 | if (node->tag == NULL) |
5616 | 223 | { |
5617 | 223 | TY_(Report)(doc, title, node, DISCARDING_UNEXPECTED); |
5618 | 223 | TY_(FreeNode)( doc, node); |
5619 | 223 | continue; |
5620 | 223 | } |
5621 | | |
5622 | | /* pushback unexpected tokens */ |
5623 | 22 | TY_(Report)(doc, title, node, MISSING_ENDTAG_BEFORE); |
5624 | 22 | TY_(UngetToken)( doc ); |
5625 | 22 | TrimSpaces(doc, title); |
5626 | 22 | return NULL; |
5627 | 245 | } |
5628 | | |
5629 | 25 | TY_(Report)(doc, title, node, MISSING_ENDTAG_FOR); |
5630 | 25 | return NULL; |
5631 | 203 | } |
5632 | | |
5633 | | |
5634 | | /** MARK: ParseXMLElement |
5635 | | * Parses the given XML element. |
5636 | | */ |
5637 | | static Node* ParseXMLElement(TidyDocImpl* doc, Node *element, GetTokenMode mode) |
5638 | 54.4k | { |
5639 | 54.4k | Lexer* lexer = doc->lexer; |
5640 | 54.4k | Node *node; |
5641 | | |
5642 | 54.4k | if ( element == NULL ) |
5643 | 27.0k | { |
5644 | 27.0k | TidyParserMemory memory = TY_(popMemory)( doc ); |
5645 | 27.0k | element = memory.original_node; |
5646 | 27.0k | node = memory.reentry_node; /* Throwaway, as main loop overrwrites anyway. */ |
5647 | 27.0k | mode = memory.reentry_mode; |
5648 | 27.0k | TY_(InsertNodeAtEnd)(element, node); /* The only re-entry action needed. */ |
5649 | 27.0k | } |
5650 | 27.4k | else |
5651 | 27.4k | { |
5652 | | /* if node is pre or has xml:space="preserve" then do so */ |
5653 | 27.4k | if ( TY_(XMLPreserveWhiteSpace)(doc, element) ) |
5654 | 543 | mode = Preformatted; |
5655 | | |
5656 | | /* deal with comments etc. */ |
5657 | 27.4k | InsertMisc( &doc->root, element); |
5658 | | |
5659 | | /* we shouldn't have plain text at this point. */ |
5660 | 27.4k | if (TY_(nodeIsText)(element)) |
5661 | 144 | { |
5662 | 144 | TY_(Report)(doc, &doc->root, element, DISCARDING_UNEXPECTED); |
5663 | 144 | TY_(FreeNode)( doc, element); |
5664 | 144 | return NULL; |
5665 | 144 | } |
5666 | 27.4k | } |
5667 | 66.4k | while ((node = TY_(GetToken)(doc, mode)) != NULL) |
5668 | 40.1k | { |
5669 | 40.1k | if (node->type == EndTag && |
5670 | 1.73k | node->element && element->element && |
5671 | 1.73k | TY_(tmbstrcmp)(node->element, element->element) == 0) |
5672 | 940 | { |
5673 | 940 | TY_(FreeNode)( doc, node); |
5674 | 940 | element->closed = yes; |
5675 | 940 | break; |
5676 | 940 | } |
5677 | | |
5678 | | /* discard unexpected end tags */ |
5679 | 39.1k | if (node->type == EndTag) |
5680 | 795 | { |
5681 | 795 | if (element) |
5682 | 795 | TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_IN); |
5683 | 0 | else |
5684 | 0 | TY_(Report)(doc, element, node, UNEXPECTED_ENDTAG_ERR); |
5685 | | |
5686 | 795 | TY_(FreeNode)( doc, node); |
5687 | 795 | continue; |
5688 | 795 | } |
5689 | | |
5690 | | /* parse content on seeing start tag */ |
5691 | 38.3k | if (node->type == StartTag) |
5692 | 27.0k | { |
5693 | 27.0k | TidyParserMemory memory = {0}; |
5694 | 27.0k | memory.identity = ParseXMLElement; |
5695 | 27.0k | memory.original_node = element; |
5696 | 27.0k | memory.reentry_node = node; |
5697 | 27.0k | memory.reentry_mode = mode; |
5698 | 27.0k | TY_(pushMemory)( doc, memory ); |
5699 | 27.0k | return node; |
5700 | 27.0k | } |
5701 | | |
5702 | 11.3k | TY_(InsertNodeAtEnd)(element, node); |
5703 | 11.3k | } /* while */ |
5704 | | |
5705 | | /* |
5706 | | if first child is text then trim initial space and |
5707 | | delete text node if it is empty. |
5708 | | */ |
5709 | | |
5710 | 27.2k | node = element->content; |
5711 | | |
5712 | 27.2k | if (TY_(nodeIsText)(node) && mode != Preformatted) |
5713 | 5.39k | { |
5714 | 5.39k | if ( lexer->lexbuf[node->start] == ' ' ) |
5715 | 980 | { |
5716 | 980 | node->start++; |
5717 | | |
5718 | 980 | if (node->start >= node->end) |
5719 | 972 | TY_(DiscardElement)( doc, node ); |
5720 | 980 | } |
5721 | 5.39k | } |
5722 | | |
5723 | | /* |
5724 | | if last child is text then trim final space and |
5725 | | delete the text node if it is empty |
5726 | | */ |
5727 | | |
5728 | 27.2k | node = element->last; |
5729 | | |
5730 | 27.2k | if (TY_(nodeIsText)(node) && mode != Preformatted) |
5731 | 664 | { |
5732 | 664 | if ( lexer->lexbuf[node->end - 1] == ' ' ) |
5733 | 287 | { |
5734 | 287 | node->end--; |
5735 | | |
5736 | 287 | if (node->start >= node->end) |
5737 | 8 | TY_(DiscardElement)( doc, node ); |
5738 | 287 | } |
5739 | 664 | } |
5740 | 27.2k | return NULL; |
5741 | 54.3k | } |
5742 | | |
5743 | | |
5744 | | /***************************************************************************//* |
5745 | | ** MARK: - Post-Parse Operations |
5746 | | ***************************************************************************/ |
5747 | | |
5748 | | |
5749 | | /** |
5750 | | * Performs checking of all attributes recursively starting at `node`. |
5751 | | */ |
5752 | | static void AttributeChecks(TidyDocImpl* doc, Node* node) |
5753 | 379k | { |
5754 | 379k | Node *next; |
5755 | | |
5756 | 864k | while (node) |
5757 | 484k | { |
5758 | 484k | next = node->next; |
5759 | | |
5760 | 484k | if (TY_(nodeIsElement)(node)) |
5761 | 437k | { |
5762 | 437k | if (node->tag && node->tag->chkattrs) /* [i_a]2 fix crash after adding SVG support with alt/unknown tag subtree insertion there */ |
5763 | 19.4k | node->tag->chkattrs(doc, node); |
5764 | 418k | else |
5765 | 418k | TY_(CheckAttributes)(doc, node); |
5766 | 437k | } |
5767 | | |
5768 | 484k | if (node->content) |
5769 | 378k | AttributeChecks(doc, node->content); |
5770 | | |
5771 | 484k | assert( next != node ); /* http://tidy.sf.net/issue/1603538 */ |
5772 | 484k | node = next; |
5773 | 484k | } |
5774 | 379k | } |
5775 | | |
5776 | | |
5777 | | /** |
5778 | | * Encloses naked text in certain elements within `p` tags. |
5779 | | * |
5780 | | * <form>, <blockquote>, and <noscript> do not allow #PCDATA in |
5781 | | * HTML 4.01 Strict (%block; model instead of %flow;). |
5782 | | */ |
5783 | | static void EncloseBlockText(TidyDocImpl* doc, Node* node) |
5784 | 0 | { |
5785 | 0 | Node *next; |
5786 | 0 | Node *block; |
5787 | |
|
5788 | 0 | while (node) |
5789 | 0 | { |
5790 | 0 | next = node->next; |
5791 | |
|
5792 | 0 | if (node->content) |
5793 | 0 | EncloseBlockText(doc, node->content); |
5794 | |
|
5795 | 0 | if (!(nodeIsFORM(node) || nodeIsNOSCRIPT(node) || |
5796 | 0 | nodeIsBLOCKQUOTE(node)) |
5797 | 0 | || !node->content) |
5798 | 0 | { |
5799 | 0 | node = next; |
5800 | 0 | continue; |
5801 | 0 | } |
5802 | | |
5803 | 0 | block = node->content; |
5804 | |
|
5805 | 0 | if ((TY_(nodeIsText)(block) && !TY_(IsBlank)(doc->lexer, block)) || |
5806 | 0 | (TY_(nodeIsElement)(block) && nodeCMIsOnlyInline(block))) |
5807 | 0 | { |
5808 | 0 | Node* p = TY_(InferredTag)(doc, TidyTag_P); |
5809 | 0 | TY_(InsertNodeBeforeElement)(block, p); |
5810 | 0 | while (block && |
5811 | 0 | (!TY_(nodeIsElement)(block) || nodeCMIsOnlyInline(block))) |
5812 | 0 | { |
5813 | 0 | Node* tempNext = block->next; |
5814 | 0 | TY_(RemoveNode)(block); |
5815 | 0 | TY_(InsertNodeAtEnd)(p, block); |
5816 | 0 | block = tempNext; |
5817 | 0 | } |
5818 | 0 | TrimSpaces(doc, p); |
5819 | 0 | continue; |
5820 | 0 | } |
5821 | | |
5822 | 0 | node = next; |
5823 | 0 | } |
5824 | 0 | } |
5825 | | |
5826 | | |
5827 | | /** |
5828 | | * Encloses all naked body text within `p` tags. |
5829 | | */ |
5830 | | static void EncloseBodyText(TidyDocImpl* doc) |
5831 | 0 | { |
5832 | 0 | Node* node; |
5833 | 0 | Node* body = TY_(FindBody)(doc); |
5834 | |
|
5835 | 0 | if (!body) |
5836 | 0 | return; |
5837 | | |
5838 | 0 | node = body->content; |
5839 | |
|
5840 | 0 | while (node) |
5841 | 0 | { |
5842 | 0 | if ((TY_(nodeIsText)(node) && !TY_(IsBlank)(doc->lexer, node)) || |
5843 | 0 | (TY_(nodeIsElement)(node) && nodeCMIsOnlyInline(node))) |
5844 | 0 | { |
5845 | 0 | Node* p = TY_(InferredTag)(doc, TidyTag_P); |
5846 | 0 | TY_(InsertNodeBeforeElement)(node, p); |
5847 | 0 | while (node && (!TY_(nodeIsElement)(node) || nodeCMIsOnlyInline(node))) |
5848 | 0 | { |
5849 | 0 | Node* next = node->next; |
5850 | 0 | TY_(RemoveNode)(node); |
5851 | 0 | TY_(InsertNodeAtEnd)(p, node); |
5852 | 0 | node = next; |
5853 | 0 | } |
5854 | 0 | TrimSpaces(doc, p); |
5855 | 0 | continue; |
5856 | 0 | } |
5857 | 0 | node = node->next; |
5858 | 0 | } |
5859 | 0 | } |
5860 | | |
5861 | | |
5862 | | /** |
5863 | | * Replaces elements that are obsolete with appropriate substitute tags. |
5864 | | */ |
5865 | | static void ReplaceObsoleteElements(TidyDocImpl* doc, Node* node) |
5866 | 379k | { |
5867 | 379k | Node *next; |
5868 | | |
5869 | 864k | while (node) |
5870 | 484k | { |
5871 | 484k | next = node->next; |
5872 | | |
5873 | | /* if (nodeIsDIR(node) || nodeIsMENU(node)) */ |
5874 | | /* HTML5 - <menu ... > is no longer obsolete */ |
5875 | 484k | if (nodeIsDIR(node)) |
5876 | 33 | TY_(CoerceNode)(doc, node, TidyTag_UL, yes, yes); |
5877 | | |
5878 | 484k | if (nodeIsXMP(node) || nodeIsLISTING(node) || |
5879 | 484k | (node->tag && node->tag->id == TidyTag_PLAINTEXT)) |
5880 | 253 | TY_(CoerceNode)(doc, node, TidyTag_PRE, yes, yes); |
5881 | | |
5882 | 484k | if (node->content) |
5883 | 378k | ReplaceObsoleteElements(doc, node->content); |
5884 | | |
5885 | 484k | node = next; |
5886 | 484k | } |
5887 | 379k | } |
5888 | | |
5889 | | |
5890 | | /***************************************************************************//* |
5891 | | ** MARK: - Internal API Implementation |
5892 | | ***************************************************************************/ |
5893 | | |
5894 | | |
5895 | | /** MARK: TY_(CheckNodeIntegrity) |
5896 | | * Is used to perform a node integrity check after parsing an HTML or XML |
5897 | | * document. |
5898 | | * @note Actual performance of this check can be disabled by defining the |
5899 | | * macro NO_NODE_INTEGRITY_CHECK. |
5900 | | */ |
5901 | | Bool TY_(CheckNodeIntegrity)(Node *node) |
5902 | 408k | { |
5903 | 408k | #ifndef NO_NODE_INTEGRITY_CHECK |
5904 | 408k | Node *child; |
5905 | | |
5906 | 408k | if (node->prev) |
5907 | 76.3k | { |
5908 | 76.3k | if (node->prev->next != node) |
5909 | 0 | return no; |
5910 | 76.3k | } |
5911 | | |
5912 | 408k | if (node->next) |
5913 | 76.3k | { |
5914 | 76.3k | if (node->next == node || node->next->prev != node) |
5915 | 0 | return no; |
5916 | 76.3k | } |
5917 | | |
5918 | 408k | if (node->parent) |
5919 | 408k | { |
5920 | 408k | if (node->prev == NULL && node->parent->content != node) |
5921 | 0 | return no; |
5922 | | |
5923 | 408k | if (node->next == NULL && node->parent->last != node) |
5924 | 0 | return no; |
5925 | 408k | } |
5926 | | |
5927 | 816k | for (child = node->content; child; child = child->next) |
5928 | 408k | if ( child->parent != node || !TY_(CheckNodeIntegrity)(child) ) |
5929 | 0 | return no; |
5930 | | |
5931 | 408k | #endif |
5932 | 408k | return yes; |
5933 | 408k | } |
5934 | | |
5935 | | |
5936 | | /** MARK: TY_(CoerceNode) |
5937 | | * Transforms a given node to another element, for example, from a <p> |
5938 | | * to a <br>. |
5939 | | */ |
5940 | | void TY_(CoerceNode)(TidyDocImpl* doc, Node *node, TidyTagId tid, Bool obsolete, Bool unexpected) |
5941 | 1.56k | { |
5942 | 1.56k | const Dict* tag = TY_(LookupTagDef)(tid); |
5943 | 1.56k | Node* tmp = TY_(InferredTag)(doc, tag->id); |
5944 | | |
5945 | 1.56k | if (obsolete) |
5946 | 286 | TY_(Report)(doc, node, tmp, OBSOLETE_ELEMENT); |
5947 | 1.27k | else if (unexpected) |
5948 | 0 | TY_(Report)(doc, node, tmp, REPLACING_UNEX_ELEMENT); |
5949 | 1.27k | else |
5950 | 1.27k | TY_(Report)(doc, node, tmp, REPLACING_ELEMENT); |
5951 | | |
5952 | 1.56k | TidyDocFree(doc, tmp->element); |
5953 | 1.56k | TidyDocFree(doc, tmp); |
5954 | | |
5955 | 1.56k | node->was = node->tag; |
5956 | 1.56k | node->tag = tag; |
5957 | 1.56k | node->type = StartTag; |
5958 | 1.56k | node->implicit = yes; |
5959 | 1.56k | TidyDocFree(doc, node->element); |
5960 | 1.56k | node->element = TY_(tmbstrdup)(doc->allocator, tag->name); |
5961 | 1.56k | } |
5962 | | |
5963 | | |
5964 | | /** MARK: TY_(DiscardElement) |
5965 | | * Remove node from markup tree and discard it. |
5966 | | */ |
5967 | | Node *TY_(DiscardElement)( TidyDocImpl* doc, Node *element ) |
5968 | 113k | { |
5969 | 113k | Node *next = NULL; |
5970 | | |
5971 | 113k | if (element) |
5972 | 113k | { |
5973 | 113k | next = element->next; |
5974 | 113k | TY_(RemoveNode)(element); |
5975 | 113k | TY_(FreeNode)( doc, element); |
5976 | 113k | } |
5977 | | |
5978 | 113k | return next; |
5979 | 113k | } |
5980 | | |
5981 | | |
5982 | | /** MARK: TY_(DropEmptyElements) |
5983 | | * Trims a tree of empty elements recursively, returning the next node. |
5984 | | */ |
5985 | | Node* TY_(DropEmptyElements)(TidyDocImpl* doc, Node* node) |
5986 | 379k | { |
5987 | 379k | Node* next; |
5988 | | |
5989 | 864k | while (node) |
5990 | 484k | { |
5991 | 484k | next = node->next; |
5992 | | |
5993 | 484k | if (node->content) |
5994 | 378k | TY_(DropEmptyElements)(doc, node->content); |
5995 | | |
5996 | 484k | if (!TY_(nodeIsElement)(node) && |
5997 | 47.4k | !(TY_(nodeIsText)(node) && !(node->start < node->end))) |
5998 | 46.8k | { |
5999 | 46.8k | node = next; |
6000 | 46.8k | continue; |
6001 | 46.8k | } |
6002 | | |
6003 | 438k | next = TY_(TrimEmptyElement)(doc, node); |
6004 | 438k | node = next; |
6005 | 438k | } |
6006 | | |
6007 | 379k | return node; |
6008 | 379k | } |
6009 | | |
6010 | | |
6011 | | /** MARK: TY_(InsertNodeAtStart) |
6012 | | * Insert node into markup tree as the first element of content of element. |
6013 | | */ |
6014 | | void TY_(InsertNodeAtStart)(Node *element, Node *node) |
6015 | 452 | { |
6016 | 452 | node->parent = element; |
6017 | | |
6018 | 452 | if (element->content == NULL) |
6019 | 57 | element->last = node; |
6020 | 395 | else |
6021 | 395 | element->content->prev = node; |
6022 | | |
6023 | 452 | node->next = element->content; |
6024 | 452 | node->prev = NULL; |
6025 | 452 | element->content = node; |
6026 | 452 | } |
6027 | | |
6028 | | |
6029 | | /** MARK: TY_(InsertNodeAtEnd) |
6030 | | * Insert node into markup tree as the last element of content of element. |
6031 | | */ |
6032 | | void TY_(InsertNodeAtEnd)(Node *element, Node *node) |
6033 | 595k | { |
6034 | 595k | node->parent = element; |
6035 | 595k | node->prev = element ? element->last : NULL; |
6036 | | |
6037 | 595k | if (element && element->last != NULL) |
6038 | 110k | element->last->next = node; |
6039 | 484k | else |
6040 | 484k | if (element) |
6041 | 484k | element->content = node; |
6042 | | |
6043 | 595k | if (element) |
6044 | 595k | element->last = node; |
6045 | 595k | } |
6046 | | |
6047 | | |
6048 | | /** MARK: TY_(InsertNodeBeforeElement) |
6049 | | * Insert node into markup tree before element. |
6050 | | */ |
6051 | | void TY_(InsertNodeBeforeElement)(Node *element, Node *node) |
6052 | 22.9k | { |
6053 | 22.9k | Node *parent; |
6054 | | |
6055 | 22.9k | parent = element ? element->parent : NULL; |
6056 | 22.9k | node->parent = parent; |
6057 | 22.9k | node->next = element; |
6058 | 22.9k | node->prev = element ? element->prev : NULL; |
6059 | 22.9k | if (element) |
6060 | 22.7k | element->prev = node; |
6061 | | |
6062 | 22.9k | if (node->prev) |
6063 | 14.1k | node->prev->next = node; |
6064 | | |
6065 | 22.9k | if (parent && parent->content == element) |
6066 | 8.38k | parent->content = node; |
6067 | 22.9k | } |
6068 | | |
6069 | | |
6070 | | /** MARK: TY_(InsertNodeAfterElement) |
6071 | | * Insert node into markup tree after element. |
6072 | | */ |
6073 | | void TY_(InsertNodeAfterElement)(Node *element, Node *node) |
6074 | 18.2k | { |
6075 | 18.2k | Node *parent; |
6076 | | |
6077 | 18.2k | parent = element->parent; |
6078 | 18.2k | node->parent = parent; |
6079 | | |
6080 | | /* AQ - 13 Jan 2000 fix for parent == NULL */ |
6081 | 18.2k | if (parent != NULL && parent->last == element) |
6082 | 3.33k | parent->last = node; |
6083 | 14.8k | else |
6084 | 14.8k | { |
6085 | 14.8k | node->next = element->next; |
6086 | | /* AQ - 13 Jan 2000 fix for node->next == NULL */ |
6087 | 14.8k | if (node->next != NULL) |
6088 | 12.5k | node->next->prev = node; |
6089 | 14.8k | } |
6090 | | |
6091 | 18.2k | element->next = node; |
6092 | 18.2k | node->prev = element; |
6093 | 18.2k | } |
6094 | | |
6095 | | |
6096 | | /** MARK: TY_(IsBlank) |
6097 | | * Indicates whether or not a text node is blank, meaning that it consists |
6098 | | * of nothing, or a single space. |
6099 | | */ |
6100 | | Bool TY_(IsBlank)(Lexer *lexer, Node *node) |
6101 | 175 | { |
6102 | 175 | Bool isBlank = TY_(nodeIsText)(node); |
6103 | 175 | if ( isBlank ) |
6104 | 0 | isBlank = ( node->end == node->start || /* Zero length */ |
6105 | 0 | ( node->end == node->start+1 /* or one blank. */ |
6106 | 0 | && lexer->lexbuf[node->start] == ' ' ) ); |
6107 | | |
6108 | 175 | return isBlank; |
6109 | 175 | } |
6110 | | |
6111 | | |
6112 | | /** MARK: TY_(IsJavaScript) |
6113 | | * Indicates whether or not a node is declared as containing javascript |
6114 | | * code. |
6115 | | */ |
6116 | | Bool TY_(IsJavaScript)(Node *node) |
6117 | 100 | { |
6118 | 100 | Bool result = no; |
6119 | 100 | AttVal *attr; |
6120 | | |
6121 | 100 | if (node->attributes == NULL) |
6122 | 65 | return yes; |
6123 | | |
6124 | 102 | for (attr = node->attributes; attr; attr = attr->next) |
6125 | 67 | { |
6126 | 67 | if ( (attrIsLANGUAGE(attr) || attrIsTYPE(attr)) |
6127 | 31 | && AttrContains(attr, "javascript") ) |
6128 | 0 | { |
6129 | 0 | result = yes; |
6130 | 0 | break; |
6131 | 0 | } |
6132 | 67 | } |
6133 | | |
6134 | 35 | return result; |
6135 | 100 | } |
6136 | | |
6137 | | |
6138 | | /** MARK: TY_(IsNewNode) |
6139 | | * Used to check if a node uses CM_NEW, which determines how attributes |
6140 | | * without values should be printed. This was introduced to deal with |
6141 | | * user-defined tags e.g. ColdFusion. |
6142 | | */ |
6143 | | Bool TY_(IsNewNode)(Node *node) |
6144 | 0 | { |
6145 | 0 | if (node && node->tag) |
6146 | 0 | { |
6147 | 0 | return (node->tag->model & CM_NEW); |
6148 | 0 | } |
6149 | 0 | return yes; |
6150 | 0 | } |
6151 | | |
6152 | | |
6153 | | /** MARK: TY_(RemoveNode) |
6154 | | * Extract a node and its children from a markup tree |
6155 | | */ |
6156 | | Node *TY_(RemoveNode)(Node *node) |
6157 | 116k | { |
6158 | 116k | if (node->prev) |
6159 | 20.9k | node->prev->next = node->next; |
6160 | | |
6161 | 116k | if (node->next) |
6162 | 39.4k | node->next->prev = node->prev; |
6163 | | |
6164 | 116k | if (node->parent) |
6165 | 115k | { |
6166 | 115k | if (node->parent->content == node) |
6167 | 94.5k | node->parent->content = node->next; |
6168 | | |
6169 | 115k | if (node->parent->last == node) |
6170 | 76.0k | node->parent->last = node->prev; |
6171 | 115k | } |
6172 | | |
6173 | 116k | node->parent = node->prev = node->next = NULL; |
6174 | 116k | return node; |
6175 | 116k | } |
6176 | | |
6177 | | |
6178 | | /** MARK: TY_(TrimEmptyElement) |
6179 | | * Trims a single, empty element, returning the next node. |
6180 | | */ |
6181 | | Node *TY_(TrimEmptyElement)( TidyDocImpl* doc, Node *element ) |
6182 | 438k | { |
6183 | 438k | if ( CanPrune(doc, element) ) |
6184 | 111k | { |
6185 | 111k | if (element->type != TextNode) |
6186 | 111k | { |
6187 | 111k | doc->footnotes |= FN_TRIM_EMPTY_ELEMENT; |
6188 | 111k | TY_(Report)(doc, element, NULL, TRIM_EMPTY_ELEMENT); |
6189 | 111k | } |
6190 | | |
6191 | 111k | return TY_(DiscardElement)(doc, element); |
6192 | 111k | } |
6193 | 326k | return element->next; |
6194 | 438k | } |
6195 | | |
6196 | | |
6197 | | /** MARK: TY_(XMLPreserveWhiteSpace) |
6198 | | * Indicates whether or not whitespace is to be preserved in XHTML/XML |
6199 | | * documents. |
6200 | | */ |
6201 | | Bool TY_(XMLPreserveWhiteSpace)( TidyDocImpl* doc, Node *element) |
6202 | 27.4k | { |
6203 | 27.4k | AttVal *attribute; |
6204 | | |
6205 | | /* search attributes for xml:space */ |
6206 | 30.0k | for (attribute = element->attributes; attribute; attribute = attribute->next) |
6207 | 3.29k | { |
6208 | 3.29k | if (attrIsXML_SPACE(attribute)) |
6209 | 667 | { |
6210 | 667 | if (AttrValueIs(attribute, "preserve")) |
6211 | 55 | return yes; |
6212 | | |
6213 | 612 | return no; |
6214 | 667 | } |
6215 | 3.29k | } |
6216 | | |
6217 | 26.7k | if (element->element == NULL) |
6218 | 144 | return no; |
6219 | | |
6220 | | /* kludge for html docs without explicit xml:space attribute */ |
6221 | 26.5k | if (nodeIsPRE(element) || |
6222 | 26.5k | nodeIsSCRIPT(element) || |
6223 | 26.5k | nodeIsSTYLE(element) || |
6224 | 26.5k | TY_(FindParser)(doc, element) == TY_(ParsePre)) |
6225 | 488 | return yes; |
6226 | | |
6227 | | /* kludge for XSL docs */ |
6228 | 26.1k | if ( TY_(tmbstrcasecmp)(element->element, "xsl:text") == 0 ) |
6229 | 0 | return yes; |
6230 | | |
6231 | 26.1k | return no; |
6232 | 26.1k | } |
6233 | | |
6234 | | |
6235 | | /***************************************************************************//* |
6236 | | ** MARK: - Internal API Implementation - Main Parsers |
6237 | | ***************************************************************************/ |
6238 | | |
6239 | | |
6240 | | /** MARK: TY_(ParseDocument) |
6241 | | * Parses an HTML document after lexing. It begins by properly configuring |
6242 | | * the overall HTML structure, and subsequently processes all remaining |
6243 | | * nodes. |
6244 | | */ |
6245 | | void TY_(ParseDocument)(TidyDocImpl* doc) |
6246 | 352 | { |
6247 | 352 | Node *node, *html, *doctype = NULL; |
6248 | | |
6249 | 1.08k | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
6250 | 1.08k | { |
6251 | 1.08k | if (node->type == XmlDecl) |
6252 | 30 | { |
6253 | 30 | doc->xmlDetected = yes; |
6254 | | |
6255 | 30 | if (TY_(FindXmlDecl)(doc) && doc->root.content) |
6256 | 29 | { |
6257 | 29 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6258 | 29 | TY_(FreeNode)(doc, node); |
6259 | 29 | continue; |
6260 | 29 | } |
6261 | 1 | if (node->line > 1 || node->column != 1) |
6262 | 1 | { |
6263 | 1 | TY_(Report)(doc, &doc->root, node, SPACE_PRECEDING_XMLDECL); |
6264 | 1 | } |
6265 | 1 | } |
6266 | | |
6267 | | /* deal with comments etc. */ |
6268 | 1.05k | if (InsertMisc( &doc->root, node )) |
6269 | 685 | continue; |
6270 | | |
6271 | 370 | if (node->type == DocTypeTag) |
6272 | 11 | { |
6273 | 11 | if (doctype == NULL) |
6274 | 11 | { |
6275 | 11 | TY_(InsertNodeAtEnd)( &doc->root, node); |
6276 | 11 | doctype = node; |
6277 | 11 | } |
6278 | 0 | else |
6279 | 0 | { |
6280 | 0 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6281 | 0 | TY_(FreeNode)( doc, node); |
6282 | 0 | } |
6283 | 11 | continue; |
6284 | 11 | } |
6285 | | |
6286 | 359 | if (node->type == EndTag) |
6287 | 9 | { |
6288 | 9 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6289 | 9 | TY_(FreeNode)( doc, node); |
6290 | 9 | continue; |
6291 | 9 | } |
6292 | | |
6293 | 350 | if (node->type == StartTag && nodeIsHTML(node)) |
6294 | 2 | { |
6295 | 2 | AttVal *xmlns = TY_(AttrGetById)(node, TidyAttr_XMLNS); |
6296 | | |
6297 | 2 | if (AttrValueIs(xmlns, XHTML_NAMESPACE)) |
6298 | 2 | { |
6299 | 2 | Bool htmlOut = cfgBool( doc, TidyHtmlOut ); |
6300 | 2 | doc->lexer->isvoyager = yes; /* Unless plain HTML */ |
6301 | 2 | TY_(SetOptionBool)( doc, TidyXhtmlOut, !htmlOut ); /* is specified, output*/ |
6302 | 2 | TY_(SetOptionBool)( doc, TidyXmlOut, !htmlOut ); /* will be XHTML. */ |
6303 | | |
6304 | | /* adjust other config options, just as in config.c */ |
6305 | 2 | if ( !htmlOut ) |
6306 | 0 | { |
6307 | 0 | TY_(SetOptionBool)( doc, TidyUpperCaseTags, no ); |
6308 | 0 | TY_(SetOptionInt)( doc, TidyUpperCaseAttrs, no ); |
6309 | 0 | } |
6310 | 2 | } |
6311 | 2 | } |
6312 | | |
6313 | 350 | if ( node->type != StartTag || !nodeIsHTML(node) ) |
6314 | 348 | { |
6315 | 348 | TY_(UngetToken)( doc ); |
6316 | 348 | html = TY_(InferredTag)(doc, TidyTag_HTML); |
6317 | 348 | } |
6318 | 2 | else |
6319 | 2 | html = node; |
6320 | | |
6321 | | /*\ |
6322 | | * #72, avoid MISSING_DOCTYPE if show-body-only. |
6323 | | * #191, also if --doctype omit, that is TidyDoctypeOmit |
6324 | | * #342, adjust tags to html4-- if not 'auto' or 'html5' |
6325 | | \*/ |
6326 | 350 | if (!TY_(FindDocType)(doc)) |
6327 | 339 | { |
6328 | 339 | ulong dtmode = cfg( doc, TidyDoctypeMode ); |
6329 | 339 | if ((dtmode != TidyDoctypeOmit) && !showingBodyOnly(doc)) |
6330 | 339 | TY_(Report)(doc, NULL, NULL, MISSING_DOCTYPE); |
6331 | 339 | if ((dtmode != TidyDoctypeAuto) && (dtmode != TidyDoctypeHtml5)) |
6332 | 0 | { |
6333 | | /*\ |
6334 | | * Issue #342 - if not doctype 'auto', or 'html5' |
6335 | | * then reset mode htm4-- parsing |
6336 | | \*/ |
6337 | 0 | TY_(AdjustTags)(doc); /* Dynamically modify the tags table to html4-- mode */ |
6338 | 0 | } |
6339 | 339 | } |
6340 | 350 | TY_(InsertNodeAtEnd)( &doc->root, html); |
6341 | 350 | ParseHTMLWithNode( doc, html ); |
6342 | 350 | break; |
6343 | 359 | } |
6344 | | |
6345 | | /* do this before any more document fixes */ |
6346 | 352 | if ( cfg( doc, TidyAccessibilityCheckLevel ) > 0 ) |
6347 | 0 | TY_(AccessibilityChecks)( doc ); |
6348 | | |
6349 | 352 | if (!TY_(FindHTML)(doc)) |
6350 | 2 | { |
6351 | | /* a later check should complain if <body> is empty */ |
6352 | 2 | html = TY_(InferredTag)(doc, TidyTag_HTML); |
6353 | 2 | TY_(InsertNodeAtEnd)( &doc->root, html); |
6354 | 2 | ParseHTMLWithNode( doc, html ); |
6355 | 2 | } |
6356 | | |
6357 | 352 | node = TY_(FindTITLE)(doc); |
6358 | 352 | if (!node) |
6359 | 343 | { |
6360 | 343 | Node* head = TY_(FindHEAD)(doc); |
6361 | | /* #72, avoid MISSING_TITLE_ELEMENT if show-body-only (but allow InsertNodeAtEnd to avoid new warning) */ |
6362 | 686 | if (!showingBodyOnly(doc)) |
6363 | 343 | { |
6364 | 343 | TY_(Report)(doc, head, NULL, MISSING_TITLE_ELEMENT); |
6365 | 343 | } |
6366 | 343 | TY_(InsertNodeAtEnd)(head, TY_(InferredTag)(doc, TidyTag_TITLE)); |
6367 | 343 | } |
6368 | 9 | else if (!node->content && !showingBodyOnly(doc)) |
6369 | 6 | { |
6370 | | /* Is #839 - warn node is blank in HTML5 */ |
6371 | 6 | if (TY_(IsHTML5Mode)(doc)) |
6372 | 3 | { |
6373 | 3 | TY_(Report)(doc, node, NULL, BLANK_TITLE_ELEMENT); |
6374 | 3 | } |
6375 | 6 | } |
6376 | | |
6377 | 352 | AttributeChecks(doc, &doc->root); |
6378 | 352 | ReplaceObsoleteElements(doc, &doc->root); |
6379 | 352 | TY_(DropEmptyElements)(doc, &doc->root); |
6380 | 352 | CleanSpaces(doc, &doc->root); |
6381 | | |
6382 | 352 | if (cfgBool(doc, TidyEncloseBodyText)) |
6383 | 0 | EncloseBodyText(doc); |
6384 | 352 | if (cfgBool(doc, TidyEncloseBlockText)) |
6385 | 0 | EncloseBlockText(doc, &doc->root); |
6386 | 352 | } |
6387 | | |
6388 | | |
6389 | | /** MARK: TY_(ParseXMLDocument) |
6390 | | * Parses the document using Tidy's XML parser. |
6391 | | */ |
6392 | | void TY_(ParseXMLDocument)(TidyDocImpl* doc) |
6393 | 50 | { |
6394 | 50 | Node *node, *doctype = NULL; |
6395 | | |
6396 | 50 | TY_(SetOptionBool)( doc, TidyXmlTags, yes ); |
6397 | | |
6398 | 50 | doc->xmlDetected = yes; |
6399 | | |
6400 | 195 | while ((node = TY_(GetToken)(doc, IgnoreWhitespace)) != NULL) |
6401 | 145 | { |
6402 | | /* discard unexpected end tags */ |
6403 | 145 | if (node->type == EndTag) |
6404 | 0 | { |
6405 | 0 | TY_(Report)(doc, NULL, node, UNEXPECTED_ENDTAG); |
6406 | 0 | TY_(FreeNode)( doc, node); |
6407 | 0 | continue; |
6408 | 0 | } |
6409 | | |
6410 | | /* deal with comments etc. */ |
6411 | 145 | if (InsertMisc( &doc->root, node)) |
6412 | 5 | continue; |
6413 | | |
6414 | 140 | if (node->type == DocTypeTag) |
6415 | 0 | { |
6416 | 0 | if (doctype == NULL) |
6417 | 0 | { |
6418 | 0 | TY_(InsertNodeAtEnd)( &doc->root, node); |
6419 | 0 | doctype = node; |
6420 | 0 | } |
6421 | 0 | else |
6422 | 0 | { |
6423 | 0 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6424 | 0 | TY_(FreeNode)( doc, node); |
6425 | 0 | } |
6426 | 0 | continue; |
6427 | 0 | } |
6428 | | |
6429 | 140 | if (node->type == StartEndTag) |
6430 | 1 | { |
6431 | 1 | TY_(InsertNodeAtEnd)( &doc->root, node); |
6432 | 1 | continue; |
6433 | 1 | } |
6434 | | |
6435 | | /* if start tag then parse element's content */ |
6436 | 139 | if (node->type == StartTag) |
6437 | 61 | { |
6438 | 61 | TY_(InsertNodeAtEnd)( &doc->root, node ); |
6439 | 61 | ParseHTMLWithNode( doc, node ); |
6440 | 61 | continue; |
6441 | 61 | } |
6442 | | |
6443 | 78 | TY_(Report)(doc, &doc->root, node, DISCARDING_UNEXPECTED); |
6444 | 78 | TY_(FreeNode)( doc, node); |
6445 | 78 | } |
6446 | | |
6447 | | /* ensure presence of initial <?xml version="1.0"?> */ |
6448 | 50 | if ( cfgBool(doc, TidyXmlDecl) ) |
6449 | 0 | TY_(FixXmlDecl)( doc ); |
6450 | 50 | } |