1from __future__ import absolute_import, division, unicode_literals
2from six import viewkeys
3
4from . import _inputstream
5from . import _tokenizer
6
7from . import treebuilders
8from .treebuilders.base import Marker
9
10from . import _utils
11from .constants import (
12 spaceCharacters, asciiUpper2Lower,
13 specialElements, headingElements, cdataElements, rcdataElements,
14 tokenTypes,
15 namespaces,
16 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
17 adjustForeignAttributes as adjustForeignAttributesMap,
18 adjustMathMLAttributes, adjustSVGAttributes,
19 E,
20 _ReparseException
21)
22
23
24def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
25 """Parse an HTML document as a string or file-like object into a tree
26
27 :arg doc: the document to parse as a string or file-like object
28
29 :arg treebuilder: the treebuilder to use when parsing
30
31 :arg namespaceHTMLElements: whether or not to namespace HTML elements
32
33 :returns: parsed tree
34
35 Example:
36
37 >>> from html5lib.html5parser import parse
38 >>> parse('<html><body><p>This is a doc</p></body></html>')
39 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
40
41 """
42 tb = treebuilders.getTreeBuilder(treebuilder)
43 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
44 return p.parse(doc, **kwargs)
45
46
47def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
48 """Parse an HTML fragment as a string or file-like object into a tree
49
50 :arg doc: the fragment to parse as a string or file-like object
51
52 :arg container: the container context to parse the fragment in
53
54 :arg treebuilder: the treebuilder to use when parsing
55
56 :arg namespaceHTMLElements: whether or not to namespace HTML elements
57
58 :returns: parsed tree
59
60 Example:
61
62 >>> from html5lib.html5libparser import parseFragment
63 >>> parseFragment('<b>this is a fragment</b>')
64 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
65
66 """
67 tb = treebuilders.getTreeBuilder(treebuilder)
68 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
69 return p.parseFragment(doc, container=container, **kwargs)
70
71
72class HTMLParser(object):
73 """HTML parser
74
75 Generates a tree structure from a stream of (possibly malformed) HTML.
76
77 """
78
79 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
80 """
81 :arg tree: a treebuilder class controlling the type of tree that will be
82 returned. Built in treebuilders can be accessed through
83 html5lib.treebuilders.getTreeBuilder(treeType)
84
85 :arg strict: raise an exception when a parse error is encountered
86
87 :arg namespaceHTMLElements: whether or not to namespace HTML elements
88
89 :arg debug: whether or not to enable debug mode which logs things
90
91 Example:
92
93 >>> from html5lib.html5parser import HTMLParser
94 >>> parser = HTMLParser() # generates parser with etree builder
95 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
96
97 """
98
99 # Raise an exception on the first error encountered
100 self.strict = strict
101 self.debug = debug
102
103 if tree is None:
104 tree = treebuilders.getTreeBuilder("etree")
105 elif isinstance(tree, str):
106 tree = treebuilders.getTreeBuilder(tree)
107
108 self.tree = tree(namespaceHTMLElements)
109 self.errors = []
110
111 self.phases = {name: cls(self, self.tree) for name, cls in
112 _phases.items()}
113
114 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
115
116 self.innerHTMLMode = innerHTML
117 self.container = container
118 self.scripting = scripting
119 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
120 self.reset()
121
122 try:
123 self.mainLoop()
124 except _ReparseException:
125 self.reset()
126 self.mainLoop()
127
128 def reset(self):
129 self.tree.reset()
130 self.firstStartTag = False
131 self.errors = []
132 self.log = [] # only used with debug mode
133 # "quirks" / "limited quirks" / "no quirks"
134 self.compatMode = "no quirks"
135
136 if self.innerHTMLMode:
137 self.innerHTML = self.container.lower()
138
139 if self.innerHTML in cdataElements:
140 self.tokenizer.state = self.tokenizer.rcdataState
141 elif self.innerHTML in rcdataElements:
142 self.tokenizer.state = self.tokenizer.rawtextState
143 elif self.innerHTML == 'plaintext':
144 self.tokenizer.state = self.tokenizer.plaintextState
145 else:
146 # state already is data state
147 # self.tokenizer.state = self.tokenizer.dataState
148 pass
149 self.phase = self.phases["beforeHtml"]
150 self.phase.insertHtmlElement()
151 self.resetInsertionMode()
152 else:
153 self.innerHTML = False # pylint:disable=redefined-variable-type
154 self.phase = self.phases["initial"]
155
156 self.lastPhase = None
157
158 self.beforeRCDataPhase = None
159
160 self.framesetOK = True
161
162 @property
163 def documentEncoding(self):
164 """Name of the character encoding that was used to decode the input stream, or
165 :obj:`None` if that is not determined yet
166
167 """
168 if not hasattr(self, 'tokenizer'):
169 return None
170 return self.tokenizer.stream.charEncoding[0].name
171
172 def isHTMLIntegrationPoint(self, element):
173 if (element.name == "annotation-xml" and
174 element.namespace == namespaces["mathml"]):
175 return ("encoding" in element.attributes and
176 element.attributes["encoding"].translate(
177 asciiUpper2Lower) in
178 ("text/html", "application/xhtml+xml"))
179 else:
180 return (element.namespace, element.name) in htmlIntegrationPointElements
181
182 def isMathMLTextIntegrationPoint(self, element):
183 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
184
185 def mainLoop(self):
186 CharactersToken = tokenTypes["Characters"]
187 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
188 StartTagToken = tokenTypes["StartTag"]
189 EndTagToken = tokenTypes["EndTag"]
190 CommentToken = tokenTypes["Comment"]
191 DoctypeToken = tokenTypes["Doctype"]
192 ParseErrorToken = tokenTypes["ParseError"]
193
194 type_names = {value: key for key, value in tokenTypes.items()}
195 debug = self.debug
196
197 for token in self.tokenizer:
198 prev_token = None
199 new_token = token
200 while new_token is not None:
201 prev_token = new_token
202 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
203 currentNodeNamespace = currentNode.namespace if currentNode else None
204 currentNodeName = currentNode.name if currentNode else None
205
206 type = new_token["type"]
207
208 if type == ParseErrorToken:
209 self.parseError(new_token["data"], new_token.get("datavars", {}))
210 new_token = None
211 else:
212 if (len(self.tree.openElements) == 0 or
213 currentNodeNamespace == self.tree.defaultNamespace or
214 (self.isMathMLTextIntegrationPoint(currentNode) and
215 ((type == StartTagToken and
216 token["name"] not in frozenset(["mglyph", "malignmark"])) or
217 type in (CharactersToken, SpaceCharactersToken))) or
218 (currentNodeNamespace == namespaces["mathml"] and
219 currentNodeName == "annotation-xml" and
220 type == StartTagToken and
221 token["name"] == "svg") or
222 (self.isHTMLIntegrationPoint(currentNode) and
223 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
224 phase = self.phase
225 else:
226 phase = self.phases["inForeignContent"]
227
228 if debug:
229 info = {"type": type_names[type]}
230 if type in (StartTagToken, EndTagToken):
231 info["name"] = new_token['name']
232
233 self.log.append((self.tokenizer.state.__name__,
234 self.phase.__class__.__name__,
235 phase.__class__.__name__,
236 "process" + info["type"],
237 info))
238
239 if type == CharactersToken:
240 new_token = phase.processCharacters(new_token)
241 elif type == SpaceCharactersToken:
242 new_token = phase.processSpaceCharacters(new_token)
243 elif type == StartTagToken:
244 new_token = phase.processStartTag(new_token)
245 elif type == EndTagToken:
246 new_token = phase.processEndTag(new_token)
247 elif type == CommentToken:
248 new_token = phase.processComment(new_token)
249 elif type == DoctypeToken:
250 new_token = phase.processDoctype(new_token)
251
252 if (type == StartTagToken and prev_token["selfClosing"] and
253 not prev_token["selfClosingAcknowledged"]):
254 self.parseError("non-void-element-with-trailing-solidus",
255 {"name": prev_token["name"]})
256
257 # When the loop finishes it's EOF
258 reprocess = True
259 phases = []
260 while reprocess:
261 phases.append(self.phase)
262 reprocess = self.phase.processEOF()
263 if reprocess:
264 assert self.phase not in phases
265
266 def parse(self, stream, *args, **kwargs):
267 """Parse a HTML document into a well-formed tree
268
269 :arg stream: a file-like object or string containing the HTML to be parsed
270
271 The optional encoding parameter must be a string that indicates
272 the encoding. If specified, that encoding will be used,
273 regardless of any BOM or later declaration (such as in a meta
274 element).
275
276 :arg scripting: treat noscript elements as if JavaScript was turned on
277
278 :returns: parsed tree
279
280 Example:
281
282 >>> from html5lib.html5parser import HTMLParser
283 >>> parser = HTMLParser()
284 >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
285 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
286
287 """
288 self._parse(stream, False, None, *args, **kwargs)
289 return self.tree.getDocument()
290
291 def parseFragment(self, stream, *args, **kwargs):
292 """Parse a HTML fragment into a well-formed tree fragment
293
294 :arg container: name of the element we're setting the innerHTML
295 property if set to None, default to 'div'
296
297 :arg stream: a file-like object or string containing the HTML to be parsed
298
299 The optional encoding parameter must be a string that indicates
300 the encoding. If specified, that encoding will be used,
301 regardless of any BOM or later declaration (such as in a meta
302 element)
303
304 :arg scripting: treat noscript elements as if JavaScript was turned on
305
306 :returns: parsed tree
307
308 Example:
309
310 >>> from html5lib.html5libparser import HTMLParser
311 >>> parser = HTMLParser()
312 >>> parser.parseFragment('<b>this is a fragment</b>')
313 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
314
315 """
316 self._parse(stream, True, *args, **kwargs)
317 return self.tree.getFragment()
318
319 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
320 # XXX The idea is to make errorcode mandatory.
321 if datavars is None:
322 datavars = {}
323 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
324 if self.strict:
325 raise ParseError(E[errorcode] % datavars)
326
327 def adjustMathMLAttributes(self, token):
328 adjust_attributes(token, adjustMathMLAttributes)
329
330 def adjustSVGAttributes(self, token):
331 adjust_attributes(token, adjustSVGAttributes)
332
333 def adjustForeignAttributes(self, token):
334 adjust_attributes(token, adjustForeignAttributesMap)
335
336 def reparseTokenNormal(self, token):
337 # pylint:disable=unused-argument
338 self.parser.phase()
339
340 def resetInsertionMode(self):
341 # The name of this method is mostly historical. (It's also used in the
342 # specification.)
343 last = False
344 newModes = {
345 "select": "inSelect",
346 "td": "inCell",
347 "th": "inCell",
348 "tr": "inRow",
349 "tbody": "inTableBody",
350 "thead": "inTableBody",
351 "tfoot": "inTableBody",
352 "caption": "inCaption",
353 "colgroup": "inColumnGroup",
354 "table": "inTable",
355 "head": "inBody",
356 "body": "inBody",
357 "frameset": "inFrameset",
358 "html": "beforeHead"
359 }
360 for node in self.tree.openElements[::-1]:
361 nodeName = node.name
362 new_phase = None
363 if node == self.tree.openElements[0]:
364 assert self.innerHTML
365 last = True
366 nodeName = self.innerHTML
367 # Check for conditions that should only happen in the innerHTML
368 # case
369 if nodeName in ("select", "colgroup", "head", "html"):
370 assert self.innerHTML
371
372 if not last and node.namespace != self.tree.defaultNamespace:
373 continue
374
375 if nodeName in newModes:
376 new_phase = self.phases[newModes[nodeName]]
377 break
378 elif last:
379 new_phase = self.phases["inBody"]
380 break
381
382 self.phase = new_phase
383
384 def parseRCDataRawtext(self, token, contentType):
385 # Generic RCDATA/RAWTEXT Parsing algorithm
386 assert contentType in ("RAWTEXT", "RCDATA")
387
388 self.tree.insertElement(token)
389
390 if contentType == "RAWTEXT":
391 self.tokenizer.state = self.tokenizer.rawtextState
392 else:
393 self.tokenizer.state = self.tokenizer.rcdataState
394
395 self.originalPhase = self.phase
396
397 self.phase = self.phases["text"]
398
399
400class Phase(object):
401 """Base class for helper object that implements each phase of processing
402 """
403 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
404
405 def __init__(self, parser, tree):
406 self.parser = parser
407 self.tree = tree
408 self.__startTagCache = {}
409 self.__endTagCache = {}
410
411 def processEOF(self):
412 raise NotImplementedError
413
414 def processComment(self, token):
415 # For most phases the following is correct. Where it's not it will be
416 # overridden.
417 self.tree.insertComment(token, self.tree.openElements[-1])
418
419 def processDoctype(self, token):
420 self.parser.parseError("unexpected-doctype")
421
422 def processCharacters(self, token):
423 self.tree.insertText(token["data"])
424
425 def processSpaceCharacters(self, token):
426 self.tree.insertText(token["data"])
427
428 def processStartTag(self, token):
429 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
430 # requires a circular reference to the Phase, and this ends up with a significant
431 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
432 name = token["name"]
433 # In Py2, using `in` is quicker in general than try/except KeyError
434 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
435 if name in self.__startTagCache:
436 func = self.__startTagCache[name]
437 else:
438 func = self.__startTagCache[name] = self.startTagHandler[name]
439 # bound the cache size in case we get loads of unknown tags
440 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
441 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
442 self.__startTagCache.pop(next(iter(self.__startTagCache)))
443 return func(token)
444
445 def startTagHtml(self, token):
446 if not self.parser.firstStartTag and token["name"] == "html":
447 self.parser.parseError("non-html-root")
448 # XXX Need a check here to see if the first start tag token emitted is
449 # this token... If it's not, invoke self.parser.parseError().
450 for attr, value in token["data"].items():
451 if attr not in self.tree.openElements[0].attributes:
452 self.tree.openElements[0].attributes[attr] = value
453 self.parser.firstStartTag = False
454
455 def processEndTag(self, token):
456 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
457 # requires a circular reference to the Phase, and this ends up with a significant
458 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
459 name = token["name"]
460 # In Py2, using `in` is quicker in general than try/except KeyError
461 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
462 if name in self.__endTagCache:
463 func = self.__endTagCache[name]
464 else:
465 func = self.__endTagCache[name] = self.endTagHandler[name]
466 # bound the cache size in case we get loads of unknown tags
467 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
468 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
469 self.__endTagCache.pop(next(iter(self.__endTagCache)))
470 return func(token)
471
472
473class InitialPhase(Phase):
474 __slots__ = tuple()
475
476 def processSpaceCharacters(self, token):
477 pass
478
479 def processComment(self, token):
480 self.tree.insertComment(token, self.tree.document)
481
482 def processDoctype(self, token):
483 name = token["name"]
484 publicId = token["publicId"]
485 systemId = token["systemId"]
486 correct = token["correct"]
487
488 if (name != "html" or publicId is not None or
489 systemId is not None and systemId != "about:legacy-compat"):
490 self.parser.parseError("unknown-doctype")
491
492 if publicId is None:
493 publicId = ""
494
495 self.tree.insertDoctype(token)
496
497 if publicId != "":
498 publicId = publicId.translate(asciiUpper2Lower)
499
500 if (not correct or token["name"] != "html" or
501 publicId.startswith(
502 ("+//silmaril//dtd html pro v0r11 19970101//",
503 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
504 "-//as//dtd html 3.0 aswedit + extensions//",
505 "-//ietf//dtd html 2.0 level 1//",
506 "-//ietf//dtd html 2.0 level 2//",
507 "-//ietf//dtd html 2.0 strict level 1//",
508 "-//ietf//dtd html 2.0 strict level 2//",
509 "-//ietf//dtd html 2.0 strict//",
510 "-//ietf//dtd html 2.0//",
511 "-//ietf//dtd html 2.1e//",
512 "-//ietf//dtd html 3.0//",
513 "-//ietf//dtd html 3.2 final//",
514 "-//ietf//dtd html 3.2//",
515 "-//ietf//dtd html 3//",
516 "-//ietf//dtd html level 0//",
517 "-//ietf//dtd html level 1//",
518 "-//ietf//dtd html level 2//",
519 "-//ietf//dtd html level 3//",
520 "-//ietf//dtd html strict level 0//",
521 "-//ietf//dtd html strict level 1//",
522 "-//ietf//dtd html strict level 2//",
523 "-//ietf//dtd html strict level 3//",
524 "-//ietf//dtd html strict//",
525 "-//ietf//dtd html//",
526 "-//metrius//dtd metrius presentational//",
527 "-//microsoft//dtd internet explorer 2.0 html strict//",
528 "-//microsoft//dtd internet explorer 2.0 html//",
529 "-//microsoft//dtd internet explorer 2.0 tables//",
530 "-//microsoft//dtd internet explorer 3.0 html strict//",
531 "-//microsoft//dtd internet explorer 3.0 html//",
532 "-//microsoft//dtd internet explorer 3.0 tables//",
533 "-//netscape comm. corp.//dtd html//",
534 "-//netscape comm. corp.//dtd strict html//",
535 "-//o'reilly and associates//dtd html 2.0//",
536 "-//o'reilly and associates//dtd html extended 1.0//",
537 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
538 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
539 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
540 "-//spyglass//dtd html 2.0 extended//",
541 "-//sq//dtd html 2.0 hotmetal + extensions//",
542 "-//sun microsystems corp.//dtd hotjava html//",
543 "-//sun microsystems corp.//dtd hotjava strict html//",
544 "-//w3c//dtd html 3 1995-03-24//",
545 "-//w3c//dtd html 3.2 draft//",
546 "-//w3c//dtd html 3.2 final//",
547 "-//w3c//dtd html 3.2//",
548 "-//w3c//dtd html 3.2s draft//",
549 "-//w3c//dtd html 4.0 frameset//",
550 "-//w3c//dtd html 4.0 transitional//",
551 "-//w3c//dtd html experimental 19960712//",
552 "-//w3c//dtd html experimental 970421//",
553 "-//w3c//dtd w3 html//",
554 "-//w3o//dtd w3 html 3.0//",
555 "-//webtechs//dtd mozilla html 2.0//",
556 "-//webtechs//dtd mozilla html//")) or
557 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
558 "-/w3c/dtd html 4.0 transitional/en",
559 "html") or
560 publicId.startswith(
561 ("-//w3c//dtd html 4.01 frameset//",
562 "-//w3c//dtd html 4.01 transitional//")) and
563 systemId is None or
564 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
565 self.parser.compatMode = "quirks"
566 elif (publicId.startswith(
567 ("-//w3c//dtd xhtml 1.0 frameset//",
568 "-//w3c//dtd xhtml 1.0 transitional//")) or
569 publicId.startswith(
570 ("-//w3c//dtd html 4.01 frameset//",
571 "-//w3c//dtd html 4.01 transitional//")) and
572 systemId is not None):
573 self.parser.compatMode = "limited quirks"
574
575 self.parser.phase = self.parser.phases["beforeHtml"]
576
577 def anythingElse(self):
578 self.parser.compatMode = "quirks"
579 self.parser.phase = self.parser.phases["beforeHtml"]
580
581 def processCharacters(self, token):
582 self.parser.parseError("expected-doctype-but-got-chars")
583 self.anythingElse()
584 return token
585
586 def processStartTag(self, token):
587 self.parser.parseError("expected-doctype-but-got-start-tag",
588 {"name": token["name"]})
589 self.anythingElse()
590 return token
591
592 def processEndTag(self, token):
593 self.parser.parseError("expected-doctype-but-got-end-tag",
594 {"name": token["name"]})
595 self.anythingElse()
596 return token
597
598 def processEOF(self):
599 self.parser.parseError("expected-doctype-but-got-eof")
600 self.anythingElse()
601 return True
602
603
604class BeforeHtmlPhase(Phase):
605 __slots__ = tuple()
606
607 # helper methods
608 def insertHtmlElement(self):
609 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
610 self.parser.phase = self.parser.phases["beforeHead"]
611
612 # other
613 def processEOF(self):
614 self.insertHtmlElement()
615 return True
616
617 def processComment(self, token):
618 self.tree.insertComment(token, self.tree.document)
619
620 def processSpaceCharacters(self, token):
621 pass
622
623 def processCharacters(self, token):
624 self.insertHtmlElement()
625 return token
626
627 def processStartTag(self, token):
628 if token["name"] == "html":
629 self.parser.firstStartTag = True
630 self.insertHtmlElement()
631 return token
632
633 def processEndTag(self, token):
634 if token["name"] not in ("head", "body", "html", "br"):
635 self.parser.parseError("unexpected-end-tag-before-html",
636 {"name": token["name"]})
637 else:
638 self.insertHtmlElement()
639 return token
640
641
642class BeforeHeadPhase(Phase):
643 __slots__ = tuple()
644
645 def processEOF(self):
646 self.startTagHead(impliedTagToken("head", "StartTag"))
647 return True
648
649 def processSpaceCharacters(self, token):
650 pass
651
652 def processCharacters(self, token):
653 self.startTagHead(impliedTagToken("head", "StartTag"))
654 return token
655
656 def startTagHtml(self, token):
657 return self.parser.phases["inBody"].processStartTag(token)
658
659 def startTagHead(self, token):
660 self.tree.insertElement(token)
661 self.tree.headPointer = self.tree.openElements[-1]
662 self.parser.phase = self.parser.phases["inHead"]
663
664 def startTagOther(self, token):
665 self.startTagHead(impliedTagToken("head", "StartTag"))
666 return token
667
668 def endTagImplyHead(self, token):
669 self.startTagHead(impliedTagToken("head", "StartTag"))
670 return token
671
672 def endTagOther(self, token):
673 self.parser.parseError("end-tag-after-implied-root",
674 {"name": token["name"]})
675
676 startTagHandler = _utils.MethodDispatcher([
677 ("html", startTagHtml),
678 ("head", startTagHead)
679 ])
680 startTagHandler.default = startTagOther
681
682 endTagHandler = _utils.MethodDispatcher([
683 (("head", "body", "html", "br"), endTagImplyHead)
684 ])
685 endTagHandler.default = endTagOther
686
687
688class InHeadPhase(Phase):
689 __slots__ = tuple()
690
691 # the real thing
692 def processEOF(self):
693 self.anythingElse()
694 return True
695
696 def processCharacters(self, token):
697 self.anythingElse()
698 return token
699
700 def startTagHtml(self, token):
701 return self.parser.phases["inBody"].processStartTag(token)
702
703 def startTagHead(self, token):
704 self.parser.parseError("two-heads-are-not-better-than-one")
705
706 def startTagBaseLinkCommand(self, token):
707 self.tree.insertElement(token)
708 self.tree.openElements.pop()
709 token["selfClosingAcknowledged"] = True
710
711 def startTagMeta(self, token):
712 self.tree.insertElement(token)
713 self.tree.openElements.pop()
714 token["selfClosingAcknowledged"] = True
715
716 attributes = token["data"]
717 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
718 if "charset" in attributes:
719 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
720 elif ("content" in attributes and
721 "http-equiv" in attributes and
722 attributes["http-equiv"].lower() == "content-type"):
723 # Encoding it as UTF-8 here is a hack, as really we should pass
724 # the abstract Unicode string, and just use the
725 # ContentAttrParser on that, but using UTF-8 allows all chars
726 # to be encoded and as a ASCII-superset works.
727 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
728 parser = _inputstream.ContentAttrParser(data)
729 codec = parser.parse()
730 self.parser.tokenizer.stream.changeEncoding(codec)
731
732 def startTagTitle(self, token):
733 self.parser.parseRCDataRawtext(token, "RCDATA")
734
735 def startTagNoFramesStyle(self, token):
736 # Need to decide whether to implement the scripting-disabled case
737 self.parser.parseRCDataRawtext(token, "RAWTEXT")
738
739 def startTagNoscript(self, token):
740 if self.parser.scripting:
741 self.parser.parseRCDataRawtext(token, "RAWTEXT")
742 else:
743 self.tree.insertElement(token)
744 self.parser.phase = self.parser.phases["inHeadNoscript"]
745
746 def startTagScript(self, token):
747 self.tree.insertElement(token)
748 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
749 self.parser.originalPhase = self.parser.phase
750 self.parser.phase = self.parser.phases["text"]
751
752 def startTagOther(self, token):
753 self.anythingElse()
754 return token
755
756 def endTagHead(self, token):
757 node = self.parser.tree.openElements.pop()
758 assert node.name == "head", "Expected head got %s" % node.name
759 self.parser.phase = self.parser.phases["afterHead"]
760
761 def endTagHtmlBodyBr(self, token):
762 self.anythingElse()
763 return token
764
765 def endTagOther(self, token):
766 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
767
768 def anythingElse(self):
769 self.endTagHead(impliedTagToken("head"))
770
771 startTagHandler = _utils.MethodDispatcher([
772 ("html", startTagHtml),
773 ("title", startTagTitle),
774 (("noframes", "style"), startTagNoFramesStyle),
775 ("noscript", startTagNoscript),
776 ("script", startTagScript),
777 (("base", "basefont", "bgsound", "command", "link"),
778 startTagBaseLinkCommand),
779 ("meta", startTagMeta),
780 ("head", startTagHead)
781 ])
782 startTagHandler.default = startTagOther
783
784 endTagHandler = _utils.MethodDispatcher([
785 ("head", endTagHead),
786 (("br", "html", "body"), endTagHtmlBodyBr)
787 ])
788 endTagHandler.default = endTagOther
789
790
791class InHeadNoscriptPhase(Phase):
792 __slots__ = tuple()
793
794 def processEOF(self):
795 self.parser.parseError("eof-in-head-noscript")
796 self.anythingElse()
797 return True
798
799 def processComment(self, token):
800 return self.parser.phases["inHead"].processComment(token)
801
802 def processCharacters(self, token):
803 self.parser.parseError("char-in-head-noscript")
804 self.anythingElse()
805 return token
806
807 def processSpaceCharacters(self, token):
808 return self.parser.phases["inHead"].processSpaceCharacters(token)
809
810 def startTagHtml(self, token):
811 return self.parser.phases["inBody"].processStartTag(token)
812
813 def startTagBaseLinkCommand(self, token):
814 return self.parser.phases["inHead"].processStartTag(token)
815
816 def startTagHeadNoscript(self, token):
817 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
818
819 def startTagOther(self, token):
820 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
821 self.anythingElse()
822 return token
823
824 def endTagNoscript(self, token):
825 node = self.parser.tree.openElements.pop()
826 assert node.name == "noscript", "Expected noscript got %s" % node.name
827 self.parser.phase = self.parser.phases["inHead"]
828
829 def endTagBr(self, token):
830 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
831 self.anythingElse()
832 return token
833
834 def endTagOther(self, token):
835 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
836
837 def anythingElse(self):
838 # Caller must raise parse error first!
839 self.endTagNoscript(impliedTagToken("noscript"))
840
841 startTagHandler = _utils.MethodDispatcher([
842 ("html", startTagHtml),
843 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
844 (("head", "noscript"), startTagHeadNoscript),
845 ])
846 startTagHandler.default = startTagOther
847
848 endTagHandler = _utils.MethodDispatcher([
849 ("noscript", endTagNoscript),
850 ("br", endTagBr),
851 ])
852 endTagHandler.default = endTagOther
853
854
855class AfterHeadPhase(Phase):
856 __slots__ = tuple()
857
858 def processEOF(self):
859 self.anythingElse()
860 return True
861
862 def processCharacters(self, token):
863 self.anythingElse()
864 return token
865
866 def startTagHtml(self, token):
867 return self.parser.phases["inBody"].processStartTag(token)
868
869 def startTagBody(self, token):
870 self.parser.framesetOK = False
871 self.tree.insertElement(token)
872 self.parser.phase = self.parser.phases["inBody"]
873
874 def startTagFrameset(self, token):
875 self.tree.insertElement(token)
876 self.parser.phase = self.parser.phases["inFrameset"]
877
878 def startTagFromHead(self, token):
879 self.parser.parseError("unexpected-start-tag-out-of-my-head",
880 {"name": token["name"]})
881 self.tree.openElements.append(self.tree.headPointer)
882 self.parser.phases["inHead"].processStartTag(token)
883 for node in self.tree.openElements[::-1]:
884 if node.name == "head":
885 self.tree.openElements.remove(node)
886 break
887
888 def startTagHead(self, token):
889 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
890
891 def startTagOther(self, token):
892 self.anythingElse()
893 return token
894
895 def endTagHtmlBodyBr(self, token):
896 self.anythingElse()
897 return token
898
899 def endTagOther(self, token):
900 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
901
902 def anythingElse(self):
903 self.tree.insertElement(impliedTagToken("body", "StartTag"))
904 self.parser.phase = self.parser.phases["inBody"]
905 self.parser.framesetOK = True
906
907 startTagHandler = _utils.MethodDispatcher([
908 ("html", startTagHtml),
909 ("body", startTagBody),
910 ("frameset", startTagFrameset),
911 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
912 "style", "title"),
913 startTagFromHead),
914 ("head", startTagHead)
915 ])
916 startTagHandler.default = startTagOther
917 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
918 endTagHtmlBodyBr)])
919 endTagHandler.default = endTagOther
920
921
922class InBodyPhase(Phase):
923 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
924 # the really-really-really-very crazy mode
925 __slots__ = ("processSpaceCharacters",)
926
927 def __init__(self, *args, **kwargs):
928 super(InBodyPhase, self).__init__(*args, **kwargs)
929 # Set this to the default handler
930 self.processSpaceCharacters = self.processSpaceCharactersNonPre
931
932 def isMatchingFormattingElement(self, node1, node2):
933 return (node1.name == node2.name and
934 node1.namespace == node2.namespace and
935 node1.attributes == node2.attributes)
936
937 # helper
938 def addFormattingElement(self, token):
939 self.tree.insertElement(token)
940 element = self.tree.openElements[-1]
941
942 matchingElements = []
943 for node in self.tree.activeFormattingElements[::-1]:
944 if node is Marker:
945 break
946 elif self.isMatchingFormattingElement(node, element):
947 matchingElements.append(node)
948
949 assert len(matchingElements) <= 3
950 if len(matchingElements) == 3:
951 self.tree.activeFormattingElements.remove(matchingElements[-1])
952 self.tree.activeFormattingElements.append(element)
953
954 # the real deal
955 def processEOF(self):
956 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
957 "tfoot", "th", "thead", "tr", "body",
958 "html"))
959 for node in self.tree.openElements[::-1]:
960 if node.name not in allowed_elements:
961 self.parser.parseError("expected-closing-tag-but-got-eof")
962 break
963 # Stop parsing
964
965 def processSpaceCharactersDropNewline(self, token):
966 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
967 # want to drop leading newlines
968 data = token["data"]
969 self.processSpaceCharacters = self.processSpaceCharactersNonPre
970 if (data.startswith("\n") and
971 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
972 not self.tree.openElements[-1].hasContent()):
973 data = data[1:]
974 if data:
975 self.tree.reconstructActiveFormattingElements()
976 self.tree.insertText(data)
977
978 def processCharacters(self, token):
979 if token["data"] == "\u0000":
980 # The tokenizer should always emit null on its own
981 return
982 self.tree.reconstructActiveFormattingElements()
983 self.tree.insertText(token["data"])
984 # This must be bad for performance
985 if (self.parser.framesetOK and
986 any(char not in spaceCharacters
987 for char in token["data"])):
988 self.parser.framesetOK = False
989
990 def processSpaceCharactersNonPre(self, token):
991 self.tree.reconstructActiveFormattingElements()
992 self.tree.insertText(token["data"])
993
994 def startTagProcessInHead(self, token):
995 return self.parser.phases["inHead"].processStartTag(token)
996
997 def startTagBody(self, token):
998 self.parser.parseError("unexpected-start-tag", {"name": "body"})
999 if (len(self.tree.openElements) == 1 or
1000 self.tree.openElements[1].name != "body"):
1001 assert self.parser.innerHTML
1002 else:
1003 self.parser.framesetOK = False
1004 for attr, value in token["data"].items():
1005 if attr not in self.tree.openElements[1].attributes:
1006 self.tree.openElements[1].attributes[attr] = value
1007
1008 def startTagFrameset(self, token):
1009 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1010 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1011 assert self.parser.innerHTML
1012 elif not self.parser.framesetOK:
1013 pass
1014 else:
1015 if self.tree.openElements[1].parent:
1016 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1017 while self.tree.openElements[-1].name != "html":
1018 self.tree.openElements.pop()
1019 self.tree.insertElement(token)
1020 self.parser.phase = self.parser.phases["inFrameset"]
1021
1022 def startTagCloseP(self, token):
1023 if self.tree.elementInScope("p", variant="button"):
1024 self.endTagP(impliedTagToken("p"))
1025 self.tree.insertElement(token)
1026
1027 def startTagPreListing(self, token):
1028 if self.tree.elementInScope("p", variant="button"):
1029 self.endTagP(impliedTagToken("p"))
1030 self.tree.insertElement(token)
1031 self.parser.framesetOK = False
1032 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1033
1034 def startTagForm(self, token):
1035 if self.tree.formPointer:
1036 self.parser.parseError("unexpected-start-tag", {"name": "form"})
1037 else:
1038 if self.tree.elementInScope("p", variant="button"):
1039 self.endTagP(impliedTagToken("p"))
1040 self.tree.insertElement(token)
1041 self.tree.formPointer = self.tree.openElements[-1]
1042
1043 def startTagListItem(self, token):
1044 self.parser.framesetOK = False
1045
1046 stopNamesMap = {"li": ["li"],
1047 "dt": ["dt", "dd"],
1048 "dd": ["dt", "dd"]}
1049 stopNames = stopNamesMap[token["name"]]
1050 for node in reversed(self.tree.openElements):
1051 if node.name in stopNames:
1052 self.parser.phase.processEndTag(
1053 impliedTagToken(node.name, "EndTag"))
1054 break
1055 if (node.nameTuple in specialElements and
1056 node.name not in ("address", "div", "p")):
1057 break
1058
1059 if self.tree.elementInScope("p", variant="button"):
1060 self.parser.phase.processEndTag(
1061 impliedTagToken("p", "EndTag"))
1062
1063 self.tree.insertElement(token)
1064
1065 def startTagPlaintext(self, token):
1066 if self.tree.elementInScope("p", variant="button"):
1067 self.endTagP(impliedTagToken("p"))
1068 self.tree.insertElement(token)
1069 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1070
1071 def startTagHeading(self, token):
1072 if self.tree.elementInScope("p", variant="button"):
1073 self.endTagP(impliedTagToken("p"))
1074 if self.tree.openElements[-1].name in headingElements:
1075 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1076 self.tree.openElements.pop()
1077 self.tree.insertElement(token)
1078
1079 def startTagA(self, token):
1080 afeAElement = self.tree.elementInActiveFormattingElements("a")
1081 if afeAElement:
1082 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1083 {"startName": "a", "endName": "a"})
1084 self.endTagFormatting(impliedTagToken("a"))
1085 if afeAElement in self.tree.openElements:
1086 self.tree.openElements.remove(afeAElement)
1087 if afeAElement in self.tree.activeFormattingElements:
1088 self.tree.activeFormattingElements.remove(afeAElement)
1089 self.tree.reconstructActiveFormattingElements()
1090 self.addFormattingElement(token)
1091
1092 def startTagFormatting(self, token):
1093 self.tree.reconstructActiveFormattingElements()
1094 self.addFormattingElement(token)
1095
1096 def startTagNobr(self, token):
1097 self.tree.reconstructActiveFormattingElements()
1098 if self.tree.elementInScope("nobr"):
1099 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1100 {"startName": "nobr", "endName": "nobr"})
1101 self.processEndTag(impliedTagToken("nobr"))
1102 # XXX Need tests that trigger the following
1103 self.tree.reconstructActiveFormattingElements()
1104 self.addFormattingElement(token)
1105
1106 def startTagButton(self, token):
1107 if self.tree.elementInScope("button"):
1108 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1109 {"startName": "button", "endName": "button"})
1110 self.processEndTag(impliedTagToken("button"))
1111 return token
1112 else:
1113 self.tree.reconstructActiveFormattingElements()
1114 self.tree.insertElement(token)
1115 self.parser.framesetOK = False
1116
1117 def startTagAppletMarqueeObject(self, token):
1118 self.tree.reconstructActiveFormattingElements()
1119 self.tree.insertElement(token)
1120 self.tree.activeFormattingElements.append(Marker)
1121 self.parser.framesetOK = False
1122
1123 def startTagXmp(self, token):
1124 if self.tree.elementInScope("p", variant="button"):
1125 self.endTagP(impliedTagToken("p"))
1126 self.tree.reconstructActiveFormattingElements()
1127 self.parser.framesetOK = False
1128 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1129
1130 def startTagTable(self, token):
1131 if self.parser.compatMode != "quirks":
1132 if self.tree.elementInScope("p", variant="button"):
1133 self.processEndTag(impliedTagToken("p"))
1134 self.tree.insertElement(token)
1135 self.parser.framesetOK = False
1136 self.parser.phase = self.parser.phases["inTable"]
1137
1138 def startTagVoidFormatting(self, token):
1139 self.tree.reconstructActiveFormattingElements()
1140 self.tree.insertElement(token)
1141 self.tree.openElements.pop()
1142 token["selfClosingAcknowledged"] = True
1143 self.parser.framesetOK = False
1144
1145 def startTagInput(self, token):
1146 framesetOK = self.parser.framesetOK
1147 self.startTagVoidFormatting(token)
1148 if ("type" in token["data"] and
1149 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1150 # input type=hidden doesn't change framesetOK
1151 self.parser.framesetOK = framesetOK
1152
1153 def startTagParamSource(self, token):
1154 self.tree.insertElement(token)
1155 self.tree.openElements.pop()
1156 token["selfClosingAcknowledged"] = True
1157
1158 def startTagHr(self, token):
1159 if self.tree.elementInScope("p", variant="button"):
1160 self.endTagP(impliedTagToken("p"))
1161 self.tree.insertElement(token)
1162 self.tree.openElements.pop()
1163 token["selfClosingAcknowledged"] = True
1164 self.parser.framesetOK = False
1165
1166 def startTagImage(self, token):
1167 # No really...
1168 self.parser.parseError("unexpected-start-tag-treated-as",
1169 {"originalName": "image", "newName": "img"})
1170 self.processStartTag(impliedTagToken("img", "StartTag",
1171 attributes=token["data"],
1172 selfClosing=token["selfClosing"]))
1173
1174 def startTagIsIndex(self, token):
1175 self.parser.parseError("deprecated-tag", {"name": "isindex"})
1176 if self.tree.formPointer:
1177 return
1178 form_attrs = {}
1179 if "action" in token["data"]:
1180 form_attrs["action"] = token["data"]["action"]
1181 self.processStartTag(impliedTagToken("form", "StartTag",
1182 attributes=form_attrs))
1183 self.processStartTag(impliedTagToken("hr", "StartTag"))
1184 self.processStartTag(impliedTagToken("label", "StartTag"))
1185 # XXX Localization ...
1186 if "prompt" in token["data"]:
1187 prompt = token["data"]["prompt"]
1188 else:
1189 prompt = "This is a searchable index. Enter search keywords: "
1190 self.processCharacters(
1191 {"type": tokenTypes["Characters"], "data": prompt})
1192 attributes = token["data"].copy()
1193 if "action" in attributes:
1194 del attributes["action"]
1195 if "prompt" in attributes:
1196 del attributes["prompt"]
1197 attributes["name"] = "isindex"
1198 self.processStartTag(impliedTagToken("input", "StartTag",
1199 attributes=attributes,
1200 selfClosing=token["selfClosing"]))
1201 self.processEndTag(impliedTagToken("label"))
1202 self.processStartTag(impliedTagToken("hr", "StartTag"))
1203 self.processEndTag(impliedTagToken("form"))
1204
1205 def startTagTextarea(self, token):
1206 self.tree.insertElement(token)
1207 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1208 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1209 self.parser.framesetOK = False
1210
1211 def startTagIFrame(self, token):
1212 self.parser.framesetOK = False
1213 self.startTagRawtext(token)
1214
1215 def startTagNoscript(self, token):
1216 if self.parser.scripting:
1217 self.startTagRawtext(token)
1218 else:
1219 self.startTagOther(token)
1220
1221 def startTagRawtext(self, token):
1222 """iframe, noembed noframes, noscript(if scripting enabled)"""
1223 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1224
1225 def startTagOpt(self, token):
1226 if self.tree.openElements[-1].name == "option":
1227 self.parser.phase.processEndTag(impliedTagToken("option"))
1228 self.tree.reconstructActiveFormattingElements()
1229 self.parser.tree.insertElement(token)
1230
1231 def startTagSelect(self, token):
1232 self.tree.reconstructActiveFormattingElements()
1233 self.tree.insertElement(token)
1234 self.parser.framesetOK = False
1235 if self.parser.phase in (self.parser.phases["inTable"],
1236 self.parser.phases["inCaption"],
1237 self.parser.phases["inColumnGroup"],
1238 self.parser.phases["inTableBody"],
1239 self.parser.phases["inRow"],
1240 self.parser.phases["inCell"]):
1241 self.parser.phase = self.parser.phases["inSelectInTable"]
1242 else:
1243 self.parser.phase = self.parser.phases["inSelect"]
1244
1245 def startTagRpRt(self, token):
1246 if self.tree.elementInScope("ruby"):
1247 self.tree.generateImpliedEndTags()
1248 if self.tree.openElements[-1].name != "ruby":
1249 self.parser.parseError()
1250 self.tree.insertElement(token)
1251
1252 def startTagMath(self, token):
1253 self.tree.reconstructActiveFormattingElements()
1254 self.parser.adjustMathMLAttributes(token)
1255 self.parser.adjustForeignAttributes(token)
1256 token["namespace"] = namespaces["mathml"]
1257 self.tree.insertElement(token)
1258 # Need to get the parse error right for the case where the token
1259 # has a namespace not equal to the xmlns attribute
1260 if token["selfClosing"]:
1261 self.tree.openElements.pop()
1262 token["selfClosingAcknowledged"] = True
1263
1264 def startTagSvg(self, token):
1265 self.tree.reconstructActiveFormattingElements()
1266 self.parser.adjustSVGAttributes(token)
1267 self.parser.adjustForeignAttributes(token)
1268 token["namespace"] = namespaces["svg"]
1269 self.tree.insertElement(token)
1270 # Need to get the parse error right for the case where the token
1271 # has a namespace not equal to the xmlns attribute
1272 if token["selfClosing"]:
1273 self.tree.openElements.pop()
1274 token["selfClosingAcknowledged"] = True
1275
1276 def startTagMisplaced(self, token):
1277 """ Elements that should be children of other elements that have a
1278 different insertion mode; here they are ignored
1279 "caption", "col", "colgroup", "frame", "frameset", "head",
1280 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1281 "tr", "noscript"
1282 """
1283 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1284
1285 def startTagOther(self, token):
1286 self.tree.reconstructActiveFormattingElements()
1287 self.tree.insertElement(token)
1288
1289 def endTagP(self, token):
1290 if not self.tree.elementInScope("p", variant="button"):
1291 self.startTagCloseP(impliedTagToken("p", "StartTag"))
1292 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1293 self.endTagP(impliedTagToken("p", "EndTag"))
1294 else:
1295 self.tree.generateImpliedEndTags("p")
1296 if self.tree.openElements[-1].name != "p":
1297 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1298 node = self.tree.openElements.pop()
1299 while node.name != "p":
1300 node = self.tree.openElements.pop()
1301
1302 def endTagBody(self, token):
1303 if not self.tree.elementInScope("body"):
1304 self.parser.parseError()
1305 return
1306 elif self.tree.openElements[-1].name != "body":
1307 for node in self.tree.openElements[2:]:
1308 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1309 "option", "p", "rp", "rt",
1310 "tbody", "td", "tfoot",
1311 "th", "thead", "tr", "body",
1312 "html")):
1313 # Not sure this is the correct name for the parse error
1314 self.parser.parseError(
1315 "expected-one-end-tag-but-got-another",
1316 {"gotName": "body", "expectedName": node.name})
1317 break
1318 self.parser.phase = self.parser.phases["afterBody"]
1319
1320 def endTagHtml(self, token):
1321 # We repeat the test for the body end tag token being ignored here
1322 if self.tree.elementInScope("body"):
1323 self.endTagBody(impliedTagToken("body"))
1324 return token
1325
1326 def endTagBlock(self, token):
1327 # Put us back in the right whitespace handling mode
1328 if token["name"] == "pre":
1329 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1330 inScope = self.tree.elementInScope(token["name"])
1331 if inScope:
1332 self.tree.generateImpliedEndTags()
1333 if self.tree.openElements[-1].name != token["name"]:
1334 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1335 if inScope:
1336 node = self.tree.openElements.pop()
1337 while node.name != token["name"]:
1338 node = self.tree.openElements.pop()
1339
1340 def endTagForm(self, token):
1341 node = self.tree.formPointer
1342 self.tree.formPointer = None
1343 if node is None or not self.tree.elementInScope(node):
1344 self.parser.parseError("unexpected-end-tag",
1345 {"name": "form"})
1346 else:
1347 self.tree.generateImpliedEndTags()
1348 if self.tree.openElements[-1] != node:
1349 self.parser.parseError("end-tag-too-early-ignored",
1350 {"name": "form"})
1351 self.tree.openElements.remove(node)
1352
1353 def endTagListItem(self, token):
1354 if token["name"] == "li":
1355 variant = "list"
1356 else:
1357 variant = None
1358 if not self.tree.elementInScope(token["name"], variant=variant):
1359 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1360 else:
1361 self.tree.generateImpliedEndTags(exclude=token["name"])
1362 if self.tree.openElements[-1].name != token["name"]:
1363 self.parser.parseError(
1364 "end-tag-too-early",
1365 {"name": token["name"]})
1366 node = self.tree.openElements.pop()
1367 while node.name != token["name"]:
1368 node = self.tree.openElements.pop()
1369
1370 def endTagHeading(self, token):
1371 for item in headingElements:
1372 if self.tree.elementInScope(item):
1373 self.tree.generateImpliedEndTags()
1374 break
1375 if self.tree.openElements[-1].name != token["name"]:
1376 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1377
1378 for item in headingElements:
1379 if self.tree.elementInScope(item):
1380 item = self.tree.openElements.pop()
1381 while item.name not in headingElements:
1382 item = self.tree.openElements.pop()
1383 break
1384
1385 def endTagFormatting(self, token):
1386 """The much-feared adoption agency algorithm"""
1387 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1388 # XXX Better parseError messages appreciated.
1389
1390 # Step 1
1391 outerLoopCounter = 0
1392
1393 # Step 2
1394 while outerLoopCounter < 8:
1395
1396 # Step 3
1397 outerLoopCounter += 1
1398
1399 # Step 4:
1400
1401 # Let the formatting element be the last element in
1402 # the list of active formatting elements that:
1403 # - is between the end of the list and the last scope
1404 # marker in the list, if any, or the start of the list
1405 # otherwise, and
1406 # - has the same tag name as the token.
1407 formattingElement = self.tree.elementInActiveFormattingElements(
1408 token["name"])
1409 if (not formattingElement or
1410 (formattingElement in self.tree.openElements and
1411 not self.tree.elementInScope(formattingElement.name))):
1412 # If there is no such node, then abort these steps
1413 # and instead act as described in the "any other
1414 # end tag" entry below.
1415 self.endTagOther(token)
1416 return
1417
1418 # Otherwise, if there is such a node, but that node is
1419 # not in the stack of open elements, then this is a
1420 # parse error; remove the element from the list, and
1421 # abort these steps.
1422 elif formattingElement not in self.tree.openElements:
1423 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1424 self.tree.activeFormattingElements.remove(formattingElement)
1425 return
1426
1427 # Otherwise, if there is such a node, and that node is
1428 # also in the stack of open elements, but the element
1429 # is not in scope, then this is a parse error; ignore
1430 # the token, and abort these steps.
1431 elif not self.tree.elementInScope(formattingElement.name):
1432 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1433 return
1434
1435 # Otherwise, there is a formatting element and that
1436 # element is in the stack and is in scope. If the
1437 # element is not the current node, this is a parse
1438 # error. In any case, proceed with the algorithm as
1439 # written in the following steps.
1440 else:
1441 if formattingElement != self.tree.openElements[-1]:
1442 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1443
1444 # Step 5:
1445
1446 # Let the furthest block be the topmost node in the
1447 # stack of open elements that is lower in the stack
1448 # than the formatting element, and is an element in
1449 # the special category. There might not be one.
1450 afeIndex = self.tree.openElements.index(formattingElement)
1451 furthestBlock = None
1452 for element in self.tree.openElements[afeIndex:]:
1453 if element.nameTuple in specialElements:
1454 furthestBlock = element
1455 break
1456
1457 # Step 6:
1458
1459 # If there is no furthest block, then the UA must
1460 # first pop all the nodes from the bottom of the stack
1461 # of open elements, from the current node up to and
1462 # including the formatting element, then remove the
1463 # formatting element from the list of active
1464 # formatting elements, and finally abort these steps.
1465 if furthestBlock is None:
1466 element = self.tree.openElements.pop()
1467 while element != formattingElement:
1468 element = self.tree.openElements.pop()
1469 self.tree.activeFormattingElements.remove(element)
1470 return
1471
1472 # Step 7
1473 commonAncestor = self.tree.openElements[afeIndex - 1]
1474
1475 # Step 8:
1476 # The bookmark is supposed to help us identify where to reinsert
1477 # nodes in step 15. We have to ensure that we reinsert nodes after
1478 # the node before the active formatting element. Note the bookmark
1479 # can move in step 9.7
1480 bookmark = self.tree.activeFormattingElements.index(formattingElement)
1481
1482 # Step 9
1483 lastNode = node = furthestBlock
1484 innerLoopCounter = 0
1485
1486 index = self.tree.openElements.index(node)
1487 while innerLoopCounter < 3:
1488 innerLoopCounter += 1
1489 # Node is element before node in open elements
1490 index -= 1
1491 node = self.tree.openElements[index]
1492 if node not in self.tree.activeFormattingElements:
1493 self.tree.openElements.remove(node)
1494 continue
1495 # Step 9.6
1496 if node == formattingElement:
1497 break
1498 # Step 9.7
1499 if lastNode == furthestBlock:
1500 bookmark = self.tree.activeFormattingElements.index(node) + 1
1501 # Step 9.8
1502 clone = node.cloneNode()
1503 # Replace node with clone
1504 self.tree.activeFormattingElements[
1505 self.tree.activeFormattingElements.index(node)] = clone
1506 self.tree.openElements[
1507 self.tree.openElements.index(node)] = clone
1508 node = clone
1509 # Step 9.9
1510 # Remove lastNode from its parents, if any
1511 if lastNode.parent:
1512 lastNode.parent.removeChild(lastNode)
1513 node.appendChild(lastNode)
1514 # Step 9.10
1515 lastNode = node
1516
1517 # Step 10
1518 # Foster parent lastNode if commonAncestor is a
1519 # table, tbody, tfoot, thead, or tr we need to foster
1520 # parent the lastNode
1521 if lastNode.parent:
1522 lastNode.parent.removeChild(lastNode)
1523
1524 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1525 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1526 parent.insertBefore(lastNode, insertBefore)
1527 else:
1528 commonAncestor.appendChild(lastNode)
1529
1530 # Step 11
1531 clone = formattingElement.cloneNode()
1532
1533 # Step 12
1534 furthestBlock.reparentChildren(clone)
1535
1536 # Step 13
1537 furthestBlock.appendChild(clone)
1538
1539 # Step 14
1540 self.tree.activeFormattingElements.remove(formattingElement)
1541 self.tree.activeFormattingElements.insert(bookmark, clone)
1542
1543 # Step 15
1544 self.tree.openElements.remove(formattingElement)
1545 self.tree.openElements.insert(
1546 self.tree.openElements.index(furthestBlock) + 1, clone)
1547
1548 def endTagAppletMarqueeObject(self, token):
1549 if self.tree.elementInScope(token["name"]):
1550 self.tree.generateImpliedEndTags()
1551 if self.tree.openElements[-1].name != token["name"]:
1552 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1553
1554 if self.tree.elementInScope(token["name"]):
1555 element = self.tree.openElements.pop()
1556 while element.name != token["name"]:
1557 element = self.tree.openElements.pop()
1558 self.tree.clearActiveFormattingElements()
1559
1560 def endTagBr(self, token):
1561 self.parser.parseError("unexpected-end-tag-treated-as",
1562 {"originalName": "br", "newName": "br element"})
1563 self.tree.reconstructActiveFormattingElements()
1564 self.tree.insertElement(impliedTagToken("br", "StartTag"))
1565 self.tree.openElements.pop()
1566
1567 def endTagOther(self, token):
1568 for node in self.tree.openElements[::-1]:
1569 if node.name == token["name"]:
1570 self.tree.generateImpliedEndTags(exclude=token["name"])
1571 if self.tree.openElements[-1].name != token["name"]:
1572 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1573 while self.tree.openElements.pop() != node:
1574 pass
1575 break
1576 else:
1577 if node.nameTuple in specialElements:
1578 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1579 break
1580
1581 startTagHandler = _utils.MethodDispatcher([
1582 ("html", Phase.startTagHtml),
1583 (("base", "basefont", "bgsound", "command", "link", "meta",
1584 "script", "style", "title"),
1585 startTagProcessInHead),
1586 ("body", startTagBody),
1587 ("frameset", startTagFrameset),
1588 (("address", "article", "aside", "blockquote", "center", "details",
1589 "dir", "div", "dl", "fieldset", "figcaption", "figure",
1590 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
1591 "section", "summary", "ul"),
1592 startTagCloseP),
1593 (headingElements, startTagHeading),
1594 (("pre", "listing"), startTagPreListing),
1595 ("form", startTagForm),
1596 (("li", "dd", "dt"), startTagListItem),
1597 ("plaintext", startTagPlaintext),
1598 ("a", startTagA),
1599 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
1600 "strong", "tt", "u"), startTagFormatting),
1601 ("nobr", startTagNobr),
1602 ("button", startTagButton),
1603 (("applet", "marquee", "object"), startTagAppletMarqueeObject),
1604 ("xmp", startTagXmp),
1605 ("table", startTagTable),
1606 (("area", "br", "embed", "img", "keygen", "wbr"),
1607 startTagVoidFormatting),
1608 (("param", "source", "track"), startTagParamSource),
1609 ("input", startTagInput),
1610 ("hr", startTagHr),
1611 ("image", startTagImage),
1612 ("isindex", startTagIsIndex),
1613 ("textarea", startTagTextarea),
1614 ("iframe", startTagIFrame),
1615 ("noscript", startTagNoscript),
1616 (("noembed", "noframes"), startTagRawtext),
1617 ("select", startTagSelect),
1618 (("rp", "rt"), startTagRpRt),
1619 (("option", "optgroup"), startTagOpt),
1620 (("math"), startTagMath),
1621 (("svg"), startTagSvg),
1622 (("caption", "col", "colgroup", "frame", "head",
1623 "tbody", "td", "tfoot", "th", "thead",
1624 "tr"), startTagMisplaced)
1625 ])
1626 startTagHandler.default = startTagOther
1627
1628 endTagHandler = _utils.MethodDispatcher([
1629 ("body", endTagBody),
1630 ("html", endTagHtml),
1631 (("address", "article", "aside", "blockquote", "button", "center",
1632 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
1633 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
1634 "section", "summary", "ul"), endTagBlock),
1635 ("form", endTagForm),
1636 ("p", endTagP),
1637 (("dd", "dt", "li"), endTagListItem),
1638 (headingElements, endTagHeading),
1639 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
1640 "strike", "strong", "tt", "u"), endTagFormatting),
1641 (("applet", "marquee", "object"), endTagAppletMarqueeObject),
1642 ("br", endTagBr),
1643 ])
1644 endTagHandler.default = endTagOther
1645
1646
1647class TextPhase(Phase):
1648 __slots__ = tuple()
1649
1650 def processCharacters(self, token):
1651 self.tree.insertText(token["data"])
1652
1653 def processEOF(self):
1654 self.parser.parseError("expected-named-closing-tag-but-got-eof",
1655 {"name": self.tree.openElements[-1].name})
1656 self.tree.openElements.pop()
1657 self.parser.phase = self.parser.originalPhase
1658 return True
1659
1660 def startTagOther(self, token):
1661 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1662
1663 def endTagScript(self, token):
1664 node = self.tree.openElements.pop()
1665 assert node.name == "script"
1666 self.parser.phase = self.parser.originalPhase
1667 # The rest of this method is all stuff that only happens if
1668 # document.write works
1669
1670 def endTagOther(self, token):
1671 self.tree.openElements.pop()
1672 self.parser.phase = self.parser.originalPhase
1673
1674 startTagHandler = _utils.MethodDispatcher([])
1675 startTagHandler.default = startTagOther
1676 endTagHandler = _utils.MethodDispatcher([
1677 ("script", endTagScript)])
1678 endTagHandler.default = endTagOther
1679
1680
1681class InTablePhase(Phase):
1682 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1683 __slots__ = tuple()
1684
1685 # helper methods
1686 def clearStackToTableContext(self):
1687 # "clear the stack back to a table context"
1688 while self.tree.openElements[-1].name not in ("table", "html"):
1689 # self.parser.parseError("unexpected-implied-end-tag-in-table",
1690 # {"name": self.tree.openElements[-1].name})
1691 self.tree.openElements.pop()
1692 # When the current node is <html> it's an innerHTML case
1693
1694 # processing methods
1695 def processEOF(self):
1696 if self.tree.openElements[-1].name != "html":
1697 self.parser.parseError("eof-in-table")
1698 else:
1699 assert self.parser.innerHTML
1700 # Stop parsing
1701
1702 def processSpaceCharacters(self, token):
1703 originalPhase = self.parser.phase
1704 self.parser.phase = self.parser.phases["inTableText"]
1705 self.parser.phase.originalPhase = originalPhase
1706 self.parser.phase.processSpaceCharacters(token)
1707
1708 def processCharacters(self, token):
1709 originalPhase = self.parser.phase
1710 self.parser.phase = self.parser.phases["inTableText"]
1711 self.parser.phase.originalPhase = originalPhase
1712 self.parser.phase.processCharacters(token)
1713
1714 def insertText(self, token):
1715 # If we get here there must be at least one non-whitespace character
1716 # Do the table magic!
1717 self.tree.insertFromTable = True
1718 self.parser.phases["inBody"].processCharacters(token)
1719 self.tree.insertFromTable = False
1720
1721 def startTagCaption(self, token):
1722 self.clearStackToTableContext()
1723 self.tree.activeFormattingElements.append(Marker)
1724 self.tree.insertElement(token)
1725 self.parser.phase = self.parser.phases["inCaption"]
1726
1727 def startTagColgroup(self, token):
1728 self.clearStackToTableContext()
1729 self.tree.insertElement(token)
1730 self.parser.phase = self.parser.phases["inColumnGroup"]
1731
1732 def startTagCol(self, token):
1733 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1734 return token
1735
1736 def startTagRowGroup(self, token):
1737 self.clearStackToTableContext()
1738 self.tree.insertElement(token)
1739 self.parser.phase = self.parser.phases["inTableBody"]
1740
1741 def startTagImplyTbody(self, token):
1742 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1743 return token
1744
1745 def startTagTable(self, token):
1746 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1747 {"startName": "table", "endName": "table"})
1748 self.parser.phase.processEndTag(impliedTagToken("table"))
1749 if not self.parser.innerHTML:
1750 return token
1751
1752 def startTagStyleScript(self, token):
1753 return self.parser.phases["inHead"].processStartTag(token)
1754
1755 def startTagInput(self, token):
1756 if ("type" in token["data"] and
1757 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1758 self.parser.parseError("unexpected-hidden-input-in-table")
1759 self.tree.insertElement(token)
1760 # XXX associate with form
1761 self.tree.openElements.pop()
1762 else:
1763 self.startTagOther(token)
1764
1765 def startTagForm(self, token):
1766 self.parser.parseError("unexpected-form-in-table")
1767 if self.tree.formPointer is None:
1768 self.tree.insertElement(token)
1769 self.tree.formPointer = self.tree.openElements[-1]
1770 self.tree.openElements.pop()
1771
1772 def startTagOther(self, token):
1773 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1774 # Do the table magic!
1775 self.tree.insertFromTable = True
1776 self.parser.phases["inBody"].processStartTag(token)
1777 self.tree.insertFromTable = False
1778
1779 def endTagTable(self, token):
1780 if self.tree.elementInScope("table", variant="table"):
1781 self.tree.generateImpliedEndTags()
1782 if self.tree.openElements[-1].name != "table":
1783 self.parser.parseError("end-tag-too-early-named",
1784 {"gotName": "table",
1785 "expectedName": self.tree.openElements[-1].name})
1786 while self.tree.openElements[-1].name != "table":
1787 self.tree.openElements.pop()
1788 self.tree.openElements.pop()
1789 self.parser.resetInsertionMode()
1790 else:
1791 # innerHTML case
1792 assert self.parser.innerHTML
1793 self.parser.parseError()
1794
1795 def endTagIgnore(self, token):
1796 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1797
1798 def endTagOther(self, token):
1799 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1800 # Do the table magic!
1801 self.tree.insertFromTable = True
1802 self.parser.phases["inBody"].processEndTag(token)
1803 self.tree.insertFromTable = False
1804
1805 startTagHandler = _utils.MethodDispatcher([
1806 ("html", Phase.startTagHtml),
1807 ("caption", startTagCaption),
1808 ("colgroup", startTagColgroup),
1809 ("col", startTagCol),
1810 (("tbody", "tfoot", "thead"), startTagRowGroup),
1811 (("td", "th", "tr"), startTagImplyTbody),
1812 ("table", startTagTable),
1813 (("style", "script"), startTagStyleScript),
1814 ("input", startTagInput),
1815 ("form", startTagForm)
1816 ])
1817 startTagHandler.default = startTagOther
1818
1819 endTagHandler = _utils.MethodDispatcher([
1820 ("table", endTagTable),
1821 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1822 "tfoot", "th", "thead", "tr"), endTagIgnore)
1823 ])
1824 endTagHandler.default = endTagOther
1825
1826
1827class InTableTextPhase(Phase):
1828 __slots__ = ("originalPhase", "characterTokens")
1829
1830 def __init__(self, *args, **kwargs):
1831 super(InTableTextPhase, self).__init__(*args, **kwargs)
1832 self.originalPhase = None
1833 self.characterTokens = []
1834
1835 def flushCharacters(self):
1836 data = "".join([item["data"] for item in self.characterTokens])
1837 if any(item not in spaceCharacters for item in data):
1838 token = {"type": tokenTypes["Characters"], "data": data}
1839 self.parser.phases["inTable"].insertText(token)
1840 elif data:
1841 self.tree.insertText(data)
1842 self.characterTokens = []
1843
1844 def processComment(self, token):
1845 self.flushCharacters()
1846 self.parser.phase = self.originalPhase
1847 return token
1848
1849 def processEOF(self):
1850 self.flushCharacters()
1851 self.parser.phase = self.originalPhase
1852 return True
1853
1854 def processCharacters(self, token):
1855 if token["data"] == "\u0000":
1856 return
1857 self.characterTokens.append(token)
1858
1859 def processSpaceCharacters(self, token):
1860 # pretty sure we should never reach here
1861 self.characterTokens.append(token)
1862# assert False
1863
1864 def processStartTag(self, token):
1865 self.flushCharacters()
1866 self.parser.phase = self.originalPhase
1867 return token
1868
1869 def processEndTag(self, token):
1870 self.flushCharacters()
1871 self.parser.phase = self.originalPhase
1872 return token
1873
1874
1875class InCaptionPhase(Phase):
1876 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1877 __slots__ = tuple()
1878
1879 def ignoreEndTagCaption(self):
1880 return not self.tree.elementInScope("caption", variant="table")
1881
1882 def processEOF(self):
1883 self.parser.phases["inBody"].processEOF()
1884
1885 def processCharacters(self, token):
1886 return self.parser.phases["inBody"].processCharacters(token)
1887
1888 def startTagTableElement(self, token):
1889 self.parser.parseError()
1890 # XXX Have to duplicate logic here to find out if the tag is ignored
1891 ignoreEndTag = self.ignoreEndTagCaption()
1892 self.parser.phase.processEndTag(impliedTagToken("caption"))
1893 if not ignoreEndTag:
1894 return token
1895
1896 def startTagOther(self, token):
1897 return self.parser.phases["inBody"].processStartTag(token)
1898
1899 def endTagCaption(self, token):
1900 if not self.ignoreEndTagCaption():
1901 # AT this code is quite similar to endTagTable in "InTable"
1902 self.tree.generateImpliedEndTags()
1903 if self.tree.openElements[-1].name != "caption":
1904 self.parser.parseError("expected-one-end-tag-but-got-another",
1905 {"gotName": "caption",
1906 "expectedName": self.tree.openElements[-1].name})
1907 while self.tree.openElements[-1].name != "caption":
1908 self.tree.openElements.pop()
1909 self.tree.openElements.pop()
1910 self.tree.clearActiveFormattingElements()
1911 self.parser.phase = self.parser.phases["inTable"]
1912 else:
1913 # innerHTML case
1914 assert self.parser.innerHTML
1915 self.parser.parseError()
1916
1917 def endTagTable(self, token):
1918 self.parser.parseError()
1919 ignoreEndTag = self.ignoreEndTagCaption()
1920 self.parser.phase.processEndTag(impliedTagToken("caption"))
1921 if not ignoreEndTag:
1922 return token
1923
1924 def endTagIgnore(self, token):
1925 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1926
1927 def endTagOther(self, token):
1928 return self.parser.phases["inBody"].processEndTag(token)
1929
1930 startTagHandler = _utils.MethodDispatcher([
1931 ("html", Phase.startTagHtml),
1932 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1933 "thead", "tr"), startTagTableElement)
1934 ])
1935 startTagHandler.default = startTagOther
1936
1937 endTagHandler = _utils.MethodDispatcher([
1938 ("caption", endTagCaption),
1939 ("table", endTagTable),
1940 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1941 "thead", "tr"), endTagIgnore)
1942 ])
1943 endTagHandler.default = endTagOther
1944
1945
1946class InColumnGroupPhase(Phase):
1947 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1948 __slots__ = tuple()
1949
1950 def ignoreEndTagColgroup(self):
1951 return self.tree.openElements[-1].name == "html"
1952
1953 def processEOF(self):
1954 if self.tree.openElements[-1].name == "html":
1955 assert self.parser.innerHTML
1956 return
1957 else:
1958 ignoreEndTag = self.ignoreEndTagColgroup()
1959 self.endTagColgroup(impliedTagToken("colgroup"))
1960 if not ignoreEndTag:
1961 return True
1962
1963 def processCharacters(self, token):
1964 ignoreEndTag = self.ignoreEndTagColgroup()
1965 self.endTagColgroup(impliedTagToken("colgroup"))
1966 if not ignoreEndTag:
1967 return token
1968
1969 def startTagCol(self, token):
1970 self.tree.insertElement(token)
1971 self.tree.openElements.pop()
1972 token["selfClosingAcknowledged"] = True
1973
1974 def startTagOther(self, token):
1975 ignoreEndTag = self.ignoreEndTagColgroup()
1976 self.endTagColgroup(impliedTagToken("colgroup"))
1977 if not ignoreEndTag:
1978 return token
1979
1980 def endTagColgroup(self, token):
1981 if self.ignoreEndTagColgroup():
1982 # innerHTML case
1983 assert self.parser.innerHTML
1984 self.parser.parseError()
1985 else:
1986 self.tree.openElements.pop()
1987 self.parser.phase = self.parser.phases["inTable"]
1988
1989 def endTagCol(self, token):
1990 self.parser.parseError("no-end-tag", {"name": "col"})
1991
1992 def endTagOther(self, token):
1993 ignoreEndTag = self.ignoreEndTagColgroup()
1994 self.endTagColgroup(impliedTagToken("colgroup"))
1995 if not ignoreEndTag:
1996 return token
1997
1998 startTagHandler = _utils.MethodDispatcher([
1999 ("html", Phase.startTagHtml),
2000 ("col", startTagCol)
2001 ])
2002 startTagHandler.default = startTagOther
2003
2004 endTagHandler = _utils.MethodDispatcher([
2005 ("colgroup", endTagColgroup),
2006 ("col", endTagCol)
2007 ])
2008 endTagHandler.default = endTagOther
2009
2010
2011class InTableBodyPhase(Phase):
2012 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2013 __slots__ = tuple()
2014
2015 # helper methods
2016 def clearStackToTableBodyContext(self):
2017 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2018 "thead", "html"):
2019 # self.parser.parseError("unexpected-implied-end-tag-in-table",
2020 # {"name": self.tree.openElements[-1].name})
2021 self.tree.openElements.pop()
2022 if self.tree.openElements[-1].name == "html":
2023 assert self.parser.innerHTML
2024
2025 # the rest
2026 def processEOF(self):
2027 self.parser.phases["inTable"].processEOF()
2028
2029 def processSpaceCharacters(self, token):
2030 return self.parser.phases["inTable"].processSpaceCharacters(token)
2031
2032 def processCharacters(self, token):
2033 return self.parser.phases["inTable"].processCharacters(token)
2034
2035 def startTagTr(self, token):
2036 self.clearStackToTableBodyContext()
2037 self.tree.insertElement(token)
2038 self.parser.phase = self.parser.phases["inRow"]
2039
2040 def startTagTableCell(self, token):
2041 self.parser.parseError("unexpected-cell-in-table-body",
2042 {"name": token["name"]})
2043 self.startTagTr(impliedTagToken("tr", "StartTag"))
2044 return token
2045
2046 def startTagTableOther(self, token):
2047 # XXX AT Any ideas on how to share this with endTagTable?
2048 if (self.tree.elementInScope("tbody", variant="table") or
2049 self.tree.elementInScope("thead", variant="table") or
2050 self.tree.elementInScope("tfoot", variant="table")):
2051 self.clearStackToTableBodyContext()
2052 self.endTagTableRowGroup(
2053 impliedTagToken(self.tree.openElements[-1].name))
2054 return token
2055 else:
2056 # innerHTML case
2057 assert self.parser.innerHTML
2058 self.parser.parseError()
2059
2060 def startTagOther(self, token):
2061 return self.parser.phases["inTable"].processStartTag(token)
2062
2063 def endTagTableRowGroup(self, token):
2064 if self.tree.elementInScope(token["name"], variant="table"):
2065 self.clearStackToTableBodyContext()
2066 self.tree.openElements.pop()
2067 self.parser.phase = self.parser.phases["inTable"]
2068 else:
2069 self.parser.parseError("unexpected-end-tag-in-table-body",
2070 {"name": token["name"]})
2071
2072 def endTagTable(self, token):
2073 if (self.tree.elementInScope("tbody", variant="table") or
2074 self.tree.elementInScope("thead", variant="table") or
2075 self.tree.elementInScope("tfoot", variant="table")):
2076 self.clearStackToTableBodyContext()
2077 self.endTagTableRowGroup(
2078 impliedTagToken(self.tree.openElements[-1].name))
2079 return token
2080 else:
2081 # innerHTML case
2082 assert self.parser.innerHTML
2083 self.parser.parseError()
2084
2085 def endTagIgnore(self, token):
2086 self.parser.parseError("unexpected-end-tag-in-table-body",
2087 {"name": token["name"]})
2088
2089 def endTagOther(self, token):
2090 return self.parser.phases["inTable"].processEndTag(token)
2091
2092 startTagHandler = _utils.MethodDispatcher([
2093 ("html", Phase.startTagHtml),
2094 ("tr", startTagTr),
2095 (("td", "th"), startTagTableCell),
2096 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2097 startTagTableOther)
2098 ])
2099 startTagHandler.default = startTagOther
2100
2101 endTagHandler = _utils.MethodDispatcher([
2102 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2103 ("table", endTagTable),
2104 (("body", "caption", "col", "colgroup", "html", "td", "th",
2105 "tr"), endTagIgnore)
2106 ])
2107 endTagHandler.default = endTagOther
2108
2109
2110class InRowPhase(Phase):
2111 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2112 __slots__ = tuple()
2113
2114 # helper methods (XXX unify this with other table helper methods)
2115 def clearStackToTableRowContext(self):
2116 while self.tree.openElements[-1].name not in ("tr", "html"):
2117 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2118 {"name": self.tree.openElements[-1].name})
2119 self.tree.openElements.pop()
2120
2121 def ignoreEndTagTr(self):
2122 return not self.tree.elementInScope("tr", variant="table")
2123
2124 # the rest
2125 def processEOF(self):
2126 self.parser.phases["inTable"].processEOF()
2127
2128 def processSpaceCharacters(self, token):
2129 return self.parser.phases["inTable"].processSpaceCharacters(token)
2130
2131 def processCharacters(self, token):
2132 return self.parser.phases["inTable"].processCharacters(token)
2133
2134 def startTagTableCell(self, token):
2135 self.clearStackToTableRowContext()
2136 self.tree.insertElement(token)
2137 self.parser.phase = self.parser.phases["inCell"]
2138 self.tree.activeFormattingElements.append(Marker)
2139
2140 def startTagTableOther(self, token):
2141 ignoreEndTag = self.ignoreEndTagTr()
2142 self.endTagTr(impliedTagToken("tr"))
2143 # XXX how are we sure it's always ignored in the innerHTML case?
2144 if not ignoreEndTag:
2145 return token
2146
2147 def startTagOther(self, token):
2148 return self.parser.phases["inTable"].processStartTag(token)
2149
2150 def endTagTr(self, token):
2151 if not self.ignoreEndTagTr():
2152 self.clearStackToTableRowContext()
2153 self.tree.openElements.pop()
2154 self.parser.phase = self.parser.phases["inTableBody"]
2155 else:
2156 # innerHTML case
2157 assert self.parser.innerHTML
2158 self.parser.parseError()
2159
2160 def endTagTable(self, token):
2161 ignoreEndTag = self.ignoreEndTagTr()
2162 self.endTagTr(impliedTagToken("tr"))
2163 # Reprocess the current tag if the tr end tag was not ignored
2164 # XXX how are we sure it's always ignored in the innerHTML case?
2165 if not ignoreEndTag:
2166 return token
2167
2168 def endTagTableRowGroup(self, token):
2169 if self.tree.elementInScope(token["name"], variant="table"):
2170 self.endTagTr(impliedTagToken("tr"))
2171 return token
2172 else:
2173 self.parser.parseError()
2174
2175 def endTagIgnore(self, token):
2176 self.parser.parseError("unexpected-end-tag-in-table-row",
2177 {"name": token["name"]})
2178
2179 def endTagOther(self, token):
2180 return self.parser.phases["inTable"].processEndTag(token)
2181
2182 startTagHandler = _utils.MethodDispatcher([
2183 ("html", Phase.startTagHtml),
2184 (("td", "th"), startTagTableCell),
2185 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2186 "tr"), startTagTableOther)
2187 ])
2188 startTagHandler.default = startTagOther
2189
2190 endTagHandler = _utils.MethodDispatcher([
2191 ("tr", endTagTr),
2192 ("table", endTagTable),
2193 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2194 (("body", "caption", "col", "colgroup", "html", "td", "th"),
2195 endTagIgnore)
2196 ])
2197 endTagHandler.default = endTagOther
2198
2199
2200class InCellPhase(Phase):
2201 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2202 __slots__ = tuple()
2203
2204 # helper
2205 def closeCell(self):
2206 if self.tree.elementInScope("td", variant="table"):
2207 self.endTagTableCell(impliedTagToken("td"))
2208 elif self.tree.elementInScope("th", variant="table"):
2209 self.endTagTableCell(impliedTagToken("th"))
2210
2211 # the rest
2212 def processEOF(self):
2213 self.parser.phases["inBody"].processEOF()
2214
2215 def processCharacters(self, token):
2216 return self.parser.phases["inBody"].processCharacters(token)
2217
2218 def startTagTableOther(self, token):
2219 if (self.tree.elementInScope("td", variant="table") or
2220 self.tree.elementInScope("th", variant="table")):
2221 self.closeCell()
2222 return token
2223 else:
2224 # innerHTML case
2225 assert self.parser.innerHTML
2226 self.parser.parseError()
2227
2228 def startTagOther(self, token):
2229 return self.parser.phases["inBody"].processStartTag(token)
2230
2231 def endTagTableCell(self, token):
2232 if self.tree.elementInScope(token["name"], variant="table"):
2233 self.tree.generateImpliedEndTags(token["name"])
2234 if self.tree.openElements[-1].name != token["name"]:
2235 self.parser.parseError("unexpected-cell-end-tag",
2236 {"name": token["name"]})
2237 while True:
2238 node = self.tree.openElements.pop()
2239 if node.name == token["name"]:
2240 break
2241 else:
2242 self.tree.openElements.pop()
2243 self.tree.clearActiveFormattingElements()
2244 self.parser.phase = self.parser.phases["inRow"]
2245 else:
2246 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2247
2248 def endTagIgnore(self, token):
2249 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2250
2251 def endTagImply(self, token):
2252 if self.tree.elementInScope(token["name"], variant="table"):
2253 self.closeCell()
2254 return token
2255 else:
2256 # sometimes innerHTML case
2257 self.parser.parseError()
2258
2259 def endTagOther(self, token):
2260 return self.parser.phases["inBody"].processEndTag(token)
2261
2262 startTagHandler = _utils.MethodDispatcher([
2263 ("html", Phase.startTagHtml),
2264 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2265 "thead", "tr"), startTagTableOther)
2266 ])
2267 startTagHandler.default = startTagOther
2268
2269 endTagHandler = _utils.MethodDispatcher([
2270 (("td", "th"), endTagTableCell),
2271 (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
2272 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
2273 ])
2274 endTagHandler.default = endTagOther
2275
2276
2277class InSelectPhase(Phase):
2278 __slots__ = tuple()
2279
2280 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2281 def processEOF(self):
2282 if self.tree.openElements[-1].name != "html":
2283 self.parser.parseError("eof-in-select")
2284 else:
2285 assert self.parser.innerHTML
2286
2287 def processCharacters(self, token):
2288 if token["data"] == "\u0000":
2289 return
2290 self.tree.insertText(token["data"])
2291
2292 def startTagOption(self, token):
2293 # We need to imply </option> if <option> is the current node.
2294 if self.tree.openElements[-1].name == "option":
2295 self.tree.openElements.pop()
2296 self.tree.insertElement(token)
2297
2298 def startTagOptgroup(self, token):
2299 if self.tree.openElements[-1].name == "option":
2300 self.tree.openElements.pop()
2301 if self.tree.openElements[-1].name == "optgroup":
2302 self.tree.openElements.pop()
2303 self.tree.insertElement(token)
2304
2305 def startTagSelect(self, token):
2306 self.parser.parseError("unexpected-select-in-select")
2307 self.endTagSelect(impliedTagToken("select"))
2308
2309 def startTagInput(self, token):
2310 self.parser.parseError("unexpected-input-in-select")
2311 if self.tree.elementInScope("select", variant="select"):
2312 self.endTagSelect(impliedTagToken("select"))
2313 return token
2314 else:
2315 assert self.parser.innerHTML
2316
2317 def startTagScript(self, token):
2318 return self.parser.phases["inHead"].processStartTag(token)
2319
2320 def startTagOther(self, token):
2321 self.parser.parseError("unexpected-start-tag-in-select",
2322 {"name": token["name"]})
2323
2324 def endTagOption(self, token):
2325 if self.tree.openElements[-1].name == "option":
2326 self.tree.openElements.pop()
2327 else:
2328 self.parser.parseError("unexpected-end-tag-in-select",
2329 {"name": "option"})
2330
2331 def endTagOptgroup(self, token):
2332 # </optgroup> implicitly closes <option>
2333 if (self.tree.openElements[-1].name == "option" and
2334 self.tree.openElements[-2].name == "optgroup"):
2335 self.tree.openElements.pop()
2336 # It also closes </optgroup>
2337 if self.tree.openElements[-1].name == "optgroup":
2338 self.tree.openElements.pop()
2339 # But nothing else
2340 else:
2341 self.parser.parseError("unexpected-end-tag-in-select",
2342 {"name": "optgroup"})
2343
2344 def endTagSelect(self, token):
2345 if self.tree.elementInScope("select", variant="select"):
2346 node = self.tree.openElements.pop()
2347 while node.name != "select":
2348 node = self.tree.openElements.pop()
2349 self.parser.resetInsertionMode()
2350 else:
2351 # innerHTML case
2352 assert self.parser.innerHTML
2353 self.parser.parseError()
2354
2355 def endTagOther(self, token):
2356 self.parser.parseError("unexpected-end-tag-in-select",
2357 {"name": token["name"]})
2358
2359 startTagHandler = _utils.MethodDispatcher([
2360 ("html", Phase.startTagHtml),
2361 ("option", startTagOption),
2362 ("optgroup", startTagOptgroup),
2363 ("select", startTagSelect),
2364 (("input", "keygen", "textarea"), startTagInput),
2365 ("script", startTagScript)
2366 ])
2367 startTagHandler.default = startTagOther
2368
2369 endTagHandler = _utils.MethodDispatcher([
2370 ("option", endTagOption),
2371 ("optgroup", endTagOptgroup),
2372 ("select", endTagSelect)
2373 ])
2374 endTagHandler.default = endTagOther
2375
2376
2377class InSelectInTablePhase(Phase):
2378 __slots__ = tuple()
2379
2380 def processEOF(self):
2381 self.parser.phases["inSelect"].processEOF()
2382
2383 def processCharacters(self, token):
2384 return self.parser.phases["inSelect"].processCharacters(token)
2385
2386 def startTagTable(self, token):
2387 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2388 self.endTagOther(impliedTagToken("select"))
2389 return token
2390
2391 def startTagOther(self, token):
2392 return self.parser.phases["inSelect"].processStartTag(token)
2393
2394 def endTagTable(self, token):
2395 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2396 if self.tree.elementInScope(token["name"], variant="table"):
2397 self.endTagOther(impliedTagToken("select"))
2398 return token
2399
2400 def endTagOther(self, token):
2401 return self.parser.phases["inSelect"].processEndTag(token)
2402
2403 startTagHandler = _utils.MethodDispatcher([
2404 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2405 startTagTable)
2406 ])
2407 startTagHandler.default = startTagOther
2408
2409 endTagHandler = _utils.MethodDispatcher([
2410 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2411 endTagTable)
2412 ])
2413 endTagHandler.default = endTagOther
2414
2415
2416class InForeignContentPhase(Phase):
2417 __slots__ = tuple()
2418
2419 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2420 "center", "code", "dd", "div", "dl", "dt",
2421 "em", "embed", "h1", "h2", "h3",
2422 "h4", "h5", "h6", "head", "hr", "i", "img",
2423 "li", "listing", "menu", "meta", "nobr",
2424 "ol", "p", "pre", "ruby", "s", "small",
2425 "span", "strong", "strike", "sub", "sup",
2426 "table", "tt", "u", "ul", "var"])
2427
2428 def adjustSVGTagNames(self, token):
2429 replacements = {"altglyph": "altGlyph",
2430 "altglyphdef": "altGlyphDef",
2431 "altglyphitem": "altGlyphItem",
2432 "animatecolor": "animateColor",
2433 "animatemotion": "animateMotion",
2434 "animatetransform": "animateTransform",
2435 "clippath": "clipPath",
2436 "feblend": "feBlend",
2437 "fecolormatrix": "feColorMatrix",
2438 "fecomponenttransfer": "feComponentTransfer",
2439 "fecomposite": "feComposite",
2440 "feconvolvematrix": "feConvolveMatrix",
2441 "fediffuselighting": "feDiffuseLighting",
2442 "fedisplacementmap": "feDisplacementMap",
2443 "fedistantlight": "feDistantLight",
2444 "feflood": "feFlood",
2445 "fefunca": "feFuncA",
2446 "fefuncb": "feFuncB",
2447 "fefuncg": "feFuncG",
2448 "fefuncr": "feFuncR",
2449 "fegaussianblur": "feGaussianBlur",
2450 "feimage": "feImage",
2451 "femerge": "feMerge",
2452 "femergenode": "feMergeNode",
2453 "femorphology": "feMorphology",
2454 "feoffset": "feOffset",
2455 "fepointlight": "fePointLight",
2456 "fespecularlighting": "feSpecularLighting",
2457 "fespotlight": "feSpotLight",
2458 "fetile": "feTile",
2459 "feturbulence": "feTurbulence",
2460 "foreignobject": "foreignObject",
2461 "glyphref": "glyphRef",
2462 "lineargradient": "linearGradient",
2463 "radialgradient": "radialGradient",
2464 "textpath": "textPath"}
2465
2466 if token["name"] in replacements:
2467 token["name"] = replacements[token["name"]]
2468
2469 def processCharacters(self, token):
2470 if token["data"] == "\u0000":
2471 token["data"] = "\uFFFD"
2472 elif (self.parser.framesetOK and
2473 any(char not in spaceCharacters for char in token["data"])):
2474 self.parser.framesetOK = False
2475 Phase.processCharacters(self, token)
2476
2477 def processStartTag(self, token):
2478 currentNode = self.tree.openElements[-1]
2479 if (token["name"] in self.breakoutElements or
2480 (token["name"] == "font" and
2481 set(token["data"].keys()) & {"color", "face", "size"})):
2482 self.parser.parseError("unexpected-html-element-in-foreign-content",
2483 {"name": token["name"]})
2484 while (self.tree.openElements[-1].namespace !=
2485 self.tree.defaultNamespace and
2486 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2487 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2488 self.tree.openElements.pop()
2489 return token
2490
2491 else:
2492 if currentNode.namespace == namespaces["mathml"]:
2493 self.parser.adjustMathMLAttributes(token)
2494 elif currentNode.namespace == namespaces["svg"]:
2495 self.adjustSVGTagNames(token)
2496 self.parser.adjustSVGAttributes(token)
2497 self.parser.adjustForeignAttributes(token)
2498 token["namespace"] = currentNode.namespace
2499 self.tree.insertElement(token)
2500 if token["selfClosing"]:
2501 self.tree.openElements.pop()
2502 token["selfClosingAcknowledged"] = True
2503
2504 def processEndTag(self, token):
2505 nodeIndex = len(self.tree.openElements) - 1
2506 node = self.tree.openElements[-1]
2507 if node.name.translate(asciiUpper2Lower) != token["name"]:
2508 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2509
2510 while True:
2511 if node.name.translate(asciiUpper2Lower) == token["name"]:
2512 # XXX this isn't in the spec but it seems necessary
2513 if self.parser.phase == self.parser.phases["inTableText"]:
2514 self.parser.phase.flushCharacters()
2515 self.parser.phase = self.parser.phase.originalPhase
2516 while self.tree.openElements.pop() != node:
2517 assert self.tree.openElements
2518 new_token = None
2519 break
2520 nodeIndex -= 1
2521
2522 node = self.tree.openElements[nodeIndex]
2523 if node.namespace != self.tree.defaultNamespace:
2524 continue
2525 else:
2526 new_token = self.parser.phase.processEndTag(token)
2527 break
2528 return new_token
2529
2530
2531class AfterBodyPhase(Phase):
2532 __slots__ = tuple()
2533
2534 def processEOF(self):
2535 # Stop parsing
2536 pass
2537
2538 def processComment(self, token):
2539 # This is needed because data is to be appended to the <html> element
2540 # here and not to whatever is currently open.
2541 self.tree.insertComment(token, self.tree.openElements[0])
2542
2543 def processCharacters(self, token):
2544 self.parser.parseError("unexpected-char-after-body")
2545 self.parser.phase = self.parser.phases["inBody"]
2546 return token
2547
2548 def startTagHtml(self, token):
2549 return self.parser.phases["inBody"].processStartTag(token)
2550
2551 def startTagOther(self, token):
2552 self.parser.parseError("unexpected-start-tag-after-body",
2553 {"name": token["name"]})
2554 self.parser.phase = self.parser.phases["inBody"]
2555 return token
2556
2557 def endTagHtml(self, name):
2558 if self.parser.innerHTML:
2559 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2560 else:
2561 self.parser.phase = self.parser.phases["afterAfterBody"]
2562
2563 def endTagOther(self, token):
2564 self.parser.parseError("unexpected-end-tag-after-body",
2565 {"name": token["name"]})
2566 self.parser.phase = self.parser.phases["inBody"]
2567 return token
2568
2569 startTagHandler = _utils.MethodDispatcher([
2570 ("html", startTagHtml)
2571 ])
2572 startTagHandler.default = startTagOther
2573
2574 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
2575 endTagHandler.default = endTagOther
2576
2577
2578class InFramesetPhase(Phase):
2579 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2580 __slots__ = tuple()
2581
2582 def processEOF(self):
2583 if self.tree.openElements[-1].name != "html":
2584 self.parser.parseError("eof-in-frameset")
2585 else:
2586 assert self.parser.innerHTML
2587
2588 def processCharacters(self, token):
2589 self.parser.parseError("unexpected-char-in-frameset")
2590
2591 def startTagFrameset(self, token):
2592 self.tree.insertElement(token)
2593
2594 def startTagFrame(self, token):
2595 self.tree.insertElement(token)
2596 self.tree.openElements.pop()
2597
2598 def startTagNoframes(self, token):
2599 return self.parser.phases["inBody"].processStartTag(token)
2600
2601 def startTagOther(self, token):
2602 self.parser.parseError("unexpected-start-tag-in-frameset",
2603 {"name": token["name"]})
2604
2605 def endTagFrameset(self, token):
2606 if self.tree.openElements[-1].name == "html":
2607 # innerHTML case
2608 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2609 else:
2610 self.tree.openElements.pop()
2611 if (not self.parser.innerHTML and
2612 self.tree.openElements[-1].name != "frameset"):
2613 # If we're not in innerHTML mode and the current node is not a
2614 # "frameset" element (anymore) then switch.
2615 self.parser.phase = self.parser.phases["afterFrameset"]
2616
2617 def endTagOther(self, token):
2618 self.parser.parseError("unexpected-end-tag-in-frameset",
2619 {"name": token["name"]})
2620
2621 startTagHandler = _utils.MethodDispatcher([
2622 ("html", Phase.startTagHtml),
2623 ("frameset", startTagFrameset),
2624 ("frame", startTagFrame),
2625 ("noframes", startTagNoframes)
2626 ])
2627 startTagHandler.default = startTagOther
2628
2629 endTagHandler = _utils.MethodDispatcher([
2630 ("frameset", endTagFrameset)
2631 ])
2632 endTagHandler.default = endTagOther
2633
2634
2635class AfterFramesetPhase(Phase):
2636 # http://www.whatwg.org/specs/web-apps/current-work/#after3
2637 __slots__ = tuple()
2638
2639 def processEOF(self):
2640 # Stop parsing
2641 pass
2642
2643 def processCharacters(self, token):
2644 self.parser.parseError("unexpected-char-after-frameset")
2645
2646 def startTagNoframes(self, token):
2647 return self.parser.phases["inHead"].processStartTag(token)
2648
2649 def startTagOther(self, token):
2650 self.parser.parseError("unexpected-start-tag-after-frameset",
2651 {"name": token["name"]})
2652
2653 def endTagHtml(self, token):
2654 self.parser.phase = self.parser.phases["afterAfterFrameset"]
2655
2656 def endTagOther(self, token):
2657 self.parser.parseError("unexpected-end-tag-after-frameset",
2658 {"name": token["name"]})
2659
2660 startTagHandler = _utils.MethodDispatcher([
2661 ("html", Phase.startTagHtml),
2662 ("noframes", startTagNoframes)
2663 ])
2664 startTagHandler.default = startTagOther
2665
2666 endTagHandler = _utils.MethodDispatcher([
2667 ("html", endTagHtml)
2668 ])
2669 endTagHandler.default = endTagOther
2670
2671
2672class AfterAfterBodyPhase(Phase):
2673 __slots__ = tuple()
2674
2675 def processEOF(self):
2676 pass
2677
2678 def processComment(self, token):
2679 self.tree.insertComment(token, self.tree.document)
2680
2681 def processSpaceCharacters(self, token):
2682 return self.parser.phases["inBody"].processSpaceCharacters(token)
2683
2684 def processCharacters(self, token):
2685 self.parser.parseError("expected-eof-but-got-char")
2686 self.parser.phase = self.parser.phases["inBody"]
2687 return token
2688
2689 def startTagHtml(self, token):
2690 return self.parser.phases["inBody"].processStartTag(token)
2691
2692 def startTagOther(self, token):
2693 self.parser.parseError("expected-eof-but-got-start-tag",
2694 {"name": token["name"]})
2695 self.parser.phase = self.parser.phases["inBody"]
2696 return token
2697
2698 def processEndTag(self, token):
2699 self.parser.parseError("expected-eof-but-got-end-tag",
2700 {"name": token["name"]})
2701 self.parser.phase = self.parser.phases["inBody"]
2702 return token
2703
2704 startTagHandler = _utils.MethodDispatcher([
2705 ("html", startTagHtml)
2706 ])
2707 startTagHandler.default = startTagOther
2708
2709
2710class AfterAfterFramesetPhase(Phase):
2711 __slots__ = tuple()
2712
2713 def processEOF(self):
2714 pass
2715
2716 def processComment(self, token):
2717 self.tree.insertComment(token, self.tree.document)
2718
2719 def processSpaceCharacters(self, token):
2720 return self.parser.phases["inBody"].processSpaceCharacters(token)
2721
2722 def processCharacters(self, token):
2723 self.parser.parseError("expected-eof-but-got-char")
2724
2725 def startTagHtml(self, token):
2726 return self.parser.phases["inBody"].processStartTag(token)
2727
2728 def startTagNoFrames(self, token):
2729 return self.parser.phases["inHead"].processStartTag(token)
2730
2731 def startTagOther(self, token):
2732 self.parser.parseError("expected-eof-but-got-start-tag",
2733 {"name": token["name"]})
2734
2735 def processEndTag(self, token):
2736 self.parser.parseError("expected-eof-but-got-end-tag",
2737 {"name": token["name"]})
2738
2739 startTagHandler = _utils.MethodDispatcher([
2740 ("html", startTagHtml),
2741 ("noframes", startTagNoFrames)
2742 ])
2743 startTagHandler.default = startTagOther
2744
2745# pylint:enable=unused-argument
2746
2747
2748_phases = {
2749 "initial": InitialPhase,
2750 "beforeHtml": BeforeHtmlPhase,
2751 "beforeHead": BeforeHeadPhase,
2752 "inHead": InHeadPhase,
2753 "inHeadNoscript": InHeadNoscriptPhase,
2754 "afterHead": AfterHeadPhase,
2755 "inBody": InBodyPhase,
2756 "text": TextPhase,
2757 "inTable": InTablePhase,
2758 "inTableText": InTableTextPhase,
2759 "inCaption": InCaptionPhase,
2760 "inColumnGroup": InColumnGroupPhase,
2761 "inTableBody": InTableBodyPhase,
2762 "inRow": InRowPhase,
2763 "inCell": InCellPhase,
2764 "inSelect": InSelectPhase,
2765 "inSelectInTable": InSelectInTablePhase,
2766 "inForeignContent": InForeignContentPhase,
2767 "afterBody": AfterBodyPhase,
2768 "inFrameset": InFramesetPhase,
2769 "afterFrameset": AfterFramesetPhase,
2770 "afterAfterBody": AfterAfterBodyPhase,
2771 "afterAfterFrameset": AfterAfterFramesetPhase,
2772 # XXX after after frameset
2773}
2774
2775
2776def adjust_attributes(token, replacements):
2777 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2778 if needs_adjustment:
2779 token['data'] = type(token['data'])((replacements.get(k, k), v)
2780 for k, v in token['data'].items())
2781
2782
2783def impliedTagToken(name, type="EndTag", attributes=None,
2784 selfClosing=False):
2785 if attributes is None:
2786 attributes = {}
2787 return {"type": tokenTypes[type], "name": name, "data": attributes,
2788 "selfClosing": selfClosing}
2789
2790
2791class ParseError(Exception):
2792 """Error in parsed document"""
2793 pass