Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/html5lib/html5parser.py: 94%
1535 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:18 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:18 +0000
1from __future__ import absolute_import, division, unicode_literals
2from six import with_metaclass, viewkeys
4import types
6from . import _inputstream
7from . import _tokenizer
9from . import treebuilders
10from .treebuilders.base import Marker
12from . import _utils
13from .constants import (
14 spaceCharacters, asciiUpper2Lower,
15 specialElements, headingElements, cdataElements, rcdataElements,
16 tokenTypes, tagTokenTypes,
17 namespaces,
18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
19 adjustForeignAttributes as adjustForeignAttributesMap,
20 adjustMathMLAttributes, adjustSVGAttributes,
21 E,
22 _ReparseException
23)
26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
27 """Parse an HTML document as a string or file-like object into a tree
29 :arg doc: the document to parse as a string or file-like object
31 :arg treebuilder: the treebuilder to use when parsing
33 :arg namespaceHTMLElements: whether or not to namespace HTML elements
35 :returns: parsed tree
37 Example:
39 >>> from html5lib.html5parser import parse
40 >>> parse('<html><body><p>This is a doc</p></body></html>')
41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43 """
44 tb = treebuilders.getTreeBuilder(treebuilder)
45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
46 return p.parse(doc, **kwargs)
49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
50 """Parse an HTML fragment as a string or file-like object into a tree
52 :arg doc: the fragment to parse as a string or file-like object
54 :arg container: the container context to parse the fragment in
56 :arg treebuilder: the treebuilder to use when parsing
58 :arg namespaceHTMLElements: whether or not to namespace HTML elements
60 :returns: parsed tree
62 Example:
64 >>> from html5lib.html5libparser import parseFragment
65 >>> parseFragment('<b>this is a fragment</b>')
66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68 """
69 tb = treebuilders.getTreeBuilder(treebuilder)
70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
71 return p.parseFragment(doc, container=container, **kwargs)
74def method_decorator_metaclass(function):
75 class Decorated(type):
76 def __new__(meta, classname, bases, classDict):
77 for attributeName, attribute in classDict.items():
78 if isinstance(attribute, types.FunctionType):
79 attribute = function(attribute)
81 classDict[attributeName] = attribute
82 return type.__new__(meta, classname, bases, classDict)
83 return Decorated
86class HTMLParser(object):
87 """HTML parser
89 Generates a tree structure from a stream of (possibly malformed) HTML.
91 """
93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
94 """
95 :arg tree: a treebuilder class controlling the type of tree that will be
96 returned. Built in treebuilders can be accessed through
97 html5lib.treebuilders.getTreeBuilder(treeType)
99 :arg strict: raise an exception when a parse error is encountered
101 :arg namespaceHTMLElements: whether or not to namespace HTML elements
103 :arg debug: whether or not to enable debug mode which logs things
105 Example:
107 >>> from html5lib.html5parser import HTMLParser
108 >>> parser = HTMLParser() # generates parser with etree builder
109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
111 """
113 # Raise an exception on the first error encountered
114 self.strict = strict
116 if tree is None:
117 tree = treebuilders.getTreeBuilder("etree")
118 elif isinstance(tree, str):
119 tree = treebuilders.getTreeBuilder(tree)
121 self.tree = tree(namespaceHTMLElements)
122 self.errors = []
124 self.phases = {name: cls(self, self.tree) for name, cls in
125 getPhases(debug).items()}
127 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
129 self.innerHTMLMode = innerHTML
130 self.container = container
131 self.scripting = scripting
132 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
133 self.reset()
135 try:
136 self.mainLoop()
137 except _ReparseException:
138 self.reset()
139 self.mainLoop()
141 def reset(self):
142 self.tree.reset()
143 self.firstStartTag = False
144 self.errors = []
145 self.log = [] # only used with debug mode
146 # "quirks" / "limited quirks" / "no quirks"
147 self.compatMode = "no quirks"
149 if self.innerHTMLMode:
150 self.innerHTML = self.container.lower()
152 if self.innerHTML in cdataElements:
153 self.tokenizer.state = self.tokenizer.rcdataState
154 elif self.innerHTML in rcdataElements:
155 self.tokenizer.state = self.tokenizer.rawtextState
156 elif self.innerHTML == 'plaintext':
157 self.tokenizer.state = self.tokenizer.plaintextState
158 else:
159 # state already is data state
160 # self.tokenizer.state = self.tokenizer.dataState
161 pass
162 self.phase = self.phases["beforeHtml"]
163 self.phase.insertHtmlElement()
164 self.resetInsertionMode()
165 else:
166 self.innerHTML = False # pylint:disable=redefined-variable-type
167 self.phase = self.phases["initial"]
169 self.lastPhase = None
171 self.beforeRCDataPhase = None
173 self.framesetOK = True
175 @property
176 def documentEncoding(self):
177 """Name of the character encoding that was used to decode the input stream, or
178 :obj:`None` if that is not determined yet
180 """
181 if not hasattr(self, 'tokenizer'):
182 return None
183 return self.tokenizer.stream.charEncoding[0].name
185 def isHTMLIntegrationPoint(self, element):
186 if (element.name == "annotation-xml" and
187 element.namespace == namespaces["mathml"]):
188 return ("encoding" in element.attributes and
189 element.attributes["encoding"].translate(
190 asciiUpper2Lower) in
191 ("text/html", "application/xhtml+xml"))
192 else:
193 return (element.namespace, element.name) in htmlIntegrationPointElements
195 def isMathMLTextIntegrationPoint(self, element):
196 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
198 def mainLoop(self):
199 CharactersToken = tokenTypes["Characters"]
200 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
201 StartTagToken = tokenTypes["StartTag"]
202 EndTagToken = tokenTypes["EndTag"]
203 CommentToken = tokenTypes["Comment"]
204 DoctypeToken = tokenTypes["Doctype"]
205 ParseErrorToken = tokenTypes["ParseError"]
207 for token in self.tokenizer:
208 prev_token = None
209 new_token = token
210 while new_token is not None:
211 prev_token = new_token
212 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
213 currentNodeNamespace = currentNode.namespace if currentNode else None
214 currentNodeName = currentNode.name if currentNode else None
216 type = new_token["type"]
218 if type == ParseErrorToken:
219 self.parseError(new_token["data"], new_token.get("datavars", {}))
220 new_token = None
221 else:
222 if (len(self.tree.openElements) == 0 or
223 currentNodeNamespace == self.tree.defaultNamespace or
224 (self.isMathMLTextIntegrationPoint(currentNode) and
225 ((type == StartTagToken and
226 token["name"] not in frozenset(["mglyph", "malignmark"])) or
227 type in (CharactersToken, SpaceCharactersToken))) or
228 (currentNodeNamespace == namespaces["mathml"] and
229 currentNodeName == "annotation-xml" and
230 type == StartTagToken and
231 token["name"] == "svg") or
232 (self.isHTMLIntegrationPoint(currentNode) and
233 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
234 phase = self.phase
235 else:
236 phase = self.phases["inForeignContent"]
238 if type == CharactersToken:
239 new_token = phase.processCharacters(new_token)
240 elif type == SpaceCharactersToken:
241 new_token = phase.processSpaceCharacters(new_token)
242 elif type == StartTagToken:
243 new_token = phase.processStartTag(new_token)
244 elif type == EndTagToken:
245 new_token = phase.processEndTag(new_token)
246 elif type == CommentToken:
247 new_token = phase.processComment(new_token)
248 elif type == DoctypeToken:
249 new_token = phase.processDoctype(new_token)
251 if (type == StartTagToken and prev_token["selfClosing"] and
252 not prev_token["selfClosingAcknowledged"]):
253 self.parseError("non-void-element-with-trailing-solidus",
254 {"name": prev_token["name"]})
256 # When the loop finishes it's EOF
257 reprocess = True
258 phases = []
259 while reprocess:
260 phases.append(self.phase)
261 reprocess = self.phase.processEOF()
262 if reprocess:
263 assert self.phase not in phases
265 def parse(self, stream, *args, **kwargs):
266 """Parse a HTML document into a well-formed tree
268 :arg stream: a file-like object or string containing the HTML to be parsed
270 The optional encoding parameter must be a string that indicates
271 the encoding. If specified, that encoding will be used,
272 regardless of any BOM or later declaration (such as in a meta
273 element).
275 :arg scripting: treat noscript elements as if JavaScript was turned on
277 :returns: parsed tree
279 Example:
281 >>> from html5lib.html5parser import HTMLParser
282 >>> parser = HTMLParser()
283 >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
284 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
286 """
287 self._parse(stream, False, None, *args, **kwargs)
288 return self.tree.getDocument()
290 def parseFragment(self, stream, *args, **kwargs):
291 """Parse a HTML fragment into a well-formed tree fragment
293 :arg container: name of the element we're setting the innerHTML
294 property if set to None, default to 'div'
296 :arg stream: a file-like object or string containing the HTML to be parsed
298 The optional encoding parameter must be a string that indicates
299 the encoding. If specified, that encoding will be used,
300 regardless of any BOM or later declaration (such as in a meta
301 element)
303 :arg scripting: treat noscript elements as if JavaScript was turned on
305 :returns: parsed tree
307 Example:
309 >>> from html5lib.html5libparser import HTMLParser
310 >>> parser = HTMLParser()
311 >>> parser.parseFragment('<b>this is a fragment</b>')
312 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
314 """
315 self._parse(stream, True, *args, **kwargs)
316 return self.tree.getFragment()
318 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
319 # XXX The idea is to make errorcode mandatory.
320 if datavars is None:
321 datavars = {}
322 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
323 if self.strict:
324 raise ParseError(E[errorcode] % datavars)
326 def adjustMathMLAttributes(self, token):
327 adjust_attributes(token, adjustMathMLAttributes)
329 def adjustSVGAttributes(self, token):
330 adjust_attributes(token, adjustSVGAttributes)
332 def adjustForeignAttributes(self, token):
333 adjust_attributes(token, adjustForeignAttributesMap)
335 def reparseTokenNormal(self, token):
336 # pylint:disable=unused-argument
337 self.parser.phase()
339 def resetInsertionMode(self):
340 # The name of this method is mostly historical. (It's also used in the
341 # specification.)
342 last = False
343 newModes = {
344 "select": "inSelect",
345 "td": "inCell",
346 "th": "inCell",
347 "tr": "inRow",
348 "tbody": "inTableBody",
349 "thead": "inTableBody",
350 "tfoot": "inTableBody",
351 "caption": "inCaption",
352 "colgroup": "inColumnGroup",
353 "table": "inTable",
354 "head": "inBody",
355 "body": "inBody",
356 "frameset": "inFrameset",
357 "html": "beforeHead"
358 }
359 for node in self.tree.openElements[::-1]:
360 nodeName = node.name
361 new_phase = None
362 if node == self.tree.openElements[0]:
363 assert self.innerHTML
364 last = True
365 nodeName = self.innerHTML
366 # Check for conditions that should only happen in the innerHTML
367 # case
368 if nodeName in ("select", "colgroup", "head", "html"):
369 assert self.innerHTML
371 if not last and node.namespace != self.tree.defaultNamespace:
372 continue
374 if nodeName in newModes:
375 new_phase = self.phases[newModes[nodeName]]
376 break
377 elif last:
378 new_phase = self.phases["inBody"]
379 break
381 self.phase = new_phase
383 def parseRCDataRawtext(self, token, contentType):
384 # Generic RCDATA/RAWTEXT Parsing algorithm
385 assert contentType in ("RAWTEXT", "RCDATA")
387 self.tree.insertElement(token)
389 if contentType == "RAWTEXT":
390 self.tokenizer.state = self.tokenizer.rawtextState
391 else:
392 self.tokenizer.state = self.tokenizer.rcdataState
394 self.originalPhase = self.phase
396 self.phase = self.phases["text"]
399@_utils.memoize
400def getPhases(debug):
401 def log(function):
402 """Logger that records which phase processes each token"""
403 type_names = {value: key for key, value in tokenTypes.items()}
405 def wrapped(self, *args, **kwargs):
406 if function.__name__.startswith("process") and len(args) > 0:
407 token = args[0]
408 info = {"type": type_names[token['type']]}
409 if token['type'] in tagTokenTypes:
410 info["name"] = token['name']
412 self.parser.log.append((self.parser.tokenizer.state.__name__,
413 self.parser.phase.__class__.__name__,
414 self.__class__.__name__,
415 function.__name__,
416 info))
417 return function(self, *args, **kwargs)
418 else:
419 return function(self, *args, **kwargs)
420 return wrapped
422 def getMetaclass(use_metaclass, metaclass_func):
423 if use_metaclass:
424 return method_decorator_metaclass(metaclass_func)
425 else:
426 return type
428 # pylint:disable=unused-argument
429 class Phase(with_metaclass(getMetaclass(debug, log))):
430 """Base class for helper object that implements each phase of processing
431 """
432 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
434 def __init__(self, parser, tree):
435 self.parser = parser
436 self.tree = tree
437 self.__startTagCache = {}
438 self.__endTagCache = {}
440 def processEOF(self):
441 raise NotImplementedError
443 def processComment(self, token):
444 # For most phases the following is correct. Where it's not it will be
445 # overridden.
446 self.tree.insertComment(token, self.tree.openElements[-1])
448 def processDoctype(self, token):
449 self.parser.parseError("unexpected-doctype")
451 def processCharacters(self, token):
452 self.tree.insertText(token["data"])
454 def processSpaceCharacters(self, token):
455 self.tree.insertText(token["data"])
457 def processStartTag(self, token):
458 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
459 # requires a circular reference to the Phase, and this ends up with a significant
460 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
461 name = token["name"]
462 # In Py2, using `in` is quicker in general than try/except KeyError
463 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
464 if name in self.__startTagCache:
465 func = self.__startTagCache[name]
466 else:
467 func = self.__startTagCache[name] = self.startTagHandler[name]
468 # bound the cache size in case we get loads of unknown tags
469 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
470 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
471 self.__startTagCache.pop(next(iter(self.__startTagCache)))
472 return func(token)
474 def startTagHtml(self, token):
475 if not self.parser.firstStartTag and token["name"] == "html":
476 self.parser.parseError("non-html-root")
477 # XXX Need a check here to see if the first start tag token emitted is
478 # this token... If it's not, invoke self.parser.parseError().
479 for attr, value in token["data"].items():
480 if attr not in self.tree.openElements[0].attributes:
481 self.tree.openElements[0].attributes[attr] = value
482 self.parser.firstStartTag = False
484 def processEndTag(self, token):
485 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
486 # requires a circular reference to the Phase, and this ends up with a significant
487 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
488 name = token["name"]
489 # In Py2, using `in` is quicker in general than try/except KeyError
490 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
491 if name in self.__endTagCache:
492 func = self.__endTagCache[name]
493 else:
494 func = self.__endTagCache[name] = self.endTagHandler[name]
495 # bound the cache size in case we get loads of unknown tags
496 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
497 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
498 self.__endTagCache.pop(next(iter(self.__endTagCache)))
499 return func(token)
501 class InitialPhase(Phase):
502 __slots__ = tuple()
504 def processSpaceCharacters(self, token):
505 pass
507 def processComment(self, token):
508 self.tree.insertComment(token, self.tree.document)
510 def processDoctype(self, token):
511 name = token["name"]
512 publicId = token["publicId"]
513 systemId = token["systemId"]
514 correct = token["correct"]
516 if (name != "html" or publicId is not None or
517 systemId is not None and systemId != "about:legacy-compat"):
518 self.parser.parseError("unknown-doctype")
520 if publicId is None:
521 publicId = ""
523 self.tree.insertDoctype(token)
525 if publicId != "":
526 publicId = publicId.translate(asciiUpper2Lower)
528 if (not correct or token["name"] != "html" or
529 publicId.startswith(
530 ("+//silmaril//dtd html pro v0r11 19970101//",
531 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
532 "-//as//dtd html 3.0 aswedit + extensions//",
533 "-//ietf//dtd html 2.0 level 1//",
534 "-//ietf//dtd html 2.0 level 2//",
535 "-//ietf//dtd html 2.0 strict level 1//",
536 "-//ietf//dtd html 2.0 strict level 2//",
537 "-//ietf//dtd html 2.0 strict//",
538 "-//ietf//dtd html 2.0//",
539 "-//ietf//dtd html 2.1e//",
540 "-//ietf//dtd html 3.0//",
541 "-//ietf//dtd html 3.2 final//",
542 "-//ietf//dtd html 3.2//",
543 "-//ietf//dtd html 3//",
544 "-//ietf//dtd html level 0//",
545 "-//ietf//dtd html level 1//",
546 "-//ietf//dtd html level 2//",
547 "-//ietf//dtd html level 3//",
548 "-//ietf//dtd html strict level 0//",
549 "-//ietf//dtd html strict level 1//",
550 "-//ietf//dtd html strict level 2//",
551 "-//ietf//dtd html strict level 3//",
552 "-//ietf//dtd html strict//",
553 "-//ietf//dtd html//",
554 "-//metrius//dtd metrius presentational//",
555 "-//microsoft//dtd internet explorer 2.0 html strict//",
556 "-//microsoft//dtd internet explorer 2.0 html//",
557 "-//microsoft//dtd internet explorer 2.0 tables//",
558 "-//microsoft//dtd internet explorer 3.0 html strict//",
559 "-//microsoft//dtd internet explorer 3.0 html//",
560 "-//microsoft//dtd internet explorer 3.0 tables//",
561 "-//netscape comm. corp.//dtd html//",
562 "-//netscape comm. corp.//dtd strict html//",
563 "-//o'reilly and associates//dtd html 2.0//",
564 "-//o'reilly and associates//dtd html extended 1.0//",
565 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
566 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
567 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
568 "-//spyglass//dtd html 2.0 extended//",
569 "-//sq//dtd html 2.0 hotmetal + extensions//",
570 "-//sun microsystems corp.//dtd hotjava html//",
571 "-//sun microsystems corp.//dtd hotjava strict html//",
572 "-//w3c//dtd html 3 1995-03-24//",
573 "-//w3c//dtd html 3.2 draft//",
574 "-//w3c//dtd html 3.2 final//",
575 "-//w3c//dtd html 3.2//",
576 "-//w3c//dtd html 3.2s draft//",
577 "-//w3c//dtd html 4.0 frameset//",
578 "-//w3c//dtd html 4.0 transitional//",
579 "-//w3c//dtd html experimental 19960712//",
580 "-//w3c//dtd html experimental 970421//",
581 "-//w3c//dtd w3 html//",
582 "-//w3o//dtd w3 html 3.0//",
583 "-//webtechs//dtd mozilla html 2.0//",
584 "-//webtechs//dtd mozilla html//")) or
585 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
586 "-/w3c/dtd html 4.0 transitional/en",
587 "html") or
588 publicId.startswith(
589 ("-//w3c//dtd html 4.01 frameset//",
590 "-//w3c//dtd html 4.01 transitional//")) and
591 systemId is None or
592 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
593 self.parser.compatMode = "quirks"
594 elif (publicId.startswith(
595 ("-//w3c//dtd xhtml 1.0 frameset//",
596 "-//w3c//dtd xhtml 1.0 transitional//")) or
597 publicId.startswith(
598 ("-//w3c//dtd html 4.01 frameset//",
599 "-//w3c//dtd html 4.01 transitional//")) and
600 systemId is not None):
601 self.parser.compatMode = "limited quirks"
603 self.parser.phase = self.parser.phases["beforeHtml"]
605 def anythingElse(self):
606 self.parser.compatMode = "quirks"
607 self.parser.phase = self.parser.phases["beforeHtml"]
609 def processCharacters(self, token):
610 self.parser.parseError("expected-doctype-but-got-chars")
611 self.anythingElse()
612 return token
614 def processStartTag(self, token):
615 self.parser.parseError("expected-doctype-but-got-start-tag",
616 {"name": token["name"]})
617 self.anythingElse()
618 return token
620 def processEndTag(self, token):
621 self.parser.parseError("expected-doctype-but-got-end-tag",
622 {"name": token["name"]})
623 self.anythingElse()
624 return token
626 def processEOF(self):
627 self.parser.parseError("expected-doctype-but-got-eof")
628 self.anythingElse()
629 return True
631 class BeforeHtmlPhase(Phase):
632 __slots__ = tuple()
634 # helper methods
635 def insertHtmlElement(self):
636 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
637 self.parser.phase = self.parser.phases["beforeHead"]
639 # other
640 def processEOF(self):
641 self.insertHtmlElement()
642 return True
644 def processComment(self, token):
645 self.tree.insertComment(token, self.tree.document)
647 def processSpaceCharacters(self, token):
648 pass
650 def processCharacters(self, token):
651 self.insertHtmlElement()
652 return token
654 def processStartTag(self, token):
655 if token["name"] == "html":
656 self.parser.firstStartTag = True
657 self.insertHtmlElement()
658 return token
660 def processEndTag(self, token):
661 if token["name"] not in ("head", "body", "html", "br"):
662 self.parser.parseError("unexpected-end-tag-before-html",
663 {"name": token["name"]})
664 else:
665 self.insertHtmlElement()
666 return token
668 class BeforeHeadPhase(Phase):
669 __slots__ = tuple()
671 def processEOF(self):
672 self.startTagHead(impliedTagToken("head", "StartTag"))
673 return True
675 def processSpaceCharacters(self, token):
676 pass
678 def processCharacters(self, token):
679 self.startTagHead(impliedTagToken("head", "StartTag"))
680 return token
682 def startTagHtml(self, token):
683 return self.parser.phases["inBody"].processStartTag(token)
685 def startTagHead(self, token):
686 self.tree.insertElement(token)
687 self.tree.headPointer = self.tree.openElements[-1]
688 self.parser.phase = self.parser.phases["inHead"]
690 def startTagOther(self, token):
691 self.startTagHead(impliedTagToken("head", "StartTag"))
692 return token
694 def endTagImplyHead(self, token):
695 self.startTagHead(impliedTagToken("head", "StartTag"))
696 return token
698 def endTagOther(self, token):
699 self.parser.parseError("end-tag-after-implied-root",
700 {"name": token["name"]})
702 startTagHandler = _utils.MethodDispatcher([
703 ("html", startTagHtml),
704 ("head", startTagHead)
705 ])
706 startTagHandler.default = startTagOther
708 endTagHandler = _utils.MethodDispatcher([
709 (("head", "body", "html", "br"), endTagImplyHead)
710 ])
711 endTagHandler.default = endTagOther
713 class InHeadPhase(Phase):
714 __slots__ = tuple()
716 # the real thing
717 def processEOF(self):
718 self.anythingElse()
719 return True
721 def processCharacters(self, token):
722 self.anythingElse()
723 return token
725 def startTagHtml(self, token):
726 return self.parser.phases["inBody"].processStartTag(token)
728 def startTagHead(self, token):
729 self.parser.parseError("two-heads-are-not-better-than-one")
731 def startTagBaseLinkCommand(self, token):
732 self.tree.insertElement(token)
733 self.tree.openElements.pop()
734 token["selfClosingAcknowledged"] = True
736 def startTagMeta(self, token):
737 self.tree.insertElement(token)
738 self.tree.openElements.pop()
739 token["selfClosingAcknowledged"] = True
741 attributes = token["data"]
742 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
743 if "charset" in attributes:
744 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
745 elif ("content" in attributes and
746 "http-equiv" in attributes and
747 attributes["http-equiv"].lower() == "content-type"):
748 # Encoding it as UTF-8 here is a hack, as really we should pass
749 # the abstract Unicode string, and just use the
750 # ContentAttrParser on that, but using UTF-8 allows all chars
751 # to be encoded and as a ASCII-superset works.
752 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
753 parser = _inputstream.ContentAttrParser(data)
754 codec = parser.parse()
755 self.parser.tokenizer.stream.changeEncoding(codec)
757 def startTagTitle(self, token):
758 self.parser.parseRCDataRawtext(token, "RCDATA")
760 def startTagNoFramesStyle(self, token):
761 # Need to decide whether to implement the scripting-disabled case
762 self.parser.parseRCDataRawtext(token, "RAWTEXT")
764 def startTagNoscript(self, token):
765 if self.parser.scripting:
766 self.parser.parseRCDataRawtext(token, "RAWTEXT")
767 else:
768 self.tree.insertElement(token)
769 self.parser.phase = self.parser.phases["inHeadNoscript"]
771 def startTagScript(self, token):
772 self.tree.insertElement(token)
773 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
774 self.parser.originalPhase = self.parser.phase
775 self.parser.phase = self.parser.phases["text"]
777 def startTagOther(self, token):
778 self.anythingElse()
779 return token
781 def endTagHead(self, token):
782 node = self.parser.tree.openElements.pop()
783 assert node.name == "head", "Expected head got %s" % node.name
784 self.parser.phase = self.parser.phases["afterHead"]
786 def endTagHtmlBodyBr(self, token):
787 self.anythingElse()
788 return token
790 def endTagOther(self, token):
791 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
793 def anythingElse(self):
794 self.endTagHead(impliedTagToken("head"))
796 startTagHandler = _utils.MethodDispatcher([
797 ("html", startTagHtml),
798 ("title", startTagTitle),
799 (("noframes", "style"), startTagNoFramesStyle),
800 ("noscript", startTagNoscript),
801 ("script", startTagScript),
802 (("base", "basefont", "bgsound", "command", "link"),
803 startTagBaseLinkCommand),
804 ("meta", startTagMeta),
805 ("head", startTagHead)
806 ])
807 startTagHandler.default = startTagOther
809 endTagHandler = _utils.MethodDispatcher([
810 ("head", endTagHead),
811 (("br", "html", "body"), endTagHtmlBodyBr)
812 ])
813 endTagHandler.default = endTagOther
815 class InHeadNoscriptPhase(Phase):
816 __slots__ = tuple()
818 def processEOF(self):
819 self.parser.parseError("eof-in-head-noscript")
820 self.anythingElse()
821 return True
823 def processComment(self, token):
824 return self.parser.phases["inHead"].processComment(token)
826 def processCharacters(self, token):
827 self.parser.parseError("char-in-head-noscript")
828 self.anythingElse()
829 return token
831 def processSpaceCharacters(self, token):
832 return self.parser.phases["inHead"].processSpaceCharacters(token)
834 def startTagHtml(self, token):
835 return self.parser.phases["inBody"].processStartTag(token)
837 def startTagBaseLinkCommand(self, token):
838 return self.parser.phases["inHead"].processStartTag(token)
840 def startTagHeadNoscript(self, token):
841 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
843 def startTagOther(self, token):
844 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
845 self.anythingElse()
846 return token
848 def endTagNoscript(self, token):
849 node = self.parser.tree.openElements.pop()
850 assert node.name == "noscript", "Expected noscript got %s" % node.name
851 self.parser.phase = self.parser.phases["inHead"]
853 def endTagBr(self, token):
854 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
855 self.anythingElse()
856 return token
858 def endTagOther(self, token):
859 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
861 def anythingElse(self):
862 # Caller must raise parse error first!
863 self.endTagNoscript(impliedTagToken("noscript"))
865 startTagHandler = _utils.MethodDispatcher([
866 ("html", startTagHtml),
867 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
868 (("head", "noscript"), startTagHeadNoscript),
869 ])
870 startTagHandler.default = startTagOther
872 endTagHandler = _utils.MethodDispatcher([
873 ("noscript", endTagNoscript),
874 ("br", endTagBr),
875 ])
876 endTagHandler.default = endTagOther
878 class AfterHeadPhase(Phase):
879 __slots__ = tuple()
881 def processEOF(self):
882 self.anythingElse()
883 return True
885 def processCharacters(self, token):
886 self.anythingElse()
887 return token
889 def startTagHtml(self, token):
890 return self.parser.phases["inBody"].processStartTag(token)
892 def startTagBody(self, token):
893 self.parser.framesetOK = False
894 self.tree.insertElement(token)
895 self.parser.phase = self.parser.phases["inBody"]
897 def startTagFrameset(self, token):
898 self.tree.insertElement(token)
899 self.parser.phase = self.parser.phases["inFrameset"]
901 def startTagFromHead(self, token):
902 self.parser.parseError("unexpected-start-tag-out-of-my-head",
903 {"name": token["name"]})
904 self.tree.openElements.append(self.tree.headPointer)
905 self.parser.phases["inHead"].processStartTag(token)
906 for node in self.tree.openElements[::-1]:
907 if node.name == "head":
908 self.tree.openElements.remove(node)
909 break
911 def startTagHead(self, token):
912 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
914 def startTagOther(self, token):
915 self.anythingElse()
916 return token
918 def endTagHtmlBodyBr(self, token):
919 self.anythingElse()
920 return token
922 def endTagOther(self, token):
923 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
925 def anythingElse(self):
926 self.tree.insertElement(impliedTagToken("body", "StartTag"))
927 self.parser.phase = self.parser.phases["inBody"]
928 self.parser.framesetOK = True
930 startTagHandler = _utils.MethodDispatcher([
931 ("html", startTagHtml),
932 ("body", startTagBody),
933 ("frameset", startTagFrameset),
934 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
935 "style", "title"),
936 startTagFromHead),
937 ("head", startTagHead)
938 ])
939 startTagHandler.default = startTagOther
940 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
941 endTagHtmlBodyBr)])
942 endTagHandler.default = endTagOther
944 class InBodyPhase(Phase):
945 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
946 # the really-really-really-very crazy mode
947 __slots__ = ("processSpaceCharacters",)
949 def __init__(self, *args, **kwargs):
950 super(InBodyPhase, self).__init__(*args, **kwargs)
951 # Set this to the default handler
952 self.processSpaceCharacters = self.processSpaceCharactersNonPre
954 def isMatchingFormattingElement(self, node1, node2):
955 return (node1.name == node2.name and
956 node1.namespace == node2.namespace and
957 node1.attributes == node2.attributes)
959 # helper
960 def addFormattingElement(self, token):
961 self.tree.insertElement(token)
962 element = self.tree.openElements[-1]
964 matchingElements = []
965 for node in self.tree.activeFormattingElements[::-1]:
966 if node is Marker:
967 break
968 elif self.isMatchingFormattingElement(node, element):
969 matchingElements.append(node)
971 assert len(matchingElements) <= 3
972 if len(matchingElements) == 3:
973 self.tree.activeFormattingElements.remove(matchingElements[-1])
974 self.tree.activeFormattingElements.append(element)
976 # the real deal
977 def processEOF(self):
978 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
979 "tfoot", "th", "thead", "tr", "body",
980 "html"))
981 for node in self.tree.openElements[::-1]:
982 if node.name not in allowed_elements:
983 self.parser.parseError("expected-closing-tag-but-got-eof")
984 break
985 # Stop parsing
987 def processSpaceCharactersDropNewline(self, token):
988 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
989 # want to drop leading newlines
990 data = token["data"]
991 self.processSpaceCharacters = self.processSpaceCharactersNonPre
992 if (data.startswith("\n") and
993 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
994 not self.tree.openElements[-1].hasContent()):
995 data = data[1:]
996 if data:
997 self.tree.reconstructActiveFormattingElements()
998 self.tree.insertText(data)
1000 def processCharacters(self, token):
1001 if token["data"] == "\u0000":
1002 # The tokenizer should always emit null on its own
1003 return
1004 self.tree.reconstructActiveFormattingElements()
1005 self.tree.insertText(token["data"])
1006 # This must be bad for performance
1007 if (self.parser.framesetOK and
1008 any(char not in spaceCharacters
1009 for char in token["data"])):
1010 self.parser.framesetOK = False
1012 def processSpaceCharactersNonPre(self, token):
1013 self.tree.reconstructActiveFormattingElements()
1014 self.tree.insertText(token["data"])
1016 def startTagProcessInHead(self, token):
1017 return self.parser.phases["inHead"].processStartTag(token)
1019 def startTagBody(self, token):
1020 self.parser.parseError("unexpected-start-tag", {"name": "body"})
1021 if (len(self.tree.openElements) == 1 or
1022 self.tree.openElements[1].name != "body"):
1023 assert self.parser.innerHTML
1024 else:
1025 self.parser.framesetOK = False
1026 for attr, value in token["data"].items():
1027 if attr not in self.tree.openElements[1].attributes:
1028 self.tree.openElements[1].attributes[attr] = value
1030 def startTagFrameset(self, token):
1031 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1032 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1033 assert self.parser.innerHTML
1034 elif not self.parser.framesetOK:
1035 pass
1036 else:
1037 if self.tree.openElements[1].parent:
1038 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1039 while self.tree.openElements[-1].name != "html":
1040 self.tree.openElements.pop()
1041 self.tree.insertElement(token)
1042 self.parser.phase = self.parser.phases["inFrameset"]
1044 def startTagCloseP(self, token):
1045 if self.tree.elementInScope("p", variant="button"):
1046 self.endTagP(impliedTagToken("p"))
1047 self.tree.insertElement(token)
1049 def startTagPreListing(self, token):
1050 if self.tree.elementInScope("p", variant="button"):
1051 self.endTagP(impliedTagToken("p"))
1052 self.tree.insertElement(token)
1053 self.parser.framesetOK = False
1054 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1056 def startTagForm(self, token):
1057 if self.tree.formPointer:
1058 self.parser.parseError("unexpected-start-tag", {"name": "form"})
1059 else:
1060 if self.tree.elementInScope("p", variant="button"):
1061 self.endTagP(impliedTagToken("p"))
1062 self.tree.insertElement(token)
1063 self.tree.formPointer = self.tree.openElements[-1]
1065 def startTagListItem(self, token):
1066 self.parser.framesetOK = False
1068 stopNamesMap = {"li": ["li"],
1069 "dt": ["dt", "dd"],
1070 "dd": ["dt", "dd"]}
1071 stopNames = stopNamesMap[token["name"]]
1072 for node in reversed(self.tree.openElements):
1073 if node.name in stopNames:
1074 self.parser.phase.processEndTag(
1075 impliedTagToken(node.name, "EndTag"))
1076 break
1077 if (node.nameTuple in specialElements and
1078 node.name not in ("address", "div", "p")):
1079 break
1081 if self.tree.elementInScope("p", variant="button"):
1082 self.parser.phase.processEndTag(
1083 impliedTagToken("p", "EndTag"))
1085 self.tree.insertElement(token)
1087 def startTagPlaintext(self, token):
1088 if self.tree.elementInScope("p", variant="button"):
1089 self.endTagP(impliedTagToken("p"))
1090 self.tree.insertElement(token)
1091 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1093 def startTagHeading(self, token):
1094 if self.tree.elementInScope("p", variant="button"):
1095 self.endTagP(impliedTagToken("p"))
1096 if self.tree.openElements[-1].name in headingElements:
1097 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1098 self.tree.openElements.pop()
1099 self.tree.insertElement(token)
1101 def startTagA(self, token):
1102 afeAElement = self.tree.elementInActiveFormattingElements("a")
1103 if afeAElement:
1104 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1105 {"startName": "a", "endName": "a"})
1106 self.endTagFormatting(impliedTagToken("a"))
1107 if afeAElement in self.tree.openElements:
1108 self.tree.openElements.remove(afeAElement)
1109 if afeAElement in self.tree.activeFormattingElements:
1110 self.tree.activeFormattingElements.remove(afeAElement)
1111 self.tree.reconstructActiveFormattingElements()
1112 self.addFormattingElement(token)
1114 def startTagFormatting(self, token):
1115 self.tree.reconstructActiveFormattingElements()
1116 self.addFormattingElement(token)
1118 def startTagNobr(self, token):
1119 self.tree.reconstructActiveFormattingElements()
1120 if self.tree.elementInScope("nobr"):
1121 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1122 {"startName": "nobr", "endName": "nobr"})
1123 self.processEndTag(impliedTagToken("nobr"))
1124 # XXX Need tests that trigger the following
1125 self.tree.reconstructActiveFormattingElements()
1126 self.addFormattingElement(token)
1128 def startTagButton(self, token):
1129 if self.tree.elementInScope("button"):
1130 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1131 {"startName": "button", "endName": "button"})
1132 self.processEndTag(impliedTagToken("button"))
1133 return token
1134 else:
1135 self.tree.reconstructActiveFormattingElements()
1136 self.tree.insertElement(token)
1137 self.parser.framesetOK = False
1139 def startTagAppletMarqueeObject(self, token):
1140 self.tree.reconstructActiveFormattingElements()
1141 self.tree.insertElement(token)
1142 self.tree.activeFormattingElements.append(Marker)
1143 self.parser.framesetOK = False
1145 def startTagXmp(self, token):
1146 if self.tree.elementInScope("p", variant="button"):
1147 self.endTagP(impliedTagToken("p"))
1148 self.tree.reconstructActiveFormattingElements()
1149 self.parser.framesetOK = False
1150 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1152 def startTagTable(self, token):
1153 if self.parser.compatMode != "quirks":
1154 if self.tree.elementInScope("p", variant="button"):
1155 self.processEndTag(impliedTagToken("p"))
1156 self.tree.insertElement(token)
1157 self.parser.framesetOK = False
1158 self.parser.phase = self.parser.phases["inTable"]
1160 def startTagVoidFormatting(self, token):
1161 self.tree.reconstructActiveFormattingElements()
1162 self.tree.insertElement(token)
1163 self.tree.openElements.pop()
1164 token["selfClosingAcknowledged"] = True
1165 self.parser.framesetOK = False
1167 def startTagInput(self, token):
1168 framesetOK = self.parser.framesetOK
1169 self.startTagVoidFormatting(token)
1170 if ("type" in token["data"] and
1171 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1172 # input type=hidden doesn't change framesetOK
1173 self.parser.framesetOK = framesetOK
1175 def startTagParamSource(self, token):
1176 self.tree.insertElement(token)
1177 self.tree.openElements.pop()
1178 token["selfClosingAcknowledged"] = True
1180 def startTagHr(self, token):
1181 if self.tree.elementInScope("p", variant="button"):
1182 self.endTagP(impliedTagToken("p"))
1183 self.tree.insertElement(token)
1184 self.tree.openElements.pop()
1185 token["selfClosingAcknowledged"] = True
1186 self.parser.framesetOK = False
1188 def startTagImage(self, token):
1189 # No really...
1190 self.parser.parseError("unexpected-start-tag-treated-as",
1191 {"originalName": "image", "newName": "img"})
1192 self.processStartTag(impliedTagToken("img", "StartTag",
1193 attributes=token["data"],
1194 selfClosing=token["selfClosing"]))
1196 def startTagIsIndex(self, token):
1197 self.parser.parseError("deprecated-tag", {"name": "isindex"})
1198 if self.tree.formPointer:
1199 return
1200 form_attrs = {}
1201 if "action" in token["data"]:
1202 form_attrs["action"] = token["data"]["action"]
1203 self.processStartTag(impliedTagToken("form", "StartTag",
1204 attributes=form_attrs))
1205 self.processStartTag(impliedTagToken("hr", "StartTag"))
1206 self.processStartTag(impliedTagToken("label", "StartTag"))
1207 # XXX Localization ...
1208 if "prompt" in token["data"]:
1209 prompt = token["data"]["prompt"]
1210 else:
1211 prompt = "This is a searchable index. Enter search keywords: "
1212 self.processCharacters(
1213 {"type": tokenTypes["Characters"], "data": prompt})
1214 attributes = token["data"].copy()
1215 if "action" in attributes:
1216 del attributes["action"]
1217 if "prompt" in attributes:
1218 del attributes["prompt"]
1219 attributes["name"] = "isindex"
1220 self.processStartTag(impliedTagToken("input", "StartTag",
1221 attributes=attributes,
1222 selfClosing=token["selfClosing"]))
1223 self.processEndTag(impliedTagToken("label"))
1224 self.processStartTag(impliedTagToken("hr", "StartTag"))
1225 self.processEndTag(impliedTagToken("form"))
1227 def startTagTextarea(self, token):
1228 self.tree.insertElement(token)
1229 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1230 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1231 self.parser.framesetOK = False
1233 def startTagIFrame(self, token):
1234 self.parser.framesetOK = False
1235 self.startTagRawtext(token)
1237 def startTagNoscript(self, token):
1238 if self.parser.scripting:
1239 self.startTagRawtext(token)
1240 else:
1241 self.startTagOther(token)
1243 def startTagRawtext(self, token):
1244 """iframe, noembed noframes, noscript(if scripting enabled)"""
1245 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1247 def startTagOpt(self, token):
1248 if self.tree.openElements[-1].name == "option":
1249 self.parser.phase.processEndTag(impliedTagToken("option"))
1250 self.tree.reconstructActiveFormattingElements()
1251 self.parser.tree.insertElement(token)
1253 def startTagSelect(self, token):
1254 self.tree.reconstructActiveFormattingElements()
1255 self.tree.insertElement(token)
1256 self.parser.framesetOK = False
1257 if self.parser.phase in (self.parser.phases["inTable"],
1258 self.parser.phases["inCaption"],
1259 self.parser.phases["inColumnGroup"],
1260 self.parser.phases["inTableBody"],
1261 self.parser.phases["inRow"],
1262 self.parser.phases["inCell"]):
1263 self.parser.phase = self.parser.phases["inSelectInTable"]
1264 else:
1265 self.parser.phase = self.parser.phases["inSelect"]
1267 def startTagRpRt(self, token):
1268 if self.tree.elementInScope("ruby"):
1269 self.tree.generateImpliedEndTags()
1270 if self.tree.openElements[-1].name != "ruby":
1271 self.parser.parseError()
1272 self.tree.insertElement(token)
1274 def startTagMath(self, token):
1275 self.tree.reconstructActiveFormattingElements()
1276 self.parser.adjustMathMLAttributes(token)
1277 self.parser.adjustForeignAttributes(token)
1278 token["namespace"] = namespaces["mathml"]
1279 self.tree.insertElement(token)
1280 # Need to get the parse error right for the case where the token
1281 # has a namespace not equal to the xmlns attribute
1282 if token["selfClosing"]:
1283 self.tree.openElements.pop()
1284 token["selfClosingAcknowledged"] = True
1286 def startTagSvg(self, token):
1287 self.tree.reconstructActiveFormattingElements()
1288 self.parser.adjustSVGAttributes(token)
1289 self.parser.adjustForeignAttributes(token)
1290 token["namespace"] = namespaces["svg"]
1291 self.tree.insertElement(token)
1292 # Need to get the parse error right for the case where the token
1293 # has a namespace not equal to the xmlns attribute
1294 if token["selfClosing"]:
1295 self.tree.openElements.pop()
1296 token["selfClosingAcknowledged"] = True
1298 def startTagMisplaced(self, token):
1299 """ Elements that should be children of other elements that have a
1300 different insertion mode; here they are ignored
1301 "caption", "col", "colgroup", "frame", "frameset", "head",
1302 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1303 "tr", "noscript"
1304 """
1305 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1307 def startTagOther(self, token):
1308 self.tree.reconstructActiveFormattingElements()
1309 self.tree.insertElement(token)
1311 def endTagP(self, token):
1312 if not self.tree.elementInScope("p", variant="button"):
1313 self.startTagCloseP(impliedTagToken("p", "StartTag"))
1314 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1315 self.endTagP(impliedTagToken("p", "EndTag"))
1316 else:
1317 self.tree.generateImpliedEndTags("p")
1318 if self.tree.openElements[-1].name != "p":
1319 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1320 node = self.tree.openElements.pop()
1321 while node.name != "p":
1322 node = self.tree.openElements.pop()
1324 def endTagBody(self, token):
1325 if not self.tree.elementInScope("body"):
1326 self.parser.parseError()
1327 return
1328 elif self.tree.openElements[-1].name != "body":
1329 for node in self.tree.openElements[2:]:
1330 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1331 "option", "p", "rp", "rt",
1332 "tbody", "td", "tfoot",
1333 "th", "thead", "tr", "body",
1334 "html")):
1335 # Not sure this is the correct name for the parse error
1336 self.parser.parseError(
1337 "expected-one-end-tag-but-got-another",
1338 {"gotName": "body", "expectedName": node.name})
1339 break
1340 self.parser.phase = self.parser.phases["afterBody"]
1342 def endTagHtml(self, token):
1343 # We repeat the test for the body end tag token being ignored here
1344 if self.tree.elementInScope("body"):
1345 self.endTagBody(impliedTagToken("body"))
1346 return token
1348 def endTagBlock(self, token):
1349 # Put us back in the right whitespace handling mode
1350 if token["name"] == "pre":
1351 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1352 inScope = self.tree.elementInScope(token["name"])
1353 if inScope:
1354 self.tree.generateImpliedEndTags()
1355 if self.tree.openElements[-1].name != token["name"]:
1356 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1357 if inScope:
1358 node = self.tree.openElements.pop()
1359 while node.name != token["name"]:
1360 node = self.tree.openElements.pop()
1362 def endTagForm(self, token):
1363 node = self.tree.formPointer
1364 self.tree.formPointer = None
1365 if node is None or not self.tree.elementInScope(node):
1366 self.parser.parseError("unexpected-end-tag",
1367 {"name": "form"})
1368 else:
1369 self.tree.generateImpliedEndTags()
1370 if self.tree.openElements[-1] != node:
1371 self.parser.parseError("end-tag-too-early-ignored",
1372 {"name": "form"})
1373 self.tree.openElements.remove(node)
1375 def endTagListItem(self, token):
1376 if token["name"] == "li":
1377 variant = "list"
1378 else:
1379 variant = None
1380 if not self.tree.elementInScope(token["name"], variant=variant):
1381 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1382 else:
1383 self.tree.generateImpliedEndTags(exclude=token["name"])
1384 if self.tree.openElements[-1].name != token["name"]:
1385 self.parser.parseError(
1386 "end-tag-too-early",
1387 {"name": token["name"]})
1388 node = self.tree.openElements.pop()
1389 while node.name != token["name"]:
1390 node = self.tree.openElements.pop()
1392 def endTagHeading(self, token):
1393 for item in headingElements:
1394 if self.tree.elementInScope(item):
1395 self.tree.generateImpliedEndTags()
1396 break
1397 if self.tree.openElements[-1].name != token["name"]:
1398 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1400 for item in headingElements:
1401 if self.tree.elementInScope(item):
1402 item = self.tree.openElements.pop()
1403 while item.name not in headingElements:
1404 item = self.tree.openElements.pop()
1405 break
1407 def endTagFormatting(self, token):
1408 """The much-feared adoption agency algorithm"""
1409 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1410 # XXX Better parseError messages appreciated.
1412 # Step 1
1413 outerLoopCounter = 0
1415 # Step 2
1416 while outerLoopCounter < 8:
1418 # Step 3
1419 outerLoopCounter += 1
1421 # Step 4:
1423 # Let the formatting element be the last element in
1424 # the list of active formatting elements that:
1425 # - is between the end of the list and the last scope
1426 # marker in the list, if any, or the start of the list
1427 # otherwise, and
1428 # - has the same tag name as the token.
1429 formattingElement = self.tree.elementInActiveFormattingElements(
1430 token["name"])
1431 if (not formattingElement or
1432 (formattingElement in self.tree.openElements and
1433 not self.tree.elementInScope(formattingElement.name))):
1434 # If there is no such node, then abort these steps
1435 # and instead act as described in the "any other
1436 # end tag" entry below.
1437 self.endTagOther(token)
1438 return
1440 # Otherwise, if there is such a node, but that node is
1441 # not in the stack of open elements, then this is a
1442 # parse error; remove the element from the list, and
1443 # abort these steps.
1444 elif formattingElement not in self.tree.openElements:
1445 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1446 self.tree.activeFormattingElements.remove(formattingElement)
1447 return
1449 # Otherwise, if there is such a node, and that node is
1450 # also in the stack of open elements, but the element
1451 # is not in scope, then this is a parse error; ignore
1452 # the token, and abort these steps.
1453 elif not self.tree.elementInScope(formattingElement.name):
1454 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1455 return
1457 # Otherwise, there is a formatting element and that
1458 # element is in the stack and is in scope. If the
1459 # element is not the current node, this is a parse
1460 # error. In any case, proceed with the algorithm as
1461 # written in the following steps.
1462 else:
1463 if formattingElement != self.tree.openElements[-1]:
1464 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1466 # Step 5:
1468 # Let the furthest block be the topmost node in the
1469 # stack of open elements that is lower in the stack
1470 # than the formatting element, and is an element in
1471 # the special category. There might not be one.
1472 afeIndex = self.tree.openElements.index(formattingElement)
1473 furthestBlock = None
1474 for element in self.tree.openElements[afeIndex:]:
1475 if element.nameTuple in specialElements:
1476 furthestBlock = element
1477 break
1479 # Step 6:
1481 # If there is no furthest block, then the UA must
1482 # first pop all the nodes from the bottom of the stack
1483 # of open elements, from the current node up to and
1484 # including the formatting element, then remove the
1485 # formatting element from the list of active
1486 # formatting elements, and finally abort these steps.
1487 if furthestBlock is None:
1488 element = self.tree.openElements.pop()
1489 while element != formattingElement:
1490 element = self.tree.openElements.pop()
1491 self.tree.activeFormattingElements.remove(element)
1492 return
1494 # Step 7
1495 commonAncestor = self.tree.openElements[afeIndex - 1]
1497 # Step 8:
1498 # The bookmark is supposed to help us identify where to reinsert
1499 # nodes in step 15. We have to ensure that we reinsert nodes after
1500 # the node before the active formatting element. Note the bookmark
1501 # can move in step 9.7
1502 bookmark = self.tree.activeFormattingElements.index(formattingElement)
1504 # Step 9
1505 lastNode = node = furthestBlock
1506 innerLoopCounter = 0
1508 index = self.tree.openElements.index(node)
1509 while innerLoopCounter < 3:
1510 innerLoopCounter += 1
1511 # Node is element before node in open elements
1512 index -= 1
1513 node = self.tree.openElements[index]
1514 if node not in self.tree.activeFormattingElements:
1515 self.tree.openElements.remove(node)
1516 continue
1517 # Step 9.6
1518 if node == formattingElement:
1519 break
1520 # Step 9.7
1521 if lastNode == furthestBlock:
1522 bookmark = self.tree.activeFormattingElements.index(node) + 1
1523 # Step 9.8
1524 clone = node.cloneNode()
1525 # Replace node with clone
1526 self.tree.activeFormattingElements[
1527 self.tree.activeFormattingElements.index(node)] = clone
1528 self.tree.openElements[
1529 self.tree.openElements.index(node)] = clone
1530 node = clone
1531 # Step 9.9
1532 # Remove lastNode from its parents, if any
1533 if lastNode.parent:
1534 lastNode.parent.removeChild(lastNode)
1535 node.appendChild(lastNode)
1536 # Step 9.10
1537 lastNode = node
1539 # Step 10
1540 # Foster parent lastNode if commonAncestor is a
1541 # table, tbody, tfoot, thead, or tr we need to foster
1542 # parent the lastNode
1543 if lastNode.parent:
1544 lastNode.parent.removeChild(lastNode)
1546 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1547 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1548 parent.insertBefore(lastNode, insertBefore)
1549 else:
1550 commonAncestor.appendChild(lastNode)
1552 # Step 11
1553 clone = formattingElement.cloneNode()
1555 # Step 12
1556 furthestBlock.reparentChildren(clone)
1558 # Step 13
1559 furthestBlock.appendChild(clone)
1561 # Step 14
1562 self.tree.activeFormattingElements.remove(formattingElement)
1563 self.tree.activeFormattingElements.insert(bookmark, clone)
1565 # Step 15
1566 self.tree.openElements.remove(formattingElement)
1567 self.tree.openElements.insert(
1568 self.tree.openElements.index(furthestBlock) + 1, clone)
1570 def endTagAppletMarqueeObject(self, token):
1571 if self.tree.elementInScope(token["name"]):
1572 self.tree.generateImpliedEndTags()
1573 if self.tree.openElements[-1].name != token["name"]:
1574 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1576 if self.tree.elementInScope(token["name"]):
1577 element = self.tree.openElements.pop()
1578 while element.name != token["name"]:
1579 element = self.tree.openElements.pop()
1580 self.tree.clearActiveFormattingElements()
1582 def endTagBr(self, token):
1583 self.parser.parseError("unexpected-end-tag-treated-as",
1584 {"originalName": "br", "newName": "br element"})
1585 self.tree.reconstructActiveFormattingElements()
1586 self.tree.insertElement(impliedTagToken("br", "StartTag"))
1587 self.tree.openElements.pop()
1589 def endTagOther(self, token):
1590 for node in self.tree.openElements[::-1]:
1591 if node.name == token["name"]:
1592 self.tree.generateImpliedEndTags(exclude=token["name"])
1593 if self.tree.openElements[-1].name != token["name"]:
1594 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1595 while self.tree.openElements.pop() != node:
1596 pass
1597 break
1598 else:
1599 if node.nameTuple in specialElements:
1600 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1601 break
1603 startTagHandler = _utils.MethodDispatcher([
1604 ("html", Phase.startTagHtml),
1605 (("base", "basefont", "bgsound", "command", "link", "meta",
1606 "script", "style", "title"),
1607 startTagProcessInHead),
1608 ("body", startTagBody),
1609 ("frameset", startTagFrameset),
1610 (("address", "article", "aside", "blockquote", "center", "details",
1611 "dir", "div", "dl", "fieldset", "figcaption", "figure",
1612 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
1613 "section", "summary", "ul"),
1614 startTagCloseP),
1615 (headingElements, startTagHeading),
1616 (("pre", "listing"), startTagPreListing),
1617 ("form", startTagForm),
1618 (("li", "dd", "dt"), startTagListItem),
1619 ("plaintext", startTagPlaintext),
1620 ("a", startTagA),
1621 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
1622 "strong", "tt", "u"), startTagFormatting),
1623 ("nobr", startTagNobr),
1624 ("button", startTagButton),
1625 (("applet", "marquee", "object"), startTagAppletMarqueeObject),
1626 ("xmp", startTagXmp),
1627 ("table", startTagTable),
1628 (("area", "br", "embed", "img", "keygen", "wbr"),
1629 startTagVoidFormatting),
1630 (("param", "source", "track"), startTagParamSource),
1631 ("input", startTagInput),
1632 ("hr", startTagHr),
1633 ("image", startTagImage),
1634 ("isindex", startTagIsIndex),
1635 ("textarea", startTagTextarea),
1636 ("iframe", startTagIFrame),
1637 ("noscript", startTagNoscript),
1638 (("noembed", "noframes"), startTagRawtext),
1639 ("select", startTagSelect),
1640 (("rp", "rt"), startTagRpRt),
1641 (("option", "optgroup"), startTagOpt),
1642 (("math"), startTagMath),
1643 (("svg"), startTagSvg),
1644 (("caption", "col", "colgroup", "frame", "head",
1645 "tbody", "td", "tfoot", "th", "thead",
1646 "tr"), startTagMisplaced)
1647 ])
1648 startTagHandler.default = startTagOther
1650 endTagHandler = _utils.MethodDispatcher([
1651 ("body", endTagBody),
1652 ("html", endTagHtml),
1653 (("address", "article", "aside", "blockquote", "button", "center",
1654 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
1655 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
1656 "section", "summary", "ul"), endTagBlock),
1657 ("form", endTagForm),
1658 ("p", endTagP),
1659 (("dd", "dt", "li"), endTagListItem),
1660 (headingElements, endTagHeading),
1661 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
1662 "strike", "strong", "tt", "u"), endTagFormatting),
1663 (("applet", "marquee", "object"), endTagAppletMarqueeObject),
1664 ("br", endTagBr),
1665 ])
1666 endTagHandler.default = endTagOther
1668 class TextPhase(Phase):
1669 __slots__ = tuple()
1671 def processCharacters(self, token):
1672 self.tree.insertText(token["data"])
1674 def processEOF(self):
1675 self.parser.parseError("expected-named-closing-tag-but-got-eof",
1676 {"name": self.tree.openElements[-1].name})
1677 self.tree.openElements.pop()
1678 self.parser.phase = self.parser.originalPhase
1679 return True
1681 def startTagOther(self, token):
1682 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1684 def endTagScript(self, token):
1685 node = self.tree.openElements.pop()
1686 assert node.name == "script"
1687 self.parser.phase = self.parser.originalPhase
1688 # The rest of this method is all stuff that only happens if
1689 # document.write works
1691 def endTagOther(self, token):
1692 self.tree.openElements.pop()
1693 self.parser.phase = self.parser.originalPhase
1695 startTagHandler = _utils.MethodDispatcher([])
1696 startTagHandler.default = startTagOther
1697 endTagHandler = _utils.MethodDispatcher([
1698 ("script", endTagScript)])
1699 endTagHandler.default = endTagOther
1701 class InTablePhase(Phase):
1702 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1703 __slots__ = tuple()
1705 # helper methods
1706 def clearStackToTableContext(self):
1707 # "clear the stack back to a table context"
1708 while self.tree.openElements[-1].name not in ("table", "html"):
1709 # self.parser.parseError("unexpected-implied-end-tag-in-table",
1710 # {"name": self.tree.openElements[-1].name})
1711 self.tree.openElements.pop()
1712 # When the current node is <html> it's an innerHTML case
1714 # processing methods
1715 def processEOF(self):
1716 if self.tree.openElements[-1].name != "html":
1717 self.parser.parseError("eof-in-table")
1718 else:
1719 assert self.parser.innerHTML
1720 # Stop parsing
1722 def processSpaceCharacters(self, token):
1723 originalPhase = self.parser.phase
1724 self.parser.phase = self.parser.phases["inTableText"]
1725 self.parser.phase.originalPhase = originalPhase
1726 self.parser.phase.processSpaceCharacters(token)
1728 def processCharacters(self, token):
1729 originalPhase = self.parser.phase
1730 self.parser.phase = self.parser.phases["inTableText"]
1731 self.parser.phase.originalPhase = originalPhase
1732 self.parser.phase.processCharacters(token)
1734 def insertText(self, token):
1735 # If we get here there must be at least one non-whitespace character
1736 # Do the table magic!
1737 self.tree.insertFromTable = True
1738 self.parser.phases["inBody"].processCharacters(token)
1739 self.tree.insertFromTable = False
1741 def startTagCaption(self, token):
1742 self.clearStackToTableContext()
1743 self.tree.activeFormattingElements.append(Marker)
1744 self.tree.insertElement(token)
1745 self.parser.phase = self.parser.phases["inCaption"]
1747 def startTagColgroup(self, token):
1748 self.clearStackToTableContext()
1749 self.tree.insertElement(token)
1750 self.parser.phase = self.parser.phases["inColumnGroup"]
1752 def startTagCol(self, token):
1753 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1754 return token
1756 def startTagRowGroup(self, token):
1757 self.clearStackToTableContext()
1758 self.tree.insertElement(token)
1759 self.parser.phase = self.parser.phases["inTableBody"]
1761 def startTagImplyTbody(self, token):
1762 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1763 return token
1765 def startTagTable(self, token):
1766 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1767 {"startName": "table", "endName": "table"})
1768 self.parser.phase.processEndTag(impliedTagToken("table"))
1769 if not self.parser.innerHTML:
1770 return token
1772 def startTagStyleScript(self, token):
1773 return self.parser.phases["inHead"].processStartTag(token)
1775 def startTagInput(self, token):
1776 if ("type" in token["data"] and
1777 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1778 self.parser.parseError("unexpected-hidden-input-in-table")
1779 self.tree.insertElement(token)
1780 # XXX associate with form
1781 self.tree.openElements.pop()
1782 else:
1783 self.startTagOther(token)
1785 def startTagForm(self, token):
1786 self.parser.parseError("unexpected-form-in-table")
1787 if self.tree.formPointer is None:
1788 self.tree.insertElement(token)
1789 self.tree.formPointer = self.tree.openElements[-1]
1790 self.tree.openElements.pop()
1792 def startTagOther(self, token):
1793 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1794 # Do the table magic!
1795 self.tree.insertFromTable = True
1796 self.parser.phases["inBody"].processStartTag(token)
1797 self.tree.insertFromTable = False
1799 def endTagTable(self, token):
1800 if self.tree.elementInScope("table", variant="table"):
1801 self.tree.generateImpliedEndTags()
1802 if self.tree.openElements[-1].name != "table":
1803 self.parser.parseError("end-tag-too-early-named",
1804 {"gotName": "table",
1805 "expectedName": self.tree.openElements[-1].name})
1806 while self.tree.openElements[-1].name != "table":
1807 self.tree.openElements.pop()
1808 self.tree.openElements.pop()
1809 self.parser.resetInsertionMode()
1810 else:
1811 # innerHTML case
1812 assert self.parser.innerHTML
1813 self.parser.parseError()
1815 def endTagIgnore(self, token):
1816 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1818 def endTagOther(self, token):
1819 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1820 # Do the table magic!
1821 self.tree.insertFromTable = True
1822 self.parser.phases["inBody"].processEndTag(token)
1823 self.tree.insertFromTable = False
1825 startTagHandler = _utils.MethodDispatcher([
1826 ("html", Phase.startTagHtml),
1827 ("caption", startTagCaption),
1828 ("colgroup", startTagColgroup),
1829 ("col", startTagCol),
1830 (("tbody", "tfoot", "thead"), startTagRowGroup),
1831 (("td", "th", "tr"), startTagImplyTbody),
1832 ("table", startTagTable),
1833 (("style", "script"), startTagStyleScript),
1834 ("input", startTagInput),
1835 ("form", startTagForm)
1836 ])
1837 startTagHandler.default = startTagOther
1839 endTagHandler = _utils.MethodDispatcher([
1840 ("table", endTagTable),
1841 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1842 "tfoot", "th", "thead", "tr"), endTagIgnore)
1843 ])
1844 endTagHandler.default = endTagOther
1846 class InTableTextPhase(Phase):
1847 __slots__ = ("originalPhase", "characterTokens")
1849 def __init__(self, *args, **kwargs):
1850 super(InTableTextPhase, self).__init__(*args, **kwargs)
1851 self.originalPhase = None
1852 self.characterTokens = []
1854 def flushCharacters(self):
1855 data = "".join([item["data"] for item in self.characterTokens])
1856 if any(item not in spaceCharacters for item in data):
1857 token = {"type": tokenTypes["Characters"], "data": data}
1858 self.parser.phases["inTable"].insertText(token)
1859 elif data:
1860 self.tree.insertText(data)
1861 self.characterTokens = []
1863 def processComment(self, token):
1864 self.flushCharacters()
1865 self.parser.phase = self.originalPhase
1866 return token
1868 def processEOF(self):
1869 self.flushCharacters()
1870 self.parser.phase = self.originalPhase
1871 return True
1873 def processCharacters(self, token):
1874 if token["data"] == "\u0000":
1875 return
1876 self.characterTokens.append(token)
1878 def processSpaceCharacters(self, token):
1879 # pretty sure we should never reach here
1880 self.characterTokens.append(token)
1881 # assert False
1883 def processStartTag(self, token):
1884 self.flushCharacters()
1885 self.parser.phase = self.originalPhase
1886 return token
1888 def processEndTag(self, token):
1889 self.flushCharacters()
1890 self.parser.phase = self.originalPhase
1891 return token
1893 class InCaptionPhase(Phase):
1894 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1895 __slots__ = tuple()
1897 def ignoreEndTagCaption(self):
1898 return not self.tree.elementInScope("caption", variant="table")
1900 def processEOF(self):
1901 self.parser.phases["inBody"].processEOF()
1903 def processCharacters(self, token):
1904 return self.parser.phases["inBody"].processCharacters(token)
1906 def startTagTableElement(self, token):
1907 self.parser.parseError()
1908 # XXX Have to duplicate logic here to find out if the tag is ignored
1909 ignoreEndTag = self.ignoreEndTagCaption()
1910 self.parser.phase.processEndTag(impliedTagToken("caption"))
1911 if not ignoreEndTag:
1912 return token
1914 def startTagOther(self, token):
1915 return self.parser.phases["inBody"].processStartTag(token)
1917 def endTagCaption(self, token):
1918 if not self.ignoreEndTagCaption():
1919 # AT this code is quite similar to endTagTable in "InTable"
1920 self.tree.generateImpliedEndTags()
1921 if self.tree.openElements[-1].name != "caption":
1922 self.parser.parseError("expected-one-end-tag-but-got-another",
1923 {"gotName": "caption",
1924 "expectedName": self.tree.openElements[-1].name})
1925 while self.tree.openElements[-1].name != "caption":
1926 self.tree.openElements.pop()
1927 self.tree.openElements.pop()
1928 self.tree.clearActiveFormattingElements()
1929 self.parser.phase = self.parser.phases["inTable"]
1930 else:
1931 # innerHTML case
1932 assert self.parser.innerHTML
1933 self.parser.parseError()
1935 def endTagTable(self, token):
1936 self.parser.parseError()
1937 ignoreEndTag = self.ignoreEndTagCaption()
1938 self.parser.phase.processEndTag(impliedTagToken("caption"))
1939 if not ignoreEndTag:
1940 return token
1942 def endTagIgnore(self, token):
1943 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1945 def endTagOther(self, token):
1946 return self.parser.phases["inBody"].processEndTag(token)
1948 startTagHandler = _utils.MethodDispatcher([
1949 ("html", Phase.startTagHtml),
1950 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1951 "thead", "tr"), startTagTableElement)
1952 ])
1953 startTagHandler.default = startTagOther
1955 endTagHandler = _utils.MethodDispatcher([
1956 ("caption", endTagCaption),
1957 ("table", endTagTable),
1958 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1959 "thead", "tr"), endTagIgnore)
1960 ])
1961 endTagHandler.default = endTagOther
1963 class InColumnGroupPhase(Phase):
1964 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1965 __slots__ = tuple()
1967 def ignoreEndTagColgroup(self):
1968 return self.tree.openElements[-1].name == "html"
1970 def processEOF(self):
1971 if self.tree.openElements[-1].name == "html":
1972 assert self.parser.innerHTML
1973 return
1974 else:
1975 ignoreEndTag = self.ignoreEndTagColgroup()
1976 self.endTagColgroup(impliedTagToken("colgroup"))
1977 if not ignoreEndTag:
1978 return True
1980 def processCharacters(self, token):
1981 ignoreEndTag = self.ignoreEndTagColgroup()
1982 self.endTagColgroup(impliedTagToken("colgroup"))
1983 if not ignoreEndTag:
1984 return token
1986 def startTagCol(self, token):
1987 self.tree.insertElement(token)
1988 self.tree.openElements.pop()
1989 token["selfClosingAcknowledged"] = True
1991 def startTagOther(self, token):
1992 ignoreEndTag = self.ignoreEndTagColgroup()
1993 self.endTagColgroup(impliedTagToken("colgroup"))
1994 if not ignoreEndTag:
1995 return token
1997 def endTagColgroup(self, token):
1998 if self.ignoreEndTagColgroup():
1999 # innerHTML case
2000 assert self.parser.innerHTML
2001 self.parser.parseError()
2002 else:
2003 self.tree.openElements.pop()
2004 self.parser.phase = self.parser.phases["inTable"]
2006 def endTagCol(self, token):
2007 self.parser.parseError("no-end-tag", {"name": "col"})
2009 def endTagOther(self, token):
2010 ignoreEndTag = self.ignoreEndTagColgroup()
2011 self.endTagColgroup(impliedTagToken("colgroup"))
2012 if not ignoreEndTag:
2013 return token
2015 startTagHandler = _utils.MethodDispatcher([
2016 ("html", Phase.startTagHtml),
2017 ("col", startTagCol)
2018 ])
2019 startTagHandler.default = startTagOther
2021 endTagHandler = _utils.MethodDispatcher([
2022 ("colgroup", endTagColgroup),
2023 ("col", endTagCol)
2024 ])
2025 endTagHandler.default = endTagOther
2027 class InTableBodyPhase(Phase):
2028 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2029 __slots__ = tuple()
2031 # helper methods
2032 def clearStackToTableBodyContext(self):
2033 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2034 "thead", "html"):
2035 # self.parser.parseError("unexpected-implied-end-tag-in-table",
2036 # {"name": self.tree.openElements[-1].name})
2037 self.tree.openElements.pop()
2038 if self.tree.openElements[-1].name == "html":
2039 assert self.parser.innerHTML
2041 # the rest
2042 def processEOF(self):
2043 self.parser.phases["inTable"].processEOF()
2045 def processSpaceCharacters(self, token):
2046 return self.parser.phases["inTable"].processSpaceCharacters(token)
2048 def processCharacters(self, token):
2049 return self.parser.phases["inTable"].processCharacters(token)
2051 def startTagTr(self, token):
2052 self.clearStackToTableBodyContext()
2053 self.tree.insertElement(token)
2054 self.parser.phase = self.parser.phases["inRow"]
2056 def startTagTableCell(self, token):
2057 self.parser.parseError("unexpected-cell-in-table-body",
2058 {"name": token["name"]})
2059 self.startTagTr(impliedTagToken("tr", "StartTag"))
2060 return token
2062 def startTagTableOther(self, token):
2063 # XXX AT Any ideas on how to share this with endTagTable?
2064 if (self.tree.elementInScope("tbody", variant="table") or
2065 self.tree.elementInScope("thead", variant="table") or
2066 self.tree.elementInScope("tfoot", variant="table")):
2067 self.clearStackToTableBodyContext()
2068 self.endTagTableRowGroup(
2069 impliedTagToken(self.tree.openElements[-1].name))
2070 return token
2071 else:
2072 # innerHTML case
2073 assert self.parser.innerHTML
2074 self.parser.parseError()
2076 def startTagOther(self, token):
2077 return self.parser.phases["inTable"].processStartTag(token)
2079 def endTagTableRowGroup(self, token):
2080 if self.tree.elementInScope(token["name"], variant="table"):
2081 self.clearStackToTableBodyContext()
2082 self.tree.openElements.pop()
2083 self.parser.phase = self.parser.phases["inTable"]
2084 else:
2085 self.parser.parseError("unexpected-end-tag-in-table-body",
2086 {"name": token["name"]})
2088 def endTagTable(self, token):
2089 if (self.tree.elementInScope("tbody", variant="table") or
2090 self.tree.elementInScope("thead", variant="table") or
2091 self.tree.elementInScope("tfoot", variant="table")):
2092 self.clearStackToTableBodyContext()
2093 self.endTagTableRowGroup(
2094 impliedTagToken(self.tree.openElements[-1].name))
2095 return token
2096 else:
2097 # innerHTML case
2098 assert self.parser.innerHTML
2099 self.parser.parseError()
2101 def endTagIgnore(self, token):
2102 self.parser.parseError("unexpected-end-tag-in-table-body",
2103 {"name": token["name"]})
2105 def endTagOther(self, token):
2106 return self.parser.phases["inTable"].processEndTag(token)
2108 startTagHandler = _utils.MethodDispatcher([
2109 ("html", Phase.startTagHtml),
2110 ("tr", startTagTr),
2111 (("td", "th"), startTagTableCell),
2112 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2113 startTagTableOther)
2114 ])
2115 startTagHandler.default = startTagOther
2117 endTagHandler = _utils.MethodDispatcher([
2118 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2119 ("table", endTagTable),
2120 (("body", "caption", "col", "colgroup", "html", "td", "th",
2121 "tr"), endTagIgnore)
2122 ])
2123 endTagHandler.default = endTagOther
2125 class InRowPhase(Phase):
2126 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2127 __slots__ = tuple()
2129 # helper methods (XXX unify this with other table helper methods)
2130 def clearStackToTableRowContext(self):
2131 while self.tree.openElements[-1].name not in ("tr", "html"):
2132 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2133 {"name": self.tree.openElements[-1].name})
2134 self.tree.openElements.pop()
2136 def ignoreEndTagTr(self):
2137 return not self.tree.elementInScope("tr", variant="table")
2139 # the rest
2140 def processEOF(self):
2141 self.parser.phases["inTable"].processEOF()
2143 def processSpaceCharacters(self, token):
2144 return self.parser.phases["inTable"].processSpaceCharacters(token)
2146 def processCharacters(self, token):
2147 return self.parser.phases["inTable"].processCharacters(token)
2149 def startTagTableCell(self, token):
2150 self.clearStackToTableRowContext()
2151 self.tree.insertElement(token)
2152 self.parser.phase = self.parser.phases["inCell"]
2153 self.tree.activeFormattingElements.append(Marker)
2155 def startTagTableOther(self, token):
2156 ignoreEndTag = self.ignoreEndTagTr()
2157 self.endTagTr(impliedTagToken("tr"))
2158 # XXX how are we sure it's always ignored in the innerHTML case?
2159 if not ignoreEndTag:
2160 return token
2162 def startTagOther(self, token):
2163 return self.parser.phases["inTable"].processStartTag(token)
2165 def endTagTr(self, token):
2166 if not self.ignoreEndTagTr():
2167 self.clearStackToTableRowContext()
2168 self.tree.openElements.pop()
2169 self.parser.phase = self.parser.phases["inTableBody"]
2170 else:
2171 # innerHTML case
2172 assert self.parser.innerHTML
2173 self.parser.parseError()
2175 def endTagTable(self, token):
2176 ignoreEndTag = self.ignoreEndTagTr()
2177 self.endTagTr(impliedTagToken("tr"))
2178 # Reprocess the current tag if the tr end tag was not ignored
2179 # XXX how are we sure it's always ignored in the innerHTML case?
2180 if not ignoreEndTag:
2181 return token
2183 def endTagTableRowGroup(self, token):
2184 if self.tree.elementInScope(token["name"], variant="table"):
2185 self.endTagTr(impliedTagToken("tr"))
2186 return token
2187 else:
2188 self.parser.parseError()
2190 def endTagIgnore(self, token):
2191 self.parser.parseError("unexpected-end-tag-in-table-row",
2192 {"name": token["name"]})
2194 def endTagOther(self, token):
2195 return self.parser.phases["inTable"].processEndTag(token)
2197 startTagHandler = _utils.MethodDispatcher([
2198 ("html", Phase.startTagHtml),
2199 (("td", "th"), startTagTableCell),
2200 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2201 "tr"), startTagTableOther)
2202 ])
2203 startTagHandler.default = startTagOther
2205 endTagHandler = _utils.MethodDispatcher([
2206 ("tr", endTagTr),
2207 ("table", endTagTable),
2208 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2209 (("body", "caption", "col", "colgroup", "html", "td", "th"),
2210 endTagIgnore)
2211 ])
2212 endTagHandler.default = endTagOther
2214 class InCellPhase(Phase):
2215 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2216 __slots__ = tuple()
2218 # helper
2219 def closeCell(self):
2220 if self.tree.elementInScope("td", variant="table"):
2221 self.endTagTableCell(impliedTagToken("td"))
2222 elif self.tree.elementInScope("th", variant="table"):
2223 self.endTagTableCell(impliedTagToken("th"))
2225 # the rest
2226 def processEOF(self):
2227 self.parser.phases["inBody"].processEOF()
2229 def processCharacters(self, token):
2230 return self.parser.phases["inBody"].processCharacters(token)
2232 def startTagTableOther(self, token):
2233 if (self.tree.elementInScope("td", variant="table") or
2234 self.tree.elementInScope("th", variant="table")):
2235 self.closeCell()
2236 return token
2237 else:
2238 # innerHTML case
2239 assert self.parser.innerHTML
2240 self.parser.parseError()
2242 def startTagOther(self, token):
2243 return self.parser.phases["inBody"].processStartTag(token)
2245 def endTagTableCell(self, token):
2246 if self.tree.elementInScope(token["name"], variant="table"):
2247 self.tree.generateImpliedEndTags(token["name"])
2248 if self.tree.openElements[-1].name != token["name"]:
2249 self.parser.parseError("unexpected-cell-end-tag",
2250 {"name": token["name"]})
2251 while True:
2252 node = self.tree.openElements.pop()
2253 if node.name == token["name"]:
2254 break
2255 else:
2256 self.tree.openElements.pop()
2257 self.tree.clearActiveFormattingElements()
2258 self.parser.phase = self.parser.phases["inRow"]
2259 else:
2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2262 def endTagIgnore(self, token):
2263 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2265 def endTagImply(self, token):
2266 if self.tree.elementInScope(token["name"], variant="table"):
2267 self.closeCell()
2268 return token
2269 else:
2270 # sometimes innerHTML case
2271 self.parser.parseError()
2273 def endTagOther(self, token):
2274 return self.parser.phases["inBody"].processEndTag(token)
2276 startTagHandler = _utils.MethodDispatcher([
2277 ("html", Phase.startTagHtml),
2278 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2279 "thead", "tr"), startTagTableOther)
2280 ])
2281 startTagHandler.default = startTagOther
2283 endTagHandler = _utils.MethodDispatcher([
2284 (("td", "th"), endTagTableCell),
2285 (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
2286 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
2287 ])
2288 endTagHandler.default = endTagOther
2290 class InSelectPhase(Phase):
2291 __slots__ = tuple()
2293 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2294 def processEOF(self):
2295 if self.tree.openElements[-1].name != "html":
2296 self.parser.parseError("eof-in-select")
2297 else:
2298 assert self.parser.innerHTML
2300 def processCharacters(self, token):
2301 if token["data"] == "\u0000":
2302 return
2303 self.tree.insertText(token["data"])
2305 def startTagOption(self, token):
2306 # We need to imply </option> if <option> is the current node.
2307 if self.tree.openElements[-1].name == "option":
2308 self.tree.openElements.pop()
2309 self.tree.insertElement(token)
2311 def startTagOptgroup(self, token):
2312 if self.tree.openElements[-1].name == "option":
2313 self.tree.openElements.pop()
2314 if self.tree.openElements[-1].name == "optgroup":
2315 self.tree.openElements.pop()
2316 self.tree.insertElement(token)
2318 def startTagSelect(self, token):
2319 self.parser.parseError("unexpected-select-in-select")
2320 self.endTagSelect(impliedTagToken("select"))
2322 def startTagInput(self, token):
2323 self.parser.parseError("unexpected-input-in-select")
2324 if self.tree.elementInScope("select", variant="select"):
2325 self.endTagSelect(impliedTagToken("select"))
2326 return token
2327 else:
2328 assert self.parser.innerHTML
2330 def startTagScript(self, token):
2331 return self.parser.phases["inHead"].processStartTag(token)
2333 def startTagOther(self, token):
2334 self.parser.parseError("unexpected-start-tag-in-select",
2335 {"name": token["name"]})
2337 def endTagOption(self, token):
2338 if self.tree.openElements[-1].name == "option":
2339 self.tree.openElements.pop()
2340 else:
2341 self.parser.parseError("unexpected-end-tag-in-select",
2342 {"name": "option"})
2344 def endTagOptgroup(self, token):
2345 # </optgroup> implicitly closes <option>
2346 if (self.tree.openElements[-1].name == "option" and
2347 self.tree.openElements[-2].name == "optgroup"):
2348 self.tree.openElements.pop()
2349 # It also closes </optgroup>
2350 if self.tree.openElements[-1].name == "optgroup":
2351 self.tree.openElements.pop()
2352 # But nothing else
2353 else:
2354 self.parser.parseError("unexpected-end-tag-in-select",
2355 {"name": "optgroup"})
2357 def endTagSelect(self, token):
2358 if self.tree.elementInScope("select", variant="select"):
2359 node = self.tree.openElements.pop()
2360 while node.name != "select":
2361 node = self.tree.openElements.pop()
2362 self.parser.resetInsertionMode()
2363 else:
2364 # innerHTML case
2365 assert self.parser.innerHTML
2366 self.parser.parseError()
2368 def endTagOther(self, token):
2369 self.parser.parseError("unexpected-end-tag-in-select",
2370 {"name": token["name"]})
2372 startTagHandler = _utils.MethodDispatcher([
2373 ("html", Phase.startTagHtml),
2374 ("option", startTagOption),
2375 ("optgroup", startTagOptgroup),
2376 ("select", startTagSelect),
2377 (("input", "keygen", "textarea"), startTagInput),
2378 ("script", startTagScript)
2379 ])
2380 startTagHandler.default = startTagOther
2382 endTagHandler = _utils.MethodDispatcher([
2383 ("option", endTagOption),
2384 ("optgroup", endTagOptgroup),
2385 ("select", endTagSelect)
2386 ])
2387 endTagHandler.default = endTagOther
2389 class InSelectInTablePhase(Phase):
2390 __slots__ = tuple()
2392 def processEOF(self):
2393 self.parser.phases["inSelect"].processEOF()
2395 def processCharacters(self, token):
2396 return self.parser.phases["inSelect"].processCharacters(token)
2398 def startTagTable(self, token):
2399 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2400 self.endTagOther(impliedTagToken("select"))
2401 return token
2403 def startTagOther(self, token):
2404 return self.parser.phases["inSelect"].processStartTag(token)
2406 def endTagTable(self, token):
2407 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2408 if self.tree.elementInScope(token["name"], variant="table"):
2409 self.endTagOther(impliedTagToken("select"))
2410 return token
2412 def endTagOther(self, token):
2413 return self.parser.phases["inSelect"].processEndTag(token)
2415 startTagHandler = _utils.MethodDispatcher([
2416 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2417 startTagTable)
2418 ])
2419 startTagHandler.default = startTagOther
2421 endTagHandler = _utils.MethodDispatcher([
2422 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2423 endTagTable)
2424 ])
2425 endTagHandler.default = endTagOther
2427 class InForeignContentPhase(Phase):
2428 __slots__ = tuple()
2430 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2431 "center", "code", "dd", "div", "dl", "dt",
2432 "em", "embed", "h1", "h2", "h3",
2433 "h4", "h5", "h6", "head", "hr", "i", "img",
2434 "li", "listing", "menu", "meta", "nobr",
2435 "ol", "p", "pre", "ruby", "s", "small",
2436 "span", "strong", "strike", "sub", "sup",
2437 "table", "tt", "u", "ul", "var"])
2439 def adjustSVGTagNames(self, token):
2440 replacements = {"altglyph": "altGlyph",
2441 "altglyphdef": "altGlyphDef",
2442 "altglyphitem": "altGlyphItem",
2443 "animatecolor": "animateColor",
2444 "animatemotion": "animateMotion",
2445 "animatetransform": "animateTransform",
2446 "clippath": "clipPath",
2447 "feblend": "feBlend",
2448 "fecolormatrix": "feColorMatrix",
2449 "fecomponenttransfer": "feComponentTransfer",
2450 "fecomposite": "feComposite",
2451 "feconvolvematrix": "feConvolveMatrix",
2452 "fediffuselighting": "feDiffuseLighting",
2453 "fedisplacementmap": "feDisplacementMap",
2454 "fedistantlight": "feDistantLight",
2455 "feflood": "feFlood",
2456 "fefunca": "feFuncA",
2457 "fefuncb": "feFuncB",
2458 "fefuncg": "feFuncG",
2459 "fefuncr": "feFuncR",
2460 "fegaussianblur": "feGaussianBlur",
2461 "feimage": "feImage",
2462 "femerge": "feMerge",
2463 "femergenode": "feMergeNode",
2464 "femorphology": "feMorphology",
2465 "feoffset": "feOffset",
2466 "fepointlight": "fePointLight",
2467 "fespecularlighting": "feSpecularLighting",
2468 "fespotlight": "feSpotLight",
2469 "fetile": "feTile",
2470 "feturbulence": "feTurbulence",
2471 "foreignobject": "foreignObject",
2472 "glyphref": "glyphRef",
2473 "lineargradient": "linearGradient",
2474 "radialgradient": "radialGradient",
2475 "textpath": "textPath"}
2477 if token["name"] in replacements:
2478 token["name"] = replacements[token["name"]]
2480 def processCharacters(self, token):
2481 if token["data"] == "\u0000":
2482 token["data"] = "\uFFFD"
2483 elif (self.parser.framesetOK and
2484 any(char not in spaceCharacters for char in token["data"])):
2485 self.parser.framesetOK = False
2486 Phase.processCharacters(self, token)
2488 def processStartTag(self, token):
2489 currentNode = self.tree.openElements[-1]
2490 if (token["name"] in self.breakoutElements or
2491 (token["name"] == "font" and
2492 set(token["data"].keys()) & {"color", "face", "size"})):
2493 self.parser.parseError("unexpected-html-element-in-foreign-content",
2494 {"name": token["name"]})
2495 while (self.tree.openElements[-1].namespace !=
2496 self.tree.defaultNamespace and
2497 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2498 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2499 self.tree.openElements.pop()
2500 return token
2502 else:
2503 if currentNode.namespace == namespaces["mathml"]:
2504 self.parser.adjustMathMLAttributes(token)
2505 elif currentNode.namespace == namespaces["svg"]:
2506 self.adjustSVGTagNames(token)
2507 self.parser.adjustSVGAttributes(token)
2508 self.parser.adjustForeignAttributes(token)
2509 token["namespace"] = currentNode.namespace
2510 self.tree.insertElement(token)
2511 if token["selfClosing"]:
2512 self.tree.openElements.pop()
2513 token["selfClosingAcknowledged"] = True
2515 def processEndTag(self, token):
2516 nodeIndex = len(self.tree.openElements) - 1
2517 node = self.tree.openElements[-1]
2518 if node.name.translate(asciiUpper2Lower) != token["name"]:
2519 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2521 while True:
2522 if node.name.translate(asciiUpper2Lower) == token["name"]:
2523 # XXX this isn't in the spec but it seems necessary
2524 if self.parser.phase == self.parser.phases["inTableText"]:
2525 self.parser.phase.flushCharacters()
2526 self.parser.phase = self.parser.phase.originalPhase
2527 while self.tree.openElements.pop() != node:
2528 assert self.tree.openElements
2529 new_token = None
2530 break
2531 nodeIndex -= 1
2533 node = self.tree.openElements[nodeIndex]
2534 if node.namespace != self.tree.defaultNamespace:
2535 continue
2536 else:
2537 new_token = self.parser.phase.processEndTag(token)
2538 break
2539 return new_token
2541 class AfterBodyPhase(Phase):
2542 __slots__ = tuple()
2544 def processEOF(self):
2545 # Stop parsing
2546 pass
2548 def processComment(self, token):
2549 # This is needed because data is to be appended to the <html> element
2550 # here and not to whatever is currently open.
2551 self.tree.insertComment(token, self.tree.openElements[0])
2553 def processCharacters(self, token):
2554 self.parser.parseError("unexpected-char-after-body")
2555 self.parser.phase = self.parser.phases["inBody"]
2556 return token
2558 def startTagHtml(self, token):
2559 return self.parser.phases["inBody"].processStartTag(token)
2561 def startTagOther(self, token):
2562 self.parser.parseError("unexpected-start-tag-after-body",
2563 {"name": token["name"]})
2564 self.parser.phase = self.parser.phases["inBody"]
2565 return token
2567 def endTagHtml(self, name):
2568 if self.parser.innerHTML:
2569 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2570 else:
2571 self.parser.phase = self.parser.phases["afterAfterBody"]
2573 def endTagOther(self, token):
2574 self.parser.parseError("unexpected-end-tag-after-body",
2575 {"name": token["name"]})
2576 self.parser.phase = self.parser.phases["inBody"]
2577 return token
2579 startTagHandler = _utils.MethodDispatcher([
2580 ("html", startTagHtml)
2581 ])
2582 startTagHandler.default = startTagOther
2584 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
2585 endTagHandler.default = endTagOther
2587 class InFramesetPhase(Phase):
2588 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2589 __slots__ = tuple()
2591 def processEOF(self):
2592 if self.tree.openElements[-1].name != "html":
2593 self.parser.parseError("eof-in-frameset")
2594 else:
2595 assert self.parser.innerHTML
2597 def processCharacters(self, token):
2598 self.parser.parseError("unexpected-char-in-frameset")
2600 def startTagFrameset(self, token):
2601 self.tree.insertElement(token)
2603 def startTagFrame(self, token):
2604 self.tree.insertElement(token)
2605 self.tree.openElements.pop()
2607 def startTagNoframes(self, token):
2608 return self.parser.phases["inBody"].processStartTag(token)
2610 def startTagOther(self, token):
2611 self.parser.parseError("unexpected-start-tag-in-frameset",
2612 {"name": token["name"]})
2614 def endTagFrameset(self, token):
2615 if self.tree.openElements[-1].name == "html":
2616 # innerHTML case
2617 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2618 else:
2619 self.tree.openElements.pop()
2620 if (not self.parser.innerHTML and
2621 self.tree.openElements[-1].name != "frameset"):
2622 # If we're not in innerHTML mode and the current node is not a
2623 # "frameset" element (anymore) then switch.
2624 self.parser.phase = self.parser.phases["afterFrameset"]
2626 def endTagOther(self, token):
2627 self.parser.parseError("unexpected-end-tag-in-frameset",
2628 {"name": token["name"]})
2630 startTagHandler = _utils.MethodDispatcher([
2631 ("html", Phase.startTagHtml),
2632 ("frameset", startTagFrameset),
2633 ("frame", startTagFrame),
2634 ("noframes", startTagNoframes)
2635 ])
2636 startTagHandler.default = startTagOther
2638 endTagHandler = _utils.MethodDispatcher([
2639 ("frameset", endTagFrameset)
2640 ])
2641 endTagHandler.default = endTagOther
2643 class AfterFramesetPhase(Phase):
2644 # http://www.whatwg.org/specs/web-apps/current-work/#after3
2645 __slots__ = tuple()
2647 def processEOF(self):
2648 # Stop parsing
2649 pass
2651 def processCharacters(self, token):
2652 self.parser.parseError("unexpected-char-after-frameset")
2654 def startTagNoframes(self, token):
2655 return self.parser.phases["inHead"].processStartTag(token)
2657 def startTagOther(self, token):
2658 self.parser.parseError("unexpected-start-tag-after-frameset",
2659 {"name": token["name"]})
2661 def endTagHtml(self, token):
2662 self.parser.phase = self.parser.phases["afterAfterFrameset"]
2664 def endTagOther(self, token):
2665 self.parser.parseError("unexpected-end-tag-after-frameset",
2666 {"name": token["name"]})
2668 startTagHandler = _utils.MethodDispatcher([
2669 ("html", Phase.startTagHtml),
2670 ("noframes", startTagNoframes)
2671 ])
2672 startTagHandler.default = startTagOther
2674 endTagHandler = _utils.MethodDispatcher([
2675 ("html", endTagHtml)
2676 ])
2677 endTagHandler.default = endTagOther
2679 class AfterAfterBodyPhase(Phase):
2680 __slots__ = tuple()
2682 def processEOF(self):
2683 pass
2685 def processComment(self, token):
2686 self.tree.insertComment(token, self.tree.document)
2688 def processSpaceCharacters(self, token):
2689 return self.parser.phases["inBody"].processSpaceCharacters(token)
2691 def processCharacters(self, token):
2692 self.parser.parseError("expected-eof-but-got-char")
2693 self.parser.phase = self.parser.phases["inBody"]
2694 return token
2696 def startTagHtml(self, token):
2697 return self.parser.phases["inBody"].processStartTag(token)
2699 def startTagOther(self, token):
2700 self.parser.parseError("expected-eof-but-got-start-tag",
2701 {"name": token["name"]})
2702 self.parser.phase = self.parser.phases["inBody"]
2703 return token
2705 def processEndTag(self, token):
2706 self.parser.parseError("expected-eof-but-got-end-tag",
2707 {"name": token["name"]})
2708 self.parser.phase = self.parser.phases["inBody"]
2709 return token
2711 startTagHandler = _utils.MethodDispatcher([
2712 ("html", startTagHtml)
2713 ])
2714 startTagHandler.default = startTagOther
2716 class AfterAfterFramesetPhase(Phase):
2717 __slots__ = tuple()
2719 def processEOF(self):
2720 pass
2722 def processComment(self, token):
2723 self.tree.insertComment(token, self.tree.document)
2725 def processSpaceCharacters(self, token):
2726 return self.parser.phases["inBody"].processSpaceCharacters(token)
2728 def processCharacters(self, token):
2729 self.parser.parseError("expected-eof-but-got-char")
2731 def startTagHtml(self, token):
2732 return self.parser.phases["inBody"].processStartTag(token)
2734 def startTagNoFrames(self, token):
2735 return self.parser.phases["inHead"].processStartTag(token)
2737 def startTagOther(self, token):
2738 self.parser.parseError("expected-eof-but-got-start-tag",
2739 {"name": token["name"]})
2741 def processEndTag(self, token):
2742 self.parser.parseError("expected-eof-but-got-end-tag",
2743 {"name": token["name"]})
2745 startTagHandler = _utils.MethodDispatcher([
2746 ("html", startTagHtml),
2747 ("noframes", startTagNoFrames)
2748 ])
2749 startTagHandler.default = startTagOther
2751 # pylint:enable=unused-argument
2753 return {
2754 "initial": InitialPhase,
2755 "beforeHtml": BeforeHtmlPhase,
2756 "beforeHead": BeforeHeadPhase,
2757 "inHead": InHeadPhase,
2758 "inHeadNoscript": InHeadNoscriptPhase,
2759 "afterHead": AfterHeadPhase,
2760 "inBody": InBodyPhase,
2761 "text": TextPhase,
2762 "inTable": InTablePhase,
2763 "inTableText": InTableTextPhase,
2764 "inCaption": InCaptionPhase,
2765 "inColumnGroup": InColumnGroupPhase,
2766 "inTableBody": InTableBodyPhase,
2767 "inRow": InRowPhase,
2768 "inCell": InCellPhase,
2769 "inSelect": InSelectPhase,
2770 "inSelectInTable": InSelectInTablePhase,
2771 "inForeignContent": InForeignContentPhase,
2772 "afterBody": AfterBodyPhase,
2773 "inFrameset": InFramesetPhase,
2774 "afterFrameset": AfterFramesetPhase,
2775 "afterAfterBody": AfterAfterBodyPhase,
2776 "afterAfterFrameset": AfterAfterFramesetPhase,
2777 # XXX after after frameset
2778 }
2781def adjust_attributes(token, replacements):
2782 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2783 if needs_adjustment:
2784 token['data'] = type(token['data'])((replacements.get(k, k), v)
2785 for k, v in token['data'].items())
2788def impliedTagToken(name, type="EndTag", attributes=None,
2789 selfClosing=False):
2790 if attributes is None:
2791 attributes = {}
2792 return {"type": tokenTypes[type], "name": name, "data": attributes,
2793 "selfClosing": selfClosing}
2796class ParseError(Exception):
2797 """Error in parsed document"""
2798 pass