Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/html5parser.py: 2%
1533 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-03 06:10 +0000
1from __future__ import absolute_import, division, unicode_literals
2from six import with_metaclass, viewkeys
4import types
6from . import _inputstream
7from . import _tokenizer
9from . import treebuilders
10from .treebuilders.base import Marker
12from . import _utils
13from .constants import (
14 spaceCharacters, asciiUpper2Lower,
15 specialElements, headingElements, cdataElements, rcdataElements,
16 tokenTypes, tagTokenTypes,
17 namespaces,
18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
19 adjustForeignAttributes as adjustForeignAttributesMap,
20 adjustMathMLAttributes, adjustSVGAttributes,
21 E,
22 _ReparseException
23)
26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
27 """Parse an HTML document as a string or file-like object into a tree
29 :arg doc: the document to parse as a string or file-like object
31 :arg treebuilder: the treebuilder to use when parsing
33 :arg namespaceHTMLElements: whether or not to namespace HTML elements
35 :returns: parsed tree
37 Example:
39 >>> from html5lib.html5parser import parse
40 >>> parse('<html><body><p>This is a doc</p></body></html>')
41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
43 """
44 tb = treebuilders.getTreeBuilder(treebuilder)
45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
46 return p.parse(doc, **kwargs)
49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
50 """Parse an HTML fragment as a string or file-like object into a tree
52 :arg doc: the fragment to parse as a string or file-like object
54 :arg container: the container context to parse the fragment in
56 :arg treebuilder: the treebuilder to use when parsing
58 :arg namespaceHTMLElements: whether or not to namespace HTML elements
60 :returns: parsed tree
62 Example:
64 >>> from html5lib.html5libparser import parseFragment
65 >>> parseFragment('<b>this is a fragment</b>')
66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
68 """
69 tb = treebuilders.getTreeBuilder(treebuilder)
70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
71 return p.parseFragment(doc, container=container, **kwargs)
74def method_decorator_metaclass(function):
75 class Decorated(type):
76 def __new__(meta, classname, bases, classDict):
77 for attributeName, attribute in classDict.items():
78 if isinstance(attribute, types.FunctionType):
79 attribute = function(attribute)
81 classDict[attributeName] = attribute
82 return type.__new__(meta, classname, bases, classDict)
83 return Decorated
86class HTMLParser(object):
87 """HTML parser
89 Generates a tree structure from a stream of (possibly malformed) HTML.
91 """
93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
94 """
95 :arg tree: a treebuilder class controlling the type of tree that will be
96 returned. Built in treebuilders can be accessed through
97 html5lib.treebuilders.getTreeBuilder(treeType)
99 :arg strict: raise an exception when a parse error is encountered
101 :arg namespaceHTMLElements: whether or not to namespace HTML elements
103 :arg debug: whether or not to enable debug mode which logs things
105 Example:
107 >>> from html5lib.html5parser import HTMLParser
108 >>> parser = HTMLParser() # generates parser with etree builder
109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
111 """
113 # Raise an exception on the first error encountered
114 self.strict = strict
116 if tree is None:
117 tree = treebuilders.getTreeBuilder("etree")
118 self.tree = tree(namespaceHTMLElements)
119 self.errors = []
121 self.phases = {name: cls(self, self.tree) for name, cls in
122 getPhases(debug).items()}
124 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
126 self.innerHTMLMode = innerHTML
127 self.container = container
128 self.scripting = scripting
129 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
130 self.reset()
132 try:
133 self.mainLoop()
134 except _ReparseException:
135 self.reset()
136 self.mainLoop()
138 def reset(self):
139 self.tree.reset()
140 self.firstStartTag = False
141 self.errors = []
142 self.log = [] # only used with debug mode
143 # "quirks" / "limited quirks" / "no quirks"
144 self.compatMode = "no quirks"
146 if self.innerHTMLMode:
147 self.innerHTML = self.container.lower()
149 if self.innerHTML in cdataElements:
150 self.tokenizer.state = self.tokenizer.rcdataState
151 elif self.innerHTML in rcdataElements:
152 self.tokenizer.state = self.tokenizer.rawtextState
153 elif self.innerHTML == 'plaintext':
154 self.tokenizer.state = self.tokenizer.plaintextState
155 else:
156 # state already is data state
157 # self.tokenizer.state = self.tokenizer.dataState
158 pass
159 self.phase = self.phases["beforeHtml"]
160 self.phase.insertHtmlElement()
161 self.resetInsertionMode()
162 else:
163 self.innerHTML = False # pylint:disable=redefined-variable-type
164 self.phase = self.phases["initial"]
166 self.lastPhase = None
168 self.beforeRCDataPhase = None
170 self.framesetOK = True
172 @property
173 def documentEncoding(self):
174 """Name of the character encoding that was used to decode the input stream, or
175 :obj:`None` if that is not determined yet
177 """
178 if not hasattr(self, 'tokenizer'):
179 return None
180 return self.tokenizer.stream.charEncoding[0].name
182 def isHTMLIntegrationPoint(self, element):
183 if (element.name == "annotation-xml" and
184 element.namespace == namespaces["mathml"]):
185 return ("encoding" in element.attributes and
186 element.attributes["encoding"].translate(
187 asciiUpper2Lower) in
188 ("text/html", "application/xhtml+xml"))
189 else:
190 return (element.namespace, element.name) in htmlIntegrationPointElements
192 def isMathMLTextIntegrationPoint(self, element):
193 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
195 def mainLoop(self):
196 CharactersToken = tokenTypes["Characters"]
197 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
198 StartTagToken = tokenTypes["StartTag"]
199 EndTagToken = tokenTypes["EndTag"]
200 CommentToken = tokenTypes["Comment"]
201 DoctypeToken = tokenTypes["Doctype"]
202 ParseErrorToken = tokenTypes["ParseError"]
204 for token in self.tokenizer:
205 prev_token = None
206 new_token = token
207 while new_token is not None:
208 prev_token = new_token
209 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
210 currentNodeNamespace = currentNode.namespace if currentNode else None
211 currentNodeName = currentNode.name if currentNode else None
213 type = new_token["type"]
215 if type == ParseErrorToken:
216 self.parseError(new_token["data"], new_token.get("datavars", {}))
217 new_token = None
218 else:
219 if (len(self.tree.openElements) == 0 or
220 currentNodeNamespace == self.tree.defaultNamespace or
221 (self.isMathMLTextIntegrationPoint(currentNode) and
222 ((type == StartTagToken and
223 token["name"] not in frozenset(["mglyph", "malignmark"])) or
224 type in (CharactersToken, SpaceCharactersToken))) or
225 (currentNodeNamespace == namespaces["mathml"] and
226 currentNodeName == "annotation-xml" and
227 type == StartTagToken and
228 token["name"] == "svg") or
229 (self.isHTMLIntegrationPoint(currentNode) and
230 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
231 phase = self.phase
232 else:
233 phase = self.phases["inForeignContent"]
235 if type == CharactersToken:
236 new_token = phase.processCharacters(new_token)
237 elif type == SpaceCharactersToken:
238 new_token = phase.processSpaceCharacters(new_token)
239 elif type == StartTagToken:
240 new_token = phase.processStartTag(new_token)
241 elif type == EndTagToken:
242 new_token = phase.processEndTag(new_token)
243 elif type == CommentToken:
244 new_token = phase.processComment(new_token)
245 elif type == DoctypeToken:
246 new_token = phase.processDoctype(new_token)
248 if (type == StartTagToken and prev_token["selfClosing"] and
249 not prev_token["selfClosingAcknowledged"]):
250 self.parseError("non-void-element-with-trailing-solidus",
251 {"name": prev_token["name"]})
253 # When the loop finishes it's EOF
254 reprocess = True
255 phases = []
256 while reprocess:
257 phases.append(self.phase)
258 reprocess = self.phase.processEOF()
259 if reprocess:
260 assert self.phase not in phases
262 def parse(self, stream, *args, **kwargs):
263 """Parse a HTML document into a well-formed tree
265 :arg stream: a file-like object or string containing the HTML to be parsed
267 The optional encoding parameter must be a string that indicates
268 the encoding. If specified, that encoding will be used,
269 regardless of any BOM or later declaration (such as in a meta
270 element).
272 :arg scripting: treat noscript elements as if JavaScript was turned on
274 :returns: parsed tree
276 Example:
278 >>> from html5lib.html5parser import HTMLParser
279 >>> parser = HTMLParser()
280 >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
281 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
283 """
284 self._parse(stream, False, None, *args, **kwargs)
285 return self.tree.getDocument()
287 def parseFragment(self, stream, *args, **kwargs):
288 """Parse a HTML fragment into a well-formed tree fragment
290 :arg container: name of the element we're setting the innerHTML
291 property if set to None, default to 'div'
293 :arg stream: a file-like object or string containing the HTML to be parsed
295 The optional encoding parameter must be a string that indicates
296 the encoding. If specified, that encoding will be used,
297 regardless of any BOM or later declaration (such as in a meta
298 element)
300 :arg scripting: treat noscript elements as if JavaScript was turned on
302 :returns: parsed tree
304 Example:
306 >>> from html5lib.html5libparser import HTMLParser
307 >>> parser = HTMLParser()
308 >>> parser.parseFragment('<b>this is a fragment</b>')
309 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
311 """
312 self._parse(stream, True, *args, **kwargs)
313 return self.tree.getFragment()
315 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
316 # XXX The idea is to make errorcode mandatory.
317 if datavars is None:
318 datavars = {}
319 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
320 if self.strict:
321 raise ParseError(E[errorcode] % datavars)
323 def adjustMathMLAttributes(self, token):
324 adjust_attributes(token, adjustMathMLAttributes)
326 def adjustSVGAttributes(self, token):
327 adjust_attributes(token, adjustSVGAttributes)
329 def adjustForeignAttributes(self, token):
330 adjust_attributes(token, adjustForeignAttributesMap)
332 def reparseTokenNormal(self, token):
333 # pylint:disable=unused-argument
334 self.parser.phase()
336 def resetInsertionMode(self):
337 # The name of this method is mostly historical. (It's also used in the
338 # specification.)
339 last = False
340 newModes = {
341 "select": "inSelect",
342 "td": "inCell",
343 "th": "inCell",
344 "tr": "inRow",
345 "tbody": "inTableBody",
346 "thead": "inTableBody",
347 "tfoot": "inTableBody",
348 "caption": "inCaption",
349 "colgroup": "inColumnGroup",
350 "table": "inTable",
351 "head": "inBody",
352 "body": "inBody",
353 "frameset": "inFrameset",
354 "html": "beforeHead"
355 }
356 for node in self.tree.openElements[::-1]:
357 nodeName = node.name
358 new_phase = None
359 if node == self.tree.openElements[0]:
360 assert self.innerHTML
361 last = True
362 nodeName = self.innerHTML
363 # Check for conditions that should only happen in the innerHTML
364 # case
365 if nodeName in ("select", "colgroup", "head", "html"):
366 assert self.innerHTML
368 if not last and node.namespace != self.tree.defaultNamespace:
369 continue
371 if nodeName in newModes:
372 new_phase = self.phases[newModes[nodeName]]
373 break
374 elif last:
375 new_phase = self.phases["inBody"]
376 break
378 self.phase = new_phase
380 def parseRCDataRawtext(self, token, contentType):
381 # Generic RCDATA/RAWTEXT Parsing algorithm
382 assert contentType in ("RAWTEXT", "RCDATA")
384 self.tree.insertElement(token)
386 if contentType == "RAWTEXT":
387 self.tokenizer.state = self.tokenizer.rawtextState
388 else:
389 self.tokenizer.state = self.tokenizer.rcdataState
391 self.originalPhase = self.phase
393 self.phase = self.phases["text"]
396@_utils.memoize
397def getPhases(debug):
398 def log(function):
399 """Logger that records which phase processes each token"""
400 type_names = {value: key for key, value in tokenTypes.items()}
402 def wrapped(self, *args, **kwargs):
403 if function.__name__.startswith("process") and len(args) > 0:
404 token = args[0]
405 info = {"type": type_names[token['type']]}
406 if token['type'] in tagTokenTypes:
407 info["name"] = token['name']
409 self.parser.log.append((self.parser.tokenizer.state.__name__,
410 self.parser.phase.__class__.__name__,
411 self.__class__.__name__,
412 function.__name__,
413 info))
414 return function(self, *args, **kwargs)
415 else:
416 return function(self, *args, **kwargs)
417 return wrapped
419 def getMetaclass(use_metaclass, metaclass_func):
420 if use_metaclass:
421 return method_decorator_metaclass(metaclass_func)
422 else:
423 return type
425 # pylint:disable=unused-argument
426 class Phase(with_metaclass(getMetaclass(debug, log))):
427 """Base class for helper object that implements each phase of processing
428 """
429 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
431 def __init__(self, parser, tree):
432 self.parser = parser
433 self.tree = tree
434 self.__startTagCache = {}
435 self.__endTagCache = {}
437 def processEOF(self):
438 raise NotImplementedError
440 def processComment(self, token):
441 # For most phases the following is correct. Where it's not it will be
442 # overridden.
443 self.tree.insertComment(token, self.tree.openElements[-1])
445 def processDoctype(self, token):
446 self.parser.parseError("unexpected-doctype")
448 def processCharacters(self, token):
449 self.tree.insertText(token["data"])
451 def processSpaceCharacters(self, token):
452 self.tree.insertText(token["data"])
454 def processStartTag(self, token):
455 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
456 # requires a circular reference to the Phase, and this ends up with a significant
457 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
458 name = token["name"]
459 # In Py2, using `in` is quicker in general than try/except KeyError
460 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
461 if name in self.__startTagCache:
462 func = self.__startTagCache[name]
463 else:
464 func = self.__startTagCache[name] = self.startTagHandler[name]
465 # bound the cache size in case we get loads of unknown tags
466 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
467 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
468 self.__startTagCache.pop(next(iter(self.__startTagCache)))
469 return func(token)
471 def startTagHtml(self, token):
472 if not self.parser.firstStartTag and token["name"] == "html":
473 self.parser.parseError("non-html-root")
474 # XXX Need a check here to see if the first start tag token emitted is
475 # this token... If it's not, invoke self.parser.parseError().
476 for attr, value in token["data"].items():
477 if attr not in self.tree.openElements[0].attributes:
478 self.tree.openElements[0].attributes[attr] = value
479 self.parser.firstStartTag = False
481 def processEndTag(self, token):
482 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
483 # requires a circular reference to the Phase, and this ends up with a significant
484 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
485 name = token["name"]
486 # In Py2, using `in` is quicker in general than try/except KeyError
487 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
488 if name in self.__endTagCache:
489 func = self.__endTagCache[name]
490 else:
491 func = self.__endTagCache[name] = self.endTagHandler[name]
492 # bound the cache size in case we get loads of unknown tags
493 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
494 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
495 self.__endTagCache.pop(next(iter(self.__endTagCache)))
496 return func(token)
498 class InitialPhase(Phase):
499 __slots__ = tuple()
501 def processSpaceCharacters(self, token):
502 pass
504 def processComment(self, token):
505 self.tree.insertComment(token, self.tree.document)
507 def processDoctype(self, token):
508 name = token["name"]
509 publicId = token["publicId"]
510 systemId = token["systemId"]
511 correct = token["correct"]
513 if (name != "html" or publicId is not None or
514 systemId is not None and systemId != "about:legacy-compat"):
515 self.parser.parseError("unknown-doctype")
517 if publicId is None:
518 publicId = ""
520 self.tree.insertDoctype(token)
522 if publicId != "":
523 publicId = publicId.translate(asciiUpper2Lower)
525 if (not correct or token["name"] != "html" or
526 publicId.startswith(
527 ("+//silmaril//dtd html pro v0r11 19970101//",
528 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
529 "-//as//dtd html 3.0 aswedit + extensions//",
530 "-//ietf//dtd html 2.0 level 1//",
531 "-//ietf//dtd html 2.0 level 2//",
532 "-//ietf//dtd html 2.0 strict level 1//",
533 "-//ietf//dtd html 2.0 strict level 2//",
534 "-//ietf//dtd html 2.0 strict//",
535 "-//ietf//dtd html 2.0//",
536 "-//ietf//dtd html 2.1e//",
537 "-//ietf//dtd html 3.0//",
538 "-//ietf//dtd html 3.2 final//",
539 "-//ietf//dtd html 3.2//",
540 "-//ietf//dtd html 3//",
541 "-//ietf//dtd html level 0//",
542 "-//ietf//dtd html level 1//",
543 "-//ietf//dtd html level 2//",
544 "-//ietf//dtd html level 3//",
545 "-//ietf//dtd html strict level 0//",
546 "-//ietf//dtd html strict level 1//",
547 "-//ietf//dtd html strict level 2//",
548 "-//ietf//dtd html strict level 3//",
549 "-//ietf//dtd html strict//",
550 "-//ietf//dtd html//",
551 "-//metrius//dtd metrius presentational//",
552 "-//microsoft//dtd internet explorer 2.0 html strict//",
553 "-//microsoft//dtd internet explorer 2.0 html//",
554 "-//microsoft//dtd internet explorer 2.0 tables//",
555 "-//microsoft//dtd internet explorer 3.0 html strict//",
556 "-//microsoft//dtd internet explorer 3.0 html//",
557 "-//microsoft//dtd internet explorer 3.0 tables//",
558 "-//netscape comm. corp.//dtd html//",
559 "-//netscape comm. corp.//dtd strict html//",
560 "-//o'reilly and associates//dtd html 2.0//",
561 "-//o'reilly and associates//dtd html extended 1.0//",
562 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
563 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
564 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
565 "-//spyglass//dtd html 2.0 extended//",
566 "-//sq//dtd html 2.0 hotmetal + extensions//",
567 "-//sun microsystems corp.//dtd hotjava html//",
568 "-//sun microsystems corp.//dtd hotjava strict html//",
569 "-//w3c//dtd html 3 1995-03-24//",
570 "-//w3c//dtd html 3.2 draft//",
571 "-//w3c//dtd html 3.2 final//",
572 "-//w3c//dtd html 3.2//",
573 "-//w3c//dtd html 3.2s draft//",
574 "-//w3c//dtd html 4.0 frameset//",
575 "-//w3c//dtd html 4.0 transitional//",
576 "-//w3c//dtd html experimental 19960712//",
577 "-//w3c//dtd html experimental 970421//",
578 "-//w3c//dtd w3 html//",
579 "-//w3o//dtd w3 html 3.0//",
580 "-//webtechs//dtd mozilla html 2.0//",
581 "-//webtechs//dtd mozilla html//")) or
582 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
583 "-/w3c/dtd html 4.0 transitional/en",
584 "html") or
585 publicId.startswith(
586 ("-//w3c//dtd html 4.01 frameset//",
587 "-//w3c//dtd html 4.01 transitional//")) and
588 systemId is None or
589 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
590 self.parser.compatMode = "quirks"
591 elif (publicId.startswith(
592 ("-//w3c//dtd xhtml 1.0 frameset//",
593 "-//w3c//dtd xhtml 1.0 transitional//")) or
594 publicId.startswith(
595 ("-//w3c//dtd html 4.01 frameset//",
596 "-//w3c//dtd html 4.01 transitional//")) and
597 systemId is not None):
598 self.parser.compatMode = "limited quirks"
600 self.parser.phase = self.parser.phases["beforeHtml"]
602 def anythingElse(self):
603 self.parser.compatMode = "quirks"
604 self.parser.phase = self.parser.phases["beforeHtml"]
606 def processCharacters(self, token):
607 self.parser.parseError("expected-doctype-but-got-chars")
608 self.anythingElse()
609 return token
611 def processStartTag(self, token):
612 self.parser.parseError("expected-doctype-but-got-start-tag",
613 {"name": token["name"]})
614 self.anythingElse()
615 return token
617 def processEndTag(self, token):
618 self.parser.parseError("expected-doctype-but-got-end-tag",
619 {"name": token["name"]})
620 self.anythingElse()
621 return token
623 def processEOF(self):
624 self.parser.parseError("expected-doctype-but-got-eof")
625 self.anythingElse()
626 return True
628 class BeforeHtmlPhase(Phase):
629 __slots__ = tuple()
631 # helper methods
632 def insertHtmlElement(self):
633 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
634 self.parser.phase = self.parser.phases["beforeHead"]
636 # other
637 def processEOF(self):
638 self.insertHtmlElement()
639 return True
641 def processComment(self, token):
642 self.tree.insertComment(token, self.tree.document)
644 def processSpaceCharacters(self, token):
645 pass
647 def processCharacters(self, token):
648 self.insertHtmlElement()
649 return token
651 def processStartTag(self, token):
652 if token["name"] == "html":
653 self.parser.firstStartTag = True
654 self.insertHtmlElement()
655 return token
657 def processEndTag(self, token):
658 if token["name"] not in ("head", "body", "html", "br"):
659 self.parser.parseError("unexpected-end-tag-before-html",
660 {"name": token["name"]})
661 else:
662 self.insertHtmlElement()
663 return token
665 class BeforeHeadPhase(Phase):
666 __slots__ = tuple()
668 def processEOF(self):
669 self.startTagHead(impliedTagToken("head", "StartTag"))
670 return True
672 def processSpaceCharacters(self, token):
673 pass
675 def processCharacters(self, token):
676 self.startTagHead(impliedTagToken("head", "StartTag"))
677 return token
679 def startTagHtml(self, token):
680 return self.parser.phases["inBody"].processStartTag(token)
682 def startTagHead(self, token):
683 self.tree.insertElement(token)
684 self.tree.headPointer = self.tree.openElements[-1]
685 self.parser.phase = self.parser.phases["inHead"]
687 def startTagOther(self, token):
688 self.startTagHead(impliedTagToken("head", "StartTag"))
689 return token
691 def endTagImplyHead(self, token):
692 self.startTagHead(impliedTagToken("head", "StartTag"))
693 return token
695 def endTagOther(self, token):
696 self.parser.parseError("end-tag-after-implied-root",
697 {"name": token["name"]})
699 startTagHandler = _utils.MethodDispatcher([
700 ("html", startTagHtml),
701 ("head", startTagHead)
702 ])
703 startTagHandler.default = startTagOther
705 endTagHandler = _utils.MethodDispatcher([
706 (("head", "body", "html", "br"), endTagImplyHead)
707 ])
708 endTagHandler.default = endTagOther
710 class InHeadPhase(Phase):
711 __slots__ = tuple()
713 # the real thing
714 def processEOF(self):
715 self.anythingElse()
716 return True
718 def processCharacters(self, token):
719 self.anythingElse()
720 return token
722 def startTagHtml(self, token):
723 return self.parser.phases["inBody"].processStartTag(token)
725 def startTagHead(self, token):
726 self.parser.parseError("two-heads-are-not-better-than-one")
728 def startTagBaseLinkCommand(self, token):
729 self.tree.insertElement(token)
730 self.tree.openElements.pop()
731 token["selfClosingAcknowledged"] = True
733 def startTagMeta(self, token):
734 self.tree.insertElement(token)
735 self.tree.openElements.pop()
736 token["selfClosingAcknowledged"] = True
738 attributes = token["data"]
739 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
740 if "charset" in attributes:
741 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
742 elif ("content" in attributes and
743 "http-equiv" in attributes and
744 attributes["http-equiv"].lower() == "content-type"):
745 # Encoding it as UTF-8 here is a hack, as really we should pass
746 # the abstract Unicode string, and just use the
747 # ContentAttrParser on that, but using UTF-8 allows all chars
748 # to be encoded and as a ASCII-superset works.
749 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
750 parser = _inputstream.ContentAttrParser(data)
751 codec = parser.parse()
752 self.parser.tokenizer.stream.changeEncoding(codec)
754 def startTagTitle(self, token):
755 self.parser.parseRCDataRawtext(token, "RCDATA")
757 def startTagNoFramesStyle(self, token):
758 # Need to decide whether to implement the scripting-disabled case
759 self.parser.parseRCDataRawtext(token, "RAWTEXT")
761 def startTagNoscript(self, token):
762 if self.parser.scripting:
763 self.parser.parseRCDataRawtext(token, "RAWTEXT")
764 else:
765 self.tree.insertElement(token)
766 self.parser.phase = self.parser.phases["inHeadNoscript"]
768 def startTagScript(self, token):
769 self.tree.insertElement(token)
770 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
771 self.parser.originalPhase = self.parser.phase
772 self.parser.phase = self.parser.phases["text"]
774 def startTagOther(self, token):
775 self.anythingElse()
776 return token
778 def endTagHead(self, token):
779 node = self.parser.tree.openElements.pop()
780 assert node.name == "head", "Expected head got %s" % node.name
781 self.parser.phase = self.parser.phases["afterHead"]
783 def endTagHtmlBodyBr(self, token):
784 self.anythingElse()
785 return token
787 def endTagOther(self, token):
788 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
790 def anythingElse(self):
791 self.endTagHead(impliedTagToken("head"))
793 startTagHandler = _utils.MethodDispatcher([
794 ("html", startTagHtml),
795 ("title", startTagTitle),
796 (("noframes", "style"), startTagNoFramesStyle),
797 ("noscript", startTagNoscript),
798 ("script", startTagScript),
799 (("base", "basefont", "bgsound", "command", "link"),
800 startTagBaseLinkCommand),
801 ("meta", startTagMeta),
802 ("head", startTagHead)
803 ])
804 startTagHandler.default = startTagOther
806 endTagHandler = _utils.MethodDispatcher([
807 ("head", endTagHead),
808 (("br", "html", "body"), endTagHtmlBodyBr)
809 ])
810 endTagHandler.default = endTagOther
812 class InHeadNoscriptPhase(Phase):
813 __slots__ = tuple()
815 def processEOF(self):
816 self.parser.parseError("eof-in-head-noscript")
817 self.anythingElse()
818 return True
820 def processComment(self, token):
821 return self.parser.phases["inHead"].processComment(token)
823 def processCharacters(self, token):
824 self.parser.parseError("char-in-head-noscript")
825 self.anythingElse()
826 return token
828 def processSpaceCharacters(self, token):
829 return self.parser.phases["inHead"].processSpaceCharacters(token)
831 def startTagHtml(self, token):
832 return self.parser.phases["inBody"].processStartTag(token)
834 def startTagBaseLinkCommand(self, token):
835 return self.parser.phases["inHead"].processStartTag(token)
837 def startTagHeadNoscript(self, token):
838 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
840 def startTagOther(self, token):
841 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
842 self.anythingElse()
843 return token
845 def endTagNoscript(self, token):
846 node = self.parser.tree.openElements.pop()
847 assert node.name == "noscript", "Expected noscript got %s" % node.name
848 self.parser.phase = self.parser.phases["inHead"]
850 def endTagBr(self, token):
851 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
852 self.anythingElse()
853 return token
855 def endTagOther(self, token):
856 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
858 def anythingElse(self):
859 # Caller must raise parse error first!
860 self.endTagNoscript(impliedTagToken("noscript"))
862 startTagHandler = _utils.MethodDispatcher([
863 ("html", startTagHtml),
864 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
865 (("head", "noscript"), startTagHeadNoscript),
866 ])
867 startTagHandler.default = startTagOther
869 endTagHandler = _utils.MethodDispatcher([
870 ("noscript", endTagNoscript),
871 ("br", endTagBr),
872 ])
873 endTagHandler.default = endTagOther
875 class AfterHeadPhase(Phase):
876 __slots__ = tuple()
878 def processEOF(self):
879 self.anythingElse()
880 return True
882 def processCharacters(self, token):
883 self.anythingElse()
884 return token
886 def startTagHtml(self, token):
887 return self.parser.phases["inBody"].processStartTag(token)
889 def startTagBody(self, token):
890 self.parser.framesetOK = False
891 self.tree.insertElement(token)
892 self.parser.phase = self.parser.phases["inBody"]
894 def startTagFrameset(self, token):
895 self.tree.insertElement(token)
896 self.parser.phase = self.parser.phases["inFrameset"]
898 def startTagFromHead(self, token):
899 self.parser.parseError("unexpected-start-tag-out-of-my-head",
900 {"name": token["name"]})
901 self.tree.openElements.append(self.tree.headPointer)
902 self.parser.phases["inHead"].processStartTag(token)
903 for node in self.tree.openElements[::-1]:
904 if node.name == "head":
905 self.tree.openElements.remove(node)
906 break
908 def startTagHead(self, token):
909 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
911 def startTagOther(self, token):
912 self.anythingElse()
913 return token
915 def endTagHtmlBodyBr(self, token):
916 self.anythingElse()
917 return token
919 def endTagOther(self, token):
920 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
922 def anythingElse(self):
923 self.tree.insertElement(impliedTagToken("body", "StartTag"))
924 self.parser.phase = self.parser.phases["inBody"]
925 self.parser.framesetOK = True
927 startTagHandler = _utils.MethodDispatcher([
928 ("html", startTagHtml),
929 ("body", startTagBody),
930 ("frameset", startTagFrameset),
931 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
932 "style", "title"),
933 startTagFromHead),
934 ("head", startTagHead)
935 ])
936 startTagHandler.default = startTagOther
937 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
938 endTagHtmlBodyBr)])
939 endTagHandler.default = endTagOther
941 class InBodyPhase(Phase):
942 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
943 # the really-really-really-very crazy mode
944 __slots__ = ("processSpaceCharacters",)
946 def __init__(self, *args, **kwargs):
947 super(InBodyPhase, self).__init__(*args, **kwargs)
948 # Set this to the default handler
949 self.processSpaceCharacters = self.processSpaceCharactersNonPre
951 def isMatchingFormattingElement(self, node1, node2):
952 return (node1.name == node2.name and
953 node1.namespace == node2.namespace and
954 node1.attributes == node2.attributes)
956 # helper
957 def addFormattingElement(self, token):
958 self.tree.insertElement(token)
959 element = self.tree.openElements[-1]
961 matchingElements = []
962 for node in self.tree.activeFormattingElements[::-1]:
963 if node is Marker:
964 break
965 elif self.isMatchingFormattingElement(node, element):
966 matchingElements.append(node)
968 assert len(matchingElements) <= 3
969 if len(matchingElements) == 3:
970 self.tree.activeFormattingElements.remove(matchingElements[-1])
971 self.tree.activeFormattingElements.append(element)
973 # the real deal
974 def processEOF(self):
975 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
976 "tfoot", "th", "thead", "tr", "body",
977 "html"))
978 for node in self.tree.openElements[::-1]:
979 if node.name not in allowed_elements:
980 self.parser.parseError("expected-closing-tag-but-got-eof")
981 break
982 # Stop parsing
984 def processSpaceCharactersDropNewline(self, token):
985 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
986 # want to drop leading newlines
987 data = token["data"]
988 self.processSpaceCharacters = self.processSpaceCharactersNonPre
989 if (data.startswith("\n") and
990 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
991 not self.tree.openElements[-1].hasContent()):
992 data = data[1:]
993 if data:
994 self.tree.reconstructActiveFormattingElements()
995 self.tree.insertText(data)
997 def processCharacters(self, token):
998 if token["data"] == "\u0000":
999 # The tokenizer should always emit null on its own
1000 return
1001 self.tree.reconstructActiveFormattingElements()
1002 self.tree.insertText(token["data"])
1003 # This must be bad for performance
1004 if (self.parser.framesetOK and
1005 any([char not in spaceCharacters
1006 for char in token["data"]])):
1007 self.parser.framesetOK = False
1009 def processSpaceCharactersNonPre(self, token):
1010 self.tree.reconstructActiveFormattingElements()
1011 self.tree.insertText(token["data"])
1013 def startTagProcessInHead(self, token):
1014 return self.parser.phases["inHead"].processStartTag(token)
1016 def startTagBody(self, token):
1017 self.parser.parseError("unexpected-start-tag", {"name": "body"})
1018 if (len(self.tree.openElements) == 1 or
1019 self.tree.openElements[1].name != "body"):
1020 assert self.parser.innerHTML
1021 else:
1022 self.parser.framesetOK = False
1023 for attr, value in token["data"].items():
1024 if attr not in self.tree.openElements[1].attributes:
1025 self.tree.openElements[1].attributes[attr] = value
1027 def startTagFrameset(self, token):
1028 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1029 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1030 assert self.parser.innerHTML
1031 elif not self.parser.framesetOK:
1032 pass
1033 else:
1034 if self.tree.openElements[1].parent:
1035 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1036 while self.tree.openElements[-1].name != "html":
1037 self.tree.openElements.pop()
1038 self.tree.insertElement(token)
1039 self.parser.phase = self.parser.phases["inFrameset"]
1041 def startTagCloseP(self, token):
1042 if self.tree.elementInScope("p", variant="button"):
1043 self.endTagP(impliedTagToken("p"))
1044 self.tree.insertElement(token)
1046 def startTagPreListing(self, token):
1047 if self.tree.elementInScope("p", variant="button"):
1048 self.endTagP(impliedTagToken("p"))
1049 self.tree.insertElement(token)
1050 self.parser.framesetOK = False
1051 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1053 def startTagForm(self, token):
1054 if self.tree.formPointer:
1055 self.parser.parseError("unexpected-start-tag", {"name": "form"})
1056 else:
1057 if self.tree.elementInScope("p", variant="button"):
1058 self.endTagP(impliedTagToken("p"))
1059 self.tree.insertElement(token)
1060 self.tree.formPointer = self.tree.openElements[-1]
1062 def startTagListItem(self, token):
1063 self.parser.framesetOK = False
1065 stopNamesMap = {"li": ["li"],
1066 "dt": ["dt", "dd"],
1067 "dd": ["dt", "dd"]}
1068 stopNames = stopNamesMap[token["name"]]
1069 for node in reversed(self.tree.openElements):
1070 if node.name in stopNames:
1071 self.parser.phase.processEndTag(
1072 impliedTagToken(node.name, "EndTag"))
1073 break
1074 if (node.nameTuple in specialElements and
1075 node.name not in ("address", "div", "p")):
1076 break
1078 if self.tree.elementInScope("p", variant="button"):
1079 self.parser.phase.processEndTag(
1080 impliedTagToken("p", "EndTag"))
1082 self.tree.insertElement(token)
1084 def startTagPlaintext(self, token):
1085 if self.tree.elementInScope("p", variant="button"):
1086 self.endTagP(impliedTagToken("p"))
1087 self.tree.insertElement(token)
1088 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1090 def startTagHeading(self, token):
1091 if self.tree.elementInScope("p", variant="button"):
1092 self.endTagP(impliedTagToken("p"))
1093 if self.tree.openElements[-1].name in headingElements:
1094 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1095 self.tree.openElements.pop()
1096 self.tree.insertElement(token)
1098 def startTagA(self, token):
1099 afeAElement = self.tree.elementInActiveFormattingElements("a")
1100 if afeAElement:
1101 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1102 {"startName": "a", "endName": "a"})
1103 self.endTagFormatting(impliedTagToken("a"))
1104 if afeAElement in self.tree.openElements:
1105 self.tree.openElements.remove(afeAElement)
1106 if afeAElement in self.tree.activeFormattingElements:
1107 self.tree.activeFormattingElements.remove(afeAElement)
1108 self.tree.reconstructActiveFormattingElements()
1109 self.addFormattingElement(token)
1111 def startTagFormatting(self, token):
1112 self.tree.reconstructActiveFormattingElements()
1113 self.addFormattingElement(token)
1115 def startTagNobr(self, token):
1116 self.tree.reconstructActiveFormattingElements()
1117 if self.tree.elementInScope("nobr"):
1118 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1119 {"startName": "nobr", "endName": "nobr"})
1120 self.processEndTag(impliedTagToken("nobr"))
1121 # XXX Need tests that trigger the following
1122 self.tree.reconstructActiveFormattingElements()
1123 self.addFormattingElement(token)
1125 def startTagButton(self, token):
1126 if self.tree.elementInScope("button"):
1127 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1128 {"startName": "button", "endName": "button"})
1129 self.processEndTag(impliedTagToken("button"))
1130 return token
1131 else:
1132 self.tree.reconstructActiveFormattingElements()
1133 self.tree.insertElement(token)
1134 self.parser.framesetOK = False
1136 def startTagAppletMarqueeObject(self, token):
1137 self.tree.reconstructActiveFormattingElements()
1138 self.tree.insertElement(token)
1139 self.tree.activeFormattingElements.append(Marker)
1140 self.parser.framesetOK = False
1142 def startTagXmp(self, token):
1143 if self.tree.elementInScope("p", variant="button"):
1144 self.endTagP(impliedTagToken("p"))
1145 self.tree.reconstructActiveFormattingElements()
1146 self.parser.framesetOK = False
1147 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1149 def startTagTable(self, token):
1150 if self.parser.compatMode != "quirks":
1151 if self.tree.elementInScope("p", variant="button"):
1152 self.processEndTag(impliedTagToken("p"))
1153 self.tree.insertElement(token)
1154 self.parser.framesetOK = False
1155 self.parser.phase = self.parser.phases["inTable"]
1157 def startTagVoidFormatting(self, token):
1158 self.tree.reconstructActiveFormattingElements()
1159 self.tree.insertElement(token)
1160 self.tree.openElements.pop()
1161 token["selfClosingAcknowledged"] = True
1162 self.parser.framesetOK = False
1164 def startTagInput(self, token):
1165 framesetOK = self.parser.framesetOK
1166 self.startTagVoidFormatting(token)
1167 if ("type" in token["data"] and
1168 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1169 # input type=hidden doesn't change framesetOK
1170 self.parser.framesetOK = framesetOK
1172 def startTagParamSource(self, token):
1173 self.tree.insertElement(token)
1174 self.tree.openElements.pop()
1175 token["selfClosingAcknowledged"] = True
1177 def startTagHr(self, token):
1178 if self.tree.elementInScope("p", variant="button"):
1179 self.endTagP(impliedTagToken("p"))
1180 self.tree.insertElement(token)
1181 self.tree.openElements.pop()
1182 token["selfClosingAcknowledged"] = True
1183 self.parser.framesetOK = False
1185 def startTagImage(self, token):
1186 # No really...
1187 self.parser.parseError("unexpected-start-tag-treated-as",
1188 {"originalName": "image", "newName": "img"})
1189 self.processStartTag(impliedTagToken("img", "StartTag",
1190 attributes=token["data"],
1191 selfClosing=token["selfClosing"]))
1193 def startTagIsIndex(self, token):
1194 self.parser.parseError("deprecated-tag", {"name": "isindex"})
1195 if self.tree.formPointer:
1196 return
1197 form_attrs = {}
1198 if "action" in token["data"]:
1199 form_attrs["action"] = token["data"]["action"]
1200 self.processStartTag(impliedTagToken("form", "StartTag",
1201 attributes=form_attrs))
1202 self.processStartTag(impliedTagToken("hr", "StartTag"))
1203 self.processStartTag(impliedTagToken("label", "StartTag"))
1204 # XXX Localization ...
1205 if "prompt" in token["data"]:
1206 prompt = token["data"]["prompt"]
1207 else:
1208 prompt = "This is a searchable index. Enter search keywords: "
1209 self.processCharacters(
1210 {"type": tokenTypes["Characters"], "data": prompt})
1211 attributes = token["data"].copy()
1212 if "action" in attributes:
1213 del attributes["action"]
1214 if "prompt" in attributes:
1215 del attributes["prompt"]
1216 attributes["name"] = "isindex"
1217 self.processStartTag(impliedTagToken("input", "StartTag",
1218 attributes=attributes,
1219 selfClosing=token["selfClosing"]))
1220 self.processEndTag(impliedTagToken("label"))
1221 self.processStartTag(impliedTagToken("hr", "StartTag"))
1222 self.processEndTag(impliedTagToken("form"))
1224 def startTagTextarea(self, token):
1225 self.tree.insertElement(token)
1226 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1227 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1228 self.parser.framesetOK = False
1230 def startTagIFrame(self, token):
1231 self.parser.framesetOK = False
1232 self.startTagRawtext(token)
1234 def startTagNoscript(self, token):
1235 if self.parser.scripting:
1236 self.startTagRawtext(token)
1237 else:
1238 self.startTagOther(token)
1240 def startTagRawtext(self, token):
1241 """iframe, noembed noframes, noscript(if scripting enabled)"""
1242 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1244 def startTagOpt(self, token):
1245 if self.tree.openElements[-1].name == "option":
1246 self.parser.phase.processEndTag(impliedTagToken("option"))
1247 self.tree.reconstructActiveFormattingElements()
1248 self.parser.tree.insertElement(token)
1250 def startTagSelect(self, token):
1251 self.tree.reconstructActiveFormattingElements()
1252 self.tree.insertElement(token)
1253 self.parser.framesetOK = False
1254 if self.parser.phase in (self.parser.phases["inTable"],
1255 self.parser.phases["inCaption"],
1256 self.parser.phases["inColumnGroup"],
1257 self.parser.phases["inTableBody"],
1258 self.parser.phases["inRow"],
1259 self.parser.phases["inCell"]):
1260 self.parser.phase = self.parser.phases["inSelectInTable"]
1261 else:
1262 self.parser.phase = self.parser.phases["inSelect"]
1264 def startTagRpRt(self, token):
1265 if self.tree.elementInScope("ruby"):
1266 self.tree.generateImpliedEndTags()
1267 if self.tree.openElements[-1].name != "ruby":
1268 self.parser.parseError()
1269 self.tree.insertElement(token)
1271 def startTagMath(self, token):
1272 self.tree.reconstructActiveFormattingElements()
1273 self.parser.adjustMathMLAttributes(token)
1274 self.parser.adjustForeignAttributes(token)
1275 token["namespace"] = namespaces["mathml"]
1276 self.tree.insertElement(token)
1277 # Need to get the parse error right for the case where the token
1278 # has a namespace not equal to the xmlns attribute
1279 if token["selfClosing"]:
1280 self.tree.openElements.pop()
1281 token["selfClosingAcknowledged"] = True
1283 def startTagSvg(self, token):
1284 self.tree.reconstructActiveFormattingElements()
1285 self.parser.adjustSVGAttributes(token)
1286 self.parser.adjustForeignAttributes(token)
1287 token["namespace"] = namespaces["svg"]
1288 self.tree.insertElement(token)
1289 # Need to get the parse error right for the case where the token
1290 # has a namespace not equal to the xmlns attribute
1291 if token["selfClosing"]:
1292 self.tree.openElements.pop()
1293 token["selfClosingAcknowledged"] = True
1295 def startTagMisplaced(self, token):
1296 """ Elements that should be children of other elements that have a
1297 different insertion mode; here they are ignored
1298 "caption", "col", "colgroup", "frame", "frameset", "head",
1299 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1300 "tr", "noscript"
1301 """
1302 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1304 def startTagOther(self, token):
1305 self.tree.reconstructActiveFormattingElements()
1306 self.tree.insertElement(token)
1308 def endTagP(self, token):
1309 if not self.tree.elementInScope("p", variant="button"):
1310 self.startTagCloseP(impliedTagToken("p", "StartTag"))
1311 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1312 self.endTagP(impliedTagToken("p", "EndTag"))
1313 else:
1314 self.tree.generateImpliedEndTags("p")
1315 if self.tree.openElements[-1].name != "p":
1316 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1317 node = self.tree.openElements.pop()
1318 while node.name != "p":
1319 node = self.tree.openElements.pop()
1321 def endTagBody(self, token):
1322 if not self.tree.elementInScope("body"):
1323 self.parser.parseError()
1324 return
1325 elif self.tree.openElements[-1].name != "body":
1326 for node in self.tree.openElements[2:]:
1327 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1328 "option", "p", "rp", "rt",
1329 "tbody", "td", "tfoot",
1330 "th", "thead", "tr", "body",
1331 "html")):
1332 # Not sure this is the correct name for the parse error
1333 self.parser.parseError(
1334 "expected-one-end-tag-but-got-another",
1335 {"gotName": "body", "expectedName": node.name})
1336 break
1337 self.parser.phase = self.parser.phases["afterBody"]
1339 def endTagHtml(self, token):
1340 # We repeat the test for the body end tag token being ignored here
1341 if self.tree.elementInScope("body"):
1342 self.endTagBody(impliedTagToken("body"))
1343 return token
1345 def endTagBlock(self, token):
1346 # Put us back in the right whitespace handling mode
1347 if token["name"] == "pre":
1348 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1349 inScope = self.tree.elementInScope(token["name"])
1350 if inScope:
1351 self.tree.generateImpliedEndTags()
1352 if self.tree.openElements[-1].name != token["name"]:
1353 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1354 if inScope:
1355 node = self.tree.openElements.pop()
1356 while node.name != token["name"]:
1357 node = self.tree.openElements.pop()
1359 def endTagForm(self, token):
1360 node = self.tree.formPointer
1361 self.tree.formPointer = None
1362 if node is None or not self.tree.elementInScope(node):
1363 self.parser.parseError("unexpected-end-tag",
1364 {"name": "form"})
1365 else:
1366 self.tree.generateImpliedEndTags()
1367 if self.tree.openElements[-1] != node:
1368 self.parser.parseError("end-tag-too-early-ignored",
1369 {"name": "form"})
1370 self.tree.openElements.remove(node)
1372 def endTagListItem(self, token):
1373 if token["name"] == "li":
1374 variant = "list"
1375 else:
1376 variant = None
1377 if not self.tree.elementInScope(token["name"], variant=variant):
1378 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1379 else:
1380 self.tree.generateImpliedEndTags(exclude=token["name"])
1381 if self.tree.openElements[-1].name != token["name"]:
1382 self.parser.parseError(
1383 "end-tag-too-early",
1384 {"name": token["name"]})
1385 node = self.tree.openElements.pop()
1386 while node.name != token["name"]:
1387 node = self.tree.openElements.pop()
1389 def endTagHeading(self, token):
1390 for item in headingElements:
1391 if self.tree.elementInScope(item):
1392 self.tree.generateImpliedEndTags()
1393 break
1394 if self.tree.openElements[-1].name != token["name"]:
1395 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1397 for item in headingElements:
1398 if self.tree.elementInScope(item):
1399 item = self.tree.openElements.pop()
1400 while item.name not in headingElements:
1401 item = self.tree.openElements.pop()
1402 break
1404 def endTagFormatting(self, token):
1405 """The much-feared adoption agency algorithm"""
1406 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1407 # XXX Better parseError messages appreciated.
1409 # Step 1
1410 outerLoopCounter = 0
1412 # Step 2
1413 while outerLoopCounter < 8:
1415 # Step 3
1416 outerLoopCounter += 1
1418 # Step 4:
1420 # Let the formatting element be the last element in
1421 # the list of active formatting elements that:
1422 # - is between the end of the list and the last scope
1423 # marker in the list, if any, or the start of the list
1424 # otherwise, and
1425 # - has the same tag name as the token.
1426 formattingElement = self.tree.elementInActiveFormattingElements(
1427 token["name"])
1428 if (not formattingElement or
1429 (formattingElement in self.tree.openElements and
1430 not self.tree.elementInScope(formattingElement.name))):
1431 # If there is no such node, then abort these steps
1432 # and instead act as described in the "any other
1433 # end tag" entry below.
1434 self.endTagOther(token)
1435 return
1437 # Otherwise, if there is such a node, but that node is
1438 # not in the stack of open elements, then this is a
1439 # parse error; remove the element from the list, and
1440 # abort these steps.
1441 elif formattingElement not in self.tree.openElements:
1442 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1443 self.tree.activeFormattingElements.remove(formattingElement)
1444 return
1446 # Otherwise, if there is such a node, and that node is
1447 # also in the stack of open elements, but the element
1448 # is not in scope, then this is a parse error; ignore
1449 # the token, and abort these steps.
1450 elif not self.tree.elementInScope(formattingElement.name):
1451 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1452 return
1454 # Otherwise, there is a formatting element and that
1455 # element is in the stack and is in scope. If the
1456 # element is not the current node, this is a parse
1457 # error. In any case, proceed with the algorithm as
1458 # written in the following steps.
1459 else:
1460 if formattingElement != self.tree.openElements[-1]:
1461 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1463 # Step 5:
1465 # Let the furthest block be the topmost node in the
1466 # stack of open elements that is lower in the stack
1467 # than the formatting element, and is an element in
1468 # the special category. There might not be one.
1469 afeIndex = self.tree.openElements.index(formattingElement)
1470 furthestBlock = None
1471 for element in self.tree.openElements[afeIndex:]:
1472 if element.nameTuple in specialElements:
1473 furthestBlock = element
1474 break
1476 # Step 6:
1478 # If there is no furthest block, then the UA must
1479 # first pop all the nodes from the bottom of the stack
1480 # of open elements, from the current node up to and
1481 # including the formatting element, then remove the
1482 # formatting element from the list of active
1483 # formatting elements, and finally abort these steps.
1484 if furthestBlock is None:
1485 element = self.tree.openElements.pop()
1486 while element != formattingElement:
1487 element = self.tree.openElements.pop()
1488 self.tree.activeFormattingElements.remove(element)
1489 return
1491 # Step 7
1492 commonAncestor = self.tree.openElements[afeIndex - 1]
1494 # Step 8:
1495 # The bookmark is supposed to help us identify where to reinsert
1496 # nodes in step 15. We have to ensure that we reinsert nodes after
1497 # the node before the active formatting element. Note the bookmark
1498 # can move in step 9.7
1499 bookmark = self.tree.activeFormattingElements.index(formattingElement)
1501 # Step 9
1502 lastNode = node = furthestBlock
1503 innerLoopCounter = 0
1505 index = self.tree.openElements.index(node)
1506 while innerLoopCounter < 3:
1507 innerLoopCounter += 1
1508 # Node is element before node in open elements
1509 index -= 1
1510 node = self.tree.openElements[index]
1511 if node not in self.tree.activeFormattingElements:
1512 self.tree.openElements.remove(node)
1513 continue
1514 # Step 9.6
1515 if node == formattingElement:
1516 break
1517 # Step 9.7
1518 if lastNode == furthestBlock:
1519 bookmark = self.tree.activeFormattingElements.index(node) + 1
1520 # Step 9.8
1521 clone = node.cloneNode()
1522 # Replace node with clone
1523 self.tree.activeFormattingElements[
1524 self.tree.activeFormattingElements.index(node)] = clone
1525 self.tree.openElements[
1526 self.tree.openElements.index(node)] = clone
1527 node = clone
1528 # Step 9.9
1529 # Remove lastNode from its parents, if any
1530 if lastNode.parent:
1531 lastNode.parent.removeChild(lastNode)
1532 node.appendChild(lastNode)
1533 # Step 9.10
1534 lastNode = node
1536 # Step 10
1537 # Foster parent lastNode if commonAncestor is a
1538 # table, tbody, tfoot, thead, or tr we need to foster
1539 # parent the lastNode
1540 if lastNode.parent:
1541 lastNode.parent.removeChild(lastNode)
1543 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1544 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1545 parent.insertBefore(lastNode, insertBefore)
1546 else:
1547 commonAncestor.appendChild(lastNode)
1549 # Step 11
1550 clone = formattingElement.cloneNode()
1552 # Step 12
1553 furthestBlock.reparentChildren(clone)
1555 # Step 13
1556 furthestBlock.appendChild(clone)
1558 # Step 14
1559 self.tree.activeFormattingElements.remove(formattingElement)
1560 self.tree.activeFormattingElements.insert(bookmark, clone)
1562 # Step 15
1563 self.tree.openElements.remove(formattingElement)
1564 self.tree.openElements.insert(
1565 self.tree.openElements.index(furthestBlock) + 1, clone)
1567 def endTagAppletMarqueeObject(self, token):
1568 if self.tree.elementInScope(token["name"]):
1569 self.tree.generateImpliedEndTags()
1570 if self.tree.openElements[-1].name != token["name"]:
1571 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1573 if self.tree.elementInScope(token["name"]):
1574 element = self.tree.openElements.pop()
1575 while element.name != token["name"]:
1576 element = self.tree.openElements.pop()
1577 self.tree.clearActiveFormattingElements()
1579 def endTagBr(self, token):
1580 self.parser.parseError("unexpected-end-tag-treated-as",
1581 {"originalName": "br", "newName": "br element"})
1582 self.tree.reconstructActiveFormattingElements()
1583 self.tree.insertElement(impliedTagToken("br", "StartTag"))
1584 self.tree.openElements.pop()
1586 def endTagOther(self, token):
1587 for node in self.tree.openElements[::-1]:
1588 if node.name == token["name"]:
1589 self.tree.generateImpliedEndTags(exclude=token["name"])
1590 if self.tree.openElements[-1].name != token["name"]:
1591 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1592 while self.tree.openElements.pop() != node:
1593 pass
1594 break
1595 else:
1596 if node.nameTuple in specialElements:
1597 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1598 break
1600 startTagHandler = _utils.MethodDispatcher([
1601 ("html", Phase.startTagHtml),
1602 (("base", "basefont", "bgsound", "command", "link", "meta",
1603 "script", "style", "title"),
1604 startTagProcessInHead),
1605 ("body", startTagBody),
1606 ("frameset", startTagFrameset),
1607 (("address", "article", "aside", "blockquote", "center", "details",
1608 "dir", "div", "dl", "fieldset", "figcaption", "figure",
1609 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
1610 "section", "summary", "ul"),
1611 startTagCloseP),
1612 (headingElements, startTagHeading),
1613 (("pre", "listing"), startTagPreListing),
1614 ("form", startTagForm),
1615 (("li", "dd", "dt"), startTagListItem),
1616 ("plaintext", startTagPlaintext),
1617 ("a", startTagA),
1618 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
1619 "strong", "tt", "u"), startTagFormatting),
1620 ("nobr", startTagNobr),
1621 ("button", startTagButton),
1622 (("applet", "marquee", "object"), startTagAppletMarqueeObject),
1623 ("xmp", startTagXmp),
1624 ("table", startTagTable),
1625 (("area", "br", "embed", "img", "keygen", "wbr"),
1626 startTagVoidFormatting),
1627 (("param", "source", "track"), startTagParamSource),
1628 ("input", startTagInput),
1629 ("hr", startTagHr),
1630 ("image", startTagImage),
1631 ("isindex", startTagIsIndex),
1632 ("textarea", startTagTextarea),
1633 ("iframe", startTagIFrame),
1634 ("noscript", startTagNoscript),
1635 (("noembed", "noframes"), startTagRawtext),
1636 ("select", startTagSelect),
1637 (("rp", "rt"), startTagRpRt),
1638 (("option", "optgroup"), startTagOpt),
1639 (("math"), startTagMath),
1640 (("svg"), startTagSvg),
1641 (("caption", "col", "colgroup", "frame", "head",
1642 "tbody", "td", "tfoot", "th", "thead",
1643 "tr"), startTagMisplaced)
1644 ])
1645 startTagHandler.default = startTagOther
1647 endTagHandler = _utils.MethodDispatcher([
1648 ("body", endTagBody),
1649 ("html", endTagHtml),
1650 (("address", "article", "aside", "blockquote", "button", "center",
1651 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
1652 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
1653 "section", "summary", "ul"), endTagBlock),
1654 ("form", endTagForm),
1655 ("p", endTagP),
1656 (("dd", "dt", "li"), endTagListItem),
1657 (headingElements, endTagHeading),
1658 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
1659 "strike", "strong", "tt", "u"), endTagFormatting),
1660 (("applet", "marquee", "object"), endTagAppletMarqueeObject),
1661 ("br", endTagBr),
1662 ])
1663 endTagHandler.default = endTagOther
1665 class TextPhase(Phase):
1666 __slots__ = tuple()
1668 def processCharacters(self, token):
1669 self.tree.insertText(token["data"])
1671 def processEOF(self):
1672 self.parser.parseError("expected-named-closing-tag-but-got-eof",
1673 {"name": self.tree.openElements[-1].name})
1674 self.tree.openElements.pop()
1675 self.parser.phase = self.parser.originalPhase
1676 return True
1678 def startTagOther(self, token):
1679 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1681 def endTagScript(self, token):
1682 node = self.tree.openElements.pop()
1683 assert node.name == "script"
1684 self.parser.phase = self.parser.originalPhase
1685 # The rest of this method is all stuff that only happens if
1686 # document.write works
1688 def endTagOther(self, token):
1689 self.tree.openElements.pop()
1690 self.parser.phase = self.parser.originalPhase
1692 startTagHandler = _utils.MethodDispatcher([])
1693 startTagHandler.default = startTagOther
1694 endTagHandler = _utils.MethodDispatcher([
1695 ("script", endTagScript)])
1696 endTagHandler.default = endTagOther
1698 class InTablePhase(Phase):
1699 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1700 __slots__ = tuple()
1702 # helper methods
1703 def clearStackToTableContext(self):
1704 # "clear the stack back to a table context"
1705 while self.tree.openElements[-1].name not in ("table", "html"):
1706 # self.parser.parseError("unexpected-implied-end-tag-in-table",
1707 # {"name": self.tree.openElements[-1].name})
1708 self.tree.openElements.pop()
1709 # When the current node is <html> it's an innerHTML case
1711 # processing methods
1712 def processEOF(self):
1713 if self.tree.openElements[-1].name != "html":
1714 self.parser.parseError("eof-in-table")
1715 else:
1716 assert self.parser.innerHTML
1717 # Stop parsing
1719 def processSpaceCharacters(self, token):
1720 originalPhase = self.parser.phase
1721 self.parser.phase = self.parser.phases["inTableText"]
1722 self.parser.phase.originalPhase = originalPhase
1723 self.parser.phase.processSpaceCharacters(token)
1725 def processCharacters(self, token):
1726 originalPhase = self.parser.phase
1727 self.parser.phase = self.parser.phases["inTableText"]
1728 self.parser.phase.originalPhase = originalPhase
1729 self.parser.phase.processCharacters(token)
1731 def insertText(self, token):
1732 # If we get here there must be at least one non-whitespace character
1733 # Do the table magic!
1734 self.tree.insertFromTable = True
1735 self.parser.phases["inBody"].processCharacters(token)
1736 self.tree.insertFromTable = False
1738 def startTagCaption(self, token):
1739 self.clearStackToTableContext()
1740 self.tree.activeFormattingElements.append(Marker)
1741 self.tree.insertElement(token)
1742 self.parser.phase = self.parser.phases["inCaption"]
1744 def startTagColgroup(self, token):
1745 self.clearStackToTableContext()
1746 self.tree.insertElement(token)
1747 self.parser.phase = self.parser.phases["inColumnGroup"]
1749 def startTagCol(self, token):
1750 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1751 return token
1753 def startTagRowGroup(self, token):
1754 self.clearStackToTableContext()
1755 self.tree.insertElement(token)
1756 self.parser.phase = self.parser.phases["inTableBody"]
1758 def startTagImplyTbody(self, token):
1759 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1760 return token
1762 def startTagTable(self, token):
1763 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1764 {"startName": "table", "endName": "table"})
1765 self.parser.phase.processEndTag(impliedTagToken("table"))
1766 if not self.parser.innerHTML:
1767 return token
1769 def startTagStyleScript(self, token):
1770 return self.parser.phases["inHead"].processStartTag(token)
1772 def startTagInput(self, token):
1773 if ("type" in token["data"] and
1774 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1775 self.parser.parseError("unexpected-hidden-input-in-table")
1776 self.tree.insertElement(token)
1777 # XXX associate with form
1778 self.tree.openElements.pop()
1779 else:
1780 self.startTagOther(token)
1782 def startTagForm(self, token):
1783 self.parser.parseError("unexpected-form-in-table")
1784 if self.tree.formPointer is None:
1785 self.tree.insertElement(token)
1786 self.tree.formPointer = self.tree.openElements[-1]
1787 self.tree.openElements.pop()
1789 def startTagOther(self, token):
1790 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1791 # Do the table magic!
1792 self.tree.insertFromTable = True
1793 self.parser.phases["inBody"].processStartTag(token)
1794 self.tree.insertFromTable = False
1796 def endTagTable(self, token):
1797 if self.tree.elementInScope("table", variant="table"):
1798 self.tree.generateImpliedEndTags()
1799 if self.tree.openElements[-1].name != "table":
1800 self.parser.parseError("end-tag-too-early-named",
1801 {"gotName": "table",
1802 "expectedName": self.tree.openElements[-1].name})
1803 while self.tree.openElements[-1].name != "table":
1804 self.tree.openElements.pop()
1805 self.tree.openElements.pop()
1806 self.parser.resetInsertionMode()
1807 else:
1808 # innerHTML case
1809 assert self.parser.innerHTML
1810 self.parser.parseError()
1812 def endTagIgnore(self, token):
1813 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1815 def endTagOther(self, token):
1816 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1817 # Do the table magic!
1818 self.tree.insertFromTable = True
1819 self.parser.phases["inBody"].processEndTag(token)
1820 self.tree.insertFromTable = False
1822 startTagHandler = _utils.MethodDispatcher([
1823 ("html", Phase.startTagHtml),
1824 ("caption", startTagCaption),
1825 ("colgroup", startTagColgroup),
1826 ("col", startTagCol),
1827 (("tbody", "tfoot", "thead"), startTagRowGroup),
1828 (("td", "th", "tr"), startTagImplyTbody),
1829 ("table", startTagTable),
1830 (("style", "script"), startTagStyleScript),
1831 ("input", startTagInput),
1832 ("form", startTagForm)
1833 ])
1834 startTagHandler.default = startTagOther
1836 endTagHandler = _utils.MethodDispatcher([
1837 ("table", endTagTable),
1838 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1839 "tfoot", "th", "thead", "tr"), endTagIgnore)
1840 ])
1841 endTagHandler.default = endTagOther
1843 class InTableTextPhase(Phase):
1844 __slots__ = ("originalPhase", "characterTokens")
1846 def __init__(self, *args, **kwargs):
1847 super(InTableTextPhase, self).__init__(*args, **kwargs)
1848 self.originalPhase = None
1849 self.characterTokens = []
1851 def flushCharacters(self):
1852 data = "".join([item["data"] for item in self.characterTokens])
1853 if any([item not in spaceCharacters for item in data]):
1854 token = {"type": tokenTypes["Characters"], "data": data}
1855 self.parser.phases["inTable"].insertText(token)
1856 elif data:
1857 self.tree.insertText(data)
1858 self.characterTokens = []
1860 def processComment(self, token):
1861 self.flushCharacters()
1862 self.parser.phase = self.originalPhase
1863 return token
1865 def processEOF(self):
1866 self.flushCharacters()
1867 self.parser.phase = self.originalPhase
1868 return True
1870 def processCharacters(self, token):
1871 if token["data"] == "\u0000":
1872 return
1873 self.characterTokens.append(token)
1875 def processSpaceCharacters(self, token):
1876 # pretty sure we should never reach here
1877 self.characterTokens.append(token)
1878 # assert False
1880 def processStartTag(self, token):
1881 self.flushCharacters()
1882 self.parser.phase = self.originalPhase
1883 return token
1885 def processEndTag(self, token):
1886 self.flushCharacters()
1887 self.parser.phase = self.originalPhase
1888 return token
1890 class InCaptionPhase(Phase):
1891 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1892 __slots__ = tuple()
1894 def ignoreEndTagCaption(self):
1895 return not self.tree.elementInScope("caption", variant="table")
1897 def processEOF(self):
1898 self.parser.phases["inBody"].processEOF()
1900 def processCharacters(self, token):
1901 return self.parser.phases["inBody"].processCharacters(token)
1903 def startTagTableElement(self, token):
1904 self.parser.parseError()
1905 # XXX Have to duplicate logic here to find out if the tag is ignored
1906 ignoreEndTag = self.ignoreEndTagCaption()
1907 self.parser.phase.processEndTag(impliedTagToken("caption"))
1908 if not ignoreEndTag:
1909 return token
1911 def startTagOther(self, token):
1912 return self.parser.phases["inBody"].processStartTag(token)
1914 def endTagCaption(self, token):
1915 if not self.ignoreEndTagCaption():
1916 # AT this code is quite similar to endTagTable in "InTable"
1917 self.tree.generateImpliedEndTags()
1918 if self.tree.openElements[-1].name != "caption":
1919 self.parser.parseError("expected-one-end-tag-but-got-another",
1920 {"gotName": "caption",
1921 "expectedName": self.tree.openElements[-1].name})
1922 while self.tree.openElements[-1].name != "caption":
1923 self.tree.openElements.pop()
1924 self.tree.openElements.pop()
1925 self.tree.clearActiveFormattingElements()
1926 self.parser.phase = self.parser.phases["inTable"]
1927 else:
1928 # innerHTML case
1929 assert self.parser.innerHTML
1930 self.parser.parseError()
1932 def endTagTable(self, token):
1933 self.parser.parseError()
1934 ignoreEndTag = self.ignoreEndTagCaption()
1935 self.parser.phase.processEndTag(impliedTagToken("caption"))
1936 if not ignoreEndTag:
1937 return token
1939 def endTagIgnore(self, token):
1940 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1942 def endTagOther(self, token):
1943 return self.parser.phases["inBody"].processEndTag(token)
1945 startTagHandler = _utils.MethodDispatcher([
1946 ("html", Phase.startTagHtml),
1947 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1948 "thead", "tr"), startTagTableElement)
1949 ])
1950 startTagHandler.default = startTagOther
1952 endTagHandler = _utils.MethodDispatcher([
1953 ("caption", endTagCaption),
1954 ("table", endTagTable),
1955 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1956 "thead", "tr"), endTagIgnore)
1957 ])
1958 endTagHandler.default = endTagOther
1960 class InColumnGroupPhase(Phase):
1961 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1962 __slots__ = tuple()
1964 def ignoreEndTagColgroup(self):
1965 return self.tree.openElements[-1].name == "html"
1967 def processEOF(self):
1968 if self.tree.openElements[-1].name == "html":
1969 assert self.parser.innerHTML
1970 return
1971 else:
1972 ignoreEndTag = self.ignoreEndTagColgroup()
1973 self.endTagColgroup(impliedTagToken("colgroup"))
1974 if not ignoreEndTag:
1975 return True
1977 def processCharacters(self, token):
1978 ignoreEndTag = self.ignoreEndTagColgroup()
1979 self.endTagColgroup(impliedTagToken("colgroup"))
1980 if not ignoreEndTag:
1981 return token
1983 def startTagCol(self, token):
1984 self.tree.insertElement(token)
1985 self.tree.openElements.pop()
1986 token["selfClosingAcknowledged"] = True
1988 def startTagOther(self, token):
1989 ignoreEndTag = self.ignoreEndTagColgroup()
1990 self.endTagColgroup(impliedTagToken("colgroup"))
1991 if not ignoreEndTag:
1992 return token
1994 def endTagColgroup(self, token):
1995 if self.ignoreEndTagColgroup():
1996 # innerHTML case
1997 assert self.parser.innerHTML
1998 self.parser.parseError()
1999 else:
2000 self.tree.openElements.pop()
2001 self.parser.phase = self.parser.phases["inTable"]
2003 def endTagCol(self, token):
2004 self.parser.parseError("no-end-tag", {"name": "col"})
2006 def endTagOther(self, token):
2007 ignoreEndTag = self.ignoreEndTagColgroup()
2008 self.endTagColgroup(impliedTagToken("colgroup"))
2009 if not ignoreEndTag:
2010 return token
2012 startTagHandler = _utils.MethodDispatcher([
2013 ("html", Phase.startTagHtml),
2014 ("col", startTagCol)
2015 ])
2016 startTagHandler.default = startTagOther
2018 endTagHandler = _utils.MethodDispatcher([
2019 ("colgroup", endTagColgroup),
2020 ("col", endTagCol)
2021 ])
2022 endTagHandler.default = endTagOther
2024 class InTableBodyPhase(Phase):
2025 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2026 __slots__ = tuple()
2028 # helper methods
2029 def clearStackToTableBodyContext(self):
2030 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2031 "thead", "html"):
2032 # self.parser.parseError("unexpected-implied-end-tag-in-table",
2033 # {"name": self.tree.openElements[-1].name})
2034 self.tree.openElements.pop()
2035 if self.tree.openElements[-1].name == "html":
2036 assert self.parser.innerHTML
2038 # the rest
2039 def processEOF(self):
2040 self.parser.phases["inTable"].processEOF()
2042 def processSpaceCharacters(self, token):
2043 return self.parser.phases["inTable"].processSpaceCharacters(token)
2045 def processCharacters(self, token):
2046 return self.parser.phases["inTable"].processCharacters(token)
2048 def startTagTr(self, token):
2049 self.clearStackToTableBodyContext()
2050 self.tree.insertElement(token)
2051 self.parser.phase = self.parser.phases["inRow"]
2053 def startTagTableCell(self, token):
2054 self.parser.parseError("unexpected-cell-in-table-body",
2055 {"name": token["name"]})
2056 self.startTagTr(impliedTagToken("tr", "StartTag"))
2057 return token
2059 def startTagTableOther(self, token):
2060 # XXX AT Any ideas on how to share this with endTagTable?
2061 if (self.tree.elementInScope("tbody", variant="table") or
2062 self.tree.elementInScope("thead", variant="table") or
2063 self.tree.elementInScope("tfoot", variant="table")):
2064 self.clearStackToTableBodyContext()
2065 self.endTagTableRowGroup(
2066 impliedTagToken(self.tree.openElements[-1].name))
2067 return token
2068 else:
2069 # innerHTML case
2070 assert self.parser.innerHTML
2071 self.parser.parseError()
2073 def startTagOther(self, token):
2074 return self.parser.phases["inTable"].processStartTag(token)
2076 def endTagTableRowGroup(self, token):
2077 if self.tree.elementInScope(token["name"], variant="table"):
2078 self.clearStackToTableBodyContext()
2079 self.tree.openElements.pop()
2080 self.parser.phase = self.parser.phases["inTable"]
2081 else:
2082 self.parser.parseError("unexpected-end-tag-in-table-body",
2083 {"name": token["name"]})
2085 def endTagTable(self, token):
2086 if (self.tree.elementInScope("tbody", variant="table") or
2087 self.tree.elementInScope("thead", variant="table") or
2088 self.tree.elementInScope("tfoot", variant="table")):
2089 self.clearStackToTableBodyContext()
2090 self.endTagTableRowGroup(
2091 impliedTagToken(self.tree.openElements[-1].name))
2092 return token
2093 else:
2094 # innerHTML case
2095 assert self.parser.innerHTML
2096 self.parser.parseError()
2098 def endTagIgnore(self, token):
2099 self.parser.parseError("unexpected-end-tag-in-table-body",
2100 {"name": token["name"]})
2102 def endTagOther(self, token):
2103 return self.parser.phases["inTable"].processEndTag(token)
2105 startTagHandler = _utils.MethodDispatcher([
2106 ("html", Phase.startTagHtml),
2107 ("tr", startTagTr),
2108 (("td", "th"), startTagTableCell),
2109 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2110 startTagTableOther)
2111 ])
2112 startTagHandler.default = startTagOther
2114 endTagHandler = _utils.MethodDispatcher([
2115 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2116 ("table", endTagTable),
2117 (("body", "caption", "col", "colgroup", "html", "td", "th",
2118 "tr"), endTagIgnore)
2119 ])
2120 endTagHandler.default = endTagOther
2122 class InRowPhase(Phase):
2123 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2124 __slots__ = tuple()
2126 # helper methods (XXX unify this with other table helper methods)
2127 def clearStackToTableRowContext(self):
2128 while self.tree.openElements[-1].name not in ("tr", "html"):
2129 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2130 {"name": self.tree.openElements[-1].name})
2131 self.tree.openElements.pop()
2133 def ignoreEndTagTr(self):
2134 return not self.tree.elementInScope("tr", variant="table")
2136 # the rest
2137 def processEOF(self):
2138 self.parser.phases["inTable"].processEOF()
2140 def processSpaceCharacters(self, token):
2141 return self.parser.phases["inTable"].processSpaceCharacters(token)
2143 def processCharacters(self, token):
2144 return self.parser.phases["inTable"].processCharacters(token)
2146 def startTagTableCell(self, token):
2147 self.clearStackToTableRowContext()
2148 self.tree.insertElement(token)
2149 self.parser.phase = self.parser.phases["inCell"]
2150 self.tree.activeFormattingElements.append(Marker)
2152 def startTagTableOther(self, token):
2153 ignoreEndTag = self.ignoreEndTagTr()
2154 self.endTagTr(impliedTagToken("tr"))
2155 # XXX how are we sure it's always ignored in the innerHTML case?
2156 if not ignoreEndTag:
2157 return token
2159 def startTagOther(self, token):
2160 return self.parser.phases["inTable"].processStartTag(token)
2162 def endTagTr(self, token):
2163 if not self.ignoreEndTagTr():
2164 self.clearStackToTableRowContext()
2165 self.tree.openElements.pop()
2166 self.parser.phase = self.parser.phases["inTableBody"]
2167 else:
2168 # innerHTML case
2169 assert self.parser.innerHTML
2170 self.parser.parseError()
2172 def endTagTable(self, token):
2173 ignoreEndTag = self.ignoreEndTagTr()
2174 self.endTagTr(impliedTagToken("tr"))
2175 # Reprocess the current tag if the tr end tag was not ignored
2176 # XXX how are we sure it's always ignored in the innerHTML case?
2177 if not ignoreEndTag:
2178 return token
2180 def endTagTableRowGroup(self, token):
2181 if self.tree.elementInScope(token["name"], variant="table"):
2182 self.endTagTr(impliedTagToken("tr"))
2183 return token
2184 else:
2185 self.parser.parseError()
2187 def endTagIgnore(self, token):
2188 self.parser.parseError("unexpected-end-tag-in-table-row",
2189 {"name": token["name"]})
2191 def endTagOther(self, token):
2192 return self.parser.phases["inTable"].processEndTag(token)
2194 startTagHandler = _utils.MethodDispatcher([
2195 ("html", Phase.startTagHtml),
2196 (("td", "th"), startTagTableCell),
2197 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2198 "tr"), startTagTableOther)
2199 ])
2200 startTagHandler.default = startTagOther
2202 endTagHandler = _utils.MethodDispatcher([
2203 ("tr", endTagTr),
2204 ("table", endTagTable),
2205 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2206 (("body", "caption", "col", "colgroup", "html", "td", "th"),
2207 endTagIgnore)
2208 ])
2209 endTagHandler.default = endTagOther
2211 class InCellPhase(Phase):
2212 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2213 __slots__ = tuple()
2215 # helper
2216 def closeCell(self):
2217 if self.tree.elementInScope("td", variant="table"):
2218 self.endTagTableCell(impliedTagToken("td"))
2219 elif self.tree.elementInScope("th", variant="table"):
2220 self.endTagTableCell(impliedTagToken("th"))
2222 # the rest
2223 def processEOF(self):
2224 self.parser.phases["inBody"].processEOF()
2226 def processCharacters(self, token):
2227 return self.parser.phases["inBody"].processCharacters(token)
2229 def startTagTableOther(self, token):
2230 if (self.tree.elementInScope("td", variant="table") or
2231 self.tree.elementInScope("th", variant="table")):
2232 self.closeCell()
2233 return token
2234 else:
2235 # innerHTML case
2236 assert self.parser.innerHTML
2237 self.parser.parseError()
2239 def startTagOther(self, token):
2240 return self.parser.phases["inBody"].processStartTag(token)
2242 def endTagTableCell(self, token):
2243 if self.tree.elementInScope(token["name"], variant="table"):
2244 self.tree.generateImpliedEndTags(token["name"])
2245 if self.tree.openElements[-1].name != token["name"]:
2246 self.parser.parseError("unexpected-cell-end-tag",
2247 {"name": token["name"]})
2248 while True:
2249 node = self.tree.openElements.pop()
2250 if node.name == token["name"]:
2251 break
2252 else:
2253 self.tree.openElements.pop()
2254 self.tree.clearActiveFormattingElements()
2255 self.parser.phase = self.parser.phases["inRow"]
2256 else:
2257 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2259 def endTagIgnore(self, token):
2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2262 def endTagImply(self, token):
2263 if self.tree.elementInScope(token["name"], variant="table"):
2264 self.closeCell()
2265 return token
2266 else:
2267 # sometimes innerHTML case
2268 self.parser.parseError()
2270 def endTagOther(self, token):
2271 return self.parser.phases["inBody"].processEndTag(token)
2273 startTagHandler = _utils.MethodDispatcher([
2274 ("html", Phase.startTagHtml),
2275 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2276 "thead", "tr"), startTagTableOther)
2277 ])
2278 startTagHandler.default = startTagOther
2280 endTagHandler = _utils.MethodDispatcher([
2281 (("td", "th"), endTagTableCell),
2282 (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
2283 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
2284 ])
2285 endTagHandler.default = endTagOther
2287 class InSelectPhase(Phase):
2288 __slots__ = tuple()
2290 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2291 def processEOF(self):
2292 if self.tree.openElements[-1].name != "html":
2293 self.parser.parseError("eof-in-select")
2294 else:
2295 assert self.parser.innerHTML
2297 def processCharacters(self, token):
2298 if token["data"] == "\u0000":
2299 return
2300 self.tree.insertText(token["data"])
2302 def startTagOption(self, token):
2303 # We need to imply </option> if <option> is the current node.
2304 if self.tree.openElements[-1].name == "option":
2305 self.tree.openElements.pop()
2306 self.tree.insertElement(token)
2308 def startTagOptgroup(self, token):
2309 if self.tree.openElements[-1].name == "option":
2310 self.tree.openElements.pop()
2311 if self.tree.openElements[-1].name == "optgroup":
2312 self.tree.openElements.pop()
2313 self.tree.insertElement(token)
2315 def startTagSelect(self, token):
2316 self.parser.parseError("unexpected-select-in-select")
2317 self.endTagSelect(impliedTagToken("select"))
2319 def startTagInput(self, token):
2320 self.parser.parseError("unexpected-input-in-select")
2321 if self.tree.elementInScope("select", variant="select"):
2322 self.endTagSelect(impliedTagToken("select"))
2323 return token
2324 else:
2325 assert self.parser.innerHTML
2327 def startTagScript(self, token):
2328 return self.parser.phases["inHead"].processStartTag(token)
2330 def startTagOther(self, token):
2331 self.parser.parseError("unexpected-start-tag-in-select",
2332 {"name": token["name"]})
2334 def endTagOption(self, token):
2335 if self.tree.openElements[-1].name == "option":
2336 self.tree.openElements.pop()
2337 else:
2338 self.parser.parseError("unexpected-end-tag-in-select",
2339 {"name": "option"})
2341 def endTagOptgroup(self, token):
2342 # </optgroup> implicitly closes <option>
2343 if (self.tree.openElements[-1].name == "option" and
2344 self.tree.openElements[-2].name == "optgroup"):
2345 self.tree.openElements.pop()
2346 # It also closes </optgroup>
2347 if self.tree.openElements[-1].name == "optgroup":
2348 self.tree.openElements.pop()
2349 # But nothing else
2350 else:
2351 self.parser.parseError("unexpected-end-tag-in-select",
2352 {"name": "optgroup"})
2354 def endTagSelect(self, token):
2355 if self.tree.elementInScope("select", variant="select"):
2356 node = self.tree.openElements.pop()
2357 while node.name != "select":
2358 node = self.tree.openElements.pop()
2359 self.parser.resetInsertionMode()
2360 else:
2361 # innerHTML case
2362 assert self.parser.innerHTML
2363 self.parser.parseError()
2365 def endTagOther(self, token):
2366 self.parser.parseError("unexpected-end-tag-in-select",
2367 {"name": token["name"]})
2369 startTagHandler = _utils.MethodDispatcher([
2370 ("html", Phase.startTagHtml),
2371 ("option", startTagOption),
2372 ("optgroup", startTagOptgroup),
2373 ("select", startTagSelect),
2374 (("input", "keygen", "textarea"), startTagInput),
2375 ("script", startTagScript)
2376 ])
2377 startTagHandler.default = startTagOther
2379 endTagHandler = _utils.MethodDispatcher([
2380 ("option", endTagOption),
2381 ("optgroup", endTagOptgroup),
2382 ("select", endTagSelect)
2383 ])
2384 endTagHandler.default = endTagOther
2386 class InSelectInTablePhase(Phase):
2387 __slots__ = tuple()
2389 def processEOF(self):
2390 self.parser.phases["inSelect"].processEOF()
2392 def processCharacters(self, token):
2393 return self.parser.phases["inSelect"].processCharacters(token)
2395 def startTagTable(self, token):
2396 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2397 self.endTagOther(impliedTagToken("select"))
2398 return token
2400 def startTagOther(self, token):
2401 return self.parser.phases["inSelect"].processStartTag(token)
2403 def endTagTable(self, token):
2404 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2405 if self.tree.elementInScope(token["name"], variant="table"):
2406 self.endTagOther(impliedTagToken("select"))
2407 return token
2409 def endTagOther(self, token):
2410 return self.parser.phases["inSelect"].processEndTag(token)
2412 startTagHandler = _utils.MethodDispatcher([
2413 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2414 startTagTable)
2415 ])
2416 startTagHandler.default = startTagOther
2418 endTagHandler = _utils.MethodDispatcher([
2419 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2420 endTagTable)
2421 ])
2422 endTagHandler.default = endTagOther
2424 class InForeignContentPhase(Phase):
2425 __slots__ = tuple()
2427 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2428 "center", "code", "dd", "div", "dl", "dt",
2429 "em", "embed", "h1", "h2", "h3",
2430 "h4", "h5", "h6", "head", "hr", "i", "img",
2431 "li", "listing", "menu", "meta", "nobr",
2432 "ol", "p", "pre", "ruby", "s", "small",
2433 "span", "strong", "strike", "sub", "sup",
2434 "table", "tt", "u", "ul", "var"])
2436 def adjustSVGTagNames(self, token):
2437 replacements = {"altglyph": "altGlyph",
2438 "altglyphdef": "altGlyphDef",
2439 "altglyphitem": "altGlyphItem",
2440 "animatecolor": "animateColor",
2441 "animatemotion": "animateMotion",
2442 "animatetransform": "animateTransform",
2443 "clippath": "clipPath",
2444 "feblend": "feBlend",
2445 "fecolormatrix": "feColorMatrix",
2446 "fecomponenttransfer": "feComponentTransfer",
2447 "fecomposite": "feComposite",
2448 "feconvolvematrix": "feConvolveMatrix",
2449 "fediffuselighting": "feDiffuseLighting",
2450 "fedisplacementmap": "feDisplacementMap",
2451 "fedistantlight": "feDistantLight",
2452 "feflood": "feFlood",
2453 "fefunca": "feFuncA",
2454 "fefuncb": "feFuncB",
2455 "fefuncg": "feFuncG",
2456 "fefuncr": "feFuncR",
2457 "fegaussianblur": "feGaussianBlur",
2458 "feimage": "feImage",
2459 "femerge": "feMerge",
2460 "femergenode": "feMergeNode",
2461 "femorphology": "feMorphology",
2462 "feoffset": "feOffset",
2463 "fepointlight": "fePointLight",
2464 "fespecularlighting": "feSpecularLighting",
2465 "fespotlight": "feSpotLight",
2466 "fetile": "feTile",
2467 "feturbulence": "feTurbulence",
2468 "foreignobject": "foreignObject",
2469 "glyphref": "glyphRef",
2470 "lineargradient": "linearGradient",
2471 "radialgradient": "radialGradient",
2472 "textpath": "textPath"}
2474 if token["name"] in replacements:
2475 token["name"] = replacements[token["name"]]
2477 def processCharacters(self, token):
2478 if token["data"] == "\u0000":
2479 token["data"] = "\uFFFD"
2480 elif (self.parser.framesetOK and
2481 any(char not in spaceCharacters for char in token["data"])):
2482 self.parser.framesetOK = False
2483 Phase.processCharacters(self, token)
2485 def processStartTag(self, token):
2486 currentNode = self.tree.openElements[-1]
2487 if (token["name"] in self.breakoutElements or
2488 (token["name"] == "font" and
2489 set(token["data"].keys()) & {"color", "face", "size"})):
2490 self.parser.parseError("unexpected-html-element-in-foreign-content",
2491 {"name": token["name"]})
2492 while (self.tree.openElements[-1].namespace !=
2493 self.tree.defaultNamespace and
2494 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2495 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2496 self.tree.openElements.pop()
2497 return token
2499 else:
2500 if currentNode.namespace == namespaces["mathml"]:
2501 self.parser.adjustMathMLAttributes(token)
2502 elif currentNode.namespace == namespaces["svg"]:
2503 self.adjustSVGTagNames(token)
2504 self.parser.adjustSVGAttributes(token)
2505 self.parser.adjustForeignAttributes(token)
2506 token["namespace"] = currentNode.namespace
2507 self.tree.insertElement(token)
2508 if token["selfClosing"]:
2509 self.tree.openElements.pop()
2510 token["selfClosingAcknowledged"] = True
2512 def processEndTag(self, token):
2513 nodeIndex = len(self.tree.openElements) - 1
2514 node = self.tree.openElements[-1]
2515 if node.name.translate(asciiUpper2Lower) != token["name"]:
2516 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2518 while True:
2519 if node.name.translate(asciiUpper2Lower) == token["name"]:
2520 # XXX this isn't in the spec but it seems necessary
2521 if self.parser.phase == self.parser.phases["inTableText"]:
2522 self.parser.phase.flushCharacters()
2523 self.parser.phase = self.parser.phase.originalPhase
2524 while self.tree.openElements.pop() != node:
2525 assert self.tree.openElements
2526 new_token = None
2527 break
2528 nodeIndex -= 1
2530 node = self.tree.openElements[nodeIndex]
2531 if node.namespace != self.tree.defaultNamespace:
2532 continue
2533 else:
2534 new_token = self.parser.phase.processEndTag(token)
2535 break
2536 return new_token
2538 class AfterBodyPhase(Phase):
2539 __slots__ = tuple()
2541 def processEOF(self):
2542 # Stop parsing
2543 pass
2545 def processComment(self, token):
2546 # This is needed because data is to be appended to the <html> element
2547 # here and not to whatever is currently open.
2548 self.tree.insertComment(token, self.tree.openElements[0])
2550 def processCharacters(self, token):
2551 self.parser.parseError("unexpected-char-after-body")
2552 self.parser.phase = self.parser.phases["inBody"]
2553 return token
2555 def startTagHtml(self, token):
2556 return self.parser.phases["inBody"].processStartTag(token)
2558 def startTagOther(self, token):
2559 self.parser.parseError("unexpected-start-tag-after-body",
2560 {"name": token["name"]})
2561 self.parser.phase = self.parser.phases["inBody"]
2562 return token
2564 def endTagHtml(self, name):
2565 if self.parser.innerHTML:
2566 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2567 else:
2568 self.parser.phase = self.parser.phases["afterAfterBody"]
2570 def endTagOther(self, token):
2571 self.parser.parseError("unexpected-end-tag-after-body",
2572 {"name": token["name"]})
2573 self.parser.phase = self.parser.phases["inBody"]
2574 return token
2576 startTagHandler = _utils.MethodDispatcher([
2577 ("html", startTagHtml)
2578 ])
2579 startTagHandler.default = startTagOther
2581 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
2582 endTagHandler.default = endTagOther
2584 class InFramesetPhase(Phase):
2585 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2586 __slots__ = tuple()
2588 def processEOF(self):
2589 if self.tree.openElements[-1].name != "html":
2590 self.parser.parseError("eof-in-frameset")
2591 else:
2592 assert self.parser.innerHTML
2594 def processCharacters(self, token):
2595 self.parser.parseError("unexpected-char-in-frameset")
2597 def startTagFrameset(self, token):
2598 self.tree.insertElement(token)
2600 def startTagFrame(self, token):
2601 self.tree.insertElement(token)
2602 self.tree.openElements.pop()
2604 def startTagNoframes(self, token):
2605 return self.parser.phases["inBody"].processStartTag(token)
2607 def startTagOther(self, token):
2608 self.parser.parseError("unexpected-start-tag-in-frameset",
2609 {"name": token["name"]})
2611 def endTagFrameset(self, token):
2612 if self.tree.openElements[-1].name == "html":
2613 # innerHTML case
2614 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2615 else:
2616 self.tree.openElements.pop()
2617 if (not self.parser.innerHTML and
2618 self.tree.openElements[-1].name != "frameset"):
2619 # If we're not in innerHTML mode and the current node is not a
2620 # "frameset" element (anymore) then switch.
2621 self.parser.phase = self.parser.phases["afterFrameset"]
2623 def endTagOther(self, token):
2624 self.parser.parseError("unexpected-end-tag-in-frameset",
2625 {"name": token["name"]})
2627 startTagHandler = _utils.MethodDispatcher([
2628 ("html", Phase.startTagHtml),
2629 ("frameset", startTagFrameset),
2630 ("frame", startTagFrame),
2631 ("noframes", startTagNoframes)
2632 ])
2633 startTagHandler.default = startTagOther
2635 endTagHandler = _utils.MethodDispatcher([
2636 ("frameset", endTagFrameset)
2637 ])
2638 endTagHandler.default = endTagOther
2640 class AfterFramesetPhase(Phase):
2641 # http://www.whatwg.org/specs/web-apps/current-work/#after3
2642 __slots__ = tuple()
2644 def processEOF(self):
2645 # Stop parsing
2646 pass
2648 def processCharacters(self, token):
2649 self.parser.parseError("unexpected-char-after-frameset")
2651 def startTagNoframes(self, token):
2652 return self.parser.phases["inHead"].processStartTag(token)
2654 def startTagOther(self, token):
2655 self.parser.parseError("unexpected-start-tag-after-frameset",
2656 {"name": token["name"]})
2658 def endTagHtml(self, token):
2659 self.parser.phase = self.parser.phases["afterAfterFrameset"]
2661 def endTagOther(self, token):
2662 self.parser.parseError("unexpected-end-tag-after-frameset",
2663 {"name": token["name"]})
2665 startTagHandler = _utils.MethodDispatcher([
2666 ("html", Phase.startTagHtml),
2667 ("noframes", startTagNoframes)
2668 ])
2669 startTagHandler.default = startTagOther
2671 endTagHandler = _utils.MethodDispatcher([
2672 ("html", endTagHtml)
2673 ])
2674 endTagHandler.default = endTagOther
2676 class AfterAfterBodyPhase(Phase):
2677 __slots__ = tuple()
2679 def processEOF(self):
2680 pass
2682 def processComment(self, token):
2683 self.tree.insertComment(token, self.tree.document)
2685 def processSpaceCharacters(self, token):
2686 return self.parser.phases["inBody"].processSpaceCharacters(token)
2688 def processCharacters(self, token):
2689 self.parser.parseError("expected-eof-but-got-char")
2690 self.parser.phase = self.parser.phases["inBody"]
2691 return token
2693 def startTagHtml(self, token):
2694 return self.parser.phases["inBody"].processStartTag(token)
2696 def startTagOther(self, token):
2697 self.parser.parseError("expected-eof-but-got-start-tag",
2698 {"name": token["name"]})
2699 self.parser.phase = self.parser.phases["inBody"]
2700 return token
2702 def processEndTag(self, token):
2703 self.parser.parseError("expected-eof-but-got-end-tag",
2704 {"name": token["name"]})
2705 self.parser.phase = self.parser.phases["inBody"]
2706 return token
2708 startTagHandler = _utils.MethodDispatcher([
2709 ("html", startTagHtml)
2710 ])
2711 startTagHandler.default = startTagOther
2713 class AfterAfterFramesetPhase(Phase):
2714 __slots__ = tuple()
2716 def processEOF(self):
2717 pass
2719 def processComment(self, token):
2720 self.tree.insertComment(token, self.tree.document)
2722 def processSpaceCharacters(self, token):
2723 return self.parser.phases["inBody"].processSpaceCharacters(token)
2725 def processCharacters(self, token):
2726 self.parser.parseError("expected-eof-but-got-char")
2728 def startTagHtml(self, token):
2729 return self.parser.phases["inBody"].processStartTag(token)
2731 def startTagNoFrames(self, token):
2732 return self.parser.phases["inHead"].processStartTag(token)
2734 def startTagOther(self, token):
2735 self.parser.parseError("expected-eof-but-got-start-tag",
2736 {"name": token["name"]})
2738 def processEndTag(self, token):
2739 self.parser.parseError("expected-eof-but-got-end-tag",
2740 {"name": token["name"]})
2742 startTagHandler = _utils.MethodDispatcher([
2743 ("html", startTagHtml),
2744 ("noframes", startTagNoFrames)
2745 ])
2746 startTagHandler.default = startTagOther
2748 # pylint:enable=unused-argument
2750 return {
2751 "initial": InitialPhase,
2752 "beforeHtml": BeforeHtmlPhase,
2753 "beforeHead": BeforeHeadPhase,
2754 "inHead": InHeadPhase,
2755 "inHeadNoscript": InHeadNoscriptPhase,
2756 "afterHead": AfterHeadPhase,
2757 "inBody": InBodyPhase,
2758 "text": TextPhase,
2759 "inTable": InTablePhase,
2760 "inTableText": InTableTextPhase,
2761 "inCaption": InCaptionPhase,
2762 "inColumnGroup": InColumnGroupPhase,
2763 "inTableBody": InTableBodyPhase,
2764 "inRow": InRowPhase,
2765 "inCell": InCellPhase,
2766 "inSelect": InSelectPhase,
2767 "inSelectInTable": InSelectInTablePhase,
2768 "inForeignContent": InForeignContentPhase,
2769 "afterBody": AfterBodyPhase,
2770 "inFrameset": InFramesetPhase,
2771 "afterFrameset": AfterFramesetPhase,
2772 "afterAfterBody": AfterAfterBodyPhase,
2773 "afterAfterFrameset": AfterAfterFramesetPhase,
2774 # XXX after after frameset
2775 }
2778def adjust_attributes(token, replacements):
2779 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2780 if needs_adjustment:
2781 token['data'] = type(token['data'])((replacements.get(k, k), v)
2782 for k, v in token['data'].items())
2785def impliedTagToken(name, type="EndTag", attributes=None,
2786 selfClosing=False):
2787 if attributes is None:
2788 attributes = {}
2789 return {"type": tokenTypes[type], "name": name, "data": attributes,
2790 "selfClosing": selfClosing}
2793class ParseError(Exception):
2794 """Error in parsed document"""
2795 pass