Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html5lib/html5parser.py: 95%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import absolute_import, division, unicode_literals
2from six import viewkeys
4from . import _inputstream
5from . import _tokenizer
7from . import treebuilders
8from .treebuilders.base import Marker
10from . import _utils
11from .constants import (
12 spaceCharacters, asciiUpper2Lower,
13 specialElements, headingElements, cdataElements, rcdataElements,
14 tokenTypes,
15 namespaces,
16 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
17 adjustForeignAttributes as adjustForeignAttributesMap,
18 adjustMathMLAttributes, adjustSVGAttributes,
19 E,
20 _ReparseException
21)
24def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
25 """Parse an HTML document as a string or file-like object into a tree
27 :arg doc: the document to parse as a string or file-like object
29 :arg treebuilder: the treebuilder to use when parsing
31 :arg namespaceHTMLElements: whether or not to namespace HTML elements
33 :returns: parsed tree
35 Example:
37 >>> from html5lib.html5parser import parse
38 >>> parse('<html><body><p>This is a doc</p></body></html>')
39 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
41 """
42 tb = treebuilders.getTreeBuilder(treebuilder)
43 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
44 return p.parse(doc, **kwargs)
47def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
48 """Parse an HTML fragment as a string or file-like object into a tree
50 :arg doc: the fragment to parse as a string or file-like object
52 :arg container: the container context to parse the fragment in
54 :arg treebuilder: the treebuilder to use when parsing
56 :arg namespaceHTMLElements: whether or not to namespace HTML elements
58 :returns: parsed tree
60 Example:
62 >>> from html5lib.html5libparser import parseFragment
63 >>> parseFragment('<b>this is a fragment</b>')
64 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
66 """
67 tb = treebuilders.getTreeBuilder(treebuilder)
68 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
69 return p.parseFragment(doc, container=container, **kwargs)
72class HTMLParser(object):
73 """HTML parser
75 Generates a tree structure from a stream of (possibly malformed) HTML.
77 """
79 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
80 """
81 :arg tree: a treebuilder class controlling the type of tree that will be
82 returned. Built in treebuilders can be accessed through
83 html5lib.treebuilders.getTreeBuilder(treeType)
85 :arg strict: raise an exception when a parse error is encountered
87 :arg namespaceHTMLElements: whether or not to namespace HTML elements
89 :arg debug: whether or not to enable debug mode which logs things
91 Example:
93 >>> from html5lib.html5parser import HTMLParser
94 >>> parser = HTMLParser() # generates parser with etree builder
95 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
97 """
99 # Raise an exception on the first error encountered
100 self.strict = strict
101 self.debug = debug
103 if tree is None:
104 tree = treebuilders.getTreeBuilder("etree")
105 elif isinstance(tree, str):
106 tree = treebuilders.getTreeBuilder(tree)
108 self.tree = tree(namespaceHTMLElements)
109 self.errors = []
111 self.phases = {name: cls(self, self.tree) for name, cls in
112 _phases.items()}
114 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
116 self.innerHTMLMode = innerHTML
117 self.container = container
118 self.scripting = scripting
119 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
120 self.reset()
122 try:
123 self.mainLoop()
124 except _ReparseException:
125 self.reset()
126 self.mainLoop()
128 def reset(self):
129 self.tree.reset()
130 self.firstStartTag = False
131 self.errors = []
132 self.log = [] # only used with debug mode
133 # "quirks" / "limited quirks" / "no quirks"
134 self.compatMode = "no quirks"
136 if self.innerHTMLMode:
137 self.innerHTML = self.container.lower()
139 if self.innerHTML in cdataElements:
140 self.tokenizer.state = self.tokenizer.rcdataState
141 elif self.innerHTML in rcdataElements:
142 self.tokenizer.state = self.tokenizer.rawtextState
143 elif self.innerHTML == 'plaintext':
144 self.tokenizer.state = self.tokenizer.plaintextState
145 else:
146 # state already is data state
147 # self.tokenizer.state = self.tokenizer.dataState
148 pass
149 self.phase = self.phases["beforeHtml"]
150 self.phase.insertHtmlElement()
151 self.resetInsertionMode()
152 else:
153 self.innerHTML = False # pylint:disable=redefined-variable-type
154 self.phase = self.phases["initial"]
156 self.lastPhase = None
158 self.beforeRCDataPhase = None
160 self.framesetOK = True
162 @property
163 def documentEncoding(self):
164 """Name of the character encoding that was used to decode the input stream, or
165 :obj:`None` if that is not determined yet
167 """
168 if not hasattr(self, 'tokenizer'):
169 return None
170 return self.tokenizer.stream.charEncoding[0].name
172 def isHTMLIntegrationPoint(self, element):
173 if (element.name == "annotation-xml" and
174 element.namespace == namespaces["mathml"]):
175 return ("encoding" in element.attributes and
176 element.attributes["encoding"].translate(
177 asciiUpper2Lower) in
178 ("text/html", "application/xhtml+xml"))
179 else:
180 return (element.namespace, element.name) in htmlIntegrationPointElements
182 def isMathMLTextIntegrationPoint(self, element):
183 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
185 def mainLoop(self):
186 CharactersToken = tokenTypes["Characters"]
187 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
188 StartTagToken = tokenTypes["StartTag"]
189 EndTagToken = tokenTypes["EndTag"]
190 CommentToken = tokenTypes["Comment"]
191 DoctypeToken = tokenTypes["Doctype"]
192 ParseErrorToken = tokenTypes["ParseError"]
194 type_names = {value: key for key, value in tokenTypes.items()}
195 debug = self.debug
197 for token in self.tokenizer:
198 prev_token = None
199 new_token = token
200 while new_token is not None:
201 prev_token = new_token
202 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
203 currentNodeNamespace = currentNode.namespace if currentNode else None
204 currentNodeName = currentNode.name if currentNode else None
206 type = new_token["type"]
208 if type == ParseErrorToken:
209 self.parseError(new_token["data"], new_token.get("datavars", {}))
210 new_token = None
211 else:
212 if (len(self.tree.openElements) == 0 or
213 currentNodeNamespace == self.tree.defaultNamespace or
214 (self.isMathMLTextIntegrationPoint(currentNode) and
215 ((type == StartTagToken and
216 token["name"] not in frozenset(["mglyph", "malignmark"])) or
217 type in (CharactersToken, SpaceCharactersToken))) or
218 (currentNodeNamespace == namespaces["mathml"] and
219 currentNodeName == "annotation-xml" and
220 type == StartTagToken and
221 token["name"] == "svg") or
222 (self.isHTMLIntegrationPoint(currentNode) and
223 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
224 phase = self.phase
225 else:
226 phase = self.phases["inForeignContent"]
228 if debug:
229 info = {"type": type_names[type]}
230 if type in (StartTagToken, EndTagToken):
231 info["name"] = new_token['name']
233 self.log.append((self.tokenizer.state.__name__,
234 self.phase.__class__.__name__,
235 phase.__class__.__name__,
236 "process" + info["type"],
237 info))
239 if type == CharactersToken:
240 new_token = phase.processCharacters(new_token)
241 elif type == SpaceCharactersToken:
242 new_token = phase.processSpaceCharacters(new_token)
243 elif type == StartTagToken:
244 new_token = phase.processStartTag(new_token)
245 elif type == EndTagToken:
246 new_token = phase.processEndTag(new_token)
247 elif type == CommentToken:
248 new_token = phase.processComment(new_token)
249 elif type == DoctypeToken:
250 new_token = phase.processDoctype(new_token)
252 if (type == StartTagToken and prev_token["selfClosing"] and
253 not prev_token["selfClosingAcknowledged"]):
254 self.parseError("non-void-element-with-trailing-solidus",
255 {"name": prev_token["name"]})
257 # When the loop finishes it's EOF
258 reprocess = True
259 phases = []
260 while reprocess:
261 phases.append(self.phase)
262 reprocess = self.phase.processEOF()
263 if reprocess:
264 assert self.phase not in phases
266 def parse(self, stream, *args, **kwargs):
267 """Parse a HTML document into a well-formed tree
269 :arg stream: a file-like object or string containing the HTML to be parsed
271 The optional encoding parameter must be a string that indicates
272 the encoding. If specified, that encoding will be used,
273 regardless of any BOM or later declaration (such as in a meta
274 element).
276 :arg scripting: treat noscript elements as if JavaScript was turned on
278 :returns: parsed tree
280 Example:
282 >>> from html5lib.html5parser import HTMLParser
283 >>> parser = HTMLParser()
284 >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
285 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
287 """
288 self._parse(stream, False, None, *args, **kwargs)
289 return self.tree.getDocument()
291 def parseFragment(self, stream, *args, **kwargs):
292 """Parse a HTML fragment into a well-formed tree fragment
294 :arg container: name of the element we're setting the innerHTML
295 property if set to None, default to 'div'
297 :arg stream: a file-like object or string containing the HTML to be parsed
299 The optional encoding parameter must be a string that indicates
300 the encoding. If specified, that encoding will be used,
301 regardless of any BOM or later declaration (such as in a meta
302 element)
304 :arg scripting: treat noscript elements as if JavaScript was turned on
306 :returns: parsed tree
308 Example:
310 >>> from html5lib.html5libparser import HTMLParser
311 >>> parser = HTMLParser()
312 >>> parser.parseFragment('<b>this is a fragment</b>')
313 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
315 """
316 self._parse(stream, True, *args, **kwargs)
317 return self.tree.getFragment()
319 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
320 # XXX The idea is to make errorcode mandatory.
321 if datavars is None:
322 datavars = {}
323 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
324 if self.strict:
325 raise ParseError(E[errorcode] % datavars)
327 def adjustMathMLAttributes(self, token):
328 adjust_attributes(token, adjustMathMLAttributes)
330 def adjustSVGAttributes(self, token):
331 adjust_attributes(token, adjustSVGAttributes)
333 def adjustForeignAttributes(self, token):
334 adjust_attributes(token, adjustForeignAttributesMap)
336 def reparseTokenNormal(self, token):
337 # pylint:disable=unused-argument
338 self.parser.phase()
340 def resetInsertionMode(self):
341 # The name of this method is mostly historical. (It's also used in the
342 # specification.)
343 last = False
344 newModes = {
345 "select": "inSelect",
346 "td": "inCell",
347 "th": "inCell",
348 "tr": "inRow",
349 "tbody": "inTableBody",
350 "thead": "inTableBody",
351 "tfoot": "inTableBody",
352 "caption": "inCaption",
353 "colgroup": "inColumnGroup",
354 "table": "inTable",
355 "head": "inBody",
356 "body": "inBody",
357 "frameset": "inFrameset",
358 "html": "beforeHead"
359 }
360 for node in self.tree.openElements[::-1]:
361 nodeName = node.name
362 new_phase = None
363 if node == self.tree.openElements[0]:
364 assert self.innerHTML
365 last = True
366 nodeName = self.innerHTML
367 # Check for conditions that should only happen in the innerHTML
368 # case
369 if nodeName in ("select", "colgroup", "head", "html"):
370 assert self.innerHTML
372 if not last and node.namespace != self.tree.defaultNamespace:
373 continue
375 if nodeName in newModes:
376 new_phase = self.phases[newModes[nodeName]]
377 break
378 elif last:
379 new_phase = self.phases["inBody"]
380 break
382 self.phase = new_phase
384 def parseRCDataRawtext(self, token, contentType):
385 # Generic RCDATA/RAWTEXT Parsing algorithm
386 assert contentType in ("RAWTEXT", "RCDATA")
388 self.tree.insertElement(token)
390 if contentType == "RAWTEXT":
391 self.tokenizer.state = self.tokenizer.rawtextState
392 else:
393 self.tokenizer.state = self.tokenizer.rcdataState
395 self.originalPhase = self.phase
397 self.phase = self.phases["text"]
400class Phase(object):
401 """Base class for helper object that implements each phase of processing
402 """
403 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
405 def __init__(self, parser, tree):
406 self.parser = parser
407 self.tree = tree
408 self.__startTagCache = {}
409 self.__endTagCache = {}
411 def processEOF(self):
412 raise NotImplementedError
414 def processComment(self, token):
415 # For most phases the following is correct. Where it's not it will be
416 # overridden.
417 self.tree.insertComment(token, self.tree.openElements[-1])
419 def processDoctype(self, token):
420 self.parser.parseError("unexpected-doctype")
422 def processCharacters(self, token):
423 self.tree.insertText(token["data"])
425 def processSpaceCharacters(self, token):
426 self.tree.insertText(token["data"])
428 def processStartTag(self, token):
429 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
430 # requires a circular reference to the Phase, and this ends up with a significant
431 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
432 name = token["name"]
433 # In Py2, using `in` is quicker in general than try/except KeyError
434 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
435 if name in self.__startTagCache:
436 func = self.__startTagCache[name]
437 else:
438 func = self.__startTagCache[name] = self.startTagHandler[name]
439 # bound the cache size in case we get loads of unknown tags
440 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
441 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
442 self.__startTagCache.pop(next(iter(self.__startTagCache)))
443 return func(token)
445 def startTagHtml(self, token):
446 if not self.parser.firstStartTag and token["name"] == "html":
447 self.parser.parseError("non-html-root")
448 # XXX Need a check here to see if the first start tag token emitted is
449 # this token... If it's not, invoke self.parser.parseError().
450 for attr, value in token["data"].items():
451 if attr not in self.tree.openElements[0].attributes:
452 self.tree.openElements[0].attributes[attr] = value
453 self.parser.firstStartTag = False
455 def processEndTag(self, token):
456 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
457 # requires a circular reference to the Phase, and this ends up with a significant
458 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
459 name = token["name"]
460 # In Py2, using `in` is quicker in general than try/except KeyError
461 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
462 if name in self.__endTagCache:
463 func = self.__endTagCache[name]
464 else:
465 func = self.__endTagCache[name] = self.endTagHandler[name]
466 # bound the cache size in case we get loads of unknown tags
467 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
468 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
469 self.__endTagCache.pop(next(iter(self.__endTagCache)))
470 return func(token)
473class InitialPhase(Phase):
474 __slots__ = tuple()
476 def processSpaceCharacters(self, token):
477 pass
479 def processComment(self, token):
480 self.tree.insertComment(token, self.tree.document)
482 def processDoctype(self, token):
483 name = token["name"]
484 publicId = token["publicId"]
485 systemId = token["systemId"]
486 correct = token["correct"]
488 if (name != "html" or publicId is not None or
489 systemId is not None and systemId != "about:legacy-compat"):
490 self.parser.parseError("unknown-doctype")
492 if publicId is None:
493 publicId = ""
495 self.tree.insertDoctype(token)
497 if publicId != "":
498 publicId = publicId.translate(asciiUpper2Lower)
500 if (not correct or token["name"] != "html" or
501 publicId.startswith(
502 ("+//silmaril//dtd html pro v0r11 19970101//",
503 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
504 "-//as//dtd html 3.0 aswedit + extensions//",
505 "-//ietf//dtd html 2.0 level 1//",
506 "-//ietf//dtd html 2.0 level 2//",
507 "-//ietf//dtd html 2.0 strict level 1//",
508 "-//ietf//dtd html 2.0 strict level 2//",
509 "-//ietf//dtd html 2.0 strict//",
510 "-//ietf//dtd html 2.0//",
511 "-//ietf//dtd html 2.1e//",
512 "-//ietf//dtd html 3.0//",
513 "-//ietf//dtd html 3.2 final//",
514 "-//ietf//dtd html 3.2//",
515 "-//ietf//dtd html 3//",
516 "-//ietf//dtd html level 0//",
517 "-//ietf//dtd html level 1//",
518 "-//ietf//dtd html level 2//",
519 "-//ietf//dtd html level 3//",
520 "-//ietf//dtd html strict level 0//",
521 "-//ietf//dtd html strict level 1//",
522 "-//ietf//dtd html strict level 2//",
523 "-//ietf//dtd html strict level 3//",
524 "-//ietf//dtd html strict//",
525 "-//ietf//dtd html//",
526 "-//metrius//dtd metrius presentational//",
527 "-//microsoft//dtd internet explorer 2.0 html strict//",
528 "-//microsoft//dtd internet explorer 2.0 html//",
529 "-//microsoft//dtd internet explorer 2.0 tables//",
530 "-//microsoft//dtd internet explorer 3.0 html strict//",
531 "-//microsoft//dtd internet explorer 3.0 html//",
532 "-//microsoft//dtd internet explorer 3.0 tables//",
533 "-//netscape comm. corp.//dtd html//",
534 "-//netscape comm. corp.//dtd strict html//",
535 "-//o'reilly and associates//dtd html 2.0//",
536 "-//o'reilly and associates//dtd html extended 1.0//",
537 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
538 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
539 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
540 "-//spyglass//dtd html 2.0 extended//",
541 "-//sq//dtd html 2.0 hotmetal + extensions//",
542 "-//sun microsystems corp.//dtd hotjava html//",
543 "-//sun microsystems corp.//dtd hotjava strict html//",
544 "-//w3c//dtd html 3 1995-03-24//",
545 "-//w3c//dtd html 3.2 draft//",
546 "-//w3c//dtd html 3.2 final//",
547 "-//w3c//dtd html 3.2//",
548 "-//w3c//dtd html 3.2s draft//",
549 "-//w3c//dtd html 4.0 frameset//",
550 "-//w3c//dtd html 4.0 transitional//",
551 "-//w3c//dtd html experimental 19960712//",
552 "-//w3c//dtd html experimental 970421//",
553 "-//w3c//dtd w3 html//",
554 "-//w3o//dtd w3 html 3.0//",
555 "-//webtechs//dtd mozilla html 2.0//",
556 "-//webtechs//dtd mozilla html//")) or
557 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
558 "-/w3c/dtd html 4.0 transitional/en",
559 "html") or
560 publicId.startswith(
561 ("-//w3c//dtd html 4.01 frameset//",
562 "-//w3c//dtd html 4.01 transitional//")) and
563 systemId is None or
564 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
565 self.parser.compatMode = "quirks"
566 elif (publicId.startswith(
567 ("-//w3c//dtd xhtml 1.0 frameset//",
568 "-//w3c//dtd xhtml 1.0 transitional//")) or
569 publicId.startswith(
570 ("-//w3c//dtd html 4.01 frameset//",
571 "-//w3c//dtd html 4.01 transitional//")) and
572 systemId is not None):
573 self.parser.compatMode = "limited quirks"
575 self.parser.phase = self.parser.phases["beforeHtml"]
577 def anythingElse(self):
578 self.parser.compatMode = "quirks"
579 self.parser.phase = self.parser.phases["beforeHtml"]
581 def processCharacters(self, token):
582 self.parser.parseError("expected-doctype-but-got-chars")
583 self.anythingElse()
584 return token
586 def processStartTag(self, token):
587 self.parser.parseError("expected-doctype-but-got-start-tag",
588 {"name": token["name"]})
589 self.anythingElse()
590 return token
592 def processEndTag(self, token):
593 self.parser.parseError("expected-doctype-but-got-end-tag",
594 {"name": token["name"]})
595 self.anythingElse()
596 return token
598 def processEOF(self):
599 self.parser.parseError("expected-doctype-but-got-eof")
600 self.anythingElse()
601 return True
604class BeforeHtmlPhase(Phase):
605 __slots__ = tuple()
607 # helper methods
608 def insertHtmlElement(self):
609 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
610 self.parser.phase = self.parser.phases["beforeHead"]
612 # other
613 def processEOF(self):
614 self.insertHtmlElement()
615 return True
617 def processComment(self, token):
618 self.tree.insertComment(token, self.tree.document)
620 def processSpaceCharacters(self, token):
621 pass
623 def processCharacters(self, token):
624 self.insertHtmlElement()
625 return token
627 def processStartTag(self, token):
628 if token["name"] == "html":
629 self.parser.firstStartTag = True
630 self.insertHtmlElement()
631 return token
633 def processEndTag(self, token):
634 if token["name"] not in ("head", "body", "html", "br"):
635 self.parser.parseError("unexpected-end-tag-before-html",
636 {"name": token["name"]})
637 else:
638 self.insertHtmlElement()
639 return token
642class BeforeHeadPhase(Phase):
643 __slots__ = tuple()
645 def processEOF(self):
646 self.startTagHead(impliedTagToken("head", "StartTag"))
647 return True
649 def processSpaceCharacters(self, token):
650 pass
652 def processCharacters(self, token):
653 self.startTagHead(impliedTagToken("head", "StartTag"))
654 return token
656 def startTagHtml(self, token):
657 return self.parser.phases["inBody"].processStartTag(token)
659 def startTagHead(self, token):
660 self.tree.insertElement(token)
661 self.tree.headPointer = self.tree.openElements[-1]
662 self.parser.phase = self.parser.phases["inHead"]
664 def startTagOther(self, token):
665 self.startTagHead(impliedTagToken("head", "StartTag"))
666 return token
668 def endTagImplyHead(self, token):
669 self.startTagHead(impliedTagToken("head", "StartTag"))
670 return token
672 def endTagOther(self, token):
673 self.parser.parseError("end-tag-after-implied-root",
674 {"name": token["name"]})
676 startTagHandler = _utils.MethodDispatcher([
677 ("html", startTagHtml),
678 ("head", startTagHead)
679 ])
680 startTagHandler.default = startTagOther
682 endTagHandler = _utils.MethodDispatcher([
683 (("head", "body", "html", "br"), endTagImplyHead)
684 ])
685 endTagHandler.default = endTagOther
688class InHeadPhase(Phase):
689 __slots__ = tuple()
691 # the real thing
692 def processEOF(self):
693 self.anythingElse()
694 return True
696 def processCharacters(self, token):
697 self.anythingElse()
698 return token
700 def startTagHtml(self, token):
701 return self.parser.phases["inBody"].processStartTag(token)
703 def startTagHead(self, token):
704 self.parser.parseError("two-heads-are-not-better-than-one")
706 def startTagBaseLinkCommand(self, token):
707 self.tree.insertElement(token)
708 self.tree.openElements.pop()
709 token["selfClosingAcknowledged"] = True
711 def startTagMeta(self, token):
712 self.tree.insertElement(token)
713 self.tree.openElements.pop()
714 token["selfClosingAcknowledged"] = True
716 attributes = token["data"]
717 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
718 if "charset" in attributes:
719 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
720 elif ("content" in attributes and
721 "http-equiv" in attributes and
722 attributes["http-equiv"].lower() == "content-type"):
723 # Encoding it as UTF-8 here is a hack, as really we should pass
724 # the abstract Unicode string, and just use the
725 # ContentAttrParser on that, but using UTF-8 allows all chars
726 # to be encoded and as a ASCII-superset works.
727 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
728 parser = _inputstream.ContentAttrParser(data)
729 codec = parser.parse()
730 self.parser.tokenizer.stream.changeEncoding(codec)
732 def startTagTitle(self, token):
733 self.parser.parseRCDataRawtext(token, "RCDATA")
735 def startTagNoFramesStyle(self, token):
736 # Need to decide whether to implement the scripting-disabled case
737 self.parser.parseRCDataRawtext(token, "RAWTEXT")
739 def startTagNoscript(self, token):
740 if self.parser.scripting:
741 self.parser.parseRCDataRawtext(token, "RAWTEXT")
742 else:
743 self.tree.insertElement(token)
744 self.parser.phase = self.parser.phases["inHeadNoscript"]
746 def startTagScript(self, token):
747 self.tree.insertElement(token)
748 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
749 self.parser.originalPhase = self.parser.phase
750 self.parser.phase = self.parser.phases["text"]
752 def startTagOther(self, token):
753 self.anythingElse()
754 return token
756 def endTagHead(self, token):
757 node = self.parser.tree.openElements.pop()
758 assert node.name == "head", "Expected head got %s" % node.name
759 self.parser.phase = self.parser.phases["afterHead"]
761 def endTagHtmlBodyBr(self, token):
762 self.anythingElse()
763 return token
765 def endTagOther(self, token):
766 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
768 def anythingElse(self):
769 self.endTagHead(impliedTagToken("head"))
771 startTagHandler = _utils.MethodDispatcher([
772 ("html", startTagHtml),
773 ("title", startTagTitle),
774 (("noframes", "style"), startTagNoFramesStyle),
775 ("noscript", startTagNoscript),
776 ("script", startTagScript),
777 (("base", "basefont", "bgsound", "command", "link"),
778 startTagBaseLinkCommand),
779 ("meta", startTagMeta),
780 ("head", startTagHead)
781 ])
782 startTagHandler.default = startTagOther
784 endTagHandler = _utils.MethodDispatcher([
785 ("head", endTagHead),
786 (("br", "html", "body"), endTagHtmlBodyBr)
787 ])
788 endTagHandler.default = endTagOther
791class InHeadNoscriptPhase(Phase):
792 __slots__ = tuple()
794 def processEOF(self):
795 self.parser.parseError("eof-in-head-noscript")
796 self.anythingElse()
797 return True
799 def processComment(self, token):
800 return self.parser.phases["inHead"].processComment(token)
802 def processCharacters(self, token):
803 self.parser.parseError("char-in-head-noscript")
804 self.anythingElse()
805 return token
807 def processSpaceCharacters(self, token):
808 return self.parser.phases["inHead"].processSpaceCharacters(token)
810 def startTagHtml(self, token):
811 return self.parser.phases["inBody"].processStartTag(token)
813 def startTagBaseLinkCommand(self, token):
814 return self.parser.phases["inHead"].processStartTag(token)
816 def startTagHeadNoscript(self, token):
817 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
819 def startTagOther(self, token):
820 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
821 self.anythingElse()
822 return token
824 def endTagNoscript(self, token):
825 node = self.parser.tree.openElements.pop()
826 assert node.name == "noscript", "Expected noscript got %s" % node.name
827 self.parser.phase = self.parser.phases["inHead"]
829 def endTagBr(self, token):
830 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
831 self.anythingElse()
832 return token
834 def endTagOther(self, token):
835 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
837 def anythingElse(self):
838 # Caller must raise parse error first!
839 self.endTagNoscript(impliedTagToken("noscript"))
841 startTagHandler = _utils.MethodDispatcher([
842 ("html", startTagHtml),
843 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
844 (("head", "noscript"), startTagHeadNoscript),
845 ])
846 startTagHandler.default = startTagOther
848 endTagHandler = _utils.MethodDispatcher([
849 ("noscript", endTagNoscript),
850 ("br", endTagBr),
851 ])
852 endTagHandler.default = endTagOther
855class AfterHeadPhase(Phase):
856 __slots__ = tuple()
858 def processEOF(self):
859 self.anythingElse()
860 return True
862 def processCharacters(self, token):
863 self.anythingElse()
864 return token
866 def startTagHtml(self, token):
867 return self.parser.phases["inBody"].processStartTag(token)
869 def startTagBody(self, token):
870 self.parser.framesetOK = False
871 self.tree.insertElement(token)
872 self.parser.phase = self.parser.phases["inBody"]
874 def startTagFrameset(self, token):
875 self.tree.insertElement(token)
876 self.parser.phase = self.parser.phases["inFrameset"]
878 def startTagFromHead(self, token):
879 self.parser.parseError("unexpected-start-tag-out-of-my-head",
880 {"name": token["name"]})
881 self.tree.openElements.append(self.tree.headPointer)
882 self.parser.phases["inHead"].processStartTag(token)
883 for node in self.tree.openElements[::-1]:
884 if node.name == "head":
885 self.tree.openElements.remove(node)
886 break
888 def startTagHead(self, token):
889 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
891 def startTagOther(self, token):
892 self.anythingElse()
893 return token
895 def endTagHtmlBodyBr(self, token):
896 self.anythingElse()
897 return token
899 def endTagOther(self, token):
900 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
902 def anythingElse(self):
903 self.tree.insertElement(impliedTagToken("body", "StartTag"))
904 self.parser.phase = self.parser.phases["inBody"]
905 self.parser.framesetOK = True
907 startTagHandler = _utils.MethodDispatcher([
908 ("html", startTagHtml),
909 ("body", startTagBody),
910 ("frameset", startTagFrameset),
911 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
912 "style", "title"),
913 startTagFromHead),
914 ("head", startTagHead)
915 ])
916 startTagHandler.default = startTagOther
917 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
918 endTagHtmlBodyBr)])
919 endTagHandler.default = endTagOther
922class InBodyPhase(Phase):
923 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
924 # the really-really-really-very crazy mode
925 __slots__ = ("processSpaceCharacters",)
927 def __init__(self, *args, **kwargs):
928 super(InBodyPhase, self).__init__(*args, **kwargs)
929 # Set this to the default handler
930 self.processSpaceCharacters = self.processSpaceCharactersNonPre
932 def isMatchingFormattingElement(self, node1, node2):
933 return (node1.name == node2.name and
934 node1.namespace == node2.namespace and
935 node1.attributes == node2.attributes)
937 # helper
938 def addFormattingElement(self, token):
939 self.tree.insertElement(token)
940 element = self.tree.openElements[-1]
942 matchingElements = []
943 for node in self.tree.activeFormattingElements[::-1]:
944 if node is Marker:
945 break
946 elif self.isMatchingFormattingElement(node, element):
947 matchingElements.append(node)
949 assert len(matchingElements) <= 3
950 if len(matchingElements) == 3:
951 self.tree.activeFormattingElements.remove(matchingElements[-1])
952 self.tree.activeFormattingElements.append(element)
954 # the real deal
955 def processEOF(self):
956 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
957 "tfoot", "th", "thead", "tr", "body",
958 "html"))
959 for node in self.tree.openElements[::-1]:
960 if node.name not in allowed_elements:
961 self.parser.parseError("expected-closing-tag-but-got-eof")
962 break
963 # Stop parsing
965 def processSpaceCharactersDropNewline(self, token):
966 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
967 # want to drop leading newlines
968 data = token["data"]
969 self.processSpaceCharacters = self.processSpaceCharactersNonPre
970 if (data.startswith("\n") and
971 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
972 not self.tree.openElements[-1].hasContent()):
973 data = data[1:]
974 if data:
975 self.tree.reconstructActiveFormattingElements()
976 self.tree.insertText(data)
978 def processCharacters(self, token):
979 if token["data"] == "\u0000":
980 # The tokenizer should always emit null on its own
981 return
982 self.tree.reconstructActiveFormattingElements()
983 self.tree.insertText(token["data"])
984 # This must be bad for performance
985 if (self.parser.framesetOK and
986 any(char not in spaceCharacters
987 for char in token["data"])):
988 self.parser.framesetOK = False
990 def processSpaceCharactersNonPre(self, token):
991 self.tree.reconstructActiveFormattingElements()
992 self.tree.insertText(token["data"])
994 def startTagProcessInHead(self, token):
995 return self.parser.phases["inHead"].processStartTag(token)
997 def startTagBody(self, token):
998 self.parser.parseError("unexpected-start-tag", {"name": "body"})
999 if (len(self.tree.openElements) == 1 or
1000 self.tree.openElements[1].name != "body"):
1001 assert self.parser.innerHTML
1002 else:
1003 self.parser.framesetOK = False
1004 for attr, value in token["data"].items():
1005 if attr not in self.tree.openElements[1].attributes:
1006 self.tree.openElements[1].attributes[attr] = value
1008 def startTagFrameset(self, token):
1009 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1010 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1011 assert self.parser.innerHTML
1012 elif not self.parser.framesetOK:
1013 pass
1014 else:
1015 if self.tree.openElements[1].parent:
1016 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1017 while self.tree.openElements[-1].name != "html":
1018 self.tree.openElements.pop()
1019 self.tree.insertElement(token)
1020 self.parser.phase = self.parser.phases["inFrameset"]
1022 def startTagCloseP(self, token):
1023 if self.tree.elementInScope("p", variant="button"):
1024 self.endTagP(impliedTagToken("p"))
1025 self.tree.insertElement(token)
1027 def startTagPreListing(self, token):
1028 if self.tree.elementInScope("p", variant="button"):
1029 self.endTagP(impliedTagToken("p"))
1030 self.tree.insertElement(token)
1031 self.parser.framesetOK = False
1032 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1034 def startTagForm(self, token):
1035 if self.tree.formPointer:
1036 self.parser.parseError("unexpected-start-tag", {"name": "form"})
1037 else:
1038 if self.tree.elementInScope("p", variant="button"):
1039 self.endTagP(impliedTagToken("p"))
1040 self.tree.insertElement(token)
1041 self.tree.formPointer = self.tree.openElements[-1]
1043 def startTagListItem(self, token):
1044 self.parser.framesetOK = False
1046 stopNamesMap = {"li": ["li"],
1047 "dt": ["dt", "dd"],
1048 "dd": ["dt", "dd"]}
1049 stopNames = stopNamesMap[token["name"]]
1050 for node in reversed(self.tree.openElements):
1051 if node.name in stopNames:
1052 self.parser.phase.processEndTag(
1053 impliedTagToken(node.name, "EndTag"))
1054 break
1055 if (node.nameTuple in specialElements and
1056 node.name not in ("address", "div", "p")):
1057 break
1059 if self.tree.elementInScope("p", variant="button"):
1060 self.parser.phase.processEndTag(
1061 impliedTagToken("p", "EndTag"))
1063 self.tree.insertElement(token)
1065 def startTagPlaintext(self, token):
1066 if self.tree.elementInScope("p", variant="button"):
1067 self.endTagP(impliedTagToken("p"))
1068 self.tree.insertElement(token)
1069 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1071 def startTagHeading(self, token):
1072 if self.tree.elementInScope("p", variant="button"):
1073 self.endTagP(impliedTagToken("p"))
1074 if self.tree.openElements[-1].name in headingElements:
1075 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1076 self.tree.openElements.pop()
1077 self.tree.insertElement(token)
1079 def startTagA(self, token):
1080 afeAElement = self.tree.elementInActiveFormattingElements("a")
1081 if afeAElement:
1082 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1083 {"startName": "a", "endName": "a"})
1084 self.endTagFormatting(impliedTagToken("a"))
1085 if afeAElement in self.tree.openElements:
1086 self.tree.openElements.remove(afeAElement)
1087 if afeAElement in self.tree.activeFormattingElements:
1088 self.tree.activeFormattingElements.remove(afeAElement)
1089 self.tree.reconstructActiveFormattingElements()
1090 self.addFormattingElement(token)
1092 def startTagFormatting(self, token):
1093 self.tree.reconstructActiveFormattingElements()
1094 self.addFormattingElement(token)
1096 def startTagNobr(self, token):
1097 self.tree.reconstructActiveFormattingElements()
1098 if self.tree.elementInScope("nobr"):
1099 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1100 {"startName": "nobr", "endName": "nobr"})
1101 self.processEndTag(impliedTagToken("nobr"))
1102 # XXX Need tests that trigger the following
1103 self.tree.reconstructActiveFormattingElements()
1104 self.addFormattingElement(token)
1106 def startTagButton(self, token):
1107 if self.tree.elementInScope("button"):
1108 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1109 {"startName": "button", "endName": "button"})
1110 self.processEndTag(impliedTagToken("button"))
1111 return token
1112 else:
1113 self.tree.reconstructActiveFormattingElements()
1114 self.tree.insertElement(token)
1115 self.parser.framesetOK = False
1117 def startTagAppletMarqueeObject(self, token):
1118 self.tree.reconstructActiveFormattingElements()
1119 self.tree.insertElement(token)
1120 self.tree.activeFormattingElements.append(Marker)
1121 self.parser.framesetOK = False
1123 def startTagXmp(self, token):
1124 if self.tree.elementInScope("p", variant="button"):
1125 self.endTagP(impliedTagToken("p"))
1126 self.tree.reconstructActiveFormattingElements()
1127 self.parser.framesetOK = False
1128 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1130 def startTagTable(self, token):
1131 if self.parser.compatMode != "quirks":
1132 if self.tree.elementInScope("p", variant="button"):
1133 self.processEndTag(impliedTagToken("p"))
1134 self.tree.insertElement(token)
1135 self.parser.framesetOK = False
1136 self.parser.phase = self.parser.phases["inTable"]
1138 def startTagVoidFormatting(self, token):
1139 self.tree.reconstructActiveFormattingElements()
1140 self.tree.insertElement(token)
1141 self.tree.openElements.pop()
1142 token["selfClosingAcknowledged"] = True
1143 self.parser.framesetOK = False
1145 def startTagInput(self, token):
1146 framesetOK = self.parser.framesetOK
1147 self.startTagVoidFormatting(token)
1148 if ("type" in token["data"] and
1149 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1150 # input type=hidden doesn't change framesetOK
1151 self.parser.framesetOK = framesetOK
1153 def startTagParamSource(self, token):
1154 self.tree.insertElement(token)
1155 self.tree.openElements.pop()
1156 token["selfClosingAcknowledged"] = True
1158 def startTagHr(self, token):
1159 if self.tree.elementInScope("p", variant="button"):
1160 self.endTagP(impliedTagToken("p"))
1161 self.tree.insertElement(token)
1162 self.tree.openElements.pop()
1163 token["selfClosingAcknowledged"] = True
1164 self.parser.framesetOK = False
1166 def startTagImage(self, token):
1167 # No really...
1168 self.parser.parseError("unexpected-start-tag-treated-as",
1169 {"originalName": "image", "newName": "img"})
1170 self.processStartTag(impliedTagToken("img", "StartTag",
1171 attributes=token["data"],
1172 selfClosing=token["selfClosing"]))
1174 def startTagIsIndex(self, token):
1175 self.parser.parseError("deprecated-tag", {"name": "isindex"})
1176 if self.tree.formPointer:
1177 return
1178 form_attrs = {}
1179 if "action" in token["data"]:
1180 form_attrs["action"] = token["data"]["action"]
1181 self.processStartTag(impliedTagToken("form", "StartTag",
1182 attributes=form_attrs))
1183 self.processStartTag(impliedTagToken("hr", "StartTag"))
1184 self.processStartTag(impliedTagToken("label", "StartTag"))
1185 # XXX Localization ...
1186 if "prompt" in token["data"]:
1187 prompt = token["data"]["prompt"]
1188 else:
1189 prompt = "This is a searchable index. Enter search keywords: "
1190 self.processCharacters(
1191 {"type": tokenTypes["Characters"], "data": prompt})
1192 attributes = token["data"].copy()
1193 if "action" in attributes:
1194 del attributes["action"]
1195 if "prompt" in attributes:
1196 del attributes["prompt"]
1197 attributes["name"] = "isindex"
1198 self.processStartTag(impliedTagToken("input", "StartTag",
1199 attributes=attributes,
1200 selfClosing=token["selfClosing"]))
1201 self.processEndTag(impliedTagToken("label"))
1202 self.processStartTag(impliedTagToken("hr", "StartTag"))
1203 self.processEndTag(impliedTagToken("form"))
1205 def startTagTextarea(self, token):
1206 self.tree.insertElement(token)
1207 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1208 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1209 self.parser.framesetOK = False
1211 def startTagIFrame(self, token):
1212 self.parser.framesetOK = False
1213 self.startTagRawtext(token)
1215 def startTagNoscript(self, token):
1216 if self.parser.scripting:
1217 self.startTagRawtext(token)
1218 else:
1219 self.startTagOther(token)
1221 def startTagRawtext(self, token):
1222 """iframe, noembed noframes, noscript(if scripting enabled)"""
1223 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1225 def startTagOpt(self, token):
1226 if self.tree.openElements[-1].name == "option":
1227 self.parser.phase.processEndTag(impliedTagToken("option"))
1228 self.tree.reconstructActiveFormattingElements()
1229 self.parser.tree.insertElement(token)
1231 def startTagSelect(self, token):
1232 self.tree.reconstructActiveFormattingElements()
1233 self.tree.insertElement(token)
1234 self.parser.framesetOK = False
1235 if self.parser.phase in (self.parser.phases["inTable"],
1236 self.parser.phases["inCaption"],
1237 self.parser.phases["inColumnGroup"],
1238 self.parser.phases["inTableBody"],
1239 self.parser.phases["inRow"],
1240 self.parser.phases["inCell"]):
1241 self.parser.phase = self.parser.phases["inSelectInTable"]
1242 else:
1243 self.parser.phase = self.parser.phases["inSelect"]
1245 def startTagRpRt(self, token):
1246 if self.tree.elementInScope("ruby"):
1247 self.tree.generateImpliedEndTags()
1248 if self.tree.openElements[-1].name != "ruby":
1249 self.parser.parseError()
1250 self.tree.insertElement(token)
1252 def startTagMath(self, token):
1253 self.tree.reconstructActiveFormattingElements()
1254 self.parser.adjustMathMLAttributes(token)
1255 self.parser.adjustForeignAttributes(token)
1256 token["namespace"] = namespaces["mathml"]
1257 self.tree.insertElement(token)
1258 # Need to get the parse error right for the case where the token
1259 # has a namespace not equal to the xmlns attribute
1260 if token["selfClosing"]:
1261 self.tree.openElements.pop()
1262 token["selfClosingAcknowledged"] = True
1264 def startTagSvg(self, token):
1265 self.tree.reconstructActiveFormattingElements()
1266 self.parser.adjustSVGAttributes(token)
1267 self.parser.adjustForeignAttributes(token)
1268 token["namespace"] = namespaces["svg"]
1269 self.tree.insertElement(token)
1270 # Need to get the parse error right for the case where the token
1271 # has a namespace not equal to the xmlns attribute
1272 if token["selfClosing"]:
1273 self.tree.openElements.pop()
1274 token["selfClosingAcknowledged"] = True
1276 def startTagMisplaced(self, token):
1277 """ Elements that should be children of other elements that have a
1278 different insertion mode; here they are ignored
1279 "caption", "col", "colgroup", "frame", "frameset", "head",
1280 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1281 "tr", "noscript"
1282 """
1283 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1285 def startTagOther(self, token):
1286 self.tree.reconstructActiveFormattingElements()
1287 self.tree.insertElement(token)
1289 def endTagP(self, token):
1290 if not self.tree.elementInScope("p", variant="button"):
1291 self.startTagCloseP(impliedTagToken("p", "StartTag"))
1292 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1293 self.endTagP(impliedTagToken("p", "EndTag"))
1294 else:
1295 self.tree.generateImpliedEndTags("p")
1296 if self.tree.openElements[-1].name != "p":
1297 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1298 node = self.tree.openElements.pop()
1299 while node.name != "p":
1300 node = self.tree.openElements.pop()
1302 def endTagBody(self, token):
1303 if not self.tree.elementInScope("body"):
1304 self.parser.parseError()
1305 return
1306 elif self.tree.openElements[-1].name != "body":
1307 for node in self.tree.openElements[2:]:
1308 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1309 "option", "p", "rp", "rt",
1310 "tbody", "td", "tfoot",
1311 "th", "thead", "tr", "body",
1312 "html")):
1313 # Not sure this is the correct name for the parse error
1314 self.parser.parseError(
1315 "expected-one-end-tag-but-got-another",
1316 {"gotName": "body", "expectedName": node.name})
1317 break
1318 self.parser.phase = self.parser.phases["afterBody"]
1320 def endTagHtml(self, token):
1321 # We repeat the test for the body end tag token being ignored here
1322 if self.tree.elementInScope("body"):
1323 self.endTagBody(impliedTagToken("body"))
1324 return token
1326 def endTagBlock(self, token):
1327 # Put us back in the right whitespace handling mode
1328 if token["name"] == "pre":
1329 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1330 inScope = self.tree.elementInScope(token["name"])
1331 if inScope:
1332 self.tree.generateImpliedEndTags()
1333 if self.tree.openElements[-1].name != token["name"]:
1334 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1335 if inScope:
1336 node = self.tree.openElements.pop()
1337 while node.name != token["name"]:
1338 node = self.tree.openElements.pop()
1340 def endTagForm(self, token):
1341 node = self.tree.formPointer
1342 self.tree.formPointer = None
1343 if node is None or not self.tree.elementInScope(node):
1344 self.parser.parseError("unexpected-end-tag",
1345 {"name": "form"})
1346 else:
1347 self.tree.generateImpliedEndTags()
1348 if self.tree.openElements[-1] != node:
1349 self.parser.parseError("end-tag-too-early-ignored",
1350 {"name": "form"})
1351 self.tree.openElements.remove(node)
1353 def endTagListItem(self, token):
1354 if token["name"] == "li":
1355 variant = "list"
1356 else:
1357 variant = None
1358 if not self.tree.elementInScope(token["name"], variant=variant):
1359 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1360 else:
1361 self.tree.generateImpliedEndTags(exclude=token["name"])
1362 if self.tree.openElements[-1].name != token["name"]:
1363 self.parser.parseError(
1364 "end-tag-too-early",
1365 {"name": token["name"]})
1366 node = self.tree.openElements.pop()
1367 while node.name != token["name"]:
1368 node = self.tree.openElements.pop()
1370 def endTagHeading(self, token):
1371 for item in headingElements:
1372 if self.tree.elementInScope(item):
1373 self.tree.generateImpliedEndTags()
1374 break
1375 if self.tree.openElements[-1].name != token["name"]:
1376 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1378 for item in headingElements:
1379 if self.tree.elementInScope(item):
1380 item = self.tree.openElements.pop()
1381 while item.name not in headingElements:
1382 item = self.tree.openElements.pop()
1383 break
1385 def endTagFormatting(self, token):
1386 """The much-feared adoption agency algorithm"""
1387 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1388 # XXX Better parseError messages appreciated.
1390 # Step 1
1391 outerLoopCounter = 0
1393 # Step 2
1394 while outerLoopCounter < 8:
1396 # Step 3
1397 outerLoopCounter += 1
1399 # Step 4:
1401 # Let the formatting element be the last element in
1402 # the list of active formatting elements that:
1403 # - is between the end of the list and the last scope
1404 # marker in the list, if any, or the start of the list
1405 # otherwise, and
1406 # - has the same tag name as the token.
1407 formattingElement = self.tree.elementInActiveFormattingElements(
1408 token["name"])
1409 if (not formattingElement or
1410 (formattingElement in self.tree.openElements and
1411 not self.tree.elementInScope(formattingElement.name))):
1412 # If there is no such node, then abort these steps
1413 # and instead act as described in the "any other
1414 # end tag" entry below.
1415 self.endTagOther(token)
1416 return
1418 # Otherwise, if there is such a node, but that node is
1419 # not in the stack of open elements, then this is a
1420 # parse error; remove the element from the list, and
1421 # abort these steps.
1422 elif formattingElement not in self.tree.openElements:
1423 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1424 self.tree.activeFormattingElements.remove(formattingElement)
1425 return
1427 # Otherwise, if there is such a node, and that node is
1428 # also in the stack of open elements, but the element
1429 # is not in scope, then this is a parse error; ignore
1430 # the token, and abort these steps.
1431 elif not self.tree.elementInScope(formattingElement.name):
1432 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1433 return
1435 # Otherwise, there is a formatting element and that
1436 # element is in the stack and is in scope. If the
1437 # element is not the current node, this is a parse
1438 # error. In any case, proceed with the algorithm as
1439 # written in the following steps.
1440 else:
1441 if formattingElement != self.tree.openElements[-1]:
1442 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1444 # Step 5:
1446 # Let the furthest block be the topmost node in the
1447 # stack of open elements that is lower in the stack
1448 # than the formatting element, and is an element in
1449 # the special category. There might not be one.
1450 afeIndex = self.tree.openElements.index(formattingElement)
1451 furthestBlock = None
1452 for element in self.tree.openElements[afeIndex:]:
1453 if element.nameTuple in specialElements:
1454 furthestBlock = element
1455 break
1457 # Step 6:
1459 # If there is no furthest block, then the UA must
1460 # first pop all the nodes from the bottom of the stack
1461 # of open elements, from the current node up to and
1462 # including the formatting element, then remove the
1463 # formatting element from the list of active
1464 # formatting elements, and finally abort these steps.
1465 if furthestBlock is None:
1466 element = self.tree.openElements.pop()
1467 while element != formattingElement:
1468 element = self.tree.openElements.pop()
1469 self.tree.activeFormattingElements.remove(element)
1470 return
1472 # Step 7
1473 commonAncestor = self.tree.openElements[afeIndex - 1]
1475 # Step 8:
1476 # The bookmark is supposed to help us identify where to reinsert
1477 # nodes in step 15. We have to ensure that we reinsert nodes after
1478 # the node before the active formatting element. Note the bookmark
1479 # can move in step 9.7
1480 bookmark = self.tree.activeFormattingElements.index(formattingElement)
1482 # Step 9
1483 lastNode = node = furthestBlock
1484 innerLoopCounter = 0
1486 index = self.tree.openElements.index(node)
1487 while innerLoopCounter < 3:
1488 innerLoopCounter += 1
1489 # Node is element before node in open elements
1490 index -= 1
1491 node = self.tree.openElements[index]
1492 if node not in self.tree.activeFormattingElements:
1493 self.tree.openElements.remove(node)
1494 continue
1495 # Step 9.6
1496 if node == formattingElement:
1497 break
1498 # Step 9.7
1499 if lastNode == furthestBlock:
1500 bookmark = self.tree.activeFormattingElements.index(node) + 1
1501 # Step 9.8
1502 clone = node.cloneNode()
1503 # Replace node with clone
1504 self.tree.activeFormattingElements[
1505 self.tree.activeFormattingElements.index(node)] = clone
1506 self.tree.openElements[
1507 self.tree.openElements.index(node)] = clone
1508 node = clone
1509 # Step 9.9
1510 # Remove lastNode from its parents, if any
1511 if lastNode.parent:
1512 lastNode.parent.removeChild(lastNode)
1513 node.appendChild(lastNode)
1514 # Step 9.10
1515 lastNode = node
1517 # Step 10
1518 # Foster parent lastNode if commonAncestor is a
1519 # table, tbody, tfoot, thead, or tr we need to foster
1520 # parent the lastNode
1521 if lastNode.parent:
1522 lastNode.parent.removeChild(lastNode)
1524 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1525 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1526 parent.insertBefore(lastNode, insertBefore)
1527 else:
1528 commonAncestor.appendChild(lastNode)
1530 # Step 11
1531 clone = formattingElement.cloneNode()
1533 # Step 12
1534 furthestBlock.reparentChildren(clone)
1536 # Step 13
1537 furthestBlock.appendChild(clone)
1539 # Step 14
1540 self.tree.activeFormattingElements.remove(formattingElement)
1541 self.tree.activeFormattingElements.insert(bookmark, clone)
1543 # Step 15
1544 self.tree.openElements.remove(formattingElement)
1545 self.tree.openElements.insert(
1546 self.tree.openElements.index(furthestBlock) + 1, clone)
1548 def endTagAppletMarqueeObject(self, token):
1549 if self.tree.elementInScope(token["name"]):
1550 self.tree.generateImpliedEndTags()
1551 if self.tree.openElements[-1].name != token["name"]:
1552 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1554 if self.tree.elementInScope(token["name"]):
1555 element = self.tree.openElements.pop()
1556 while element.name != token["name"]:
1557 element = self.tree.openElements.pop()
1558 self.tree.clearActiveFormattingElements()
1560 def endTagBr(self, token):
1561 self.parser.parseError("unexpected-end-tag-treated-as",
1562 {"originalName": "br", "newName": "br element"})
1563 self.tree.reconstructActiveFormattingElements()
1564 self.tree.insertElement(impliedTagToken("br", "StartTag"))
1565 self.tree.openElements.pop()
1567 def endTagOther(self, token):
1568 for node in self.tree.openElements[::-1]:
1569 if node.name == token["name"]:
1570 self.tree.generateImpliedEndTags(exclude=token["name"])
1571 if self.tree.openElements[-1].name != token["name"]:
1572 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1573 while self.tree.openElements.pop() != node:
1574 pass
1575 break
1576 else:
1577 if node.nameTuple in specialElements:
1578 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1579 break
1581 startTagHandler = _utils.MethodDispatcher([
1582 ("html", Phase.startTagHtml),
1583 (("base", "basefont", "bgsound", "command", "link", "meta",
1584 "script", "style", "title"),
1585 startTagProcessInHead),
1586 ("body", startTagBody),
1587 ("frameset", startTagFrameset),
1588 (("address", "article", "aside", "blockquote", "center", "details",
1589 "dir", "div", "dl", "fieldset", "figcaption", "figure",
1590 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
1591 "section", "summary", "ul"),
1592 startTagCloseP),
1593 (headingElements, startTagHeading),
1594 (("pre", "listing"), startTagPreListing),
1595 ("form", startTagForm),
1596 (("li", "dd", "dt"), startTagListItem),
1597 ("plaintext", startTagPlaintext),
1598 ("a", startTagA),
1599 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
1600 "strong", "tt", "u"), startTagFormatting),
1601 ("nobr", startTagNobr),
1602 ("button", startTagButton),
1603 (("applet", "marquee", "object"), startTagAppletMarqueeObject),
1604 ("xmp", startTagXmp),
1605 ("table", startTagTable),
1606 (("area", "br", "embed", "img", "keygen", "wbr"),
1607 startTagVoidFormatting),
1608 (("param", "source", "track"), startTagParamSource),
1609 ("input", startTagInput),
1610 ("hr", startTagHr),
1611 ("image", startTagImage),
1612 ("isindex", startTagIsIndex),
1613 ("textarea", startTagTextarea),
1614 ("iframe", startTagIFrame),
1615 ("noscript", startTagNoscript),
1616 (("noembed", "noframes"), startTagRawtext),
1617 ("select", startTagSelect),
1618 (("rp", "rt"), startTagRpRt),
1619 (("option", "optgroup"), startTagOpt),
1620 (("math"), startTagMath),
1621 (("svg"), startTagSvg),
1622 (("caption", "col", "colgroup", "frame", "head",
1623 "tbody", "td", "tfoot", "th", "thead",
1624 "tr"), startTagMisplaced)
1625 ])
1626 startTagHandler.default = startTagOther
1628 endTagHandler = _utils.MethodDispatcher([
1629 ("body", endTagBody),
1630 ("html", endTagHtml),
1631 (("address", "article", "aside", "blockquote", "button", "center",
1632 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
1633 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
1634 "section", "summary", "ul"), endTagBlock),
1635 ("form", endTagForm),
1636 ("p", endTagP),
1637 (("dd", "dt", "li"), endTagListItem),
1638 (headingElements, endTagHeading),
1639 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
1640 "strike", "strong", "tt", "u"), endTagFormatting),
1641 (("applet", "marquee", "object"), endTagAppletMarqueeObject),
1642 ("br", endTagBr),
1643 ])
1644 endTagHandler.default = endTagOther
1647class TextPhase(Phase):
1648 __slots__ = tuple()
1650 def processCharacters(self, token):
1651 self.tree.insertText(token["data"])
1653 def processEOF(self):
1654 self.parser.parseError("expected-named-closing-tag-but-got-eof",
1655 {"name": self.tree.openElements[-1].name})
1656 self.tree.openElements.pop()
1657 self.parser.phase = self.parser.originalPhase
1658 return True
1660 def startTagOther(self, token):
1661 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1663 def endTagScript(self, token):
1664 node = self.tree.openElements.pop()
1665 assert node.name == "script"
1666 self.parser.phase = self.parser.originalPhase
1667 # The rest of this method is all stuff that only happens if
1668 # document.write works
1670 def endTagOther(self, token):
1671 self.tree.openElements.pop()
1672 self.parser.phase = self.parser.originalPhase
1674 startTagHandler = _utils.MethodDispatcher([])
1675 startTagHandler.default = startTagOther
1676 endTagHandler = _utils.MethodDispatcher([
1677 ("script", endTagScript)])
1678 endTagHandler.default = endTagOther
1681class InTablePhase(Phase):
1682 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1683 __slots__ = tuple()
1685 # helper methods
1686 def clearStackToTableContext(self):
1687 # "clear the stack back to a table context"
1688 while self.tree.openElements[-1].name not in ("table", "html"):
1689 # self.parser.parseError("unexpected-implied-end-tag-in-table",
1690 # {"name": self.tree.openElements[-1].name})
1691 self.tree.openElements.pop()
1692 # When the current node is <html> it's an innerHTML case
1694 # processing methods
1695 def processEOF(self):
1696 if self.tree.openElements[-1].name != "html":
1697 self.parser.parseError("eof-in-table")
1698 else:
1699 assert self.parser.innerHTML
1700 # Stop parsing
1702 def processSpaceCharacters(self, token):
1703 originalPhase = self.parser.phase
1704 self.parser.phase = self.parser.phases["inTableText"]
1705 self.parser.phase.originalPhase = originalPhase
1706 self.parser.phase.processSpaceCharacters(token)
1708 def processCharacters(self, token):
1709 originalPhase = self.parser.phase
1710 self.parser.phase = self.parser.phases["inTableText"]
1711 self.parser.phase.originalPhase = originalPhase
1712 self.parser.phase.processCharacters(token)
1714 def insertText(self, token):
1715 # If we get here there must be at least one non-whitespace character
1716 # Do the table magic!
1717 self.tree.insertFromTable = True
1718 self.parser.phases["inBody"].processCharacters(token)
1719 self.tree.insertFromTable = False
1721 def startTagCaption(self, token):
1722 self.clearStackToTableContext()
1723 self.tree.activeFormattingElements.append(Marker)
1724 self.tree.insertElement(token)
1725 self.parser.phase = self.parser.phases["inCaption"]
1727 def startTagColgroup(self, token):
1728 self.clearStackToTableContext()
1729 self.tree.insertElement(token)
1730 self.parser.phase = self.parser.phases["inColumnGroup"]
1732 def startTagCol(self, token):
1733 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1734 return token
1736 def startTagRowGroup(self, token):
1737 self.clearStackToTableContext()
1738 self.tree.insertElement(token)
1739 self.parser.phase = self.parser.phases["inTableBody"]
1741 def startTagImplyTbody(self, token):
1742 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1743 return token
1745 def startTagTable(self, token):
1746 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1747 {"startName": "table", "endName": "table"})
1748 self.parser.phase.processEndTag(impliedTagToken("table"))
1749 if not self.parser.innerHTML:
1750 return token
1752 def startTagStyleScript(self, token):
1753 return self.parser.phases["inHead"].processStartTag(token)
1755 def startTagInput(self, token):
1756 if ("type" in token["data"] and
1757 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1758 self.parser.parseError("unexpected-hidden-input-in-table")
1759 self.tree.insertElement(token)
1760 # XXX associate with form
1761 self.tree.openElements.pop()
1762 else:
1763 self.startTagOther(token)
1765 def startTagForm(self, token):
1766 self.parser.parseError("unexpected-form-in-table")
1767 if self.tree.formPointer is None:
1768 self.tree.insertElement(token)
1769 self.tree.formPointer = self.tree.openElements[-1]
1770 self.tree.openElements.pop()
1772 def startTagOther(self, token):
1773 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1774 # Do the table magic!
1775 self.tree.insertFromTable = True
1776 self.parser.phases["inBody"].processStartTag(token)
1777 self.tree.insertFromTable = False
1779 def endTagTable(self, token):
1780 if self.tree.elementInScope("table", variant="table"):
1781 self.tree.generateImpliedEndTags()
1782 if self.tree.openElements[-1].name != "table":
1783 self.parser.parseError("end-tag-too-early-named",
1784 {"gotName": "table",
1785 "expectedName": self.tree.openElements[-1].name})
1786 while self.tree.openElements[-1].name != "table":
1787 self.tree.openElements.pop()
1788 self.tree.openElements.pop()
1789 self.parser.resetInsertionMode()
1790 else:
1791 # innerHTML case
1792 assert self.parser.innerHTML
1793 self.parser.parseError()
1795 def endTagIgnore(self, token):
1796 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1798 def endTagOther(self, token):
1799 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1800 # Do the table magic!
1801 self.tree.insertFromTable = True
1802 self.parser.phases["inBody"].processEndTag(token)
1803 self.tree.insertFromTable = False
1805 startTagHandler = _utils.MethodDispatcher([
1806 ("html", Phase.startTagHtml),
1807 ("caption", startTagCaption),
1808 ("colgroup", startTagColgroup),
1809 ("col", startTagCol),
1810 (("tbody", "tfoot", "thead"), startTagRowGroup),
1811 (("td", "th", "tr"), startTagImplyTbody),
1812 ("table", startTagTable),
1813 (("style", "script"), startTagStyleScript),
1814 ("input", startTagInput),
1815 ("form", startTagForm)
1816 ])
1817 startTagHandler.default = startTagOther
1819 endTagHandler = _utils.MethodDispatcher([
1820 ("table", endTagTable),
1821 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1822 "tfoot", "th", "thead", "tr"), endTagIgnore)
1823 ])
1824 endTagHandler.default = endTagOther
1827class InTableTextPhase(Phase):
1828 __slots__ = ("originalPhase", "characterTokens")
1830 def __init__(self, *args, **kwargs):
1831 super(InTableTextPhase, self).__init__(*args, **kwargs)
1832 self.originalPhase = None
1833 self.characterTokens = []
1835 def flushCharacters(self):
1836 data = "".join([item["data"] for item in self.characterTokens])
1837 if any(item not in spaceCharacters for item in data):
1838 token = {"type": tokenTypes["Characters"], "data": data}
1839 self.parser.phases["inTable"].insertText(token)
1840 elif data:
1841 self.tree.insertText(data)
1842 self.characterTokens = []
1844 def processComment(self, token):
1845 self.flushCharacters()
1846 self.parser.phase = self.originalPhase
1847 return token
1849 def processEOF(self):
1850 self.flushCharacters()
1851 self.parser.phase = self.originalPhase
1852 return True
1854 def processCharacters(self, token):
1855 if token["data"] == "\u0000":
1856 return
1857 self.characterTokens.append(token)
1859 def processSpaceCharacters(self, token):
1860 # pretty sure we should never reach here
1861 self.characterTokens.append(token)
1862# assert False
1864 def processStartTag(self, token):
1865 self.flushCharacters()
1866 self.parser.phase = self.originalPhase
1867 return token
1869 def processEndTag(self, token):
1870 self.flushCharacters()
1871 self.parser.phase = self.originalPhase
1872 return token
1875class InCaptionPhase(Phase):
1876 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1877 __slots__ = tuple()
1879 def ignoreEndTagCaption(self):
1880 return not self.tree.elementInScope("caption", variant="table")
1882 def processEOF(self):
1883 self.parser.phases["inBody"].processEOF()
1885 def processCharacters(self, token):
1886 return self.parser.phases["inBody"].processCharacters(token)
1888 def startTagTableElement(self, token):
1889 self.parser.parseError()
1890 # XXX Have to duplicate logic here to find out if the tag is ignored
1891 ignoreEndTag = self.ignoreEndTagCaption()
1892 self.parser.phase.processEndTag(impliedTagToken("caption"))
1893 if not ignoreEndTag:
1894 return token
1896 def startTagOther(self, token):
1897 return self.parser.phases["inBody"].processStartTag(token)
1899 def endTagCaption(self, token):
1900 if not self.ignoreEndTagCaption():
1901 # AT this code is quite similar to endTagTable in "InTable"
1902 self.tree.generateImpliedEndTags()
1903 if self.tree.openElements[-1].name != "caption":
1904 self.parser.parseError("expected-one-end-tag-but-got-another",
1905 {"gotName": "caption",
1906 "expectedName": self.tree.openElements[-1].name})
1907 while self.tree.openElements[-1].name != "caption":
1908 self.tree.openElements.pop()
1909 self.tree.openElements.pop()
1910 self.tree.clearActiveFormattingElements()
1911 self.parser.phase = self.parser.phases["inTable"]
1912 else:
1913 # innerHTML case
1914 assert self.parser.innerHTML
1915 self.parser.parseError()
1917 def endTagTable(self, token):
1918 self.parser.parseError()
1919 ignoreEndTag = self.ignoreEndTagCaption()
1920 self.parser.phase.processEndTag(impliedTagToken("caption"))
1921 if not ignoreEndTag:
1922 return token
1924 def endTagIgnore(self, token):
1925 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1927 def endTagOther(self, token):
1928 return self.parser.phases["inBody"].processEndTag(token)
1930 startTagHandler = _utils.MethodDispatcher([
1931 ("html", Phase.startTagHtml),
1932 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1933 "thead", "tr"), startTagTableElement)
1934 ])
1935 startTagHandler.default = startTagOther
1937 endTagHandler = _utils.MethodDispatcher([
1938 ("caption", endTagCaption),
1939 ("table", endTagTable),
1940 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1941 "thead", "tr"), endTagIgnore)
1942 ])
1943 endTagHandler.default = endTagOther
1946class InColumnGroupPhase(Phase):
1947 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1948 __slots__ = tuple()
1950 def ignoreEndTagColgroup(self):
1951 return self.tree.openElements[-1].name == "html"
1953 def processEOF(self):
1954 if self.tree.openElements[-1].name == "html":
1955 assert self.parser.innerHTML
1956 return
1957 else:
1958 ignoreEndTag = self.ignoreEndTagColgroup()
1959 self.endTagColgroup(impliedTagToken("colgroup"))
1960 if not ignoreEndTag:
1961 return True
1963 def processCharacters(self, token):
1964 ignoreEndTag = self.ignoreEndTagColgroup()
1965 self.endTagColgroup(impliedTagToken("colgroup"))
1966 if not ignoreEndTag:
1967 return token
1969 def startTagCol(self, token):
1970 self.tree.insertElement(token)
1971 self.tree.openElements.pop()
1972 token["selfClosingAcknowledged"] = True
1974 def startTagOther(self, token):
1975 ignoreEndTag = self.ignoreEndTagColgroup()
1976 self.endTagColgroup(impliedTagToken("colgroup"))
1977 if not ignoreEndTag:
1978 return token
1980 def endTagColgroup(self, token):
1981 if self.ignoreEndTagColgroup():
1982 # innerHTML case
1983 assert self.parser.innerHTML
1984 self.parser.parseError()
1985 else:
1986 self.tree.openElements.pop()
1987 self.parser.phase = self.parser.phases["inTable"]
1989 def endTagCol(self, token):
1990 self.parser.parseError("no-end-tag", {"name": "col"})
1992 def endTagOther(self, token):
1993 ignoreEndTag = self.ignoreEndTagColgroup()
1994 self.endTagColgroup(impliedTagToken("colgroup"))
1995 if not ignoreEndTag:
1996 return token
1998 startTagHandler = _utils.MethodDispatcher([
1999 ("html", Phase.startTagHtml),
2000 ("col", startTagCol)
2001 ])
2002 startTagHandler.default = startTagOther
2004 endTagHandler = _utils.MethodDispatcher([
2005 ("colgroup", endTagColgroup),
2006 ("col", endTagCol)
2007 ])
2008 endTagHandler.default = endTagOther
2011class InTableBodyPhase(Phase):
2012 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2013 __slots__ = tuple()
2015 # helper methods
2016 def clearStackToTableBodyContext(self):
2017 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2018 "thead", "html"):
2019 # self.parser.parseError("unexpected-implied-end-tag-in-table",
2020 # {"name": self.tree.openElements[-1].name})
2021 self.tree.openElements.pop()
2022 if self.tree.openElements[-1].name == "html":
2023 assert self.parser.innerHTML
2025 # the rest
2026 def processEOF(self):
2027 self.parser.phases["inTable"].processEOF()
2029 def processSpaceCharacters(self, token):
2030 return self.parser.phases["inTable"].processSpaceCharacters(token)
2032 def processCharacters(self, token):
2033 return self.parser.phases["inTable"].processCharacters(token)
2035 def startTagTr(self, token):
2036 self.clearStackToTableBodyContext()
2037 self.tree.insertElement(token)
2038 self.parser.phase = self.parser.phases["inRow"]
2040 def startTagTableCell(self, token):
2041 self.parser.parseError("unexpected-cell-in-table-body",
2042 {"name": token["name"]})
2043 self.startTagTr(impliedTagToken("tr", "StartTag"))
2044 return token
2046 def startTagTableOther(self, token):
2047 # XXX AT Any ideas on how to share this with endTagTable?
2048 if (self.tree.elementInScope("tbody", variant="table") or
2049 self.tree.elementInScope("thead", variant="table") or
2050 self.tree.elementInScope("tfoot", variant="table")):
2051 self.clearStackToTableBodyContext()
2052 self.endTagTableRowGroup(
2053 impliedTagToken(self.tree.openElements[-1].name))
2054 return token
2055 else:
2056 # innerHTML case
2057 assert self.parser.innerHTML
2058 self.parser.parseError()
2060 def startTagOther(self, token):
2061 return self.parser.phases["inTable"].processStartTag(token)
2063 def endTagTableRowGroup(self, token):
2064 if self.tree.elementInScope(token["name"], variant="table"):
2065 self.clearStackToTableBodyContext()
2066 self.tree.openElements.pop()
2067 self.parser.phase = self.parser.phases["inTable"]
2068 else:
2069 self.parser.parseError("unexpected-end-tag-in-table-body",
2070 {"name": token["name"]})
2072 def endTagTable(self, token):
2073 if (self.tree.elementInScope("tbody", variant="table") or
2074 self.tree.elementInScope("thead", variant="table") or
2075 self.tree.elementInScope("tfoot", variant="table")):
2076 self.clearStackToTableBodyContext()
2077 self.endTagTableRowGroup(
2078 impliedTagToken(self.tree.openElements[-1].name))
2079 return token
2080 else:
2081 # innerHTML case
2082 assert self.parser.innerHTML
2083 self.parser.parseError()
2085 def endTagIgnore(self, token):
2086 self.parser.parseError("unexpected-end-tag-in-table-body",
2087 {"name": token["name"]})
2089 def endTagOther(self, token):
2090 return self.parser.phases["inTable"].processEndTag(token)
2092 startTagHandler = _utils.MethodDispatcher([
2093 ("html", Phase.startTagHtml),
2094 ("tr", startTagTr),
2095 (("td", "th"), startTagTableCell),
2096 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2097 startTagTableOther)
2098 ])
2099 startTagHandler.default = startTagOther
2101 endTagHandler = _utils.MethodDispatcher([
2102 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2103 ("table", endTagTable),
2104 (("body", "caption", "col", "colgroup", "html", "td", "th",
2105 "tr"), endTagIgnore)
2106 ])
2107 endTagHandler.default = endTagOther
2110class InRowPhase(Phase):
2111 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2112 __slots__ = tuple()
2114 # helper methods (XXX unify this with other table helper methods)
2115 def clearStackToTableRowContext(self):
2116 while self.tree.openElements[-1].name not in ("tr", "html"):
2117 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2118 {"name": self.tree.openElements[-1].name})
2119 self.tree.openElements.pop()
2121 def ignoreEndTagTr(self):
2122 return not self.tree.elementInScope("tr", variant="table")
2124 # the rest
2125 def processEOF(self):
2126 self.parser.phases["inTable"].processEOF()
2128 def processSpaceCharacters(self, token):
2129 return self.parser.phases["inTable"].processSpaceCharacters(token)
2131 def processCharacters(self, token):
2132 return self.parser.phases["inTable"].processCharacters(token)
2134 def startTagTableCell(self, token):
2135 self.clearStackToTableRowContext()
2136 self.tree.insertElement(token)
2137 self.parser.phase = self.parser.phases["inCell"]
2138 self.tree.activeFormattingElements.append(Marker)
2140 def startTagTableOther(self, token):
2141 ignoreEndTag = self.ignoreEndTagTr()
2142 self.endTagTr(impliedTagToken("tr"))
2143 # XXX how are we sure it's always ignored in the innerHTML case?
2144 if not ignoreEndTag:
2145 return token
2147 def startTagOther(self, token):
2148 return self.parser.phases["inTable"].processStartTag(token)
2150 def endTagTr(self, token):
2151 if not self.ignoreEndTagTr():
2152 self.clearStackToTableRowContext()
2153 self.tree.openElements.pop()
2154 self.parser.phase = self.parser.phases["inTableBody"]
2155 else:
2156 # innerHTML case
2157 assert self.parser.innerHTML
2158 self.parser.parseError()
2160 def endTagTable(self, token):
2161 ignoreEndTag = self.ignoreEndTagTr()
2162 self.endTagTr(impliedTagToken("tr"))
2163 # Reprocess the current tag if the tr end tag was not ignored
2164 # XXX how are we sure it's always ignored in the innerHTML case?
2165 if not ignoreEndTag:
2166 return token
2168 def endTagTableRowGroup(self, token):
2169 if self.tree.elementInScope(token["name"], variant="table"):
2170 self.endTagTr(impliedTagToken("tr"))
2171 return token
2172 else:
2173 self.parser.parseError()
2175 def endTagIgnore(self, token):
2176 self.parser.parseError("unexpected-end-tag-in-table-row",
2177 {"name": token["name"]})
2179 def endTagOther(self, token):
2180 return self.parser.phases["inTable"].processEndTag(token)
2182 startTagHandler = _utils.MethodDispatcher([
2183 ("html", Phase.startTagHtml),
2184 (("td", "th"), startTagTableCell),
2185 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2186 "tr"), startTagTableOther)
2187 ])
2188 startTagHandler.default = startTagOther
2190 endTagHandler = _utils.MethodDispatcher([
2191 ("tr", endTagTr),
2192 ("table", endTagTable),
2193 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2194 (("body", "caption", "col", "colgroup", "html", "td", "th"),
2195 endTagIgnore)
2196 ])
2197 endTagHandler.default = endTagOther
2200class InCellPhase(Phase):
2201 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2202 __slots__ = tuple()
2204 # helper
2205 def closeCell(self):
2206 if self.tree.elementInScope("td", variant="table"):
2207 self.endTagTableCell(impliedTagToken("td"))
2208 elif self.tree.elementInScope("th", variant="table"):
2209 self.endTagTableCell(impliedTagToken("th"))
2211 # the rest
2212 def processEOF(self):
2213 self.parser.phases["inBody"].processEOF()
2215 def processCharacters(self, token):
2216 return self.parser.phases["inBody"].processCharacters(token)
2218 def startTagTableOther(self, token):
2219 if (self.tree.elementInScope("td", variant="table") or
2220 self.tree.elementInScope("th", variant="table")):
2221 self.closeCell()
2222 return token
2223 else:
2224 # innerHTML case
2225 assert self.parser.innerHTML
2226 self.parser.parseError()
2228 def startTagOther(self, token):
2229 return self.parser.phases["inBody"].processStartTag(token)
2231 def endTagTableCell(self, token):
2232 if self.tree.elementInScope(token["name"], variant="table"):
2233 self.tree.generateImpliedEndTags(token["name"])
2234 if self.tree.openElements[-1].name != token["name"]:
2235 self.parser.parseError("unexpected-cell-end-tag",
2236 {"name": token["name"]})
2237 while True:
2238 node = self.tree.openElements.pop()
2239 if node.name == token["name"]:
2240 break
2241 else:
2242 self.tree.openElements.pop()
2243 self.tree.clearActiveFormattingElements()
2244 self.parser.phase = self.parser.phases["inRow"]
2245 else:
2246 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2248 def endTagIgnore(self, token):
2249 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2251 def endTagImply(self, token):
2252 if self.tree.elementInScope(token["name"], variant="table"):
2253 self.closeCell()
2254 return token
2255 else:
2256 # sometimes innerHTML case
2257 self.parser.parseError()
2259 def endTagOther(self, token):
2260 return self.parser.phases["inBody"].processEndTag(token)
2262 startTagHandler = _utils.MethodDispatcher([
2263 ("html", Phase.startTagHtml),
2264 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2265 "thead", "tr"), startTagTableOther)
2266 ])
2267 startTagHandler.default = startTagOther
2269 endTagHandler = _utils.MethodDispatcher([
2270 (("td", "th"), endTagTableCell),
2271 (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
2272 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
2273 ])
2274 endTagHandler.default = endTagOther
2277class InSelectPhase(Phase):
2278 __slots__ = tuple()
2280 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2281 def processEOF(self):
2282 if self.tree.openElements[-1].name != "html":
2283 self.parser.parseError("eof-in-select")
2284 else:
2285 assert self.parser.innerHTML
2287 def processCharacters(self, token):
2288 if token["data"] == "\u0000":
2289 return
2290 self.tree.insertText(token["data"])
2292 def startTagOption(self, token):
2293 # We need to imply </option> if <option> is the current node.
2294 if self.tree.openElements[-1].name == "option":
2295 self.tree.openElements.pop()
2296 self.tree.insertElement(token)
2298 def startTagOptgroup(self, token):
2299 if self.tree.openElements[-1].name == "option":
2300 self.tree.openElements.pop()
2301 if self.tree.openElements[-1].name == "optgroup":
2302 self.tree.openElements.pop()
2303 self.tree.insertElement(token)
2305 def startTagSelect(self, token):
2306 self.parser.parseError("unexpected-select-in-select")
2307 self.endTagSelect(impliedTagToken("select"))
2309 def startTagInput(self, token):
2310 self.parser.parseError("unexpected-input-in-select")
2311 if self.tree.elementInScope("select", variant="select"):
2312 self.endTagSelect(impliedTagToken("select"))
2313 return token
2314 else:
2315 assert self.parser.innerHTML
2317 def startTagScript(self, token):
2318 return self.parser.phases["inHead"].processStartTag(token)
2320 def startTagOther(self, token):
2321 self.parser.parseError("unexpected-start-tag-in-select",
2322 {"name": token["name"]})
2324 def endTagOption(self, token):
2325 if self.tree.openElements[-1].name == "option":
2326 self.tree.openElements.pop()
2327 else:
2328 self.parser.parseError("unexpected-end-tag-in-select",
2329 {"name": "option"})
2331 def endTagOptgroup(self, token):
2332 # </optgroup> implicitly closes <option>
2333 if (self.tree.openElements[-1].name == "option" and
2334 self.tree.openElements[-2].name == "optgroup"):
2335 self.tree.openElements.pop()
2336 # It also closes </optgroup>
2337 if self.tree.openElements[-1].name == "optgroup":
2338 self.tree.openElements.pop()
2339 # But nothing else
2340 else:
2341 self.parser.parseError("unexpected-end-tag-in-select",
2342 {"name": "optgroup"})
2344 def endTagSelect(self, token):
2345 if self.tree.elementInScope("select", variant="select"):
2346 node = self.tree.openElements.pop()
2347 while node.name != "select":
2348 node = self.tree.openElements.pop()
2349 self.parser.resetInsertionMode()
2350 else:
2351 # innerHTML case
2352 assert self.parser.innerHTML
2353 self.parser.parseError()
2355 def endTagOther(self, token):
2356 self.parser.parseError("unexpected-end-tag-in-select",
2357 {"name": token["name"]})
2359 startTagHandler = _utils.MethodDispatcher([
2360 ("html", Phase.startTagHtml),
2361 ("option", startTagOption),
2362 ("optgroup", startTagOptgroup),
2363 ("select", startTagSelect),
2364 (("input", "keygen", "textarea"), startTagInput),
2365 ("script", startTagScript)
2366 ])
2367 startTagHandler.default = startTagOther
2369 endTagHandler = _utils.MethodDispatcher([
2370 ("option", endTagOption),
2371 ("optgroup", endTagOptgroup),
2372 ("select", endTagSelect)
2373 ])
2374 endTagHandler.default = endTagOther
2377class InSelectInTablePhase(Phase):
2378 __slots__ = tuple()
2380 def processEOF(self):
2381 self.parser.phases["inSelect"].processEOF()
2383 def processCharacters(self, token):
2384 return self.parser.phases["inSelect"].processCharacters(token)
2386 def startTagTable(self, token):
2387 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2388 self.endTagOther(impliedTagToken("select"))
2389 return token
2391 def startTagOther(self, token):
2392 return self.parser.phases["inSelect"].processStartTag(token)
2394 def endTagTable(self, token):
2395 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2396 if self.tree.elementInScope(token["name"], variant="table"):
2397 self.endTagOther(impliedTagToken("select"))
2398 return token
2400 def endTagOther(self, token):
2401 return self.parser.phases["inSelect"].processEndTag(token)
2403 startTagHandler = _utils.MethodDispatcher([
2404 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2405 startTagTable)
2406 ])
2407 startTagHandler.default = startTagOther
2409 endTagHandler = _utils.MethodDispatcher([
2410 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2411 endTagTable)
2412 ])
2413 endTagHandler.default = endTagOther
2416class InForeignContentPhase(Phase):
2417 __slots__ = tuple()
2419 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2420 "center", "code", "dd", "div", "dl", "dt",
2421 "em", "embed", "h1", "h2", "h3",
2422 "h4", "h5", "h6", "head", "hr", "i", "img",
2423 "li", "listing", "menu", "meta", "nobr",
2424 "ol", "p", "pre", "ruby", "s", "small",
2425 "span", "strong", "strike", "sub", "sup",
2426 "table", "tt", "u", "ul", "var"])
2428 def adjustSVGTagNames(self, token):
2429 replacements = {"altglyph": "altGlyph",
2430 "altglyphdef": "altGlyphDef",
2431 "altglyphitem": "altGlyphItem",
2432 "animatecolor": "animateColor",
2433 "animatemotion": "animateMotion",
2434 "animatetransform": "animateTransform",
2435 "clippath": "clipPath",
2436 "feblend": "feBlend",
2437 "fecolormatrix": "feColorMatrix",
2438 "fecomponenttransfer": "feComponentTransfer",
2439 "fecomposite": "feComposite",
2440 "feconvolvematrix": "feConvolveMatrix",
2441 "fediffuselighting": "feDiffuseLighting",
2442 "fedisplacementmap": "feDisplacementMap",
2443 "fedistantlight": "feDistantLight",
2444 "feflood": "feFlood",
2445 "fefunca": "feFuncA",
2446 "fefuncb": "feFuncB",
2447 "fefuncg": "feFuncG",
2448 "fefuncr": "feFuncR",
2449 "fegaussianblur": "feGaussianBlur",
2450 "feimage": "feImage",
2451 "femerge": "feMerge",
2452 "femergenode": "feMergeNode",
2453 "femorphology": "feMorphology",
2454 "feoffset": "feOffset",
2455 "fepointlight": "fePointLight",
2456 "fespecularlighting": "feSpecularLighting",
2457 "fespotlight": "feSpotLight",
2458 "fetile": "feTile",
2459 "feturbulence": "feTurbulence",
2460 "foreignobject": "foreignObject",
2461 "glyphref": "glyphRef",
2462 "lineargradient": "linearGradient",
2463 "radialgradient": "radialGradient",
2464 "textpath": "textPath"}
2466 if token["name"] in replacements:
2467 token["name"] = replacements[token["name"]]
2469 def processCharacters(self, token):
2470 if token["data"] == "\u0000":
2471 token["data"] = "\uFFFD"
2472 elif (self.parser.framesetOK and
2473 any(char not in spaceCharacters for char in token["data"])):
2474 self.parser.framesetOK = False
2475 Phase.processCharacters(self, token)
2477 def processStartTag(self, token):
2478 currentNode = self.tree.openElements[-1]
2479 if (token["name"] in self.breakoutElements or
2480 (token["name"] == "font" and
2481 set(token["data"].keys()) & {"color", "face", "size"})):
2482 self.parser.parseError("unexpected-html-element-in-foreign-content",
2483 {"name": token["name"]})
2484 while (self.tree.openElements[-1].namespace !=
2485 self.tree.defaultNamespace and
2486 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2487 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2488 self.tree.openElements.pop()
2489 return token
2491 else:
2492 if currentNode.namespace == namespaces["mathml"]:
2493 self.parser.adjustMathMLAttributes(token)
2494 elif currentNode.namespace == namespaces["svg"]:
2495 self.adjustSVGTagNames(token)
2496 self.parser.adjustSVGAttributes(token)
2497 self.parser.adjustForeignAttributes(token)
2498 token["namespace"] = currentNode.namespace
2499 self.tree.insertElement(token)
2500 if token["selfClosing"]:
2501 self.tree.openElements.pop()
2502 token["selfClosingAcknowledged"] = True
2504 def processEndTag(self, token):
2505 nodeIndex = len(self.tree.openElements) - 1
2506 node = self.tree.openElements[-1]
2507 if node.name.translate(asciiUpper2Lower) != token["name"]:
2508 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2510 while True:
2511 if node.name.translate(asciiUpper2Lower) == token["name"]:
2512 # XXX this isn't in the spec but it seems necessary
2513 if self.parser.phase == self.parser.phases["inTableText"]:
2514 self.parser.phase.flushCharacters()
2515 self.parser.phase = self.parser.phase.originalPhase
2516 while self.tree.openElements.pop() != node:
2517 assert self.tree.openElements
2518 new_token = None
2519 break
2520 nodeIndex -= 1
2522 node = self.tree.openElements[nodeIndex]
2523 if node.namespace != self.tree.defaultNamespace:
2524 continue
2525 else:
2526 new_token = self.parser.phase.processEndTag(token)
2527 break
2528 return new_token
2531class AfterBodyPhase(Phase):
2532 __slots__ = tuple()
2534 def processEOF(self):
2535 # Stop parsing
2536 pass
2538 def processComment(self, token):
2539 # This is needed because data is to be appended to the <html> element
2540 # here and not to whatever is currently open.
2541 self.tree.insertComment(token, self.tree.openElements[0])
2543 def processCharacters(self, token):
2544 self.parser.parseError("unexpected-char-after-body")
2545 self.parser.phase = self.parser.phases["inBody"]
2546 return token
2548 def startTagHtml(self, token):
2549 return self.parser.phases["inBody"].processStartTag(token)
2551 def startTagOther(self, token):
2552 self.parser.parseError("unexpected-start-tag-after-body",
2553 {"name": token["name"]})
2554 self.parser.phase = self.parser.phases["inBody"]
2555 return token
2557 def endTagHtml(self, name):
2558 if self.parser.innerHTML:
2559 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2560 else:
2561 self.parser.phase = self.parser.phases["afterAfterBody"]
2563 def endTagOther(self, token):
2564 self.parser.parseError("unexpected-end-tag-after-body",
2565 {"name": token["name"]})
2566 self.parser.phase = self.parser.phases["inBody"]
2567 return token
2569 startTagHandler = _utils.MethodDispatcher([
2570 ("html", startTagHtml)
2571 ])
2572 startTagHandler.default = startTagOther
2574 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
2575 endTagHandler.default = endTagOther
2578class InFramesetPhase(Phase):
2579 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2580 __slots__ = tuple()
2582 def processEOF(self):
2583 if self.tree.openElements[-1].name != "html":
2584 self.parser.parseError("eof-in-frameset")
2585 else:
2586 assert self.parser.innerHTML
2588 def processCharacters(self, token):
2589 self.parser.parseError("unexpected-char-in-frameset")
2591 def startTagFrameset(self, token):
2592 self.tree.insertElement(token)
2594 def startTagFrame(self, token):
2595 self.tree.insertElement(token)
2596 self.tree.openElements.pop()
2598 def startTagNoframes(self, token):
2599 return self.parser.phases["inBody"].processStartTag(token)
2601 def startTagOther(self, token):
2602 self.parser.parseError("unexpected-start-tag-in-frameset",
2603 {"name": token["name"]})
2605 def endTagFrameset(self, token):
2606 if self.tree.openElements[-1].name == "html":
2607 # innerHTML case
2608 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2609 else:
2610 self.tree.openElements.pop()
2611 if (not self.parser.innerHTML and
2612 self.tree.openElements[-1].name != "frameset"):
2613 # If we're not in innerHTML mode and the current node is not a
2614 # "frameset" element (anymore) then switch.
2615 self.parser.phase = self.parser.phases["afterFrameset"]
2617 def endTagOther(self, token):
2618 self.parser.parseError("unexpected-end-tag-in-frameset",
2619 {"name": token["name"]})
2621 startTagHandler = _utils.MethodDispatcher([
2622 ("html", Phase.startTagHtml),
2623 ("frameset", startTagFrameset),
2624 ("frame", startTagFrame),
2625 ("noframes", startTagNoframes)
2626 ])
2627 startTagHandler.default = startTagOther
2629 endTagHandler = _utils.MethodDispatcher([
2630 ("frameset", endTagFrameset)
2631 ])
2632 endTagHandler.default = endTagOther
2635class AfterFramesetPhase(Phase):
2636 # http://www.whatwg.org/specs/web-apps/current-work/#after3
2637 __slots__ = tuple()
2639 def processEOF(self):
2640 # Stop parsing
2641 pass
2643 def processCharacters(self, token):
2644 self.parser.parseError("unexpected-char-after-frameset")
2646 def startTagNoframes(self, token):
2647 return self.parser.phases["inHead"].processStartTag(token)
2649 def startTagOther(self, token):
2650 self.parser.parseError("unexpected-start-tag-after-frameset",
2651 {"name": token["name"]})
2653 def endTagHtml(self, token):
2654 self.parser.phase = self.parser.phases["afterAfterFrameset"]
2656 def endTagOther(self, token):
2657 self.parser.parseError("unexpected-end-tag-after-frameset",
2658 {"name": token["name"]})
2660 startTagHandler = _utils.MethodDispatcher([
2661 ("html", Phase.startTagHtml),
2662 ("noframes", startTagNoframes)
2663 ])
2664 startTagHandler.default = startTagOther
2666 endTagHandler = _utils.MethodDispatcher([
2667 ("html", endTagHtml)
2668 ])
2669 endTagHandler.default = endTagOther
2672class AfterAfterBodyPhase(Phase):
2673 __slots__ = tuple()
2675 def processEOF(self):
2676 pass
2678 def processComment(self, token):
2679 self.tree.insertComment(token, self.tree.document)
2681 def processSpaceCharacters(self, token):
2682 return self.parser.phases["inBody"].processSpaceCharacters(token)
2684 def processCharacters(self, token):
2685 self.parser.parseError("expected-eof-but-got-char")
2686 self.parser.phase = self.parser.phases["inBody"]
2687 return token
2689 def startTagHtml(self, token):
2690 return self.parser.phases["inBody"].processStartTag(token)
2692 def startTagOther(self, token):
2693 self.parser.parseError("expected-eof-but-got-start-tag",
2694 {"name": token["name"]})
2695 self.parser.phase = self.parser.phases["inBody"]
2696 return token
2698 def processEndTag(self, token):
2699 self.parser.parseError("expected-eof-but-got-end-tag",
2700 {"name": token["name"]})
2701 self.parser.phase = self.parser.phases["inBody"]
2702 return token
2704 startTagHandler = _utils.MethodDispatcher([
2705 ("html", startTagHtml)
2706 ])
2707 startTagHandler.default = startTagOther
2710class AfterAfterFramesetPhase(Phase):
2711 __slots__ = tuple()
2713 def processEOF(self):
2714 pass
2716 def processComment(self, token):
2717 self.tree.insertComment(token, self.tree.document)
2719 def processSpaceCharacters(self, token):
2720 return self.parser.phases["inBody"].processSpaceCharacters(token)
2722 def processCharacters(self, token):
2723 self.parser.parseError("expected-eof-but-got-char")
2725 def startTagHtml(self, token):
2726 return self.parser.phases["inBody"].processStartTag(token)
2728 def startTagNoFrames(self, token):
2729 return self.parser.phases["inHead"].processStartTag(token)
2731 def startTagOther(self, token):
2732 self.parser.parseError("expected-eof-but-got-start-tag",
2733 {"name": token["name"]})
2735 def processEndTag(self, token):
2736 self.parser.parseError("expected-eof-but-got-end-tag",
2737 {"name": token["name"]})
2739 startTagHandler = _utils.MethodDispatcher([
2740 ("html", startTagHtml),
2741 ("noframes", startTagNoFrames)
2742 ])
2743 startTagHandler.default = startTagOther
2745# pylint:enable=unused-argument
2748_phases = {
2749 "initial": InitialPhase,
2750 "beforeHtml": BeforeHtmlPhase,
2751 "beforeHead": BeforeHeadPhase,
2752 "inHead": InHeadPhase,
2753 "inHeadNoscript": InHeadNoscriptPhase,
2754 "afterHead": AfterHeadPhase,
2755 "inBody": InBodyPhase,
2756 "text": TextPhase,
2757 "inTable": InTablePhase,
2758 "inTableText": InTableTextPhase,
2759 "inCaption": InCaptionPhase,
2760 "inColumnGroup": InColumnGroupPhase,
2761 "inTableBody": InTableBodyPhase,
2762 "inRow": InRowPhase,
2763 "inCell": InCellPhase,
2764 "inSelect": InSelectPhase,
2765 "inSelectInTable": InSelectInTablePhase,
2766 "inForeignContent": InForeignContentPhase,
2767 "afterBody": AfterBodyPhase,
2768 "inFrameset": InFramesetPhase,
2769 "afterFrameset": AfterFramesetPhase,
2770 "afterAfterBody": AfterAfterBodyPhase,
2771 "afterAfterFrameset": AfterAfterFramesetPhase,
2772 # XXX after after frameset
2773}
2776def adjust_attributes(token, replacements):
2777 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2778 if needs_adjustment:
2779 token['data'] = type(token['data'])((replacements.get(k, k), v)
2780 for k, v in token['data'].items())
2783def impliedTagToken(name, type="EndTag", attributes=None,
2784 selfClosing=False):
2785 if attributes is None:
2786 attributes = {}
2787 return {"type": tokenTypes[type], "name": name, "data": attributes,
2788 "selfClosing": selfClosing}
2791class ParseError(Exception):
2792 """Error in parsed document"""
2793 pass