1from __future__ import absolute_import, division, unicode_literals
2from six import with_metaclass, viewkeys
3
4import types
5
6from . import _inputstream
7from . import _tokenizer
8
9from . import treebuilders
10from .treebuilders.base import Marker
11
12from . import _utils
13from .constants import (
14 spaceCharacters, asciiUpper2Lower,
15 specialElements, headingElements, cdataElements, rcdataElements,
16 tokenTypes, tagTokenTypes,
17 namespaces,
18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,
19 adjustForeignAttributes as adjustForeignAttributesMap,
20 adjustMathMLAttributes, adjustSVGAttributes,
21 E,
22 _ReparseException
23)
24
25
26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):
27 """Parse an HTML document as a string or file-like object into a tree
28
29 :arg doc: the document to parse as a string or file-like object
30
31 :arg treebuilder: the treebuilder to use when parsing
32
33 :arg namespaceHTMLElements: whether or not to namespace HTML elements
34
35 :returns: parsed tree
36
37 Example:
38
39 >>> from html5lib.html5parser import parse
40 >>> parse('<html><body><p>This is a doc</p></body></html>')
41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
42
43 """
44 tb = treebuilders.getTreeBuilder(treebuilder)
45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
46 return p.parse(doc, **kwargs)
47
48
49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):
50 """Parse an HTML fragment as a string or file-like object into a tree
51
52 :arg doc: the fragment to parse as a string or file-like object
53
54 :arg container: the container context to parse the fragment in
55
56 :arg treebuilder: the treebuilder to use when parsing
57
58 :arg namespaceHTMLElements: whether or not to namespace HTML elements
59
60 :returns: parsed tree
61
62 Example:
63
64 >>> from html5lib.html5libparser import parseFragment
65 >>> parseFragment('<b>this is a fragment</b>')
66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
67
68 """
69 tb = treebuilders.getTreeBuilder(treebuilder)
70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)
71 return p.parseFragment(doc, container=container, **kwargs)
72
73
74def method_decorator_metaclass(function):
75 class Decorated(type):
76 def __new__(meta, classname, bases, classDict):
77 for attributeName, attribute in classDict.items():
78 if isinstance(attribute, types.FunctionType):
79 attribute = function(attribute)
80
81 classDict[attributeName] = attribute
82 return type.__new__(meta, classname, bases, classDict)
83 return Decorated
84
85
86class HTMLParser(object):
87 """HTML parser
88
89 Generates a tree structure from a stream of (possibly malformed) HTML.
90
91 """
92
93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):
94 """
95 :arg tree: a treebuilder class controlling the type of tree that will be
96 returned. Built in treebuilders can be accessed through
97 html5lib.treebuilders.getTreeBuilder(treeType)
98
99 :arg strict: raise an exception when a parse error is encountered
100
101 :arg namespaceHTMLElements: whether or not to namespace HTML elements
102
103 :arg debug: whether or not to enable debug mode which logs things
104
105 Example:
106
107 >>> from html5lib.html5parser import HTMLParser
108 >>> parser = HTMLParser() # generates parser with etree builder
109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict
110
111 """
112
113 # Raise an exception on the first error encountered
114 self.strict = strict
115
116 if tree is None:
117 tree = treebuilders.getTreeBuilder("etree")
118 self.tree = tree(namespaceHTMLElements)
119 self.errors = []
120
121 self.phases = {name: cls(self, self.tree) for name, cls in
122 getPhases(debug).items()}
123
124 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):
125
126 self.innerHTMLMode = innerHTML
127 self.container = container
128 self.scripting = scripting
129 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)
130 self.reset()
131
132 try:
133 self.mainLoop()
134 except _ReparseException:
135 self.reset()
136 self.mainLoop()
137
138 def reset(self):
139 self.tree.reset()
140 self.firstStartTag = False
141 self.errors = []
142 self.log = [] # only used with debug mode
143 # "quirks" / "limited quirks" / "no quirks"
144 self.compatMode = "no quirks"
145
146 if self.innerHTMLMode:
147 self.innerHTML = self.container.lower()
148
149 if self.innerHTML in cdataElements:
150 self.tokenizer.state = self.tokenizer.rcdataState
151 elif self.innerHTML in rcdataElements:
152 self.tokenizer.state = self.tokenizer.rawtextState
153 elif self.innerHTML == 'plaintext':
154 self.tokenizer.state = self.tokenizer.plaintextState
155 else:
156 # state already is data state
157 # self.tokenizer.state = self.tokenizer.dataState
158 pass
159 self.phase = self.phases["beforeHtml"]
160 self.phase.insertHtmlElement()
161 self.resetInsertionMode()
162 else:
163 self.innerHTML = False # pylint:disable=redefined-variable-type
164 self.phase = self.phases["initial"]
165
166 self.lastPhase = None
167
168 self.beforeRCDataPhase = None
169
170 self.framesetOK = True
171
172 @property
173 def documentEncoding(self):
174 """Name of the character encoding that was used to decode the input stream, or
175 :obj:`None` if that is not determined yet
176
177 """
178 if not hasattr(self, 'tokenizer'):
179 return None
180 return self.tokenizer.stream.charEncoding[0].name
181
182 def isHTMLIntegrationPoint(self, element):
183 if (element.name == "annotation-xml" and
184 element.namespace == namespaces["mathml"]):
185 return ("encoding" in element.attributes and
186 element.attributes["encoding"].translate(
187 asciiUpper2Lower) in
188 ("text/html", "application/xhtml+xml"))
189 else:
190 return (element.namespace, element.name) in htmlIntegrationPointElements
191
192 def isMathMLTextIntegrationPoint(self, element):
193 return (element.namespace, element.name) in mathmlTextIntegrationPointElements
194
195 def mainLoop(self):
196 CharactersToken = tokenTypes["Characters"]
197 SpaceCharactersToken = tokenTypes["SpaceCharacters"]
198 StartTagToken = tokenTypes["StartTag"]
199 EndTagToken = tokenTypes["EndTag"]
200 CommentToken = tokenTypes["Comment"]
201 DoctypeToken = tokenTypes["Doctype"]
202 ParseErrorToken = tokenTypes["ParseError"]
203
204 for token in self.tokenizer:
205 prev_token = None
206 new_token = token
207 while new_token is not None:
208 prev_token = new_token
209 currentNode = self.tree.openElements[-1] if self.tree.openElements else None
210 currentNodeNamespace = currentNode.namespace if currentNode else None
211 currentNodeName = currentNode.name if currentNode else None
212
213 type = new_token["type"]
214
215 if type == ParseErrorToken:
216 self.parseError(new_token["data"], new_token.get("datavars", {}))
217 new_token = None
218 else:
219 if (len(self.tree.openElements) == 0 or
220 currentNodeNamespace == self.tree.defaultNamespace or
221 (self.isMathMLTextIntegrationPoint(currentNode) and
222 ((type == StartTagToken and
223 token["name"] not in frozenset(["mglyph", "malignmark"])) or
224 type in (CharactersToken, SpaceCharactersToken))) or
225 (currentNodeNamespace == namespaces["mathml"] and
226 currentNodeName == "annotation-xml" and
227 type == StartTagToken and
228 token["name"] == "svg") or
229 (self.isHTMLIntegrationPoint(currentNode) and
230 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):
231 phase = self.phase
232 else:
233 phase = self.phases["inForeignContent"]
234
235 if type == CharactersToken:
236 new_token = phase.processCharacters(new_token)
237 elif type == SpaceCharactersToken:
238 new_token = phase.processSpaceCharacters(new_token)
239 elif type == StartTagToken:
240 new_token = phase.processStartTag(new_token)
241 elif type == EndTagToken:
242 new_token = phase.processEndTag(new_token)
243 elif type == CommentToken:
244 new_token = phase.processComment(new_token)
245 elif type == DoctypeToken:
246 new_token = phase.processDoctype(new_token)
247
248 if (type == StartTagToken and prev_token["selfClosing"] and
249 not prev_token["selfClosingAcknowledged"]):
250 self.parseError("non-void-element-with-trailing-solidus",
251 {"name": prev_token["name"]})
252
253 # When the loop finishes it's EOF
254 reprocess = True
255 phases = []
256 while reprocess:
257 phases.append(self.phase)
258 reprocess = self.phase.processEOF()
259 if reprocess:
260 assert self.phase not in phases
261
262 def parse(self, stream, *args, **kwargs):
263 """Parse a HTML document into a well-formed tree
264
265 :arg stream: a file-like object or string containing the HTML to be parsed
266
267 The optional encoding parameter must be a string that indicates
268 the encoding. If specified, that encoding will be used,
269 regardless of any BOM or later declaration (such as in a meta
270 element).
271
272 :arg scripting: treat noscript elements as if JavaScript was turned on
273
274 :returns: parsed tree
275
276 Example:
277
278 >>> from html5lib.html5parser import HTMLParser
279 >>> parser = HTMLParser()
280 >>> parser.parse('<html><body><p>This is a doc</p></body></html>')
281 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>
282
283 """
284 self._parse(stream, False, None, *args, **kwargs)
285 return self.tree.getDocument()
286
287 def parseFragment(self, stream, *args, **kwargs):
288 """Parse a HTML fragment into a well-formed tree fragment
289
290 :arg container: name of the element we're setting the innerHTML
291 property if set to None, default to 'div'
292
293 :arg stream: a file-like object or string containing the HTML to be parsed
294
295 The optional encoding parameter must be a string that indicates
296 the encoding. If specified, that encoding will be used,
297 regardless of any BOM or later declaration (such as in a meta
298 element)
299
300 :arg scripting: treat noscript elements as if JavaScript was turned on
301
302 :returns: parsed tree
303
304 Example:
305
306 >>> from html5lib.html5libparser import HTMLParser
307 >>> parser = HTMLParser()
308 >>> parser.parseFragment('<b>this is a fragment</b>')
309 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>
310
311 """
312 self._parse(stream, True, *args, **kwargs)
313 return self.tree.getFragment()
314
315 def parseError(self, errorcode="XXX-undefined-error", datavars=None):
316 # XXX The idea is to make errorcode mandatory.
317 if datavars is None:
318 datavars = {}
319 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
320 if self.strict:
321 raise ParseError(E[errorcode] % datavars)
322
323 def adjustMathMLAttributes(self, token):
324 adjust_attributes(token, adjustMathMLAttributes)
325
326 def adjustSVGAttributes(self, token):
327 adjust_attributes(token, adjustSVGAttributes)
328
329 def adjustForeignAttributes(self, token):
330 adjust_attributes(token, adjustForeignAttributesMap)
331
332 def reparseTokenNormal(self, token):
333 # pylint:disable=unused-argument
334 self.parser.phase()
335
336 def resetInsertionMode(self):
337 # The name of this method is mostly historical. (It's also used in the
338 # specification.)
339 last = False
340 newModes = {
341 "select": "inSelect",
342 "td": "inCell",
343 "th": "inCell",
344 "tr": "inRow",
345 "tbody": "inTableBody",
346 "thead": "inTableBody",
347 "tfoot": "inTableBody",
348 "caption": "inCaption",
349 "colgroup": "inColumnGroup",
350 "table": "inTable",
351 "head": "inBody",
352 "body": "inBody",
353 "frameset": "inFrameset",
354 "html": "beforeHead"
355 }
356 for node in self.tree.openElements[::-1]:
357 nodeName = node.name
358 new_phase = None
359 if node == self.tree.openElements[0]:
360 assert self.innerHTML
361 last = True
362 nodeName = self.innerHTML
363 # Check for conditions that should only happen in the innerHTML
364 # case
365 if nodeName in ("select", "colgroup", "head", "html"):
366 assert self.innerHTML
367
368 if not last and node.namespace != self.tree.defaultNamespace:
369 continue
370
371 if nodeName in newModes:
372 new_phase = self.phases[newModes[nodeName]]
373 break
374 elif last:
375 new_phase = self.phases["inBody"]
376 break
377
378 self.phase = new_phase
379
380 def parseRCDataRawtext(self, token, contentType):
381 # Generic RCDATA/RAWTEXT Parsing algorithm
382 assert contentType in ("RAWTEXT", "RCDATA")
383
384 self.tree.insertElement(token)
385
386 if contentType == "RAWTEXT":
387 self.tokenizer.state = self.tokenizer.rawtextState
388 else:
389 self.tokenizer.state = self.tokenizer.rcdataState
390
391 self.originalPhase = self.phase
392
393 self.phase = self.phases["text"]
394
395
396@_utils.memoize
397def getPhases(debug):
398 def log(function):
399 """Logger that records which phase processes each token"""
400 type_names = {value: key for key, value in tokenTypes.items()}
401
402 def wrapped(self, *args, **kwargs):
403 if function.__name__.startswith("process") and len(args) > 0:
404 token = args[0]
405 info = {"type": type_names[token['type']]}
406 if token['type'] in tagTokenTypes:
407 info["name"] = token['name']
408
409 self.parser.log.append((self.parser.tokenizer.state.__name__,
410 self.parser.phase.__class__.__name__,
411 self.__class__.__name__,
412 function.__name__,
413 info))
414 return function(self, *args, **kwargs)
415 else:
416 return function(self, *args, **kwargs)
417 return wrapped
418
419 def getMetaclass(use_metaclass, metaclass_func):
420 if use_metaclass:
421 return method_decorator_metaclass(metaclass_func)
422 else:
423 return type
424
425 # pylint:disable=unused-argument
426 class Phase(with_metaclass(getMetaclass(debug, log))):
427 """Base class for helper object that implements each phase of processing
428 """
429 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
430
431 def __init__(self, parser, tree):
432 self.parser = parser
433 self.tree = tree
434 self.__startTagCache = {}
435 self.__endTagCache = {}
436
437 def processEOF(self):
438 raise NotImplementedError
439
440 def processComment(self, token):
441 # For most phases the following is correct. Where it's not it will be
442 # overridden.
443 self.tree.insertComment(token, self.tree.openElements[-1])
444
445 def processDoctype(self, token):
446 self.parser.parseError("unexpected-doctype")
447
448 def processCharacters(self, token):
449 self.tree.insertText(token["data"])
450
451 def processSpaceCharacters(self, token):
452 self.tree.insertText(token["data"])
453
454 def processStartTag(self, token):
455 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
456 # requires a circular reference to the Phase, and this ends up with a significant
457 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
458 name = token["name"]
459 # In Py2, using `in` is quicker in general than try/except KeyError
460 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
461 if name in self.__startTagCache:
462 func = self.__startTagCache[name]
463 else:
464 func = self.__startTagCache[name] = self.startTagHandler[name]
465 # bound the cache size in case we get loads of unknown tags
466 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
467 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
468 self.__startTagCache.pop(next(iter(self.__startTagCache)))
469 return func(token)
470
471 def startTagHtml(self, token):
472 if not self.parser.firstStartTag and token["name"] == "html":
473 self.parser.parseError("non-html-root")
474 # XXX Need a check here to see if the first start tag token emitted is
475 # this token... If it's not, invoke self.parser.parseError().
476 for attr, value in token["data"].items():
477 if attr not in self.tree.openElements[0].attributes:
478 self.tree.openElements[0].attributes[attr] = value
479 self.parser.firstStartTag = False
480
481 def processEndTag(self, token):
482 # Note the caching is done here rather than BoundMethodDispatcher as doing it there
483 # requires a circular reference to the Phase, and this ends up with a significant
484 # (CPython 2.7, 3.8) GC cost when parsing many short inputs
485 name = token["name"]
486 # In Py2, using `in` is quicker in general than try/except KeyError
487 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)
488 if name in self.__endTagCache:
489 func = self.__endTagCache[name]
490 else:
491 func = self.__endTagCache[name] = self.endTagHandler[name]
492 # bound the cache size in case we get loads of unknown tags
493 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
494 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
495 self.__endTagCache.pop(next(iter(self.__endTagCache)))
496 return func(token)
497
498 class InitialPhase(Phase):
499 __slots__ = tuple()
500
501 def processSpaceCharacters(self, token):
502 pass
503
504 def processComment(self, token):
505 self.tree.insertComment(token, self.tree.document)
506
507 def processDoctype(self, token):
508 name = token["name"]
509 publicId = token["publicId"]
510 systemId = token["systemId"]
511 correct = token["correct"]
512
513 if (name != "html" or publicId is not None or
514 systemId is not None and systemId != "about:legacy-compat"):
515 self.parser.parseError("unknown-doctype")
516
517 if publicId is None:
518 publicId = ""
519
520 self.tree.insertDoctype(token)
521
522 if publicId != "":
523 publicId = publicId.translate(asciiUpper2Lower)
524
525 if (not correct or token["name"] != "html" or
526 publicId.startswith(
527 ("+//silmaril//dtd html pro v0r11 19970101//",
528 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
529 "-//as//dtd html 3.0 aswedit + extensions//",
530 "-//ietf//dtd html 2.0 level 1//",
531 "-//ietf//dtd html 2.0 level 2//",
532 "-//ietf//dtd html 2.0 strict level 1//",
533 "-//ietf//dtd html 2.0 strict level 2//",
534 "-//ietf//dtd html 2.0 strict//",
535 "-//ietf//dtd html 2.0//",
536 "-//ietf//dtd html 2.1e//",
537 "-//ietf//dtd html 3.0//",
538 "-//ietf//dtd html 3.2 final//",
539 "-//ietf//dtd html 3.2//",
540 "-//ietf//dtd html 3//",
541 "-//ietf//dtd html level 0//",
542 "-//ietf//dtd html level 1//",
543 "-//ietf//dtd html level 2//",
544 "-//ietf//dtd html level 3//",
545 "-//ietf//dtd html strict level 0//",
546 "-//ietf//dtd html strict level 1//",
547 "-//ietf//dtd html strict level 2//",
548 "-//ietf//dtd html strict level 3//",
549 "-//ietf//dtd html strict//",
550 "-//ietf//dtd html//",
551 "-//metrius//dtd metrius presentational//",
552 "-//microsoft//dtd internet explorer 2.0 html strict//",
553 "-//microsoft//dtd internet explorer 2.0 html//",
554 "-//microsoft//dtd internet explorer 2.0 tables//",
555 "-//microsoft//dtd internet explorer 3.0 html strict//",
556 "-//microsoft//dtd internet explorer 3.0 html//",
557 "-//microsoft//dtd internet explorer 3.0 tables//",
558 "-//netscape comm. corp.//dtd html//",
559 "-//netscape comm. corp.//dtd strict html//",
560 "-//o'reilly and associates//dtd html 2.0//",
561 "-//o'reilly and associates//dtd html extended 1.0//",
562 "-//o'reilly and associates//dtd html extended relaxed 1.0//",
563 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
564 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
565 "-//spyglass//dtd html 2.0 extended//",
566 "-//sq//dtd html 2.0 hotmetal + extensions//",
567 "-//sun microsystems corp.//dtd hotjava html//",
568 "-//sun microsystems corp.//dtd hotjava strict html//",
569 "-//w3c//dtd html 3 1995-03-24//",
570 "-//w3c//dtd html 3.2 draft//",
571 "-//w3c//dtd html 3.2 final//",
572 "-//w3c//dtd html 3.2//",
573 "-//w3c//dtd html 3.2s draft//",
574 "-//w3c//dtd html 4.0 frameset//",
575 "-//w3c//dtd html 4.0 transitional//",
576 "-//w3c//dtd html experimental 19960712//",
577 "-//w3c//dtd html experimental 970421//",
578 "-//w3c//dtd w3 html//",
579 "-//w3o//dtd w3 html 3.0//",
580 "-//webtechs//dtd mozilla html 2.0//",
581 "-//webtechs//dtd mozilla html//")) or
582 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
583 "-/w3c/dtd html 4.0 transitional/en",
584 "html") or
585 publicId.startswith(
586 ("-//w3c//dtd html 4.01 frameset//",
587 "-//w3c//dtd html 4.01 transitional//")) and
588 systemId is None or
589 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
590 self.parser.compatMode = "quirks"
591 elif (publicId.startswith(
592 ("-//w3c//dtd xhtml 1.0 frameset//",
593 "-//w3c//dtd xhtml 1.0 transitional//")) or
594 publicId.startswith(
595 ("-//w3c//dtd html 4.01 frameset//",
596 "-//w3c//dtd html 4.01 transitional//")) and
597 systemId is not None):
598 self.parser.compatMode = "limited quirks"
599
600 self.parser.phase = self.parser.phases["beforeHtml"]
601
602 def anythingElse(self):
603 self.parser.compatMode = "quirks"
604 self.parser.phase = self.parser.phases["beforeHtml"]
605
606 def processCharacters(self, token):
607 self.parser.parseError("expected-doctype-but-got-chars")
608 self.anythingElse()
609 return token
610
611 def processStartTag(self, token):
612 self.parser.parseError("expected-doctype-but-got-start-tag",
613 {"name": token["name"]})
614 self.anythingElse()
615 return token
616
617 def processEndTag(self, token):
618 self.parser.parseError("expected-doctype-but-got-end-tag",
619 {"name": token["name"]})
620 self.anythingElse()
621 return token
622
623 def processEOF(self):
624 self.parser.parseError("expected-doctype-but-got-eof")
625 self.anythingElse()
626 return True
627
628 class BeforeHtmlPhase(Phase):
629 __slots__ = tuple()
630
631 # helper methods
632 def insertHtmlElement(self):
633 self.tree.insertRoot(impliedTagToken("html", "StartTag"))
634 self.parser.phase = self.parser.phases["beforeHead"]
635
636 # other
637 def processEOF(self):
638 self.insertHtmlElement()
639 return True
640
641 def processComment(self, token):
642 self.tree.insertComment(token, self.tree.document)
643
644 def processSpaceCharacters(self, token):
645 pass
646
647 def processCharacters(self, token):
648 self.insertHtmlElement()
649 return token
650
651 def processStartTag(self, token):
652 if token["name"] == "html":
653 self.parser.firstStartTag = True
654 self.insertHtmlElement()
655 return token
656
657 def processEndTag(self, token):
658 if token["name"] not in ("head", "body", "html", "br"):
659 self.parser.parseError("unexpected-end-tag-before-html",
660 {"name": token["name"]})
661 else:
662 self.insertHtmlElement()
663 return token
664
665 class BeforeHeadPhase(Phase):
666 __slots__ = tuple()
667
668 def processEOF(self):
669 self.startTagHead(impliedTagToken("head", "StartTag"))
670 return True
671
672 def processSpaceCharacters(self, token):
673 pass
674
675 def processCharacters(self, token):
676 self.startTagHead(impliedTagToken("head", "StartTag"))
677 return token
678
679 def startTagHtml(self, token):
680 return self.parser.phases["inBody"].processStartTag(token)
681
682 def startTagHead(self, token):
683 self.tree.insertElement(token)
684 self.tree.headPointer = self.tree.openElements[-1]
685 self.parser.phase = self.parser.phases["inHead"]
686
687 def startTagOther(self, token):
688 self.startTagHead(impliedTagToken("head", "StartTag"))
689 return token
690
691 def endTagImplyHead(self, token):
692 self.startTagHead(impliedTagToken("head", "StartTag"))
693 return token
694
695 def endTagOther(self, token):
696 self.parser.parseError("end-tag-after-implied-root",
697 {"name": token["name"]})
698
699 startTagHandler = _utils.MethodDispatcher([
700 ("html", startTagHtml),
701 ("head", startTagHead)
702 ])
703 startTagHandler.default = startTagOther
704
705 endTagHandler = _utils.MethodDispatcher([
706 (("head", "body", "html", "br"), endTagImplyHead)
707 ])
708 endTagHandler.default = endTagOther
709
710 class InHeadPhase(Phase):
711 __slots__ = tuple()
712
713 # the real thing
714 def processEOF(self):
715 self.anythingElse()
716 return True
717
718 def processCharacters(self, token):
719 self.anythingElse()
720 return token
721
722 def startTagHtml(self, token):
723 return self.parser.phases["inBody"].processStartTag(token)
724
725 def startTagHead(self, token):
726 self.parser.parseError("two-heads-are-not-better-than-one")
727
728 def startTagBaseLinkCommand(self, token):
729 self.tree.insertElement(token)
730 self.tree.openElements.pop()
731 token["selfClosingAcknowledged"] = True
732
733 def startTagMeta(self, token):
734 self.tree.insertElement(token)
735 self.tree.openElements.pop()
736 token["selfClosingAcknowledged"] = True
737
738 attributes = token["data"]
739 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
740 if "charset" in attributes:
741 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
742 elif ("content" in attributes and
743 "http-equiv" in attributes and
744 attributes["http-equiv"].lower() == "content-type"):
745 # Encoding it as UTF-8 here is a hack, as really we should pass
746 # the abstract Unicode string, and just use the
747 # ContentAttrParser on that, but using UTF-8 allows all chars
748 # to be encoded and as a ASCII-superset works.
749 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
750 parser = _inputstream.ContentAttrParser(data)
751 codec = parser.parse()
752 self.parser.tokenizer.stream.changeEncoding(codec)
753
754 def startTagTitle(self, token):
755 self.parser.parseRCDataRawtext(token, "RCDATA")
756
757 def startTagNoFramesStyle(self, token):
758 # Need to decide whether to implement the scripting-disabled case
759 self.parser.parseRCDataRawtext(token, "RAWTEXT")
760
761 def startTagNoscript(self, token):
762 if self.parser.scripting:
763 self.parser.parseRCDataRawtext(token, "RAWTEXT")
764 else:
765 self.tree.insertElement(token)
766 self.parser.phase = self.parser.phases["inHeadNoscript"]
767
768 def startTagScript(self, token):
769 self.tree.insertElement(token)
770 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
771 self.parser.originalPhase = self.parser.phase
772 self.parser.phase = self.parser.phases["text"]
773
774 def startTagOther(self, token):
775 self.anythingElse()
776 return token
777
778 def endTagHead(self, token):
779 node = self.parser.tree.openElements.pop()
780 assert node.name == "head", "Expected head got %s" % node.name
781 self.parser.phase = self.parser.phases["afterHead"]
782
783 def endTagHtmlBodyBr(self, token):
784 self.anythingElse()
785 return token
786
787 def endTagOther(self, token):
788 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
789
790 def anythingElse(self):
791 self.endTagHead(impliedTagToken("head"))
792
793 startTagHandler = _utils.MethodDispatcher([
794 ("html", startTagHtml),
795 ("title", startTagTitle),
796 (("noframes", "style"), startTagNoFramesStyle),
797 ("noscript", startTagNoscript),
798 ("script", startTagScript),
799 (("base", "basefont", "bgsound", "command", "link"),
800 startTagBaseLinkCommand),
801 ("meta", startTagMeta),
802 ("head", startTagHead)
803 ])
804 startTagHandler.default = startTagOther
805
806 endTagHandler = _utils.MethodDispatcher([
807 ("head", endTagHead),
808 (("br", "html", "body"), endTagHtmlBodyBr)
809 ])
810 endTagHandler.default = endTagOther
811
812 class InHeadNoscriptPhase(Phase):
813 __slots__ = tuple()
814
815 def processEOF(self):
816 self.parser.parseError("eof-in-head-noscript")
817 self.anythingElse()
818 return True
819
820 def processComment(self, token):
821 return self.parser.phases["inHead"].processComment(token)
822
823 def processCharacters(self, token):
824 self.parser.parseError("char-in-head-noscript")
825 self.anythingElse()
826 return token
827
828 def processSpaceCharacters(self, token):
829 return self.parser.phases["inHead"].processSpaceCharacters(token)
830
831 def startTagHtml(self, token):
832 return self.parser.phases["inBody"].processStartTag(token)
833
834 def startTagBaseLinkCommand(self, token):
835 return self.parser.phases["inHead"].processStartTag(token)
836
837 def startTagHeadNoscript(self, token):
838 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
839
840 def startTagOther(self, token):
841 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
842 self.anythingElse()
843 return token
844
845 def endTagNoscript(self, token):
846 node = self.parser.tree.openElements.pop()
847 assert node.name == "noscript", "Expected noscript got %s" % node.name
848 self.parser.phase = self.parser.phases["inHead"]
849
850 def endTagBr(self, token):
851 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
852 self.anythingElse()
853 return token
854
855 def endTagOther(self, token):
856 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
857
858 def anythingElse(self):
859 # Caller must raise parse error first!
860 self.endTagNoscript(impliedTagToken("noscript"))
861
862 startTagHandler = _utils.MethodDispatcher([
863 ("html", startTagHtml),
864 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
865 (("head", "noscript"), startTagHeadNoscript),
866 ])
867 startTagHandler.default = startTagOther
868
869 endTagHandler = _utils.MethodDispatcher([
870 ("noscript", endTagNoscript),
871 ("br", endTagBr),
872 ])
873 endTagHandler.default = endTagOther
874
875 class AfterHeadPhase(Phase):
876 __slots__ = tuple()
877
878 def processEOF(self):
879 self.anythingElse()
880 return True
881
882 def processCharacters(self, token):
883 self.anythingElse()
884 return token
885
886 def startTagHtml(self, token):
887 return self.parser.phases["inBody"].processStartTag(token)
888
889 def startTagBody(self, token):
890 self.parser.framesetOK = False
891 self.tree.insertElement(token)
892 self.parser.phase = self.parser.phases["inBody"]
893
894 def startTagFrameset(self, token):
895 self.tree.insertElement(token)
896 self.parser.phase = self.parser.phases["inFrameset"]
897
898 def startTagFromHead(self, token):
899 self.parser.parseError("unexpected-start-tag-out-of-my-head",
900 {"name": token["name"]})
901 self.tree.openElements.append(self.tree.headPointer)
902 self.parser.phases["inHead"].processStartTag(token)
903 for node in self.tree.openElements[::-1]:
904 if node.name == "head":
905 self.tree.openElements.remove(node)
906 break
907
908 def startTagHead(self, token):
909 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
910
911 def startTagOther(self, token):
912 self.anythingElse()
913 return token
914
915 def endTagHtmlBodyBr(self, token):
916 self.anythingElse()
917 return token
918
919 def endTagOther(self, token):
920 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
921
922 def anythingElse(self):
923 self.tree.insertElement(impliedTagToken("body", "StartTag"))
924 self.parser.phase = self.parser.phases["inBody"]
925 self.parser.framesetOK = True
926
927 startTagHandler = _utils.MethodDispatcher([
928 ("html", startTagHtml),
929 ("body", startTagBody),
930 ("frameset", startTagFrameset),
931 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
932 "style", "title"),
933 startTagFromHead),
934 ("head", startTagHead)
935 ])
936 startTagHandler.default = startTagOther
937 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
938 endTagHtmlBodyBr)])
939 endTagHandler.default = endTagOther
940
941 class InBodyPhase(Phase):
942 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
943 # the really-really-really-very crazy mode
944 __slots__ = ("processSpaceCharacters",)
945
946 def __init__(self, *args, **kwargs):
947 super(InBodyPhase, self).__init__(*args, **kwargs)
948 # Set this to the default handler
949 self.processSpaceCharacters = self.processSpaceCharactersNonPre
950
951 def isMatchingFormattingElement(self, node1, node2):
952 return (node1.name == node2.name and
953 node1.namespace == node2.namespace and
954 node1.attributes == node2.attributes)
955
956 # helper
957 def addFormattingElement(self, token):
958 self.tree.insertElement(token)
959 element = self.tree.openElements[-1]
960
961 matchingElements = []
962 for node in self.tree.activeFormattingElements[::-1]:
963 if node is Marker:
964 break
965 elif self.isMatchingFormattingElement(node, element):
966 matchingElements.append(node)
967
968 assert len(matchingElements) <= 3
969 if len(matchingElements) == 3:
970 self.tree.activeFormattingElements.remove(matchingElements[-1])
971 self.tree.activeFormattingElements.append(element)
972
973 # the real deal
974 def processEOF(self):
975 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
976 "tfoot", "th", "thead", "tr", "body",
977 "html"))
978 for node in self.tree.openElements[::-1]:
979 if node.name not in allowed_elements:
980 self.parser.parseError("expected-closing-tag-but-got-eof")
981 break
982 # Stop parsing
983
984 def processSpaceCharactersDropNewline(self, token):
985 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we
986 # want to drop leading newlines
987 data = token["data"]
988 self.processSpaceCharacters = self.processSpaceCharactersNonPre
989 if (data.startswith("\n") and
990 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and
991 not self.tree.openElements[-1].hasContent()):
992 data = data[1:]
993 if data:
994 self.tree.reconstructActiveFormattingElements()
995 self.tree.insertText(data)
996
997 def processCharacters(self, token):
998 if token["data"] == "\u0000":
999 # The tokenizer should always emit null on its own
1000 return
1001 self.tree.reconstructActiveFormattingElements()
1002 self.tree.insertText(token["data"])
1003 # This must be bad for performance
1004 if (self.parser.framesetOK and
1005 any([char not in spaceCharacters
1006 for char in token["data"]])):
1007 self.parser.framesetOK = False
1008
1009 def processSpaceCharactersNonPre(self, token):
1010 self.tree.reconstructActiveFormattingElements()
1011 self.tree.insertText(token["data"])
1012
1013 def startTagProcessInHead(self, token):
1014 return self.parser.phases["inHead"].processStartTag(token)
1015
1016 def startTagBody(self, token):
1017 self.parser.parseError("unexpected-start-tag", {"name": "body"})
1018 if (len(self.tree.openElements) == 1 or
1019 self.tree.openElements[1].name != "body"):
1020 assert self.parser.innerHTML
1021 else:
1022 self.parser.framesetOK = False
1023 for attr, value in token["data"].items():
1024 if attr not in self.tree.openElements[1].attributes:
1025 self.tree.openElements[1].attributes[attr] = value
1026
1027 def startTagFrameset(self, token):
1028 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})
1029 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):
1030 assert self.parser.innerHTML
1031 elif not self.parser.framesetOK:
1032 pass
1033 else:
1034 if self.tree.openElements[1].parent:
1035 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])
1036 while self.tree.openElements[-1].name != "html":
1037 self.tree.openElements.pop()
1038 self.tree.insertElement(token)
1039 self.parser.phase = self.parser.phases["inFrameset"]
1040
1041 def startTagCloseP(self, token):
1042 if self.tree.elementInScope("p", variant="button"):
1043 self.endTagP(impliedTagToken("p"))
1044 self.tree.insertElement(token)
1045
1046 def startTagPreListing(self, token):
1047 if self.tree.elementInScope("p", variant="button"):
1048 self.endTagP(impliedTagToken("p"))
1049 self.tree.insertElement(token)
1050 self.parser.framesetOK = False
1051 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1052
1053 def startTagForm(self, token):
1054 if self.tree.formPointer:
1055 self.parser.parseError("unexpected-start-tag", {"name": "form"})
1056 else:
1057 if self.tree.elementInScope("p", variant="button"):
1058 self.endTagP(impliedTagToken("p"))
1059 self.tree.insertElement(token)
1060 self.tree.formPointer = self.tree.openElements[-1]
1061
1062 def startTagListItem(self, token):
1063 self.parser.framesetOK = False
1064
1065 stopNamesMap = {"li": ["li"],
1066 "dt": ["dt", "dd"],
1067 "dd": ["dt", "dd"]}
1068 stopNames = stopNamesMap[token["name"]]
1069 for node in reversed(self.tree.openElements):
1070 if node.name in stopNames:
1071 self.parser.phase.processEndTag(
1072 impliedTagToken(node.name, "EndTag"))
1073 break
1074 if (node.nameTuple in specialElements and
1075 node.name not in ("address", "div", "p")):
1076 break
1077
1078 if self.tree.elementInScope("p", variant="button"):
1079 self.parser.phase.processEndTag(
1080 impliedTagToken("p", "EndTag"))
1081
1082 self.tree.insertElement(token)
1083
1084 def startTagPlaintext(self, token):
1085 if self.tree.elementInScope("p", variant="button"):
1086 self.endTagP(impliedTagToken("p"))
1087 self.tree.insertElement(token)
1088 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState
1089
1090 def startTagHeading(self, token):
1091 if self.tree.elementInScope("p", variant="button"):
1092 self.endTagP(impliedTagToken("p"))
1093 if self.tree.openElements[-1].name in headingElements:
1094 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
1095 self.tree.openElements.pop()
1096 self.tree.insertElement(token)
1097
1098 def startTagA(self, token):
1099 afeAElement = self.tree.elementInActiveFormattingElements("a")
1100 if afeAElement:
1101 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1102 {"startName": "a", "endName": "a"})
1103 self.endTagFormatting(impliedTagToken("a"))
1104 if afeAElement in self.tree.openElements:
1105 self.tree.openElements.remove(afeAElement)
1106 if afeAElement in self.tree.activeFormattingElements:
1107 self.tree.activeFormattingElements.remove(afeAElement)
1108 self.tree.reconstructActiveFormattingElements()
1109 self.addFormattingElement(token)
1110
1111 def startTagFormatting(self, token):
1112 self.tree.reconstructActiveFormattingElements()
1113 self.addFormattingElement(token)
1114
1115 def startTagNobr(self, token):
1116 self.tree.reconstructActiveFormattingElements()
1117 if self.tree.elementInScope("nobr"):
1118 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1119 {"startName": "nobr", "endName": "nobr"})
1120 self.processEndTag(impliedTagToken("nobr"))
1121 # XXX Need tests that trigger the following
1122 self.tree.reconstructActiveFormattingElements()
1123 self.addFormattingElement(token)
1124
1125 def startTagButton(self, token):
1126 if self.tree.elementInScope("button"):
1127 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1128 {"startName": "button", "endName": "button"})
1129 self.processEndTag(impliedTagToken("button"))
1130 return token
1131 else:
1132 self.tree.reconstructActiveFormattingElements()
1133 self.tree.insertElement(token)
1134 self.parser.framesetOK = False
1135
1136 def startTagAppletMarqueeObject(self, token):
1137 self.tree.reconstructActiveFormattingElements()
1138 self.tree.insertElement(token)
1139 self.tree.activeFormattingElements.append(Marker)
1140 self.parser.framesetOK = False
1141
1142 def startTagXmp(self, token):
1143 if self.tree.elementInScope("p", variant="button"):
1144 self.endTagP(impliedTagToken("p"))
1145 self.tree.reconstructActiveFormattingElements()
1146 self.parser.framesetOK = False
1147 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1148
1149 def startTagTable(self, token):
1150 if self.parser.compatMode != "quirks":
1151 if self.tree.elementInScope("p", variant="button"):
1152 self.processEndTag(impliedTagToken("p"))
1153 self.tree.insertElement(token)
1154 self.parser.framesetOK = False
1155 self.parser.phase = self.parser.phases["inTable"]
1156
1157 def startTagVoidFormatting(self, token):
1158 self.tree.reconstructActiveFormattingElements()
1159 self.tree.insertElement(token)
1160 self.tree.openElements.pop()
1161 token["selfClosingAcknowledged"] = True
1162 self.parser.framesetOK = False
1163
1164 def startTagInput(self, token):
1165 framesetOK = self.parser.framesetOK
1166 self.startTagVoidFormatting(token)
1167 if ("type" in token["data"] and
1168 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1169 # input type=hidden doesn't change framesetOK
1170 self.parser.framesetOK = framesetOK
1171
1172 def startTagParamSource(self, token):
1173 self.tree.insertElement(token)
1174 self.tree.openElements.pop()
1175 token["selfClosingAcknowledged"] = True
1176
1177 def startTagHr(self, token):
1178 if self.tree.elementInScope("p", variant="button"):
1179 self.endTagP(impliedTagToken("p"))
1180 self.tree.insertElement(token)
1181 self.tree.openElements.pop()
1182 token["selfClosingAcknowledged"] = True
1183 self.parser.framesetOK = False
1184
1185 def startTagImage(self, token):
1186 # No really...
1187 self.parser.parseError("unexpected-start-tag-treated-as",
1188 {"originalName": "image", "newName": "img"})
1189 self.processStartTag(impliedTagToken("img", "StartTag",
1190 attributes=token["data"],
1191 selfClosing=token["selfClosing"]))
1192
1193 def startTagIsIndex(self, token):
1194 self.parser.parseError("deprecated-tag", {"name": "isindex"})
1195 if self.tree.formPointer:
1196 return
1197 form_attrs = {}
1198 if "action" in token["data"]:
1199 form_attrs["action"] = token["data"]["action"]
1200 self.processStartTag(impliedTagToken("form", "StartTag",
1201 attributes=form_attrs))
1202 self.processStartTag(impliedTagToken("hr", "StartTag"))
1203 self.processStartTag(impliedTagToken("label", "StartTag"))
1204 # XXX Localization ...
1205 if "prompt" in token["data"]:
1206 prompt = token["data"]["prompt"]
1207 else:
1208 prompt = "This is a searchable index. Enter search keywords: "
1209 self.processCharacters(
1210 {"type": tokenTypes["Characters"], "data": prompt})
1211 attributes = token["data"].copy()
1212 if "action" in attributes:
1213 del attributes["action"]
1214 if "prompt" in attributes:
1215 del attributes["prompt"]
1216 attributes["name"] = "isindex"
1217 self.processStartTag(impliedTagToken("input", "StartTag",
1218 attributes=attributes,
1219 selfClosing=token["selfClosing"]))
1220 self.processEndTag(impliedTagToken("label"))
1221 self.processStartTag(impliedTagToken("hr", "StartTag"))
1222 self.processEndTag(impliedTagToken("form"))
1223
1224 def startTagTextarea(self, token):
1225 self.tree.insertElement(token)
1226 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState
1227 self.processSpaceCharacters = self.processSpaceCharactersDropNewline
1228 self.parser.framesetOK = False
1229
1230 def startTagIFrame(self, token):
1231 self.parser.framesetOK = False
1232 self.startTagRawtext(token)
1233
1234 def startTagNoscript(self, token):
1235 if self.parser.scripting:
1236 self.startTagRawtext(token)
1237 else:
1238 self.startTagOther(token)
1239
1240 def startTagRawtext(self, token):
1241 """iframe, noembed noframes, noscript(if scripting enabled)"""
1242 self.parser.parseRCDataRawtext(token, "RAWTEXT")
1243
1244 def startTagOpt(self, token):
1245 if self.tree.openElements[-1].name == "option":
1246 self.parser.phase.processEndTag(impliedTagToken("option"))
1247 self.tree.reconstructActiveFormattingElements()
1248 self.parser.tree.insertElement(token)
1249
1250 def startTagSelect(self, token):
1251 self.tree.reconstructActiveFormattingElements()
1252 self.tree.insertElement(token)
1253 self.parser.framesetOK = False
1254 if self.parser.phase in (self.parser.phases["inTable"],
1255 self.parser.phases["inCaption"],
1256 self.parser.phases["inColumnGroup"],
1257 self.parser.phases["inTableBody"],
1258 self.parser.phases["inRow"],
1259 self.parser.phases["inCell"]):
1260 self.parser.phase = self.parser.phases["inSelectInTable"]
1261 else:
1262 self.parser.phase = self.parser.phases["inSelect"]
1263
1264 def startTagRpRt(self, token):
1265 if self.tree.elementInScope("ruby"):
1266 self.tree.generateImpliedEndTags()
1267 if self.tree.openElements[-1].name != "ruby":
1268 self.parser.parseError()
1269 self.tree.insertElement(token)
1270
1271 def startTagMath(self, token):
1272 self.tree.reconstructActiveFormattingElements()
1273 self.parser.adjustMathMLAttributes(token)
1274 self.parser.adjustForeignAttributes(token)
1275 token["namespace"] = namespaces["mathml"]
1276 self.tree.insertElement(token)
1277 # Need to get the parse error right for the case where the token
1278 # has a namespace not equal to the xmlns attribute
1279 if token["selfClosing"]:
1280 self.tree.openElements.pop()
1281 token["selfClosingAcknowledged"] = True
1282
1283 def startTagSvg(self, token):
1284 self.tree.reconstructActiveFormattingElements()
1285 self.parser.adjustSVGAttributes(token)
1286 self.parser.adjustForeignAttributes(token)
1287 token["namespace"] = namespaces["svg"]
1288 self.tree.insertElement(token)
1289 # Need to get the parse error right for the case where the token
1290 # has a namespace not equal to the xmlns attribute
1291 if token["selfClosing"]:
1292 self.tree.openElements.pop()
1293 token["selfClosingAcknowledged"] = True
1294
1295 def startTagMisplaced(self, token):
1296 """ Elements that should be children of other elements that have a
1297 different insertion mode; here they are ignored
1298 "caption", "col", "colgroup", "frame", "frameset", "head",
1299 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",
1300 "tr", "noscript"
1301 """
1302 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})
1303
1304 def startTagOther(self, token):
1305 self.tree.reconstructActiveFormattingElements()
1306 self.tree.insertElement(token)
1307
1308 def endTagP(self, token):
1309 if not self.tree.elementInScope("p", variant="button"):
1310 self.startTagCloseP(impliedTagToken("p", "StartTag"))
1311 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1312 self.endTagP(impliedTagToken("p", "EndTag"))
1313 else:
1314 self.tree.generateImpliedEndTags("p")
1315 if self.tree.openElements[-1].name != "p":
1316 self.parser.parseError("unexpected-end-tag", {"name": "p"})
1317 node = self.tree.openElements.pop()
1318 while node.name != "p":
1319 node = self.tree.openElements.pop()
1320
1321 def endTagBody(self, token):
1322 if not self.tree.elementInScope("body"):
1323 self.parser.parseError()
1324 return
1325 elif self.tree.openElements[-1].name != "body":
1326 for node in self.tree.openElements[2:]:
1327 if node.name not in frozenset(("dd", "dt", "li", "optgroup",
1328 "option", "p", "rp", "rt",
1329 "tbody", "td", "tfoot",
1330 "th", "thead", "tr", "body",
1331 "html")):
1332 # Not sure this is the correct name for the parse error
1333 self.parser.parseError(
1334 "expected-one-end-tag-but-got-another",
1335 {"gotName": "body", "expectedName": node.name})
1336 break
1337 self.parser.phase = self.parser.phases["afterBody"]
1338
1339 def endTagHtml(self, token):
1340 # We repeat the test for the body end tag token being ignored here
1341 if self.tree.elementInScope("body"):
1342 self.endTagBody(impliedTagToken("body"))
1343 return token
1344
1345 def endTagBlock(self, token):
1346 # Put us back in the right whitespace handling mode
1347 if token["name"] == "pre":
1348 self.processSpaceCharacters = self.processSpaceCharactersNonPre
1349 inScope = self.tree.elementInScope(token["name"])
1350 if inScope:
1351 self.tree.generateImpliedEndTags()
1352 if self.tree.openElements[-1].name != token["name"]:
1353 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1354 if inScope:
1355 node = self.tree.openElements.pop()
1356 while node.name != token["name"]:
1357 node = self.tree.openElements.pop()
1358
1359 def endTagForm(self, token):
1360 node = self.tree.formPointer
1361 self.tree.formPointer = None
1362 if node is None or not self.tree.elementInScope(node):
1363 self.parser.parseError("unexpected-end-tag",
1364 {"name": "form"})
1365 else:
1366 self.tree.generateImpliedEndTags()
1367 if self.tree.openElements[-1] != node:
1368 self.parser.parseError("end-tag-too-early-ignored",
1369 {"name": "form"})
1370 self.tree.openElements.remove(node)
1371
1372 def endTagListItem(self, token):
1373 if token["name"] == "li":
1374 variant = "list"
1375 else:
1376 variant = None
1377 if not self.tree.elementInScope(token["name"], variant=variant):
1378 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1379 else:
1380 self.tree.generateImpliedEndTags(exclude=token["name"])
1381 if self.tree.openElements[-1].name != token["name"]:
1382 self.parser.parseError(
1383 "end-tag-too-early",
1384 {"name": token["name"]})
1385 node = self.tree.openElements.pop()
1386 while node.name != token["name"]:
1387 node = self.tree.openElements.pop()
1388
1389 def endTagHeading(self, token):
1390 for item in headingElements:
1391 if self.tree.elementInScope(item):
1392 self.tree.generateImpliedEndTags()
1393 break
1394 if self.tree.openElements[-1].name != token["name"]:
1395 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1396
1397 for item in headingElements:
1398 if self.tree.elementInScope(item):
1399 item = self.tree.openElements.pop()
1400 while item.name not in headingElements:
1401 item = self.tree.openElements.pop()
1402 break
1403
1404 def endTagFormatting(self, token):
1405 """The much-feared adoption agency algorithm"""
1406 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867
1407 # XXX Better parseError messages appreciated.
1408
1409 # Step 1
1410 outerLoopCounter = 0
1411
1412 # Step 2
1413 while outerLoopCounter < 8:
1414
1415 # Step 3
1416 outerLoopCounter += 1
1417
1418 # Step 4:
1419
1420 # Let the formatting element be the last element in
1421 # the list of active formatting elements that:
1422 # - is between the end of the list and the last scope
1423 # marker in the list, if any, or the start of the list
1424 # otherwise, and
1425 # - has the same tag name as the token.
1426 formattingElement = self.tree.elementInActiveFormattingElements(
1427 token["name"])
1428 if (not formattingElement or
1429 (formattingElement in self.tree.openElements and
1430 not self.tree.elementInScope(formattingElement.name))):
1431 # If there is no such node, then abort these steps
1432 # and instead act as described in the "any other
1433 # end tag" entry below.
1434 self.endTagOther(token)
1435 return
1436
1437 # Otherwise, if there is such a node, but that node is
1438 # not in the stack of open elements, then this is a
1439 # parse error; remove the element from the list, and
1440 # abort these steps.
1441 elif formattingElement not in self.tree.openElements:
1442 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})
1443 self.tree.activeFormattingElements.remove(formattingElement)
1444 return
1445
1446 # Otherwise, if there is such a node, and that node is
1447 # also in the stack of open elements, but the element
1448 # is not in scope, then this is a parse error; ignore
1449 # the token, and abort these steps.
1450 elif not self.tree.elementInScope(formattingElement.name):
1451 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})
1452 return
1453
1454 # Otherwise, there is a formatting element and that
1455 # element is in the stack and is in scope. If the
1456 # element is not the current node, this is a parse
1457 # error. In any case, proceed with the algorithm as
1458 # written in the following steps.
1459 else:
1460 if formattingElement != self.tree.openElements[-1]:
1461 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})
1462
1463 # Step 5:
1464
1465 # Let the furthest block be the topmost node in the
1466 # stack of open elements that is lower in the stack
1467 # than the formatting element, and is an element in
1468 # the special category. There might not be one.
1469 afeIndex = self.tree.openElements.index(formattingElement)
1470 furthestBlock = None
1471 for element in self.tree.openElements[afeIndex:]:
1472 if element.nameTuple in specialElements:
1473 furthestBlock = element
1474 break
1475
1476 # Step 6:
1477
1478 # If there is no furthest block, then the UA must
1479 # first pop all the nodes from the bottom of the stack
1480 # of open elements, from the current node up to and
1481 # including the formatting element, then remove the
1482 # formatting element from the list of active
1483 # formatting elements, and finally abort these steps.
1484 if furthestBlock is None:
1485 element = self.tree.openElements.pop()
1486 while element != formattingElement:
1487 element = self.tree.openElements.pop()
1488 self.tree.activeFormattingElements.remove(element)
1489 return
1490
1491 # Step 7
1492 commonAncestor = self.tree.openElements[afeIndex - 1]
1493
1494 # Step 8:
1495 # The bookmark is supposed to help us identify where to reinsert
1496 # nodes in step 15. We have to ensure that we reinsert nodes after
1497 # the node before the active formatting element. Note the bookmark
1498 # can move in step 9.7
1499 bookmark = self.tree.activeFormattingElements.index(formattingElement)
1500
1501 # Step 9
1502 lastNode = node = furthestBlock
1503 innerLoopCounter = 0
1504
1505 index = self.tree.openElements.index(node)
1506 while innerLoopCounter < 3:
1507 innerLoopCounter += 1
1508 # Node is element before node in open elements
1509 index -= 1
1510 node = self.tree.openElements[index]
1511 if node not in self.tree.activeFormattingElements:
1512 self.tree.openElements.remove(node)
1513 continue
1514 # Step 9.6
1515 if node == formattingElement:
1516 break
1517 # Step 9.7
1518 if lastNode == furthestBlock:
1519 bookmark = self.tree.activeFormattingElements.index(node) + 1
1520 # Step 9.8
1521 clone = node.cloneNode()
1522 # Replace node with clone
1523 self.tree.activeFormattingElements[
1524 self.tree.activeFormattingElements.index(node)] = clone
1525 self.tree.openElements[
1526 self.tree.openElements.index(node)] = clone
1527 node = clone
1528 # Step 9.9
1529 # Remove lastNode from its parents, if any
1530 if lastNode.parent:
1531 lastNode.parent.removeChild(lastNode)
1532 node.appendChild(lastNode)
1533 # Step 9.10
1534 lastNode = node
1535
1536 # Step 10
1537 # Foster parent lastNode if commonAncestor is a
1538 # table, tbody, tfoot, thead, or tr we need to foster
1539 # parent the lastNode
1540 if lastNode.parent:
1541 lastNode.parent.removeChild(lastNode)
1542
1543 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):
1544 parent, insertBefore = self.tree.getTableMisnestedNodePosition()
1545 parent.insertBefore(lastNode, insertBefore)
1546 else:
1547 commonAncestor.appendChild(lastNode)
1548
1549 # Step 11
1550 clone = formattingElement.cloneNode()
1551
1552 # Step 12
1553 furthestBlock.reparentChildren(clone)
1554
1555 # Step 13
1556 furthestBlock.appendChild(clone)
1557
1558 # Step 14
1559 self.tree.activeFormattingElements.remove(formattingElement)
1560 self.tree.activeFormattingElements.insert(bookmark, clone)
1561
1562 # Step 15
1563 self.tree.openElements.remove(formattingElement)
1564 self.tree.openElements.insert(
1565 self.tree.openElements.index(furthestBlock) + 1, clone)
1566
1567 def endTagAppletMarqueeObject(self, token):
1568 if self.tree.elementInScope(token["name"]):
1569 self.tree.generateImpliedEndTags()
1570 if self.tree.openElements[-1].name != token["name"]:
1571 self.parser.parseError("end-tag-too-early", {"name": token["name"]})
1572
1573 if self.tree.elementInScope(token["name"]):
1574 element = self.tree.openElements.pop()
1575 while element.name != token["name"]:
1576 element = self.tree.openElements.pop()
1577 self.tree.clearActiveFormattingElements()
1578
1579 def endTagBr(self, token):
1580 self.parser.parseError("unexpected-end-tag-treated-as",
1581 {"originalName": "br", "newName": "br element"})
1582 self.tree.reconstructActiveFormattingElements()
1583 self.tree.insertElement(impliedTagToken("br", "StartTag"))
1584 self.tree.openElements.pop()
1585
1586 def endTagOther(self, token):
1587 for node in self.tree.openElements[::-1]:
1588 if node.name == token["name"]:
1589 self.tree.generateImpliedEndTags(exclude=token["name"])
1590 if self.tree.openElements[-1].name != token["name"]:
1591 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1592 while self.tree.openElements.pop() != node:
1593 pass
1594 break
1595 else:
1596 if node.nameTuple in specialElements:
1597 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1598 break
1599
1600 startTagHandler = _utils.MethodDispatcher([
1601 ("html", Phase.startTagHtml),
1602 (("base", "basefont", "bgsound", "command", "link", "meta",
1603 "script", "style", "title"),
1604 startTagProcessInHead),
1605 ("body", startTagBody),
1606 ("frameset", startTagFrameset),
1607 (("address", "article", "aside", "blockquote", "center", "details",
1608 "dir", "div", "dl", "fieldset", "figcaption", "figure",
1609 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",
1610 "section", "summary", "ul"),
1611 startTagCloseP),
1612 (headingElements, startTagHeading),
1613 (("pre", "listing"), startTagPreListing),
1614 ("form", startTagForm),
1615 (("li", "dd", "dt"), startTagListItem),
1616 ("plaintext", startTagPlaintext),
1617 ("a", startTagA),
1618 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",
1619 "strong", "tt", "u"), startTagFormatting),
1620 ("nobr", startTagNobr),
1621 ("button", startTagButton),
1622 (("applet", "marquee", "object"), startTagAppletMarqueeObject),
1623 ("xmp", startTagXmp),
1624 ("table", startTagTable),
1625 (("area", "br", "embed", "img", "keygen", "wbr"),
1626 startTagVoidFormatting),
1627 (("param", "source", "track"), startTagParamSource),
1628 ("input", startTagInput),
1629 ("hr", startTagHr),
1630 ("image", startTagImage),
1631 ("isindex", startTagIsIndex),
1632 ("textarea", startTagTextarea),
1633 ("iframe", startTagIFrame),
1634 ("noscript", startTagNoscript),
1635 (("noembed", "noframes"), startTagRawtext),
1636 ("select", startTagSelect),
1637 (("rp", "rt"), startTagRpRt),
1638 (("option", "optgroup"), startTagOpt),
1639 (("math"), startTagMath),
1640 (("svg"), startTagSvg),
1641 (("caption", "col", "colgroup", "frame", "head",
1642 "tbody", "td", "tfoot", "th", "thead",
1643 "tr"), startTagMisplaced)
1644 ])
1645 startTagHandler.default = startTagOther
1646
1647 endTagHandler = _utils.MethodDispatcher([
1648 ("body", endTagBody),
1649 ("html", endTagHtml),
1650 (("address", "article", "aside", "blockquote", "button", "center",
1651 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",
1652 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",
1653 "section", "summary", "ul"), endTagBlock),
1654 ("form", endTagForm),
1655 ("p", endTagP),
1656 (("dd", "dt", "li"), endTagListItem),
1657 (headingElements, endTagHeading),
1658 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",
1659 "strike", "strong", "tt", "u"), endTagFormatting),
1660 (("applet", "marquee", "object"), endTagAppletMarqueeObject),
1661 ("br", endTagBr),
1662 ])
1663 endTagHandler.default = endTagOther
1664
1665 class TextPhase(Phase):
1666 __slots__ = tuple()
1667
1668 def processCharacters(self, token):
1669 self.tree.insertText(token["data"])
1670
1671 def processEOF(self):
1672 self.parser.parseError("expected-named-closing-tag-but-got-eof",
1673 {"name": self.tree.openElements[-1].name})
1674 self.tree.openElements.pop()
1675 self.parser.phase = self.parser.originalPhase
1676 return True
1677
1678 def startTagOther(self, token):
1679 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']
1680
1681 def endTagScript(self, token):
1682 node = self.tree.openElements.pop()
1683 assert node.name == "script"
1684 self.parser.phase = self.parser.originalPhase
1685 # The rest of this method is all stuff that only happens if
1686 # document.write works
1687
1688 def endTagOther(self, token):
1689 self.tree.openElements.pop()
1690 self.parser.phase = self.parser.originalPhase
1691
1692 startTagHandler = _utils.MethodDispatcher([])
1693 startTagHandler.default = startTagOther
1694 endTagHandler = _utils.MethodDispatcher([
1695 ("script", endTagScript)])
1696 endTagHandler.default = endTagOther
1697
1698 class InTablePhase(Phase):
1699 # http://www.whatwg.org/specs/web-apps/current-work/#in-table
1700 __slots__ = tuple()
1701
1702 # helper methods
1703 def clearStackToTableContext(self):
1704 # "clear the stack back to a table context"
1705 while self.tree.openElements[-1].name not in ("table", "html"):
1706 # self.parser.parseError("unexpected-implied-end-tag-in-table",
1707 # {"name": self.tree.openElements[-1].name})
1708 self.tree.openElements.pop()
1709 # When the current node is <html> it's an innerHTML case
1710
1711 # processing methods
1712 def processEOF(self):
1713 if self.tree.openElements[-1].name != "html":
1714 self.parser.parseError("eof-in-table")
1715 else:
1716 assert self.parser.innerHTML
1717 # Stop parsing
1718
1719 def processSpaceCharacters(self, token):
1720 originalPhase = self.parser.phase
1721 self.parser.phase = self.parser.phases["inTableText"]
1722 self.parser.phase.originalPhase = originalPhase
1723 self.parser.phase.processSpaceCharacters(token)
1724
1725 def processCharacters(self, token):
1726 originalPhase = self.parser.phase
1727 self.parser.phase = self.parser.phases["inTableText"]
1728 self.parser.phase.originalPhase = originalPhase
1729 self.parser.phase.processCharacters(token)
1730
1731 def insertText(self, token):
1732 # If we get here there must be at least one non-whitespace character
1733 # Do the table magic!
1734 self.tree.insertFromTable = True
1735 self.parser.phases["inBody"].processCharacters(token)
1736 self.tree.insertFromTable = False
1737
1738 def startTagCaption(self, token):
1739 self.clearStackToTableContext()
1740 self.tree.activeFormattingElements.append(Marker)
1741 self.tree.insertElement(token)
1742 self.parser.phase = self.parser.phases["inCaption"]
1743
1744 def startTagColgroup(self, token):
1745 self.clearStackToTableContext()
1746 self.tree.insertElement(token)
1747 self.parser.phase = self.parser.phases["inColumnGroup"]
1748
1749 def startTagCol(self, token):
1750 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))
1751 return token
1752
1753 def startTagRowGroup(self, token):
1754 self.clearStackToTableContext()
1755 self.tree.insertElement(token)
1756 self.parser.phase = self.parser.phases["inTableBody"]
1757
1758 def startTagImplyTbody(self, token):
1759 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))
1760 return token
1761
1762 def startTagTable(self, token):
1763 self.parser.parseError("unexpected-start-tag-implies-end-tag",
1764 {"startName": "table", "endName": "table"})
1765 self.parser.phase.processEndTag(impliedTagToken("table"))
1766 if not self.parser.innerHTML:
1767 return token
1768
1769 def startTagStyleScript(self, token):
1770 return self.parser.phases["inHead"].processStartTag(token)
1771
1772 def startTagInput(self, token):
1773 if ("type" in token["data"] and
1774 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):
1775 self.parser.parseError("unexpected-hidden-input-in-table")
1776 self.tree.insertElement(token)
1777 # XXX associate with form
1778 self.tree.openElements.pop()
1779 else:
1780 self.startTagOther(token)
1781
1782 def startTagForm(self, token):
1783 self.parser.parseError("unexpected-form-in-table")
1784 if self.tree.formPointer is None:
1785 self.tree.insertElement(token)
1786 self.tree.formPointer = self.tree.openElements[-1]
1787 self.tree.openElements.pop()
1788
1789 def startTagOther(self, token):
1790 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})
1791 # Do the table magic!
1792 self.tree.insertFromTable = True
1793 self.parser.phases["inBody"].processStartTag(token)
1794 self.tree.insertFromTable = False
1795
1796 def endTagTable(self, token):
1797 if self.tree.elementInScope("table", variant="table"):
1798 self.tree.generateImpliedEndTags()
1799 if self.tree.openElements[-1].name != "table":
1800 self.parser.parseError("end-tag-too-early-named",
1801 {"gotName": "table",
1802 "expectedName": self.tree.openElements[-1].name})
1803 while self.tree.openElements[-1].name != "table":
1804 self.tree.openElements.pop()
1805 self.tree.openElements.pop()
1806 self.parser.resetInsertionMode()
1807 else:
1808 # innerHTML case
1809 assert self.parser.innerHTML
1810 self.parser.parseError()
1811
1812 def endTagIgnore(self, token):
1813 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1814
1815 def endTagOther(self, token):
1816 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})
1817 # Do the table magic!
1818 self.tree.insertFromTable = True
1819 self.parser.phases["inBody"].processEndTag(token)
1820 self.tree.insertFromTable = False
1821
1822 startTagHandler = _utils.MethodDispatcher([
1823 ("html", Phase.startTagHtml),
1824 ("caption", startTagCaption),
1825 ("colgroup", startTagColgroup),
1826 ("col", startTagCol),
1827 (("tbody", "tfoot", "thead"), startTagRowGroup),
1828 (("td", "th", "tr"), startTagImplyTbody),
1829 ("table", startTagTable),
1830 (("style", "script"), startTagStyleScript),
1831 ("input", startTagInput),
1832 ("form", startTagForm)
1833 ])
1834 startTagHandler.default = startTagOther
1835
1836 endTagHandler = _utils.MethodDispatcher([
1837 ("table", endTagTable),
1838 (("body", "caption", "col", "colgroup", "html", "tbody", "td",
1839 "tfoot", "th", "thead", "tr"), endTagIgnore)
1840 ])
1841 endTagHandler.default = endTagOther
1842
1843 class InTableTextPhase(Phase):
1844 __slots__ = ("originalPhase", "characterTokens")
1845
1846 def __init__(self, *args, **kwargs):
1847 super(InTableTextPhase, self).__init__(*args, **kwargs)
1848 self.originalPhase = None
1849 self.characterTokens = []
1850
1851 def flushCharacters(self):
1852 data = "".join([item["data"] for item in self.characterTokens])
1853 if any([item not in spaceCharacters for item in data]):
1854 token = {"type": tokenTypes["Characters"], "data": data}
1855 self.parser.phases["inTable"].insertText(token)
1856 elif data:
1857 self.tree.insertText(data)
1858 self.characterTokens = []
1859
1860 def processComment(self, token):
1861 self.flushCharacters()
1862 self.parser.phase = self.originalPhase
1863 return token
1864
1865 def processEOF(self):
1866 self.flushCharacters()
1867 self.parser.phase = self.originalPhase
1868 return True
1869
1870 def processCharacters(self, token):
1871 if token["data"] == "\u0000":
1872 return
1873 self.characterTokens.append(token)
1874
1875 def processSpaceCharacters(self, token):
1876 # pretty sure we should never reach here
1877 self.characterTokens.append(token)
1878 # assert False
1879
1880 def processStartTag(self, token):
1881 self.flushCharacters()
1882 self.parser.phase = self.originalPhase
1883 return token
1884
1885 def processEndTag(self, token):
1886 self.flushCharacters()
1887 self.parser.phase = self.originalPhase
1888 return token
1889
1890 class InCaptionPhase(Phase):
1891 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption
1892 __slots__ = tuple()
1893
1894 def ignoreEndTagCaption(self):
1895 return not self.tree.elementInScope("caption", variant="table")
1896
1897 def processEOF(self):
1898 self.parser.phases["inBody"].processEOF()
1899
1900 def processCharacters(self, token):
1901 return self.parser.phases["inBody"].processCharacters(token)
1902
1903 def startTagTableElement(self, token):
1904 self.parser.parseError()
1905 # XXX Have to duplicate logic here to find out if the tag is ignored
1906 ignoreEndTag = self.ignoreEndTagCaption()
1907 self.parser.phase.processEndTag(impliedTagToken("caption"))
1908 if not ignoreEndTag:
1909 return token
1910
1911 def startTagOther(self, token):
1912 return self.parser.phases["inBody"].processStartTag(token)
1913
1914 def endTagCaption(self, token):
1915 if not self.ignoreEndTagCaption():
1916 # AT this code is quite similar to endTagTable in "InTable"
1917 self.tree.generateImpliedEndTags()
1918 if self.tree.openElements[-1].name != "caption":
1919 self.parser.parseError("expected-one-end-tag-but-got-another",
1920 {"gotName": "caption",
1921 "expectedName": self.tree.openElements[-1].name})
1922 while self.tree.openElements[-1].name != "caption":
1923 self.tree.openElements.pop()
1924 self.tree.openElements.pop()
1925 self.tree.clearActiveFormattingElements()
1926 self.parser.phase = self.parser.phases["inTable"]
1927 else:
1928 # innerHTML case
1929 assert self.parser.innerHTML
1930 self.parser.parseError()
1931
1932 def endTagTable(self, token):
1933 self.parser.parseError()
1934 ignoreEndTag = self.ignoreEndTagCaption()
1935 self.parser.phase.processEndTag(impliedTagToken("caption"))
1936 if not ignoreEndTag:
1937 return token
1938
1939 def endTagIgnore(self, token):
1940 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
1941
1942 def endTagOther(self, token):
1943 return self.parser.phases["inBody"].processEndTag(token)
1944
1945 startTagHandler = _utils.MethodDispatcher([
1946 ("html", Phase.startTagHtml),
1947 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
1948 "thead", "tr"), startTagTableElement)
1949 ])
1950 startTagHandler.default = startTagOther
1951
1952 endTagHandler = _utils.MethodDispatcher([
1953 ("caption", endTagCaption),
1954 ("table", endTagTable),
1955 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",
1956 "thead", "tr"), endTagIgnore)
1957 ])
1958 endTagHandler.default = endTagOther
1959
1960 class InColumnGroupPhase(Phase):
1961 # http://www.whatwg.org/specs/web-apps/current-work/#in-column
1962 __slots__ = tuple()
1963
1964 def ignoreEndTagColgroup(self):
1965 return self.tree.openElements[-1].name == "html"
1966
1967 def processEOF(self):
1968 if self.tree.openElements[-1].name == "html":
1969 assert self.parser.innerHTML
1970 return
1971 else:
1972 ignoreEndTag = self.ignoreEndTagColgroup()
1973 self.endTagColgroup(impliedTagToken("colgroup"))
1974 if not ignoreEndTag:
1975 return True
1976
1977 def processCharacters(self, token):
1978 ignoreEndTag = self.ignoreEndTagColgroup()
1979 self.endTagColgroup(impliedTagToken("colgroup"))
1980 if not ignoreEndTag:
1981 return token
1982
1983 def startTagCol(self, token):
1984 self.tree.insertElement(token)
1985 self.tree.openElements.pop()
1986 token["selfClosingAcknowledged"] = True
1987
1988 def startTagOther(self, token):
1989 ignoreEndTag = self.ignoreEndTagColgroup()
1990 self.endTagColgroup(impliedTagToken("colgroup"))
1991 if not ignoreEndTag:
1992 return token
1993
1994 def endTagColgroup(self, token):
1995 if self.ignoreEndTagColgroup():
1996 # innerHTML case
1997 assert self.parser.innerHTML
1998 self.parser.parseError()
1999 else:
2000 self.tree.openElements.pop()
2001 self.parser.phase = self.parser.phases["inTable"]
2002
2003 def endTagCol(self, token):
2004 self.parser.parseError("no-end-tag", {"name": "col"})
2005
2006 def endTagOther(self, token):
2007 ignoreEndTag = self.ignoreEndTagColgroup()
2008 self.endTagColgroup(impliedTagToken("colgroup"))
2009 if not ignoreEndTag:
2010 return token
2011
2012 startTagHandler = _utils.MethodDispatcher([
2013 ("html", Phase.startTagHtml),
2014 ("col", startTagCol)
2015 ])
2016 startTagHandler.default = startTagOther
2017
2018 endTagHandler = _utils.MethodDispatcher([
2019 ("colgroup", endTagColgroup),
2020 ("col", endTagCol)
2021 ])
2022 endTagHandler.default = endTagOther
2023
2024 class InTableBodyPhase(Phase):
2025 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0
2026 __slots__ = tuple()
2027
2028 # helper methods
2029 def clearStackToTableBodyContext(self):
2030 while self.tree.openElements[-1].name not in ("tbody", "tfoot",
2031 "thead", "html"):
2032 # self.parser.parseError("unexpected-implied-end-tag-in-table",
2033 # {"name": self.tree.openElements[-1].name})
2034 self.tree.openElements.pop()
2035 if self.tree.openElements[-1].name == "html":
2036 assert self.parser.innerHTML
2037
2038 # the rest
2039 def processEOF(self):
2040 self.parser.phases["inTable"].processEOF()
2041
2042 def processSpaceCharacters(self, token):
2043 return self.parser.phases["inTable"].processSpaceCharacters(token)
2044
2045 def processCharacters(self, token):
2046 return self.parser.phases["inTable"].processCharacters(token)
2047
2048 def startTagTr(self, token):
2049 self.clearStackToTableBodyContext()
2050 self.tree.insertElement(token)
2051 self.parser.phase = self.parser.phases["inRow"]
2052
2053 def startTagTableCell(self, token):
2054 self.parser.parseError("unexpected-cell-in-table-body",
2055 {"name": token["name"]})
2056 self.startTagTr(impliedTagToken("tr", "StartTag"))
2057 return token
2058
2059 def startTagTableOther(self, token):
2060 # XXX AT Any ideas on how to share this with endTagTable?
2061 if (self.tree.elementInScope("tbody", variant="table") or
2062 self.tree.elementInScope("thead", variant="table") or
2063 self.tree.elementInScope("tfoot", variant="table")):
2064 self.clearStackToTableBodyContext()
2065 self.endTagTableRowGroup(
2066 impliedTagToken(self.tree.openElements[-1].name))
2067 return token
2068 else:
2069 # innerHTML case
2070 assert self.parser.innerHTML
2071 self.parser.parseError()
2072
2073 def startTagOther(self, token):
2074 return self.parser.phases["inTable"].processStartTag(token)
2075
2076 def endTagTableRowGroup(self, token):
2077 if self.tree.elementInScope(token["name"], variant="table"):
2078 self.clearStackToTableBodyContext()
2079 self.tree.openElements.pop()
2080 self.parser.phase = self.parser.phases["inTable"]
2081 else:
2082 self.parser.parseError("unexpected-end-tag-in-table-body",
2083 {"name": token["name"]})
2084
2085 def endTagTable(self, token):
2086 if (self.tree.elementInScope("tbody", variant="table") or
2087 self.tree.elementInScope("thead", variant="table") or
2088 self.tree.elementInScope("tfoot", variant="table")):
2089 self.clearStackToTableBodyContext()
2090 self.endTagTableRowGroup(
2091 impliedTagToken(self.tree.openElements[-1].name))
2092 return token
2093 else:
2094 # innerHTML case
2095 assert self.parser.innerHTML
2096 self.parser.parseError()
2097
2098 def endTagIgnore(self, token):
2099 self.parser.parseError("unexpected-end-tag-in-table-body",
2100 {"name": token["name"]})
2101
2102 def endTagOther(self, token):
2103 return self.parser.phases["inTable"].processEndTag(token)
2104
2105 startTagHandler = _utils.MethodDispatcher([
2106 ("html", Phase.startTagHtml),
2107 ("tr", startTagTr),
2108 (("td", "th"), startTagTableCell),
2109 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),
2110 startTagTableOther)
2111 ])
2112 startTagHandler.default = startTagOther
2113
2114 endTagHandler = _utils.MethodDispatcher([
2115 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2116 ("table", endTagTable),
2117 (("body", "caption", "col", "colgroup", "html", "td", "th",
2118 "tr"), endTagIgnore)
2119 ])
2120 endTagHandler.default = endTagOther
2121
2122 class InRowPhase(Phase):
2123 # http://www.whatwg.org/specs/web-apps/current-work/#in-row
2124 __slots__ = tuple()
2125
2126 # helper methods (XXX unify this with other table helper methods)
2127 def clearStackToTableRowContext(self):
2128 while self.tree.openElements[-1].name not in ("tr", "html"):
2129 self.parser.parseError("unexpected-implied-end-tag-in-table-row",
2130 {"name": self.tree.openElements[-1].name})
2131 self.tree.openElements.pop()
2132
2133 def ignoreEndTagTr(self):
2134 return not self.tree.elementInScope("tr", variant="table")
2135
2136 # the rest
2137 def processEOF(self):
2138 self.parser.phases["inTable"].processEOF()
2139
2140 def processSpaceCharacters(self, token):
2141 return self.parser.phases["inTable"].processSpaceCharacters(token)
2142
2143 def processCharacters(self, token):
2144 return self.parser.phases["inTable"].processCharacters(token)
2145
2146 def startTagTableCell(self, token):
2147 self.clearStackToTableRowContext()
2148 self.tree.insertElement(token)
2149 self.parser.phase = self.parser.phases["inCell"]
2150 self.tree.activeFormattingElements.append(Marker)
2151
2152 def startTagTableOther(self, token):
2153 ignoreEndTag = self.ignoreEndTagTr()
2154 self.endTagTr(impliedTagToken("tr"))
2155 # XXX how are we sure it's always ignored in the innerHTML case?
2156 if not ignoreEndTag:
2157 return token
2158
2159 def startTagOther(self, token):
2160 return self.parser.phases["inTable"].processStartTag(token)
2161
2162 def endTagTr(self, token):
2163 if not self.ignoreEndTagTr():
2164 self.clearStackToTableRowContext()
2165 self.tree.openElements.pop()
2166 self.parser.phase = self.parser.phases["inTableBody"]
2167 else:
2168 # innerHTML case
2169 assert self.parser.innerHTML
2170 self.parser.parseError()
2171
2172 def endTagTable(self, token):
2173 ignoreEndTag = self.ignoreEndTagTr()
2174 self.endTagTr(impliedTagToken("tr"))
2175 # Reprocess the current tag if the tr end tag was not ignored
2176 # XXX how are we sure it's always ignored in the innerHTML case?
2177 if not ignoreEndTag:
2178 return token
2179
2180 def endTagTableRowGroup(self, token):
2181 if self.tree.elementInScope(token["name"], variant="table"):
2182 self.endTagTr(impliedTagToken("tr"))
2183 return token
2184 else:
2185 self.parser.parseError()
2186
2187 def endTagIgnore(self, token):
2188 self.parser.parseError("unexpected-end-tag-in-table-row",
2189 {"name": token["name"]})
2190
2191 def endTagOther(self, token):
2192 return self.parser.phases["inTable"].processEndTag(token)
2193
2194 startTagHandler = _utils.MethodDispatcher([
2195 ("html", Phase.startTagHtml),
2196 (("td", "th"), startTagTableCell),
2197 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",
2198 "tr"), startTagTableOther)
2199 ])
2200 startTagHandler.default = startTagOther
2201
2202 endTagHandler = _utils.MethodDispatcher([
2203 ("tr", endTagTr),
2204 ("table", endTagTable),
2205 (("tbody", "tfoot", "thead"), endTagTableRowGroup),
2206 (("body", "caption", "col", "colgroup", "html", "td", "th"),
2207 endTagIgnore)
2208 ])
2209 endTagHandler.default = endTagOther
2210
2211 class InCellPhase(Phase):
2212 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell
2213 __slots__ = tuple()
2214
2215 # helper
2216 def closeCell(self):
2217 if self.tree.elementInScope("td", variant="table"):
2218 self.endTagTableCell(impliedTagToken("td"))
2219 elif self.tree.elementInScope("th", variant="table"):
2220 self.endTagTableCell(impliedTagToken("th"))
2221
2222 # the rest
2223 def processEOF(self):
2224 self.parser.phases["inBody"].processEOF()
2225
2226 def processCharacters(self, token):
2227 return self.parser.phases["inBody"].processCharacters(token)
2228
2229 def startTagTableOther(self, token):
2230 if (self.tree.elementInScope("td", variant="table") or
2231 self.tree.elementInScope("th", variant="table")):
2232 self.closeCell()
2233 return token
2234 else:
2235 # innerHTML case
2236 assert self.parser.innerHTML
2237 self.parser.parseError()
2238
2239 def startTagOther(self, token):
2240 return self.parser.phases["inBody"].processStartTag(token)
2241
2242 def endTagTableCell(self, token):
2243 if self.tree.elementInScope(token["name"], variant="table"):
2244 self.tree.generateImpliedEndTags(token["name"])
2245 if self.tree.openElements[-1].name != token["name"]:
2246 self.parser.parseError("unexpected-cell-end-tag",
2247 {"name": token["name"]})
2248 while True:
2249 node = self.tree.openElements.pop()
2250 if node.name == token["name"]:
2251 break
2252 else:
2253 self.tree.openElements.pop()
2254 self.tree.clearActiveFormattingElements()
2255 self.parser.phase = self.parser.phases["inRow"]
2256 else:
2257 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2258
2259 def endTagIgnore(self, token):
2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2261
2262 def endTagImply(self, token):
2263 if self.tree.elementInScope(token["name"], variant="table"):
2264 self.closeCell()
2265 return token
2266 else:
2267 # sometimes innerHTML case
2268 self.parser.parseError()
2269
2270 def endTagOther(self, token):
2271 return self.parser.phases["inBody"].processEndTag(token)
2272
2273 startTagHandler = _utils.MethodDispatcher([
2274 ("html", Phase.startTagHtml),
2275 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",
2276 "thead", "tr"), startTagTableOther)
2277 ])
2278 startTagHandler.default = startTagOther
2279
2280 endTagHandler = _utils.MethodDispatcher([
2281 (("td", "th"), endTagTableCell),
2282 (("body", "caption", "col", "colgroup", "html"), endTagIgnore),
2283 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)
2284 ])
2285 endTagHandler.default = endTagOther
2286
2287 class InSelectPhase(Phase):
2288 __slots__ = tuple()
2289
2290 # http://www.whatwg.org/specs/web-apps/current-work/#in-select
2291 def processEOF(self):
2292 if self.tree.openElements[-1].name != "html":
2293 self.parser.parseError("eof-in-select")
2294 else:
2295 assert self.parser.innerHTML
2296
2297 def processCharacters(self, token):
2298 if token["data"] == "\u0000":
2299 return
2300 self.tree.insertText(token["data"])
2301
2302 def startTagOption(self, token):
2303 # We need to imply </option> if <option> is the current node.
2304 if self.tree.openElements[-1].name == "option":
2305 self.tree.openElements.pop()
2306 self.tree.insertElement(token)
2307
2308 def startTagOptgroup(self, token):
2309 if self.tree.openElements[-1].name == "option":
2310 self.tree.openElements.pop()
2311 if self.tree.openElements[-1].name == "optgroup":
2312 self.tree.openElements.pop()
2313 self.tree.insertElement(token)
2314
2315 def startTagSelect(self, token):
2316 self.parser.parseError("unexpected-select-in-select")
2317 self.endTagSelect(impliedTagToken("select"))
2318
2319 def startTagInput(self, token):
2320 self.parser.parseError("unexpected-input-in-select")
2321 if self.tree.elementInScope("select", variant="select"):
2322 self.endTagSelect(impliedTagToken("select"))
2323 return token
2324 else:
2325 assert self.parser.innerHTML
2326
2327 def startTagScript(self, token):
2328 return self.parser.phases["inHead"].processStartTag(token)
2329
2330 def startTagOther(self, token):
2331 self.parser.parseError("unexpected-start-tag-in-select",
2332 {"name": token["name"]})
2333
2334 def endTagOption(self, token):
2335 if self.tree.openElements[-1].name == "option":
2336 self.tree.openElements.pop()
2337 else:
2338 self.parser.parseError("unexpected-end-tag-in-select",
2339 {"name": "option"})
2340
2341 def endTagOptgroup(self, token):
2342 # </optgroup> implicitly closes <option>
2343 if (self.tree.openElements[-1].name == "option" and
2344 self.tree.openElements[-2].name == "optgroup"):
2345 self.tree.openElements.pop()
2346 # It also closes </optgroup>
2347 if self.tree.openElements[-1].name == "optgroup":
2348 self.tree.openElements.pop()
2349 # But nothing else
2350 else:
2351 self.parser.parseError("unexpected-end-tag-in-select",
2352 {"name": "optgroup"})
2353
2354 def endTagSelect(self, token):
2355 if self.tree.elementInScope("select", variant="select"):
2356 node = self.tree.openElements.pop()
2357 while node.name != "select":
2358 node = self.tree.openElements.pop()
2359 self.parser.resetInsertionMode()
2360 else:
2361 # innerHTML case
2362 assert self.parser.innerHTML
2363 self.parser.parseError()
2364
2365 def endTagOther(self, token):
2366 self.parser.parseError("unexpected-end-tag-in-select",
2367 {"name": token["name"]})
2368
2369 startTagHandler = _utils.MethodDispatcher([
2370 ("html", Phase.startTagHtml),
2371 ("option", startTagOption),
2372 ("optgroup", startTagOptgroup),
2373 ("select", startTagSelect),
2374 (("input", "keygen", "textarea"), startTagInput),
2375 ("script", startTagScript)
2376 ])
2377 startTagHandler.default = startTagOther
2378
2379 endTagHandler = _utils.MethodDispatcher([
2380 ("option", endTagOption),
2381 ("optgroup", endTagOptgroup),
2382 ("select", endTagSelect)
2383 ])
2384 endTagHandler.default = endTagOther
2385
2386 class InSelectInTablePhase(Phase):
2387 __slots__ = tuple()
2388
2389 def processEOF(self):
2390 self.parser.phases["inSelect"].processEOF()
2391
2392 def processCharacters(self, token):
2393 return self.parser.phases["inSelect"].processCharacters(token)
2394
2395 def startTagTable(self, token):
2396 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})
2397 self.endTagOther(impliedTagToken("select"))
2398 return token
2399
2400 def startTagOther(self, token):
2401 return self.parser.phases["inSelect"].processStartTag(token)
2402
2403 def endTagTable(self, token):
2404 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})
2405 if self.tree.elementInScope(token["name"], variant="table"):
2406 self.endTagOther(impliedTagToken("select"))
2407 return token
2408
2409 def endTagOther(self, token):
2410 return self.parser.phases["inSelect"].processEndTag(token)
2411
2412 startTagHandler = _utils.MethodDispatcher([
2413 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2414 startTagTable)
2415 ])
2416 startTagHandler.default = startTagOther
2417
2418 endTagHandler = _utils.MethodDispatcher([
2419 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),
2420 endTagTable)
2421 ])
2422 endTagHandler.default = endTagOther
2423
2424 class InForeignContentPhase(Phase):
2425 __slots__ = tuple()
2426
2427 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",
2428 "center", "code", "dd", "div", "dl", "dt",
2429 "em", "embed", "h1", "h2", "h3",
2430 "h4", "h5", "h6", "head", "hr", "i", "img",
2431 "li", "listing", "menu", "meta", "nobr",
2432 "ol", "p", "pre", "ruby", "s", "small",
2433 "span", "strong", "strike", "sub", "sup",
2434 "table", "tt", "u", "ul", "var"])
2435
2436 def adjustSVGTagNames(self, token):
2437 replacements = {"altglyph": "altGlyph",
2438 "altglyphdef": "altGlyphDef",
2439 "altglyphitem": "altGlyphItem",
2440 "animatecolor": "animateColor",
2441 "animatemotion": "animateMotion",
2442 "animatetransform": "animateTransform",
2443 "clippath": "clipPath",
2444 "feblend": "feBlend",
2445 "fecolormatrix": "feColorMatrix",
2446 "fecomponenttransfer": "feComponentTransfer",
2447 "fecomposite": "feComposite",
2448 "feconvolvematrix": "feConvolveMatrix",
2449 "fediffuselighting": "feDiffuseLighting",
2450 "fedisplacementmap": "feDisplacementMap",
2451 "fedistantlight": "feDistantLight",
2452 "feflood": "feFlood",
2453 "fefunca": "feFuncA",
2454 "fefuncb": "feFuncB",
2455 "fefuncg": "feFuncG",
2456 "fefuncr": "feFuncR",
2457 "fegaussianblur": "feGaussianBlur",
2458 "feimage": "feImage",
2459 "femerge": "feMerge",
2460 "femergenode": "feMergeNode",
2461 "femorphology": "feMorphology",
2462 "feoffset": "feOffset",
2463 "fepointlight": "fePointLight",
2464 "fespecularlighting": "feSpecularLighting",
2465 "fespotlight": "feSpotLight",
2466 "fetile": "feTile",
2467 "feturbulence": "feTurbulence",
2468 "foreignobject": "foreignObject",
2469 "glyphref": "glyphRef",
2470 "lineargradient": "linearGradient",
2471 "radialgradient": "radialGradient",
2472 "textpath": "textPath"}
2473
2474 if token["name"] in replacements:
2475 token["name"] = replacements[token["name"]]
2476
2477 def processCharacters(self, token):
2478 if token["data"] == "\u0000":
2479 token["data"] = "\uFFFD"
2480 elif (self.parser.framesetOK and
2481 any(char not in spaceCharacters for char in token["data"])):
2482 self.parser.framesetOK = False
2483 Phase.processCharacters(self, token)
2484
2485 def processStartTag(self, token):
2486 currentNode = self.tree.openElements[-1]
2487 if (token["name"] in self.breakoutElements or
2488 (token["name"] == "font" and
2489 set(token["data"].keys()) & {"color", "face", "size"})):
2490 self.parser.parseError("unexpected-html-element-in-foreign-content",
2491 {"name": token["name"]})
2492 while (self.tree.openElements[-1].namespace !=
2493 self.tree.defaultNamespace and
2494 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and
2495 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):
2496 self.tree.openElements.pop()
2497 return token
2498
2499 else:
2500 if currentNode.namespace == namespaces["mathml"]:
2501 self.parser.adjustMathMLAttributes(token)
2502 elif currentNode.namespace == namespaces["svg"]:
2503 self.adjustSVGTagNames(token)
2504 self.parser.adjustSVGAttributes(token)
2505 self.parser.adjustForeignAttributes(token)
2506 token["namespace"] = currentNode.namespace
2507 self.tree.insertElement(token)
2508 if token["selfClosing"]:
2509 self.tree.openElements.pop()
2510 token["selfClosingAcknowledged"] = True
2511
2512 def processEndTag(self, token):
2513 nodeIndex = len(self.tree.openElements) - 1
2514 node = self.tree.openElements[-1]
2515 if node.name.translate(asciiUpper2Lower) != token["name"]:
2516 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
2517
2518 while True:
2519 if node.name.translate(asciiUpper2Lower) == token["name"]:
2520 # XXX this isn't in the spec but it seems necessary
2521 if self.parser.phase == self.parser.phases["inTableText"]:
2522 self.parser.phase.flushCharacters()
2523 self.parser.phase = self.parser.phase.originalPhase
2524 while self.tree.openElements.pop() != node:
2525 assert self.tree.openElements
2526 new_token = None
2527 break
2528 nodeIndex -= 1
2529
2530 node = self.tree.openElements[nodeIndex]
2531 if node.namespace != self.tree.defaultNamespace:
2532 continue
2533 else:
2534 new_token = self.parser.phase.processEndTag(token)
2535 break
2536 return new_token
2537
2538 class AfterBodyPhase(Phase):
2539 __slots__ = tuple()
2540
2541 def processEOF(self):
2542 # Stop parsing
2543 pass
2544
2545 def processComment(self, token):
2546 # This is needed because data is to be appended to the <html> element
2547 # here and not to whatever is currently open.
2548 self.tree.insertComment(token, self.tree.openElements[0])
2549
2550 def processCharacters(self, token):
2551 self.parser.parseError("unexpected-char-after-body")
2552 self.parser.phase = self.parser.phases["inBody"]
2553 return token
2554
2555 def startTagHtml(self, token):
2556 return self.parser.phases["inBody"].processStartTag(token)
2557
2558 def startTagOther(self, token):
2559 self.parser.parseError("unexpected-start-tag-after-body",
2560 {"name": token["name"]})
2561 self.parser.phase = self.parser.phases["inBody"]
2562 return token
2563
2564 def endTagHtml(self, name):
2565 if self.parser.innerHTML:
2566 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")
2567 else:
2568 self.parser.phase = self.parser.phases["afterAfterBody"]
2569
2570 def endTagOther(self, token):
2571 self.parser.parseError("unexpected-end-tag-after-body",
2572 {"name": token["name"]})
2573 self.parser.phase = self.parser.phases["inBody"]
2574 return token
2575
2576 startTagHandler = _utils.MethodDispatcher([
2577 ("html", startTagHtml)
2578 ])
2579 startTagHandler.default = startTagOther
2580
2581 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])
2582 endTagHandler.default = endTagOther
2583
2584 class InFramesetPhase(Phase):
2585 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset
2586 __slots__ = tuple()
2587
2588 def processEOF(self):
2589 if self.tree.openElements[-1].name != "html":
2590 self.parser.parseError("eof-in-frameset")
2591 else:
2592 assert self.parser.innerHTML
2593
2594 def processCharacters(self, token):
2595 self.parser.parseError("unexpected-char-in-frameset")
2596
2597 def startTagFrameset(self, token):
2598 self.tree.insertElement(token)
2599
2600 def startTagFrame(self, token):
2601 self.tree.insertElement(token)
2602 self.tree.openElements.pop()
2603
2604 def startTagNoframes(self, token):
2605 return self.parser.phases["inBody"].processStartTag(token)
2606
2607 def startTagOther(self, token):
2608 self.parser.parseError("unexpected-start-tag-in-frameset",
2609 {"name": token["name"]})
2610
2611 def endTagFrameset(self, token):
2612 if self.tree.openElements[-1].name == "html":
2613 # innerHTML case
2614 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")
2615 else:
2616 self.tree.openElements.pop()
2617 if (not self.parser.innerHTML and
2618 self.tree.openElements[-1].name != "frameset"):
2619 # If we're not in innerHTML mode and the current node is not a
2620 # "frameset" element (anymore) then switch.
2621 self.parser.phase = self.parser.phases["afterFrameset"]
2622
2623 def endTagOther(self, token):
2624 self.parser.parseError("unexpected-end-tag-in-frameset",
2625 {"name": token["name"]})
2626
2627 startTagHandler = _utils.MethodDispatcher([
2628 ("html", Phase.startTagHtml),
2629 ("frameset", startTagFrameset),
2630 ("frame", startTagFrame),
2631 ("noframes", startTagNoframes)
2632 ])
2633 startTagHandler.default = startTagOther
2634
2635 endTagHandler = _utils.MethodDispatcher([
2636 ("frameset", endTagFrameset)
2637 ])
2638 endTagHandler.default = endTagOther
2639
2640 class AfterFramesetPhase(Phase):
2641 # http://www.whatwg.org/specs/web-apps/current-work/#after3
2642 __slots__ = tuple()
2643
2644 def processEOF(self):
2645 # Stop parsing
2646 pass
2647
2648 def processCharacters(self, token):
2649 self.parser.parseError("unexpected-char-after-frameset")
2650
2651 def startTagNoframes(self, token):
2652 return self.parser.phases["inHead"].processStartTag(token)
2653
2654 def startTagOther(self, token):
2655 self.parser.parseError("unexpected-start-tag-after-frameset",
2656 {"name": token["name"]})
2657
2658 def endTagHtml(self, token):
2659 self.parser.phase = self.parser.phases["afterAfterFrameset"]
2660
2661 def endTagOther(self, token):
2662 self.parser.parseError("unexpected-end-tag-after-frameset",
2663 {"name": token["name"]})
2664
2665 startTagHandler = _utils.MethodDispatcher([
2666 ("html", Phase.startTagHtml),
2667 ("noframes", startTagNoframes)
2668 ])
2669 startTagHandler.default = startTagOther
2670
2671 endTagHandler = _utils.MethodDispatcher([
2672 ("html", endTagHtml)
2673 ])
2674 endTagHandler.default = endTagOther
2675
2676 class AfterAfterBodyPhase(Phase):
2677 __slots__ = tuple()
2678
2679 def processEOF(self):
2680 pass
2681
2682 def processComment(self, token):
2683 self.tree.insertComment(token, self.tree.document)
2684
2685 def processSpaceCharacters(self, token):
2686 return self.parser.phases["inBody"].processSpaceCharacters(token)
2687
2688 def processCharacters(self, token):
2689 self.parser.parseError("expected-eof-but-got-char")
2690 self.parser.phase = self.parser.phases["inBody"]
2691 return token
2692
2693 def startTagHtml(self, token):
2694 return self.parser.phases["inBody"].processStartTag(token)
2695
2696 def startTagOther(self, token):
2697 self.parser.parseError("expected-eof-but-got-start-tag",
2698 {"name": token["name"]})
2699 self.parser.phase = self.parser.phases["inBody"]
2700 return token
2701
2702 def processEndTag(self, token):
2703 self.parser.parseError("expected-eof-but-got-end-tag",
2704 {"name": token["name"]})
2705 self.parser.phase = self.parser.phases["inBody"]
2706 return token
2707
2708 startTagHandler = _utils.MethodDispatcher([
2709 ("html", startTagHtml)
2710 ])
2711 startTagHandler.default = startTagOther
2712
2713 class AfterAfterFramesetPhase(Phase):
2714 __slots__ = tuple()
2715
2716 def processEOF(self):
2717 pass
2718
2719 def processComment(self, token):
2720 self.tree.insertComment(token, self.tree.document)
2721
2722 def processSpaceCharacters(self, token):
2723 return self.parser.phases["inBody"].processSpaceCharacters(token)
2724
2725 def processCharacters(self, token):
2726 self.parser.parseError("expected-eof-but-got-char")
2727
2728 def startTagHtml(self, token):
2729 return self.parser.phases["inBody"].processStartTag(token)
2730
2731 def startTagNoFrames(self, token):
2732 return self.parser.phases["inHead"].processStartTag(token)
2733
2734 def startTagOther(self, token):
2735 self.parser.parseError("expected-eof-but-got-start-tag",
2736 {"name": token["name"]})
2737
2738 def processEndTag(self, token):
2739 self.parser.parseError("expected-eof-but-got-end-tag",
2740 {"name": token["name"]})
2741
2742 startTagHandler = _utils.MethodDispatcher([
2743 ("html", startTagHtml),
2744 ("noframes", startTagNoFrames)
2745 ])
2746 startTagHandler.default = startTagOther
2747
2748 # pylint:enable=unused-argument
2749
2750 return {
2751 "initial": InitialPhase,
2752 "beforeHtml": BeforeHtmlPhase,
2753 "beforeHead": BeforeHeadPhase,
2754 "inHead": InHeadPhase,
2755 "inHeadNoscript": InHeadNoscriptPhase,
2756 "afterHead": AfterHeadPhase,
2757 "inBody": InBodyPhase,
2758 "text": TextPhase,
2759 "inTable": InTablePhase,
2760 "inTableText": InTableTextPhase,
2761 "inCaption": InCaptionPhase,
2762 "inColumnGroup": InColumnGroupPhase,
2763 "inTableBody": InTableBodyPhase,
2764 "inRow": InRowPhase,
2765 "inCell": InCellPhase,
2766 "inSelect": InSelectPhase,
2767 "inSelectInTable": InSelectInTablePhase,
2768 "inForeignContent": InForeignContentPhase,
2769 "afterBody": AfterBodyPhase,
2770 "inFrameset": InFramesetPhase,
2771 "afterFrameset": AfterFramesetPhase,
2772 "afterAfterBody": AfterAfterBodyPhase,
2773 "afterAfterFrameset": AfterAfterFramesetPhase,
2774 # XXX after after frameset
2775 }
2776
2777
2778def adjust_attributes(token, replacements):
2779 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)
2780 if needs_adjustment:
2781 token['data'] = type(token['data'])((replacements.get(k, k), v)
2782 for k, v in token['data'].items())
2783
2784
2785def impliedTagToken(name, type="EndTag", attributes=None,
2786 selfClosing=False):
2787 if attributes is None:
2788 attributes = {}
2789 return {"type": tokenTypes[type], "name": name, "data": attributes,
2790 "selfClosing": selfClosing}
2791
2792
2793class ParseError(Exception):
2794 """Error in parsed document"""
2795 pass