1from __future__ import absolute_import, division, unicode_literals 
    2from six import with_metaclass, viewkeys 
    3 
    4import types 
    5 
    6from . import _inputstream 
    7from . import _tokenizer 
    8 
    9from . import treebuilders 
    10from .treebuilders.base import Marker 
    11 
    12from . import _utils 
    13from .constants import ( 
    14    spaceCharacters, asciiUpper2Lower, 
    15    specialElements, headingElements, cdataElements, rcdataElements, 
    16    tokenTypes, tagTokenTypes, 
    17    namespaces, 
    18    htmlIntegrationPointElements, mathmlTextIntegrationPointElements, 
    19    adjustForeignAttributes as adjustForeignAttributesMap, 
    20    adjustMathMLAttributes, adjustSVGAttributes, 
    21    E, 
    22    _ReparseException 
    23) 
    24 
    25 
    26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): 
    27    """Parse an HTML document as a string or file-like object into a tree 
    28 
    29    :arg doc: the document to parse as a string or file-like object 
    30 
    31    :arg treebuilder: the treebuilder to use when parsing 
    32 
    33    :arg namespaceHTMLElements: whether or not to namespace HTML elements 
    34 
    35    :returns: parsed tree 
    36 
    37    Example: 
    38 
    39    >>> from html5lib.html5parser import parse 
    40    >>> parse('<html><body><p>This is a doc</p></body></html>') 
    41    <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 
    42 
    43    """ 
    44    tb = treebuilders.getTreeBuilder(treebuilder) 
    45    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 
    46    return p.parse(doc, **kwargs) 
    47 
    48 
    49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): 
    50    """Parse an HTML fragment as a string or file-like object into a tree 
    51 
    52    :arg doc: the fragment to parse as a string or file-like object 
    53 
    54    :arg container: the container context to parse the fragment in 
    55 
    56    :arg treebuilder: the treebuilder to use when parsing 
    57 
    58    :arg namespaceHTMLElements: whether or not to namespace HTML elements 
    59 
    60    :returns: parsed tree 
    61 
    62    Example: 
    63 
    64    >>> from html5lib.html5libparser import parseFragment 
    65    >>> parseFragment('<b>this is a fragment</b>') 
    66    <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 
    67 
    68    """ 
    69    tb = treebuilders.getTreeBuilder(treebuilder) 
    70    p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 
    71    return p.parseFragment(doc, container=container, **kwargs) 
    72 
    73 
    74def method_decorator_metaclass(function): 
    75    class Decorated(type): 
    76        def __new__(meta, classname, bases, classDict): 
    77            for attributeName, attribute in classDict.items(): 
    78                if isinstance(attribute, types.FunctionType): 
    79                    attribute = function(attribute) 
    80 
    81                classDict[attributeName] = attribute 
    82            return type.__new__(meta, classname, bases, classDict) 
    83    return Decorated 
    84 
    85 
    86class HTMLParser(object): 
    87    """HTML parser 
    88 
    89    Generates a tree structure from a stream of (possibly malformed) HTML. 
    90 
    91    """ 
    92 
    93    def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): 
    94        """ 
    95        :arg tree: a treebuilder class controlling the type of tree that will be 
    96            returned. Built in treebuilders can be accessed through 
    97            html5lib.treebuilders.getTreeBuilder(treeType) 
    98 
    99        :arg strict: raise an exception when a parse error is encountered 
    100 
    101        :arg namespaceHTMLElements: whether or not to namespace HTML elements 
    102 
    103        :arg debug: whether or not to enable debug mode which logs things 
    104 
    105        Example: 
    106 
    107        >>> from html5lib.html5parser import HTMLParser 
    108        >>> parser = HTMLParser()                     # generates parser with etree builder 
    109        >>> parser = HTMLParser('lxml', strict=True)  # generates parser with lxml builder which is strict 
    110 
    111        """ 
    112 
    113        # Raise an exception on the first error encountered 
    114        self.strict = strict 
    115 
    116        if tree is None: 
    117            tree = treebuilders.getTreeBuilder("etree") 
    118        self.tree = tree(namespaceHTMLElements) 
    119        self.errors = [] 
    120 
    121        self.phases = {name: cls(self, self.tree) for name, cls in 
    122                       getPhases(debug).items()} 
    123 
    124    def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): 
    125 
    126        self.innerHTMLMode = innerHTML 
    127        self.container = container 
    128        self.scripting = scripting 
    129        self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) 
    130        self.reset() 
    131 
    132        try: 
    133            self.mainLoop() 
    134        except _ReparseException: 
    135            self.reset() 
    136            self.mainLoop() 
    137 
    138    def reset(self): 
    139        self.tree.reset() 
    140        self.firstStartTag = False 
    141        self.errors = [] 
    142        self.log = []  # only used with debug mode 
    143        # "quirks" / "limited quirks" / "no quirks" 
    144        self.compatMode = "no quirks" 
    145 
    146        if self.innerHTMLMode: 
    147            self.innerHTML = self.container.lower() 
    148 
    149            if self.innerHTML in cdataElements: 
    150                self.tokenizer.state = self.tokenizer.rcdataState 
    151            elif self.innerHTML in rcdataElements: 
    152                self.tokenizer.state = self.tokenizer.rawtextState 
    153            elif self.innerHTML == 'plaintext': 
    154                self.tokenizer.state = self.tokenizer.plaintextState 
    155            else: 
    156                # state already is data state 
    157                # self.tokenizer.state = self.tokenizer.dataState 
    158                pass 
    159            self.phase = self.phases["beforeHtml"] 
    160            self.phase.insertHtmlElement() 
    161            self.resetInsertionMode() 
    162        else: 
    163            self.innerHTML = False  # pylint:disable=redefined-variable-type 
    164            self.phase = self.phases["initial"] 
    165 
    166        self.lastPhase = None 
    167 
    168        self.beforeRCDataPhase = None 
    169 
    170        self.framesetOK = True 
    171 
    172    @property 
    173    def documentEncoding(self): 
    174        """Name of the character encoding that was used to decode the input stream, or 
    175        :obj:`None` if that is not determined yet 
    176 
    177        """ 
    178        if not hasattr(self, 'tokenizer'): 
    179            return None 
    180        return self.tokenizer.stream.charEncoding[0].name 
    181 
    182    def isHTMLIntegrationPoint(self, element): 
    183        if (element.name == "annotation-xml" and 
    184                element.namespace == namespaces["mathml"]): 
    185            return ("encoding" in element.attributes and 
    186                    element.attributes["encoding"].translate( 
    187                        asciiUpper2Lower) in 
    188                    ("text/html", "application/xhtml+xml")) 
    189        else: 
    190            return (element.namespace, element.name) in htmlIntegrationPointElements 
    191 
    192    def isMathMLTextIntegrationPoint(self, element): 
    193        return (element.namespace, element.name) in mathmlTextIntegrationPointElements 
    194 
    195    def mainLoop(self): 
    196        CharactersToken = tokenTypes["Characters"] 
    197        SpaceCharactersToken = tokenTypes["SpaceCharacters"] 
    198        StartTagToken = tokenTypes["StartTag"] 
    199        EndTagToken = tokenTypes["EndTag"] 
    200        CommentToken = tokenTypes["Comment"] 
    201        DoctypeToken = tokenTypes["Doctype"] 
    202        ParseErrorToken = tokenTypes["ParseError"] 
    203 
    204        for token in self.tokenizer: 
    205            prev_token = None 
    206            new_token = token 
    207            while new_token is not None: 
    208                prev_token = new_token 
    209                currentNode = self.tree.openElements[-1] if self.tree.openElements else None 
    210                currentNodeNamespace = currentNode.namespace if currentNode else None 
    211                currentNodeName = currentNode.name if currentNode else None 
    212 
    213                type = new_token["type"] 
    214 
    215                if type == ParseErrorToken: 
    216                    self.parseError(new_token["data"], new_token.get("datavars", {})) 
    217                    new_token = None 
    218                else: 
    219                    if (len(self.tree.openElements) == 0 or 
    220                        currentNodeNamespace == self.tree.defaultNamespace or 
    221                        (self.isMathMLTextIntegrationPoint(currentNode) and 
    222                         ((type == StartTagToken and 
    223                           token["name"] not in frozenset(["mglyph", "malignmark"])) or 
    224                          type in (CharactersToken, SpaceCharactersToken))) or 
    225                        (currentNodeNamespace == namespaces["mathml"] and 
    226                         currentNodeName == "annotation-xml" and 
    227                         type == StartTagToken and 
    228                         token["name"] == "svg") or 
    229                        (self.isHTMLIntegrationPoint(currentNode) and 
    230                         type in (StartTagToken, CharactersToken, SpaceCharactersToken))): 
    231                        phase = self.phase 
    232                    else: 
    233                        phase = self.phases["inForeignContent"] 
    234 
    235                    if type == CharactersToken: 
    236                        new_token = phase.processCharacters(new_token) 
    237                    elif type == SpaceCharactersToken: 
    238                        new_token = phase.processSpaceCharacters(new_token) 
    239                    elif type == StartTagToken: 
    240                        new_token = phase.processStartTag(new_token) 
    241                    elif type == EndTagToken: 
    242                        new_token = phase.processEndTag(new_token) 
    243                    elif type == CommentToken: 
    244                        new_token = phase.processComment(new_token) 
    245                    elif type == DoctypeToken: 
    246                        new_token = phase.processDoctype(new_token) 
    247 
    248            if (type == StartTagToken and prev_token["selfClosing"] and 
    249                    not prev_token["selfClosingAcknowledged"]): 
    250                self.parseError("non-void-element-with-trailing-solidus", 
    251                                {"name": prev_token["name"]}) 
    252 
    253        # When the loop finishes it's EOF 
    254        reprocess = True 
    255        phases = [] 
    256        while reprocess: 
    257            phases.append(self.phase) 
    258            reprocess = self.phase.processEOF() 
    259            if reprocess: 
    260                assert self.phase not in phases 
    261 
    262    def parse(self, stream, *args, **kwargs): 
    263        """Parse a HTML document into a well-formed tree 
    264 
    265        :arg stream: a file-like object or string containing the HTML to be parsed 
    266 
    267            The optional encoding parameter must be a string that indicates 
    268            the encoding.  If specified, that encoding will be used, 
    269            regardless of any BOM or later declaration (such as in a meta 
    270            element). 
    271 
    272        :arg scripting: treat noscript elements as if JavaScript was turned on 
    273 
    274        :returns: parsed tree 
    275 
    276        Example: 
    277 
    278        >>> from html5lib.html5parser import HTMLParser 
    279        >>> parser = HTMLParser() 
    280        >>> parser.parse('<html><body><p>This is a doc</p></body></html>') 
    281        <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 
    282 
    283        """ 
    284        self._parse(stream, False, None, *args, **kwargs) 
    285        return self.tree.getDocument() 
    286 
    287    def parseFragment(self, stream, *args, **kwargs): 
    288        """Parse a HTML fragment into a well-formed tree fragment 
    289 
    290        :arg container: name of the element we're setting the innerHTML 
    291            property if set to None, default to 'div' 
    292 
    293        :arg stream: a file-like object or string containing the HTML to be parsed 
    294 
    295            The optional encoding parameter must be a string that indicates 
    296            the encoding.  If specified, that encoding will be used, 
    297            regardless of any BOM or later declaration (such as in a meta 
    298            element) 
    299 
    300        :arg scripting: treat noscript elements as if JavaScript was turned on 
    301 
    302        :returns: parsed tree 
    303 
    304        Example: 
    305 
    306        >>> from html5lib.html5libparser import HTMLParser 
    307        >>> parser = HTMLParser() 
    308        >>> parser.parseFragment('<b>this is a fragment</b>') 
    309        <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 
    310 
    311        """ 
    312        self._parse(stream, True, *args, **kwargs) 
    313        return self.tree.getFragment() 
    314 
    315    def parseError(self, errorcode="XXX-undefined-error", datavars=None): 
    316        # XXX The idea is to make errorcode mandatory. 
    317        if datavars is None: 
    318            datavars = {} 
    319        self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) 
    320        if self.strict: 
    321            raise ParseError(E[errorcode] % datavars) 
    322 
    323    def adjustMathMLAttributes(self, token): 
    324        adjust_attributes(token, adjustMathMLAttributes) 
    325 
    326    def adjustSVGAttributes(self, token): 
    327        adjust_attributes(token, adjustSVGAttributes) 
    328 
    329    def adjustForeignAttributes(self, token): 
    330        adjust_attributes(token, adjustForeignAttributesMap) 
    331 
    332    def reparseTokenNormal(self, token): 
    333        # pylint:disable=unused-argument 
    334        self.parser.phase() 
    335 
    336    def resetInsertionMode(self): 
    337        # The name of this method is mostly historical. (It's also used in the 
    338        # specification.) 
    339        last = False 
    340        newModes = { 
    341            "select": "inSelect", 
    342            "td": "inCell", 
    343            "th": "inCell", 
    344            "tr": "inRow", 
    345            "tbody": "inTableBody", 
    346            "thead": "inTableBody", 
    347            "tfoot": "inTableBody", 
    348            "caption": "inCaption", 
    349            "colgroup": "inColumnGroup", 
    350            "table": "inTable", 
    351            "head": "inBody", 
    352            "body": "inBody", 
    353            "frameset": "inFrameset", 
    354            "html": "beforeHead" 
    355        } 
    356        for node in self.tree.openElements[::-1]: 
    357            nodeName = node.name 
    358            new_phase = None 
    359            if node == self.tree.openElements[0]: 
    360                assert self.innerHTML 
    361                last = True 
    362                nodeName = self.innerHTML 
    363            # Check for conditions that should only happen in the innerHTML 
    364            # case 
    365            if nodeName in ("select", "colgroup", "head", "html"): 
    366                assert self.innerHTML 
    367 
    368            if not last and node.namespace != self.tree.defaultNamespace: 
    369                continue 
    370 
    371            if nodeName in newModes: 
    372                new_phase = self.phases[newModes[nodeName]] 
    373                break 
    374            elif last: 
    375                new_phase = self.phases["inBody"] 
    376                break 
    377 
    378        self.phase = new_phase 
    379 
    380    def parseRCDataRawtext(self, token, contentType): 
    381        # Generic RCDATA/RAWTEXT Parsing algorithm 
    382        assert contentType in ("RAWTEXT", "RCDATA") 
    383 
    384        self.tree.insertElement(token) 
    385 
    386        if contentType == "RAWTEXT": 
    387            self.tokenizer.state = self.tokenizer.rawtextState 
    388        else: 
    389            self.tokenizer.state = self.tokenizer.rcdataState 
    390 
    391        self.originalPhase = self.phase 
    392 
    393        self.phase = self.phases["text"] 
    394 
    395 
    396@_utils.memoize 
    397def getPhases(debug): 
    398    def log(function): 
    399        """Logger that records which phase processes each token""" 
    400        type_names = {value: key for key, value in tokenTypes.items()} 
    401 
    402        def wrapped(self, *args, **kwargs): 
    403            if function.__name__.startswith("process") and len(args) > 0: 
    404                token = args[0] 
    405                info = {"type": type_names[token['type']]} 
    406                if token['type'] in tagTokenTypes: 
    407                    info["name"] = token['name'] 
    408 
    409                self.parser.log.append((self.parser.tokenizer.state.__name__, 
    410                                        self.parser.phase.__class__.__name__, 
    411                                        self.__class__.__name__, 
    412                                        function.__name__, 
    413                                        info)) 
    414                return function(self, *args, **kwargs) 
    415            else: 
    416                return function(self, *args, **kwargs) 
    417        return wrapped 
    418 
    419    def getMetaclass(use_metaclass, metaclass_func): 
    420        if use_metaclass: 
    421            return method_decorator_metaclass(metaclass_func) 
    422        else: 
    423            return type 
    424 
    425    # pylint:disable=unused-argument 
    426    class Phase(with_metaclass(getMetaclass(debug, log))): 
    427        """Base class for helper object that implements each phase of processing 
    428        """ 
    429        __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") 
    430 
    431        def __init__(self, parser, tree): 
    432            self.parser = parser 
    433            self.tree = tree 
    434            self.__startTagCache = {} 
    435            self.__endTagCache = {} 
    436 
    437        def processEOF(self): 
    438            raise NotImplementedError 
    439 
    440        def processComment(self, token): 
    441            # For most phases the following is correct. Where it's not it will be 
    442            # overridden. 
    443            self.tree.insertComment(token, self.tree.openElements[-1]) 
    444 
    445        def processDoctype(self, token): 
    446            self.parser.parseError("unexpected-doctype") 
    447 
    448        def processCharacters(self, token): 
    449            self.tree.insertText(token["data"]) 
    450 
    451        def processSpaceCharacters(self, token): 
    452            self.tree.insertText(token["data"]) 
    453 
    454        def processStartTag(self, token): 
    455            # Note the caching is done here rather than BoundMethodDispatcher as doing it there 
    456            # requires a circular reference to the Phase, and this ends up with a significant 
    457            # (CPython 2.7, 3.8) GC cost when parsing many short inputs 
    458            name = token["name"] 
    459            # In Py2, using `in` is quicker in general than try/except KeyError 
    460            # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 
    461            if name in self.__startTagCache: 
    462                func = self.__startTagCache[name] 
    463            else: 
    464                func = self.__startTagCache[name] = self.startTagHandler[name] 
    465                # bound the cache size in case we get loads of unknown tags 
    466                while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: 
    467                    # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 
    468                    self.__startTagCache.pop(next(iter(self.__startTagCache))) 
    469            return func(token) 
    470 
    471        def startTagHtml(self, token): 
    472            if not self.parser.firstStartTag and token["name"] == "html": 
    473                self.parser.parseError("non-html-root") 
    474            # XXX Need a check here to see if the first start tag token emitted is 
    475            # this token... If it's not, invoke self.parser.parseError(). 
    476            for attr, value in token["data"].items(): 
    477                if attr not in self.tree.openElements[0].attributes: 
    478                    self.tree.openElements[0].attributes[attr] = value 
    479            self.parser.firstStartTag = False 
    480 
    481        def processEndTag(self, token): 
    482            # Note the caching is done here rather than BoundMethodDispatcher as doing it there 
    483            # requires a circular reference to the Phase, and this ends up with a significant 
    484            # (CPython 2.7, 3.8) GC cost when parsing many short inputs 
    485            name = token["name"] 
    486            # In Py2, using `in` is quicker in general than try/except KeyError 
    487            # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 
    488            if name in self.__endTagCache: 
    489                func = self.__endTagCache[name] 
    490            else: 
    491                func = self.__endTagCache[name] = self.endTagHandler[name] 
    492                # bound the cache size in case we get loads of unknown tags 
    493                while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: 
    494                    # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 
    495                    self.__endTagCache.pop(next(iter(self.__endTagCache))) 
    496            return func(token) 
    497 
    498    class InitialPhase(Phase): 
    499        __slots__ = tuple() 
    500 
    501        def processSpaceCharacters(self, token): 
    502            pass 
    503 
    504        def processComment(self, token): 
    505            self.tree.insertComment(token, self.tree.document) 
    506 
    507        def processDoctype(self, token): 
    508            name = token["name"] 
    509            publicId = token["publicId"] 
    510            systemId = token["systemId"] 
    511            correct = token["correct"] 
    512 
    513            if (name != "html" or publicId is not None or 
    514                    systemId is not None and systemId != "about:legacy-compat"): 
    515                self.parser.parseError("unknown-doctype") 
    516 
    517            if publicId is None: 
    518                publicId = "" 
    519 
    520            self.tree.insertDoctype(token) 
    521 
    522            if publicId != "": 
    523                publicId = publicId.translate(asciiUpper2Lower) 
    524 
    525            if (not correct or token["name"] != "html" or 
    526                    publicId.startswith( 
    527                        ("+//silmaril//dtd html pro v0r11 19970101//", 
    528                         "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", 
    529                         "-//as//dtd html 3.0 aswedit + extensions//", 
    530                         "-//ietf//dtd html 2.0 level 1//", 
    531                         "-//ietf//dtd html 2.0 level 2//", 
    532                         "-//ietf//dtd html 2.0 strict level 1//", 
    533                         "-//ietf//dtd html 2.0 strict level 2//", 
    534                         "-//ietf//dtd html 2.0 strict//", 
    535                         "-//ietf//dtd html 2.0//", 
    536                         "-//ietf//dtd html 2.1e//", 
    537                         "-//ietf//dtd html 3.0//", 
    538                         "-//ietf//dtd html 3.2 final//", 
    539                         "-//ietf//dtd html 3.2//", 
    540                         "-//ietf//dtd html 3//", 
    541                         "-//ietf//dtd html level 0//", 
    542                         "-//ietf//dtd html level 1//", 
    543                         "-//ietf//dtd html level 2//", 
    544                         "-//ietf//dtd html level 3//", 
    545                         "-//ietf//dtd html strict level 0//", 
    546                         "-//ietf//dtd html strict level 1//", 
    547                         "-//ietf//dtd html strict level 2//", 
    548                         "-//ietf//dtd html strict level 3//", 
    549                         "-//ietf//dtd html strict//", 
    550                         "-//ietf//dtd html//", 
    551                         "-//metrius//dtd metrius presentational//", 
    552                         "-//microsoft//dtd internet explorer 2.0 html strict//", 
    553                         "-//microsoft//dtd internet explorer 2.0 html//", 
    554                         "-//microsoft//dtd internet explorer 2.0 tables//", 
    555                         "-//microsoft//dtd internet explorer 3.0 html strict//", 
    556                         "-//microsoft//dtd internet explorer 3.0 html//", 
    557                         "-//microsoft//dtd internet explorer 3.0 tables//", 
    558                         "-//netscape comm. corp.//dtd html//", 
    559                         "-//netscape comm. corp.//dtd strict html//", 
    560                         "-//o'reilly and associates//dtd html 2.0//", 
    561                         "-//o'reilly and associates//dtd html extended 1.0//", 
    562                         "-//o'reilly and associates//dtd html extended relaxed 1.0//", 
    563                         "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", 
    564                         "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", 
    565                         "-//spyglass//dtd html 2.0 extended//", 
    566                         "-//sq//dtd html 2.0 hotmetal + extensions//", 
    567                         "-//sun microsystems corp.//dtd hotjava html//", 
    568                         "-//sun microsystems corp.//dtd hotjava strict html//", 
    569                         "-//w3c//dtd html 3 1995-03-24//", 
    570                         "-//w3c//dtd html 3.2 draft//", 
    571                         "-//w3c//dtd html 3.2 final//", 
    572                         "-//w3c//dtd html 3.2//", 
    573                         "-//w3c//dtd html 3.2s draft//", 
    574                         "-//w3c//dtd html 4.0 frameset//", 
    575                         "-//w3c//dtd html 4.0 transitional//", 
    576                         "-//w3c//dtd html experimental 19960712//", 
    577                         "-//w3c//dtd html experimental 970421//", 
    578                         "-//w3c//dtd w3 html//", 
    579                         "-//w3o//dtd w3 html 3.0//", 
    580                         "-//webtechs//dtd mozilla html 2.0//", 
    581                         "-//webtechs//dtd mozilla html//")) or 
    582                    publicId in ("-//w3o//dtd w3 html strict 3.0//en//", 
    583                                 "-/w3c/dtd html 4.0 transitional/en", 
    584                                 "html") or 
    585                    publicId.startswith( 
    586                        ("-//w3c//dtd html 4.01 frameset//", 
    587                         "-//w3c//dtd html 4.01 transitional//")) and 
    588                    systemId is None or 
    589                    systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): 
    590                self.parser.compatMode = "quirks" 
    591            elif (publicId.startswith( 
    592                    ("-//w3c//dtd xhtml 1.0 frameset//", 
    593                     "-//w3c//dtd xhtml 1.0 transitional//")) or 
    594                  publicId.startswith( 
    595                      ("-//w3c//dtd html 4.01 frameset//", 
    596                       "-//w3c//dtd html 4.01 transitional//")) and 
    597                  systemId is not None): 
    598                self.parser.compatMode = "limited quirks" 
    599 
    600            self.parser.phase = self.parser.phases["beforeHtml"] 
    601 
    602        def anythingElse(self): 
    603            self.parser.compatMode = "quirks" 
    604            self.parser.phase = self.parser.phases["beforeHtml"] 
    605 
    606        def processCharacters(self, token): 
    607            self.parser.parseError("expected-doctype-but-got-chars") 
    608            self.anythingElse() 
    609            return token 
    610 
    611        def processStartTag(self, token): 
    612            self.parser.parseError("expected-doctype-but-got-start-tag", 
    613                                   {"name": token["name"]}) 
    614            self.anythingElse() 
    615            return token 
    616 
    617        def processEndTag(self, token): 
    618            self.parser.parseError("expected-doctype-but-got-end-tag", 
    619                                   {"name": token["name"]}) 
    620            self.anythingElse() 
    621            return token 
    622 
    623        def processEOF(self): 
    624            self.parser.parseError("expected-doctype-but-got-eof") 
    625            self.anythingElse() 
    626            return True 
    627 
    628    class BeforeHtmlPhase(Phase): 
    629        __slots__ = tuple() 
    630 
    631        # helper methods 
    632        def insertHtmlElement(self): 
    633            self.tree.insertRoot(impliedTagToken("html", "StartTag")) 
    634            self.parser.phase = self.parser.phases["beforeHead"] 
    635 
    636        # other 
    637        def processEOF(self): 
    638            self.insertHtmlElement() 
    639            return True 
    640 
    641        def processComment(self, token): 
    642            self.tree.insertComment(token, self.tree.document) 
    643 
    644        def processSpaceCharacters(self, token): 
    645            pass 
    646 
    647        def processCharacters(self, token): 
    648            self.insertHtmlElement() 
    649            return token 
    650 
    651        def processStartTag(self, token): 
    652            if token["name"] == "html": 
    653                self.parser.firstStartTag = True 
    654            self.insertHtmlElement() 
    655            return token 
    656 
    657        def processEndTag(self, token): 
    658            if token["name"] not in ("head", "body", "html", "br"): 
    659                self.parser.parseError("unexpected-end-tag-before-html", 
    660                                       {"name": token["name"]}) 
    661            else: 
    662                self.insertHtmlElement() 
    663                return token 
    664 
    665    class BeforeHeadPhase(Phase): 
    666        __slots__ = tuple() 
    667 
    668        def processEOF(self): 
    669            self.startTagHead(impliedTagToken("head", "StartTag")) 
    670            return True 
    671 
    672        def processSpaceCharacters(self, token): 
    673            pass 
    674 
    675        def processCharacters(self, token): 
    676            self.startTagHead(impliedTagToken("head", "StartTag")) 
    677            return token 
    678 
    679        def startTagHtml(self, token): 
    680            return self.parser.phases["inBody"].processStartTag(token) 
    681 
    682        def startTagHead(self, token): 
    683            self.tree.insertElement(token) 
    684            self.tree.headPointer = self.tree.openElements[-1] 
    685            self.parser.phase = self.parser.phases["inHead"] 
    686 
    687        def startTagOther(self, token): 
    688            self.startTagHead(impliedTagToken("head", "StartTag")) 
    689            return token 
    690 
    691        def endTagImplyHead(self, token): 
    692            self.startTagHead(impliedTagToken("head", "StartTag")) 
    693            return token 
    694 
    695        def endTagOther(self, token): 
    696            self.parser.parseError("end-tag-after-implied-root", 
    697                                   {"name": token["name"]}) 
    698 
    699        startTagHandler = _utils.MethodDispatcher([ 
    700            ("html", startTagHtml), 
    701            ("head", startTagHead) 
    702        ]) 
    703        startTagHandler.default = startTagOther 
    704 
    705        endTagHandler = _utils.MethodDispatcher([ 
    706            (("head", "body", "html", "br"), endTagImplyHead) 
    707        ]) 
    708        endTagHandler.default = endTagOther 
    709 
    710    class InHeadPhase(Phase): 
    711        __slots__ = tuple() 
    712 
    713        # the real thing 
    714        def processEOF(self): 
    715            self.anythingElse() 
    716            return True 
    717 
    718        def processCharacters(self, token): 
    719            self.anythingElse() 
    720            return token 
    721 
    722        def startTagHtml(self, token): 
    723            return self.parser.phases["inBody"].processStartTag(token) 
    724 
    725        def startTagHead(self, token): 
    726            self.parser.parseError("two-heads-are-not-better-than-one") 
    727 
    728        def startTagBaseLinkCommand(self, token): 
    729            self.tree.insertElement(token) 
    730            self.tree.openElements.pop() 
    731            token["selfClosingAcknowledged"] = True 
    732 
    733        def startTagMeta(self, token): 
    734            self.tree.insertElement(token) 
    735            self.tree.openElements.pop() 
    736            token["selfClosingAcknowledged"] = True 
    737 
    738            attributes = token["data"] 
    739            if self.parser.tokenizer.stream.charEncoding[1] == "tentative": 
    740                if "charset" in attributes: 
    741                    self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) 
    742                elif ("content" in attributes and 
    743                      "http-equiv" in attributes and 
    744                      attributes["http-equiv"].lower() == "content-type"): 
    745                    # Encoding it as UTF-8 here is a hack, as really we should pass 
    746                    # the abstract Unicode string, and just use the 
    747                    # ContentAttrParser on that, but using UTF-8 allows all chars 
    748                    # to be encoded and as a ASCII-superset works. 
    749                    data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) 
    750                    parser = _inputstream.ContentAttrParser(data) 
    751                    codec = parser.parse() 
    752                    self.parser.tokenizer.stream.changeEncoding(codec) 
    753 
    754        def startTagTitle(self, token): 
    755            self.parser.parseRCDataRawtext(token, "RCDATA") 
    756 
    757        def startTagNoFramesStyle(self, token): 
    758            # Need to decide whether to implement the scripting-disabled case 
    759            self.parser.parseRCDataRawtext(token, "RAWTEXT") 
    760 
    761        def startTagNoscript(self, token): 
    762            if self.parser.scripting: 
    763                self.parser.parseRCDataRawtext(token, "RAWTEXT") 
    764            else: 
    765                self.tree.insertElement(token) 
    766                self.parser.phase = self.parser.phases["inHeadNoscript"] 
    767 
    768        def startTagScript(self, token): 
    769            self.tree.insertElement(token) 
    770            self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState 
    771            self.parser.originalPhase = self.parser.phase 
    772            self.parser.phase = self.parser.phases["text"] 
    773 
    774        def startTagOther(self, token): 
    775            self.anythingElse() 
    776            return token 
    777 
    778        def endTagHead(self, token): 
    779            node = self.parser.tree.openElements.pop() 
    780            assert node.name == "head", "Expected head got %s" % node.name 
    781            self.parser.phase = self.parser.phases["afterHead"] 
    782 
    783        def endTagHtmlBodyBr(self, token): 
    784            self.anythingElse() 
    785            return token 
    786 
    787        def endTagOther(self, token): 
    788            self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    789 
    790        def anythingElse(self): 
    791            self.endTagHead(impliedTagToken("head")) 
    792 
    793        startTagHandler = _utils.MethodDispatcher([ 
    794            ("html", startTagHtml), 
    795            ("title", startTagTitle), 
    796            (("noframes", "style"), startTagNoFramesStyle), 
    797            ("noscript", startTagNoscript), 
    798            ("script", startTagScript), 
    799            (("base", "basefont", "bgsound", "command", "link"), 
    800             startTagBaseLinkCommand), 
    801            ("meta", startTagMeta), 
    802            ("head", startTagHead) 
    803        ]) 
    804        startTagHandler.default = startTagOther 
    805 
    806        endTagHandler = _utils.MethodDispatcher([ 
    807            ("head", endTagHead), 
    808            (("br", "html", "body"), endTagHtmlBodyBr) 
    809        ]) 
    810        endTagHandler.default = endTagOther 
    811 
    812    class InHeadNoscriptPhase(Phase): 
    813        __slots__ = tuple() 
    814 
    815        def processEOF(self): 
    816            self.parser.parseError("eof-in-head-noscript") 
    817            self.anythingElse() 
    818            return True 
    819 
    820        def processComment(self, token): 
    821            return self.parser.phases["inHead"].processComment(token) 
    822 
    823        def processCharacters(self, token): 
    824            self.parser.parseError("char-in-head-noscript") 
    825            self.anythingElse() 
    826            return token 
    827 
    828        def processSpaceCharacters(self, token): 
    829            return self.parser.phases["inHead"].processSpaceCharacters(token) 
    830 
    831        def startTagHtml(self, token): 
    832            return self.parser.phases["inBody"].processStartTag(token) 
    833 
    834        def startTagBaseLinkCommand(self, token): 
    835            return self.parser.phases["inHead"].processStartTag(token) 
    836 
    837        def startTagHeadNoscript(self, token): 
    838            self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 
    839 
    840        def startTagOther(self, token): 
    841            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 
    842            self.anythingElse() 
    843            return token 
    844 
    845        def endTagNoscript(self, token): 
    846            node = self.parser.tree.openElements.pop() 
    847            assert node.name == "noscript", "Expected noscript got %s" % node.name 
    848            self.parser.phase = self.parser.phases["inHead"] 
    849 
    850        def endTagBr(self, token): 
    851            self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 
    852            self.anythingElse() 
    853            return token 
    854 
    855        def endTagOther(self, token): 
    856            self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    857 
    858        def anythingElse(self): 
    859            # Caller must raise parse error first! 
    860            self.endTagNoscript(impliedTagToken("noscript")) 
    861 
    862        startTagHandler = _utils.MethodDispatcher([ 
    863            ("html", startTagHtml), 
    864            (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), 
    865            (("head", "noscript"), startTagHeadNoscript), 
    866        ]) 
    867        startTagHandler.default = startTagOther 
    868 
    869        endTagHandler = _utils.MethodDispatcher([ 
    870            ("noscript", endTagNoscript), 
    871            ("br", endTagBr), 
    872        ]) 
    873        endTagHandler.default = endTagOther 
    874 
    875    class AfterHeadPhase(Phase): 
    876        __slots__ = tuple() 
    877 
    878        def processEOF(self): 
    879            self.anythingElse() 
    880            return True 
    881 
    882        def processCharacters(self, token): 
    883            self.anythingElse() 
    884            return token 
    885 
    886        def startTagHtml(self, token): 
    887            return self.parser.phases["inBody"].processStartTag(token) 
    888 
    889        def startTagBody(self, token): 
    890            self.parser.framesetOK = False 
    891            self.tree.insertElement(token) 
    892            self.parser.phase = self.parser.phases["inBody"] 
    893 
    894        def startTagFrameset(self, token): 
    895            self.tree.insertElement(token) 
    896            self.parser.phase = self.parser.phases["inFrameset"] 
    897 
    898        def startTagFromHead(self, token): 
    899            self.parser.parseError("unexpected-start-tag-out-of-my-head", 
    900                                   {"name": token["name"]}) 
    901            self.tree.openElements.append(self.tree.headPointer) 
    902            self.parser.phases["inHead"].processStartTag(token) 
    903            for node in self.tree.openElements[::-1]: 
    904                if node.name == "head": 
    905                    self.tree.openElements.remove(node) 
    906                    break 
    907 
    908        def startTagHead(self, token): 
    909            self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 
    910 
    911        def startTagOther(self, token): 
    912            self.anythingElse() 
    913            return token 
    914 
    915        def endTagHtmlBodyBr(self, token): 
    916            self.anythingElse() 
    917            return token 
    918 
    919        def endTagOther(self, token): 
    920            self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    921 
    922        def anythingElse(self): 
    923            self.tree.insertElement(impliedTagToken("body", "StartTag")) 
    924            self.parser.phase = self.parser.phases["inBody"] 
    925            self.parser.framesetOK = True 
    926 
    927        startTagHandler = _utils.MethodDispatcher([ 
    928            ("html", startTagHtml), 
    929            ("body", startTagBody), 
    930            ("frameset", startTagFrameset), 
    931            (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", 
    932              "style", "title"), 
    933             startTagFromHead), 
    934            ("head", startTagHead) 
    935        ]) 
    936        startTagHandler.default = startTagOther 
    937        endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), 
    938                                                  endTagHtmlBodyBr)]) 
    939        endTagHandler.default = endTagOther 
    940 
    941    class InBodyPhase(Phase): 
    942        # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody 
    943        # the really-really-really-very crazy mode 
    944        __slots__ = ("processSpaceCharacters",) 
    945 
    946        def __init__(self, *args, **kwargs): 
    947            super(InBodyPhase, self).__init__(*args, **kwargs) 
    948            # Set this to the default handler 
    949            self.processSpaceCharacters = self.processSpaceCharactersNonPre 
    950 
    951        def isMatchingFormattingElement(self, node1, node2): 
    952            return (node1.name == node2.name and 
    953                    node1.namespace == node2.namespace and 
    954                    node1.attributes == node2.attributes) 
    955 
    956        # helper 
    957        def addFormattingElement(self, token): 
    958            self.tree.insertElement(token) 
    959            element = self.tree.openElements[-1] 
    960 
    961            matchingElements = [] 
    962            for node in self.tree.activeFormattingElements[::-1]: 
    963                if node is Marker: 
    964                    break 
    965                elif self.isMatchingFormattingElement(node, element): 
    966                    matchingElements.append(node) 
    967 
    968            assert len(matchingElements) <= 3 
    969            if len(matchingElements) == 3: 
    970                self.tree.activeFormattingElements.remove(matchingElements[-1]) 
    971            self.tree.activeFormattingElements.append(element) 
    972 
    973        # the real deal 
    974        def processEOF(self): 
    975            allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", 
    976                                          "tfoot", "th", "thead", "tr", "body", 
    977                                          "html")) 
    978            for node in self.tree.openElements[::-1]: 
    979                if node.name not in allowed_elements: 
    980                    self.parser.parseError("expected-closing-tag-but-got-eof") 
    981                    break 
    982            # Stop parsing 
    983 
    984        def processSpaceCharactersDropNewline(self, token): 
    985            # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we 
    986            # want to drop leading newlines 
    987            data = token["data"] 
    988            self.processSpaceCharacters = self.processSpaceCharactersNonPre 
    989            if (data.startswith("\n") and 
    990                self.tree.openElements[-1].name in ("pre", "listing", "textarea") and 
    991                    not self.tree.openElements[-1].hasContent()): 
    992                data = data[1:] 
    993            if data: 
    994                self.tree.reconstructActiveFormattingElements() 
    995                self.tree.insertText(data) 
    996 
    997        def processCharacters(self, token): 
    998            if token["data"] == "\u0000": 
    999                # The tokenizer should always emit null on its own 
    1000                return 
    1001            self.tree.reconstructActiveFormattingElements() 
    1002            self.tree.insertText(token["data"]) 
    1003            # This must be bad for performance 
    1004            if (self.parser.framesetOK and 
    1005                any([char not in spaceCharacters 
    1006                     for char in token["data"]])): 
    1007                self.parser.framesetOK = False 
    1008 
    1009        def processSpaceCharactersNonPre(self, token): 
    1010            self.tree.reconstructActiveFormattingElements() 
    1011            self.tree.insertText(token["data"]) 
    1012 
    1013        def startTagProcessInHead(self, token): 
    1014            return self.parser.phases["inHead"].processStartTag(token) 
    1015 
    1016        def startTagBody(self, token): 
    1017            self.parser.parseError("unexpected-start-tag", {"name": "body"}) 
    1018            if (len(self.tree.openElements) == 1 or 
    1019                    self.tree.openElements[1].name != "body"): 
    1020                assert self.parser.innerHTML 
    1021            else: 
    1022                self.parser.framesetOK = False 
    1023                for attr, value in token["data"].items(): 
    1024                    if attr not in self.tree.openElements[1].attributes: 
    1025                        self.tree.openElements[1].attributes[attr] = value 
    1026 
    1027        def startTagFrameset(self, token): 
    1028            self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) 
    1029            if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): 
    1030                assert self.parser.innerHTML 
    1031            elif not self.parser.framesetOK: 
    1032                pass 
    1033            else: 
    1034                if self.tree.openElements[1].parent: 
    1035                    self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) 
    1036                while self.tree.openElements[-1].name != "html": 
    1037                    self.tree.openElements.pop() 
    1038                self.tree.insertElement(token) 
    1039                self.parser.phase = self.parser.phases["inFrameset"] 
    1040 
    1041        def startTagCloseP(self, token): 
    1042            if self.tree.elementInScope("p", variant="button"): 
    1043                self.endTagP(impliedTagToken("p")) 
    1044            self.tree.insertElement(token) 
    1045 
    1046        def startTagPreListing(self, token): 
    1047            if self.tree.elementInScope("p", variant="button"): 
    1048                self.endTagP(impliedTagToken("p")) 
    1049            self.tree.insertElement(token) 
    1050            self.parser.framesetOK = False 
    1051            self.processSpaceCharacters = self.processSpaceCharactersDropNewline 
    1052 
    1053        def startTagForm(self, token): 
    1054            if self.tree.formPointer: 
    1055                self.parser.parseError("unexpected-start-tag", {"name": "form"}) 
    1056            else: 
    1057                if self.tree.elementInScope("p", variant="button"): 
    1058                    self.endTagP(impliedTagToken("p")) 
    1059                self.tree.insertElement(token) 
    1060                self.tree.formPointer = self.tree.openElements[-1] 
    1061 
    1062        def startTagListItem(self, token): 
    1063            self.parser.framesetOK = False 
    1064 
    1065            stopNamesMap = {"li": ["li"], 
    1066                            "dt": ["dt", "dd"], 
    1067                            "dd": ["dt", "dd"]} 
    1068            stopNames = stopNamesMap[token["name"]] 
    1069            for node in reversed(self.tree.openElements): 
    1070                if node.name in stopNames: 
    1071                    self.parser.phase.processEndTag( 
    1072                        impliedTagToken(node.name, "EndTag")) 
    1073                    break 
    1074                if (node.nameTuple in specialElements and 
    1075                        node.name not in ("address", "div", "p")): 
    1076                    break 
    1077 
    1078            if self.tree.elementInScope("p", variant="button"): 
    1079                self.parser.phase.processEndTag( 
    1080                    impliedTagToken("p", "EndTag")) 
    1081 
    1082            self.tree.insertElement(token) 
    1083 
    1084        def startTagPlaintext(self, token): 
    1085            if self.tree.elementInScope("p", variant="button"): 
    1086                self.endTagP(impliedTagToken("p")) 
    1087            self.tree.insertElement(token) 
    1088            self.parser.tokenizer.state = self.parser.tokenizer.plaintextState 
    1089 
    1090        def startTagHeading(self, token): 
    1091            if self.tree.elementInScope("p", variant="button"): 
    1092                self.endTagP(impliedTagToken("p")) 
    1093            if self.tree.openElements[-1].name in headingElements: 
    1094                self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 
    1095                self.tree.openElements.pop() 
    1096            self.tree.insertElement(token) 
    1097 
    1098        def startTagA(self, token): 
    1099            afeAElement = self.tree.elementInActiveFormattingElements("a") 
    1100            if afeAElement: 
    1101                self.parser.parseError("unexpected-start-tag-implies-end-tag", 
    1102                                       {"startName": "a", "endName": "a"}) 
    1103                self.endTagFormatting(impliedTagToken("a")) 
    1104                if afeAElement in self.tree.openElements: 
    1105                    self.tree.openElements.remove(afeAElement) 
    1106                if afeAElement in self.tree.activeFormattingElements: 
    1107                    self.tree.activeFormattingElements.remove(afeAElement) 
    1108            self.tree.reconstructActiveFormattingElements() 
    1109            self.addFormattingElement(token) 
    1110 
    1111        def startTagFormatting(self, token): 
    1112            self.tree.reconstructActiveFormattingElements() 
    1113            self.addFormattingElement(token) 
    1114 
    1115        def startTagNobr(self, token): 
    1116            self.tree.reconstructActiveFormattingElements() 
    1117            if self.tree.elementInScope("nobr"): 
    1118                self.parser.parseError("unexpected-start-tag-implies-end-tag", 
    1119                                       {"startName": "nobr", "endName": "nobr"}) 
    1120                self.processEndTag(impliedTagToken("nobr")) 
    1121                # XXX Need tests that trigger the following 
    1122                self.tree.reconstructActiveFormattingElements() 
    1123            self.addFormattingElement(token) 
    1124 
    1125        def startTagButton(self, token): 
    1126            if self.tree.elementInScope("button"): 
    1127                self.parser.parseError("unexpected-start-tag-implies-end-tag", 
    1128                                       {"startName": "button", "endName": "button"}) 
    1129                self.processEndTag(impliedTagToken("button")) 
    1130                return token 
    1131            else: 
    1132                self.tree.reconstructActiveFormattingElements() 
    1133                self.tree.insertElement(token) 
    1134                self.parser.framesetOK = False 
    1135 
    1136        def startTagAppletMarqueeObject(self, token): 
    1137            self.tree.reconstructActiveFormattingElements() 
    1138            self.tree.insertElement(token) 
    1139            self.tree.activeFormattingElements.append(Marker) 
    1140            self.parser.framesetOK = False 
    1141 
    1142        def startTagXmp(self, token): 
    1143            if self.tree.elementInScope("p", variant="button"): 
    1144                self.endTagP(impliedTagToken("p")) 
    1145            self.tree.reconstructActiveFormattingElements() 
    1146            self.parser.framesetOK = False 
    1147            self.parser.parseRCDataRawtext(token, "RAWTEXT") 
    1148 
    1149        def startTagTable(self, token): 
    1150            if self.parser.compatMode != "quirks": 
    1151                if self.tree.elementInScope("p", variant="button"): 
    1152                    self.processEndTag(impliedTagToken("p")) 
    1153            self.tree.insertElement(token) 
    1154            self.parser.framesetOK = False 
    1155            self.parser.phase = self.parser.phases["inTable"] 
    1156 
    1157        def startTagVoidFormatting(self, token): 
    1158            self.tree.reconstructActiveFormattingElements() 
    1159            self.tree.insertElement(token) 
    1160            self.tree.openElements.pop() 
    1161            token["selfClosingAcknowledged"] = True 
    1162            self.parser.framesetOK = False 
    1163 
    1164        def startTagInput(self, token): 
    1165            framesetOK = self.parser.framesetOK 
    1166            self.startTagVoidFormatting(token) 
    1167            if ("type" in token["data"] and 
    1168                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 
    1169                # input type=hidden doesn't change framesetOK 
    1170                self.parser.framesetOK = framesetOK 
    1171 
    1172        def startTagParamSource(self, token): 
    1173            self.tree.insertElement(token) 
    1174            self.tree.openElements.pop() 
    1175            token["selfClosingAcknowledged"] = True 
    1176 
    1177        def startTagHr(self, token): 
    1178            if self.tree.elementInScope("p", variant="button"): 
    1179                self.endTagP(impliedTagToken("p")) 
    1180            self.tree.insertElement(token) 
    1181            self.tree.openElements.pop() 
    1182            token["selfClosingAcknowledged"] = True 
    1183            self.parser.framesetOK = False 
    1184 
    1185        def startTagImage(self, token): 
    1186            # No really... 
    1187            self.parser.parseError("unexpected-start-tag-treated-as", 
    1188                                   {"originalName": "image", "newName": "img"}) 
    1189            self.processStartTag(impliedTagToken("img", "StartTag", 
    1190                                                 attributes=token["data"], 
    1191                                                 selfClosing=token["selfClosing"])) 
    1192 
    1193        def startTagIsIndex(self, token): 
    1194            self.parser.parseError("deprecated-tag", {"name": "isindex"}) 
    1195            if self.tree.formPointer: 
    1196                return 
    1197            form_attrs = {} 
    1198            if "action" in token["data"]: 
    1199                form_attrs["action"] = token["data"]["action"] 
    1200            self.processStartTag(impliedTagToken("form", "StartTag", 
    1201                                                 attributes=form_attrs)) 
    1202            self.processStartTag(impliedTagToken("hr", "StartTag")) 
    1203            self.processStartTag(impliedTagToken("label", "StartTag")) 
    1204            # XXX Localization ... 
    1205            if "prompt" in token["data"]: 
    1206                prompt = token["data"]["prompt"] 
    1207            else: 
    1208                prompt = "This is a searchable index. Enter search keywords: " 
    1209            self.processCharacters( 
    1210                {"type": tokenTypes["Characters"], "data": prompt}) 
    1211            attributes = token["data"].copy() 
    1212            if "action" in attributes: 
    1213                del attributes["action"] 
    1214            if "prompt" in attributes: 
    1215                del attributes["prompt"] 
    1216            attributes["name"] = "isindex" 
    1217            self.processStartTag(impliedTagToken("input", "StartTag", 
    1218                                                 attributes=attributes, 
    1219                                                 selfClosing=token["selfClosing"])) 
    1220            self.processEndTag(impliedTagToken("label")) 
    1221            self.processStartTag(impliedTagToken("hr", "StartTag")) 
    1222            self.processEndTag(impliedTagToken("form")) 
    1223 
    1224        def startTagTextarea(self, token): 
    1225            self.tree.insertElement(token) 
    1226            self.parser.tokenizer.state = self.parser.tokenizer.rcdataState 
    1227            self.processSpaceCharacters = self.processSpaceCharactersDropNewline 
    1228            self.parser.framesetOK = False 
    1229 
    1230        def startTagIFrame(self, token): 
    1231            self.parser.framesetOK = False 
    1232            self.startTagRawtext(token) 
    1233 
    1234        def startTagNoscript(self, token): 
    1235            if self.parser.scripting: 
    1236                self.startTagRawtext(token) 
    1237            else: 
    1238                self.startTagOther(token) 
    1239 
    1240        def startTagRawtext(self, token): 
    1241            """iframe, noembed noframes, noscript(if scripting enabled)""" 
    1242            self.parser.parseRCDataRawtext(token, "RAWTEXT") 
    1243 
    1244        def startTagOpt(self, token): 
    1245            if self.tree.openElements[-1].name == "option": 
    1246                self.parser.phase.processEndTag(impliedTagToken("option")) 
    1247            self.tree.reconstructActiveFormattingElements() 
    1248            self.parser.tree.insertElement(token) 
    1249 
    1250        def startTagSelect(self, token): 
    1251            self.tree.reconstructActiveFormattingElements() 
    1252            self.tree.insertElement(token) 
    1253            self.parser.framesetOK = False 
    1254            if self.parser.phase in (self.parser.phases["inTable"], 
    1255                                     self.parser.phases["inCaption"], 
    1256                                     self.parser.phases["inColumnGroup"], 
    1257                                     self.parser.phases["inTableBody"], 
    1258                                     self.parser.phases["inRow"], 
    1259                                     self.parser.phases["inCell"]): 
    1260                self.parser.phase = self.parser.phases["inSelectInTable"] 
    1261            else: 
    1262                self.parser.phase = self.parser.phases["inSelect"] 
    1263 
    1264        def startTagRpRt(self, token): 
    1265            if self.tree.elementInScope("ruby"): 
    1266                self.tree.generateImpliedEndTags() 
    1267                if self.tree.openElements[-1].name != "ruby": 
    1268                    self.parser.parseError() 
    1269            self.tree.insertElement(token) 
    1270 
    1271        def startTagMath(self, token): 
    1272            self.tree.reconstructActiveFormattingElements() 
    1273            self.parser.adjustMathMLAttributes(token) 
    1274            self.parser.adjustForeignAttributes(token) 
    1275            token["namespace"] = namespaces["mathml"] 
    1276            self.tree.insertElement(token) 
    1277            # Need to get the parse error right for the case where the token 
    1278            # has a namespace not equal to the xmlns attribute 
    1279            if token["selfClosing"]: 
    1280                self.tree.openElements.pop() 
    1281                token["selfClosingAcknowledged"] = True 
    1282 
    1283        def startTagSvg(self, token): 
    1284            self.tree.reconstructActiveFormattingElements() 
    1285            self.parser.adjustSVGAttributes(token) 
    1286            self.parser.adjustForeignAttributes(token) 
    1287            token["namespace"] = namespaces["svg"] 
    1288            self.tree.insertElement(token) 
    1289            # Need to get the parse error right for the case where the token 
    1290            # has a namespace not equal to the xmlns attribute 
    1291            if token["selfClosing"]: 
    1292                self.tree.openElements.pop() 
    1293                token["selfClosingAcknowledged"] = True 
    1294 
    1295        def startTagMisplaced(self, token): 
    1296            """ Elements that should be children of other elements that have a 
    1297            different insertion mode; here they are ignored 
    1298            "caption", "col", "colgroup", "frame", "frameset", "head", 
    1299            "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", 
    1300            "tr", "noscript" 
    1301            """ 
    1302            self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) 
    1303 
    1304        def startTagOther(self, token): 
    1305            self.tree.reconstructActiveFormattingElements() 
    1306            self.tree.insertElement(token) 
    1307 
    1308        def endTagP(self, token): 
    1309            if not self.tree.elementInScope("p", variant="button"): 
    1310                self.startTagCloseP(impliedTagToken("p", "StartTag")) 
    1311                self.parser.parseError("unexpected-end-tag", {"name": "p"}) 
    1312                self.endTagP(impliedTagToken("p", "EndTag")) 
    1313            else: 
    1314                self.tree.generateImpliedEndTags("p") 
    1315                if self.tree.openElements[-1].name != "p": 
    1316                    self.parser.parseError("unexpected-end-tag", {"name": "p"}) 
    1317                node = self.tree.openElements.pop() 
    1318                while node.name != "p": 
    1319                    node = self.tree.openElements.pop() 
    1320 
    1321        def endTagBody(self, token): 
    1322            if not self.tree.elementInScope("body"): 
    1323                self.parser.parseError() 
    1324                return 
    1325            elif self.tree.openElements[-1].name != "body": 
    1326                for node in self.tree.openElements[2:]: 
    1327                    if node.name not in frozenset(("dd", "dt", "li", "optgroup", 
    1328                                                   "option", "p", "rp", "rt", 
    1329                                                   "tbody", "td", "tfoot", 
    1330                                                   "th", "thead", "tr", "body", 
    1331                                                   "html")): 
    1332                        # Not sure this is the correct name for the parse error 
    1333                        self.parser.parseError( 
    1334                            "expected-one-end-tag-but-got-another", 
    1335                            {"gotName": "body", "expectedName": node.name}) 
    1336                        break 
    1337            self.parser.phase = self.parser.phases["afterBody"] 
    1338 
    1339        def endTagHtml(self, token): 
    1340            # We repeat the test for the body end tag token being ignored here 
    1341            if self.tree.elementInScope("body"): 
    1342                self.endTagBody(impliedTagToken("body")) 
    1343                return token 
    1344 
    1345        def endTagBlock(self, token): 
    1346            # Put us back in the right whitespace handling mode 
    1347            if token["name"] == "pre": 
    1348                self.processSpaceCharacters = self.processSpaceCharactersNonPre 
    1349            inScope = self.tree.elementInScope(token["name"]) 
    1350            if inScope: 
    1351                self.tree.generateImpliedEndTags() 
    1352            if self.tree.openElements[-1].name != token["name"]: 
    1353                self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 
    1354            if inScope: 
    1355                node = self.tree.openElements.pop() 
    1356                while node.name != token["name"]: 
    1357                    node = self.tree.openElements.pop() 
    1358 
    1359        def endTagForm(self, token): 
    1360            node = self.tree.formPointer 
    1361            self.tree.formPointer = None 
    1362            if node is None or not self.tree.elementInScope(node): 
    1363                self.parser.parseError("unexpected-end-tag", 
    1364                                       {"name": "form"}) 
    1365            else: 
    1366                self.tree.generateImpliedEndTags() 
    1367                if self.tree.openElements[-1] != node: 
    1368                    self.parser.parseError("end-tag-too-early-ignored", 
    1369                                           {"name": "form"}) 
    1370                self.tree.openElements.remove(node) 
    1371 
    1372        def endTagListItem(self, token): 
    1373            if token["name"] == "li": 
    1374                variant = "list" 
    1375            else: 
    1376                variant = None 
    1377            if not self.tree.elementInScope(token["name"], variant=variant): 
    1378                self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    1379            else: 
    1380                self.tree.generateImpliedEndTags(exclude=token["name"]) 
    1381                if self.tree.openElements[-1].name != token["name"]: 
    1382                    self.parser.parseError( 
    1383                        "end-tag-too-early", 
    1384                        {"name": token["name"]}) 
    1385                node = self.tree.openElements.pop() 
    1386                while node.name != token["name"]: 
    1387                    node = self.tree.openElements.pop() 
    1388 
    1389        def endTagHeading(self, token): 
    1390            for item in headingElements: 
    1391                if self.tree.elementInScope(item): 
    1392                    self.tree.generateImpliedEndTags() 
    1393                    break 
    1394            if self.tree.openElements[-1].name != token["name"]: 
    1395                self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 
    1396 
    1397            for item in headingElements: 
    1398                if self.tree.elementInScope(item): 
    1399                    item = self.tree.openElements.pop() 
    1400                    while item.name not in headingElements: 
    1401                        item = self.tree.openElements.pop() 
    1402                    break 
    1403 
    1404        def endTagFormatting(self, token): 
    1405            """The much-feared adoption agency algorithm""" 
    1406            # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 
    1407            # XXX Better parseError messages appreciated. 
    1408 
    1409            # Step 1 
    1410            outerLoopCounter = 0 
    1411 
    1412            # Step 2 
    1413            while outerLoopCounter < 8: 
    1414 
    1415                # Step 3 
    1416                outerLoopCounter += 1 
    1417 
    1418                # Step 4: 
    1419 
    1420                # Let the formatting element be the last element in 
    1421                # the list of active formatting elements that: 
    1422                # - is between the end of the list and the last scope 
    1423                # marker in the list, if any, or the start of the list 
    1424                # otherwise, and 
    1425                # - has the same tag name as the token. 
    1426                formattingElement = self.tree.elementInActiveFormattingElements( 
    1427                    token["name"]) 
    1428                if (not formattingElement or 
    1429                    (formattingElement in self.tree.openElements and 
    1430                     not self.tree.elementInScope(formattingElement.name))): 
    1431                    # If there is no such node, then abort these steps 
    1432                    # and instead act as described in the "any other 
    1433                    # end tag" entry below. 
    1434                    self.endTagOther(token) 
    1435                    return 
    1436 
    1437                # Otherwise, if there is such a node, but that node is 
    1438                # not in the stack of open elements, then this is a 
    1439                # parse error; remove the element from the list, and 
    1440                # abort these steps. 
    1441                elif formattingElement not in self.tree.openElements: 
    1442                    self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) 
    1443                    self.tree.activeFormattingElements.remove(formattingElement) 
    1444                    return 
    1445 
    1446                # Otherwise, if there is such a node, and that node is 
    1447                # also in the stack of open elements, but the element 
    1448                # is not in scope, then this is a parse error; ignore 
    1449                # the token, and abort these steps. 
    1450                elif not self.tree.elementInScope(formattingElement.name): 
    1451                    self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) 
    1452                    return 
    1453 
    1454                # Otherwise, there is a formatting element and that 
    1455                # element is in the stack and is in scope. If the 
    1456                # element is not the current node, this is a parse 
    1457                # error. In any case, proceed with the algorithm as 
    1458                # written in the following steps. 
    1459                else: 
    1460                    if formattingElement != self.tree.openElements[-1]: 
    1461                        self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) 
    1462 
    1463                # Step 5: 
    1464 
    1465                # Let the furthest block be the topmost node in the 
    1466                # stack of open elements that is lower in the stack 
    1467                # than the formatting element, and is an element in 
    1468                # the special category. There might not be one. 
    1469                afeIndex = self.tree.openElements.index(formattingElement) 
    1470                furthestBlock = None 
    1471                for element in self.tree.openElements[afeIndex:]: 
    1472                    if element.nameTuple in specialElements: 
    1473                        furthestBlock = element 
    1474                        break 
    1475 
    1476                # Step 6: 
    1477 
    1478                # If there is no furthest block, then the UA must 
    1479                # first pop all the nodes from the bottom of the stack 
    1480                # of open elements, from the current node up to and 
    1481                # including the formatting element, then remove the 
    1482                # formatting element from the list of active 
    1483                # formatting elements, and finally abort these steps. 
    1484                if furthestBlock is None: 
    1485                    element = self.tree.openElements.pop() 
    1486                    while element != formattingElement: 
    1487                        element = self.tree.openElements.pop() 
    1488                    self.tree.activeFormattingElements.remove(element) 
    1489                    return 
    1490 
    1491                # Step 7 
    1492                commonAncestor = self.tree.openElements[afeIndex - 1] 
    1493 
    1494                # Step 8: 
    1495                # The bookmark is supposed to help us identify where to reinsert 
    1496                # nodes in step 15. We have to ensure that we reinsert nodes after 
    1497                # the node before the active formatting element. Note the bookmark 
    1498                # can move in step 9.7 
    1499                bookmark = self.tree.activeFormattingElements.index(formattingElement) 
    1500 
    1501                # Step 9 
    1502                lastNode = node = furthestBlock 
    1503                innerLoopCounter = 0 
    1504 
    1505                index = self.tree.openElements.index(node) 
    1506                while innerLoopCounter < 3: 
    1507                    innerLoopCounter += 1 
    1508                    # Node is element before node in open elements 
    1509                    index -= 1 
    1510                    node = self.tree.openElements[index] 
    1511                    if node not in self.tree.activeFormattingElements: 
    1512                        self.tree.openElements.remove(node) 
    1513                        continue 
    1514                    # Step 9.6 
    1515                    if node == formattingElement: 
    1516                        break 
    1517                    # Step 9.7 
    1518                    if lastNode == furthestBlock: 
    1519                        bookmark = self.tree.activeFormattingElements.index(node) + 1 
    1520                    # Step 9.8 
    1521                    clone = node.cloneNode() 
    1522                    # Replace node with clone 
    1523                    self.tree.activeFormattingElements[ 
    1524                        self.tree.activeFormattingElements.index(node)] = clone 
    1525                    self.tree.openElements[ 
    1526                        self.tree.openElements.index(node)] = clone 
    1527                    node = clone 
    1528                    # Step 9.9 
    1529                    # Remove lastNode from its parents, if any 
    1530                    if lastNode.parent: 
    1531                        lastNode.parent.removeChild(lastNode) 
    1532                    node.appendChild(lastNode) 
    1533                    # Step 9.10 
    1534                    lastNode = node 
    1535 
    1536                # Step 10 
    1537                # Foster parent lastNode if commonAncestor is a 
    1538                # table, tbody, tfoot, thead, or tr we need to foster 
    1539                # parent the lastNode 
    1540                if lastNode.parent: 
    1541                    lastNode.parent.removeChild(lastNode) 
    1542 
    1543                if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): 
    1544                    parent, insertBefore = self.tree.getTableMisnestedNodePosition() 
    1545                    parent.insertBefore(lastNode, insertBefore) 
    1546                else: 
    1547                    commonAncestor.appendChild(lastNode) 
    1548 
    1549                # Step 11 
    1550                clone = formattingElement.cloneNode() 
    1551 
    1552                # Step 12 
    1553                furthestBlock.reparentChildren(clone) 
    1554 
    1555                # Step 13 
    1556                furthestBlock.appendChild(clone) 
    1557 
    1558                # Step 14 
    1559                self.tree.activeFormattingElements.remove(formattingElement) 
    1560                self.tree.activeFormattingElements.insert(bookmark, clone) 
    1561 
    1562                # Step 15 
    1563                self.tree.openElements.remove(formattingElement) 
    1564                self.tree.openElements.insert( 
    1565                    self.tree.openElements.index(furthestBlock) + 1, clone) 
    1566 
    1567        def endTagAppletMarqueeObject(self, token): 
    1568            if self.tree.elementInScope(token["name"]): 
    1569                self.tree.generateImpliedEndTags() 
    1570            if self.tree.openElements[-1].name != token["name"]: 
    1571                self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 
    1572 
    1573            if self.tree.elementInScope(token["name"]): 
    1574                element = self.tree.openElements.pop() 
    1575                while element.name != token["name"]: 
    1576                    element = self.tree.openElements.pop() 
    1577                self.tree.clearActiveFormattingElements() 
    1578 
    1579        def endTagBr(self, token): 
    1580            self.parser.parseError("unexpected-end-tag-treated-as", 
    1581                                   {"originalName": "br", "newName": "br element"}) 
    1582            self.tree.reconstructActiveFormattingElements() 
    1583            self.tree.insertElement(impliedTagToken("br", "StartTag")) 
    1584            self.tree.openElements.pop() 
    1585 
    1586        def endTagOther(self, token): 
    1587            for node in self.tree.openElements[::-1]: 
    1588                if node.name == token["name"]: 
    1589                    self.tree.generateImpliedEndTags(exclude=token["name"]) 
    1590                    if self.tree.openElements[-1].name != token["name"]: 
    1591                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    1592                    while self.tree.openElements.pop() != node: 
    1593                        pass 
    1594                    break 
    1595                else: 
    1596                    if node.nameTuple in specialElements: 
    1597                        self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    1598                        break 
    1599 
    1600        startTagHandler = _utils.MethodDispatcher([ 
    1601            ("html", Phase.startTagHtml), 
    1602            (("base", "basefont", "bgsound", "command", "link", "meta", 
    1603              "script", "style", "title"), 
    1604             startTagProcessInHead), 
    1605            ("body", startTagBody), 
    1606            ("frameset", startTagFrameset), 
    1607            (("address", "article", "aside", "blockquote", "center", "details", 
    1608              "dir", "div", "dl", "fieldset", "figcaption", "figure", 
    1609              "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", 
    1610              "section", "summary", "ul"), 
    1611             startTagCloseP), 
    1612            (headingElements, startTagHeading), 
    1613            (("pre", "listing"), startTagPreListing), 
    1614            ("form", startTagForm), 
    1615            (("li", "dd", "dt"), startTagListItem), 
    1616            ("plaintext", startTagPlaintext), 
    1617            ("a", startTagA), 
    1618            (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 
    1619              "strong", "tt", "u"), startTagFormatting), 
    1620            ("nobr", startTagNobr), 
    1621            ("button", startTagButton), 
    1622            (("applet", "marquee", "object"), startTagAppletMarqueeObject), 
    1623            ("xmp", startTagXmp), 
    1624            ("table", startTagTable), 
    1625            (("area", "br", "embed", "img", "keygen", "wbr"), 
    1626             startTagVoidFormatting), 
    1627            (("param", "source", "track"), startTagParamSource), 
    1628            ("input", startTagInput), 
    1629            ("hr", startTagHr), 
    1630            ("image", startTagImage), 
    1631            ("isindex", startTagIsIndex), 
    1632            ("textarea", startTagTextarea), 
    1633            ("iframe", startTagIFrame), 
    1634            ("noscript", startTagNoscript), 
    1635            (("noembed", "noframes"), startTagRawtext), 
    1636            ("select", startTagSelect), 
    1637            (("rp", "rt"), startTagRpRt), 
    1638            (("option", "optgroup"), startTagOpt), 
    1639            (("math"), startTagMath), 
    1640            (("svg"), startTagSvg), 
    1641            (("caption", "col", "colgroup", "frame", "head", 
    1642              "tbody", "td", "tfoot", "th", "thead", 
    1643              "tr"), startTagMisplaced) 
    1644        ]) 
    1645        startTagHandler.default = startTagOther 
    1646 
    1647        endTagHandler = _utils.MethodDispatcher([ 
    1648            ("body", endTagBody), 
    1649            ("html", endTagHtml), 
    1650            (("address", "article", "aside", "blockquote", "button", "center", 
    1651              "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", 
    1652              "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", 
    1653              "section", "summary", "ul"), endTagBlock), 
    1654            ("form", endTagForm), 
    1655            ("p", endTagP), 
    1656            (("dd", "dt", "li"), endTagListItem), 
    1657            (headingElements, endTagHeading), 
    1658            (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", 
    1659              "strike", "strong", "tt", "u"), endTagFormatting), 
    1660            (("applet", "marquee", "object"), endTagAppletMarqueeObject), 
    1661            ("br", endTagBr), 
    1662        ]) 
    1663        endTagHandler.default = endTagOther 
    1664 
    1665    class TextPhase(Phase): 
    1666        __slots__ = tuple() 
    1667 
    1668        def processCharacters(self, token): 
    1669            self.tree.insertText(token["data"]) 
    1670 
    1671        def processEOF(self): 
    1672            self.parser.parseError("expected-named-closing-tag-but-got-eof", 
    1673                                   {"name": self.tree.openElements[-1].name}) 
    1674            self.tree.openElements.pop() 
    1675            self.parser.phase = self.parser.originalPhase 
    1676            return True 
    1677 
    1678        def startTagOther(self, token): 
    1679            assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] 
    1680 
    1681        def endTagScript(self, token): 
    1682            node = self.tree.openElements.pop() 
    1683            assert node.name == "script" 
    1684            self.parser.phase = self.parser.originalPhase 
    1685            # The rest of this method is all stuff that only happens if 
    1686            # document.write works 
    1687 
    1688        def endTagOther(self, token): 
    1689            self.tree.openElements.pop() 
    1690            self.parser.phase = self.parser.originalPhase 
    1691 
    1692        startTagHandler = _utils.MethodDispatcher([]) 
    1693        startTagHandler.default = startTagOther 
    1694        endTagHandler = _utils.MethodDispatcher([ 
    1695            ("script", endTagScript)]) 
    1696        endTagHandler.default = endTagOther 
    1697 
    1698    class InTablePhase(Phase): 
    1699        # http://www.whatwg.org/specs/web-apps/current-work/#in-table 
    1700        __slots__ = tuple() 
    1701 
    1702        # helper methods 
    1703        def clearStackToTableContext(self): 
    1704            # "clear the stack back to a table context" 
    1705            while self.tree.openElements[-1].name not in ("table", "html"): 
    1706                # self.parser.parseError("unexpected-implied-end-tag-in-table", 
    1707                #  {"name":  self.tree.openElements[-1].name}) 
    1708                self.tree.openElements.pop() 
    1709            # When the current node is <html> it's an innerHTML case 
    1710 
    1711        # processing methods 
    1712        def processEOF(self): 
    1713            if self.tree.openElements[-1].name != "html": 
    1714                self.parser.parseError("eof-in-table") 
    1715            else: 
    1716                assert self.parser.innerHTML 
    1717            # Stop parsing 
    1718 
    1719        def processSpaceCharacters(self, token): 
    1720            originalPhase = self.parser.phase 
    1721            self.parser.phase = self.parser.phases["inTableText"] 
    1722            self.parser.phase.originalPhase = originalPhase 
    1723            self.parser.phase.processSpaceCharacters(token) 
    1724 
    1725        def processCharacters(self, token): 
    1726            originalPhase = self.parser.phase 
    1727            self.parser.phase = self.parser.phases["inTableText"] 
    1728            self.parser.phase.originalPhase = originalPhase 
    1729            self.parser.phase.processCharacters(token) 
    1730 
    1731        def insertText(self, token): 
    1732            # If we get here there must be at least one non-whitespace character 
    1733            # Do the table magic! 
    1734            self.tree.insertFromTable = True 
    1735            self.parser.phases["inBody"].processCharacters(token) 
    1736            self.tree.insertFromTable = False 
    1737 
    1738        def startTagCaption(self, token): 
    1739            self.clearStackToTableContext() 
    1740            self.tree.activeFormattingElements.append(Marker) 
    1741            self.tree.insertElement(token) 
    1742            self.parser.phase = self.parser.phases["inCaption"] 
    1743 
    1744        def startTagColgroup(self, token): 
    1745            self.clearStackToTableContext() 
    1746            self.tree.insertElement(token) 
    1747            self.parser.phase = self.parser.phases["inColumnGroup"] 
    1748 
    1749        def startTagCol(self, token): 
    1750            self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) 
    1751            return token 
    1752 
    1753        def startTagRowGroup(self, token): 
    1754            self.clearStackToTableContext() 
    1755            self.tree.insertElement(token) 
    1756            self.parser.phase = self.parser.phases["inTableBody"] 
    1757 
    1758        def startTagImplyTbody(self, token): 
    1759            self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) 
    1760            return token 
    1761 
    1762        def startTagTable(self, token): 
    1763            self.parser.parseError("unexpected-start-tag-implies-end-tag", 
    1764                                   {"startName": "table", "endName": "table"}) 
    1765            self.parser.phase.processEndTag(impliedTagToken("table")) 
    1766            if not self.parser.innerHTML: 
    1767                return token 
    1768 
    1769        def startTagStyleScript(self, token): 
    1770            return self.parser.phases["inHead"].processStartTag(token) 
    1771 
    1772        def startTagInput(self, token): 
    1773            if ("type" in token["data"] and 
    1774                    token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 
    1775                self.parser.parseError("unexpected-hidden-input-in-table") 
    1776                self.tree.insertElement(token) 
    1777                # XXX associate with form 
    1778                self.tree.openElements.pop() 
    1779            else: 
    1780                self.startTagOther(token) 
    1781 
    1782        def startTagForm(self, token): 
    1783            self.parser.parseError("unexpected-form-in-table") 
    1784            if self.tree.formPointer is None: 
    1785                self.tree.insertElement(token) 
    1786                self.tree.formPointer = self.tree.openElements[-1] 
    1787                self.tree.openElements.pop() 
    1788 
    1789        def startTagOther(self, token): 
    1790            self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) 
    1791            # Do the table magic! 
    1792            self.tree.insertFromTable = True 
    1793            self.parser.phases["inBody"].processStartTag(token) 
    1794            self.tree.insertFromTable = False 
    1795 
    1796        def endTagTable(self, token): 
    1797            if self.tree.elementInScope("table", variant="table"): 
    1798                self.tree.generateImpliedEndTags() 
    1799                if self.tree.openElements[-1].name != "table": 
    1800                    self.parser.parseError("end-tag-too-early-named", 
    1801                                           {"gotName": "table", 
    1802                                            "expectedName": self.tree.openElements[-1].name}) 
    1803                while self.tree.openElements[-1].name != "table": 
    1804                    self.tree.openElements.pop() 
    1805                self.tree.openElements.pop() 
    1806                self.parser.resetInsertionMode() 
    1807            else: 
    1808                # innerHTML case 
    1809                assert self.parser.innerHTML 
    1810                self.parser.parseError() 
    1811 
    1812        def endTagIgnore(self, token): 
    1813            self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    1814 
    1815        def endTagOther(self, token): 
    1816            self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) 
    1817            # Do the table magic! 
    1818            self.tree.insertFromTable = True 
    1819            self.parser.phases["inBody"].processEndTag(token) 
    1820            self.tree.insertFromTable = False 
    1821 
    1822        startTagHandler = _utils.MethodDispatcher([ 
    1823            ("html", Phase.startTagHtml), 
    1824            ("caption", startTagCaption), 
    1825            ("colgroup", startTagColgroup), 
    1826            ("col", startTagCol), 
    1827            (("tbody", "tfoot", "thead"), startTagRowGroup), 
    1828            (("td", "th", "tr"), startTagImplyTbody), 
    1829            ("table", startTagTable), 
    1830            (("style", "script"), startTagStyleScript), 
    1831            ("input", startTagInput), 
    1832            ("form", startTagForm) 
    1833        ]) 
    1834        startTagHandler.default = startTagOther 
    1835 
    1836        endTagHandler = _utils.MethodDispatcher([ 
    1837            ("table", endTagTable), 
    1838            (("body", "caption", "col", "colgroup", "html", "tbody", "td", 
    1839              "tfoot", "th", "thead", "tr"), endTagIgnore) 
    1840        ]) 
    1841        endTagHandler.default = endTagOther 
    1842 
    1843    class InTableTextPhase(Phase): 
    1844        __slots__ = ("originalPhase", "characterTokens") 
    1845 
    1846        def __init__(self, *args, **kwargs): 
    1847            super(InTableTextPhase, self).__init__(*args, **kwargs) 
    1848            self.originalPhase = None 
    1849            self.characterTokens = [] 
    1850 
    1851        def flushCharacters(self): 
    1852            data = "".join([item["data"] for item in self.characterTokens]) 
    1853            if any([item not in spaceCharacters for item in data]): 
    1854                token = {"type": tokenTypes["Characters"], "data": data} 
    1855                self.parser.phases["inTable"].insertText(token) 
    1856            elif data: 
    1857                self.tree.insertText(data) 
    1858            self.characterTokens = [] 
    1859 
    1860        def processComment(self, token): 
    1861            self.flushCharacters() 
    1862            self.parser.phase = self.originalPhase 
    1863            return token 
    1864 
    1865        def processEOF(self): 
    1866            self.flushCharacters() 
    1867            self.parser.phase = self.originalPhase 
    1868            return True 
    1869 
    1870        def processCharacters(self, token): 
    1871            if token["data"] == "\u0000": 
    1872                return 
    1873            self.characterTokens.append(token) 
    1874 
    1875        def processSpaceCharacters(self, token): 
    1876            # pretty sure we should never reach here 
    1877            self.characterTokens.append(token) 
    1878    #        assert False 
    1879 
    1880        def processStartTag(self, token): 
    1881            self.flushCharacters() 
    1882            self.parser.phase = self.originalPhase 
    1883            return token 
    1884 
    1885        def processEndTag(self, token): 
    1886            self.flushCharacters() 
    1887            self.parser.phase = self.originalPhase 
    1888            return token 
    1889 
    1890    class InCaptionPhase(Phase): 
    1891        # http://www.whatwg.org/specs/web-apps/current-work/#in-caption 
    1892        __slots__ = tuple() 
    1893 
    1894        def ignoreEndTagCaption(self): 
    1895            return not self.tree.elementInScope("caption", variant="table") 
    1896 
    1897        def processEOF(self): 
    1898            self.parser.phases["inBody"].processEOF() 
    1899 
    1900        def processCharacters(self, token): 
    1901            return self.parser.phases["inBody"].processCharacters(token) 
    1902 
    1903        def startTagTableElement(self, token): 
    1904            self.parser.parseError() 
    1905            # XXX Have to duplicate logic here to find out if the tag is ignored 
    1906            ignoreEndTag = self.ignoreEndTagCaption() 
    1907            self.parser.phase.processEndTag(impliedTagToken("caption")) 
    1908            if not ignoreEndTag: 
    1909                return token 
    1910 
    1911        def startTagOther(self, token): 
    1912            return self.parser.phases["inBody"].processStartTag(token) 
    1913 
    1914        def endTagCaption(self, token): 
    1915            if not self.ignoreEndTagCaption(): 
    1916                # AT this code is quite similar to endTagTable in "InTable" 
    1917                self.tree.generateImpliedEndTags() 
    1918                if self.tree.openElements[-1].name != "caption": 
    1919                    self.parser.parseError("expected-one-end-tag-but-got-another", 
    1920                                           {"gotName": "caption", 
    1921                                            "expectedName": self.tree.openElements[-1].name}) 
    1922                while self.tree.openElements[-1].name != "caption": 
    1923                    self.tree.openElements.pop() 
    1924                self.tree.openElements.pop() 
    1925                self.tree.clearActiveFormattingElements() 
    1926                self.parser.phase = self.parser.phases["inTable"] 
    1927            else: 
    1928                # innerHTML case 
    1929                assert self.parser.innerHTML 
    1930                self.parser.parseError() 
    1931 
    1932        def endTagTable(self, token): 
    1933            self.parser.parseError() 
    1934            ignoreEndTag = self.ignoreEndTagCaption() 
    1935            self.parser.phase.processEndTag(impliedTagToken("caption")) 
    1936            if not ignoreEndTag: 
    1937                return token 
    1938 
    1939        def endTagIgnore(self, token): 
    1940            self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    1941 
    1942        def endTagOther(self, token): 
    1943            return self.parser.phases["inBody"].processEndTag(token) 
    1944 
    1945        startTagHandler = _utils.MethodDispatcher([ 
    1946            ("html", Phase.startTagHtml), 
    1947            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 
    1948              "thead", "tr"), startTagTableElement) 
    1949        ]) 
    1950        startTagHandler.default = startTagOther 
    1951 
    1952        endTagHandler = _utils.MethodDispatcher([ 
    1953            ("caption", endTagCaption), 
    1954            ("table", endTagTable), 
    1955            (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", 
    1956              "thead", "tr"), endTagIgnore) 
    1957        ]) 
    1958        endTagHandler.default = endTagOther 
    1959 
    1960    class InColumnGroupPhase(Phase): 
    1961        # http://www.whatwg.org/specs/web-apps/current-work/#in-column 
    1962        __slots__ = tuple() 
    1963 
    1964        def ignoreEndTagColgroup(self): 
    1965            return self.tree.openElements[-1].name == "html" 
    1966 
    1967        def processEOF(self): 
    1968            if self.tree.openElements[-1].name == "html": 
    1969                assert self.parser.innerHTML 
    1970                return 
    1971            else: 
    1972                ignoreEndTag = self.ignoreEndTagColgroup() 
    1973                self.endTagColgroup(impliedTagToken("colgroup")) 
    1974                if not ignoreEndTag: 
    1975                    return True 
    1976 
    1977        def processCharacters(self, token): 
    1978            ignoreEndTag = self.ignoreEndTagColgroup() 
    1979            self.endTagColgroup(impliedTagToken("colgroup")) 
    1980            if not ignoreEndTag: 
    1981                return token 
    1982 
    1983        def startTagCol(self, token): 
    1984            self.tree.insertElement(token) 
    1985            self.tree.openElements.pop() 
    1986            token["selfClosingAcknowledged"] = True 
    1987 
    1988        def startTagOther(self, token): 
    1989            ignoreEndTag = self.ignoreEndTagColgroup() 
    1990            self.endTagColgroup(impliedTagToken("colgroup")) 
    1991            if not ignoreEndTag: 
    1992                return token 
    1993 
    1994        def endTagColgroup(self, token): 
    1995            if self.ignoreEndTagColgroup(): 
    1996                # innerHTML case 
    1997                assert self.parser.innerHTML 
    1998                self.parser.parseError() 
    1999            else: 
    2000                self.tree.openElements.pop() 
    2001                self.parser.phase = self.parser.phases["inTable"] 
    2002 
    2003        def endTagCol(self, token): 
    2004            self.parser.parseError("no-end-tag", {"name": "col"}) 
    2005 
    2006        def endTagOther(self, token): 
    2007            ignoreEndTag = self.ignoreEndTagColgroup() 
    2008            self.endTagColgroup(impliedTagToken("colgroup")) 
    2009            if not ignoreEndTag: 
    2010                return token 
    2011 
    2012        startTagHandler = _utils.MethodDispatcher([ 
    2013            ("html", Phase.startTagHtml), 
    2014            ("col", startTagCol) 
    2015        ]) 
    2016        startTagHandler.default = startTagOther 
    2017 
    2018        endTagHandler = _utils.MethodDispatcher([ 
    2019            ("colgroup", endTagColgroup), 
    2020            ("col", endTagCol) 
    2021        ]) 
    2022        endTagHandler.default = endTagOther 
    2023 
    2024    class InTableBodyPhase(Phase): 
    2025        # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 
    2026        __slots__ = tuple() 
    2027 
    2028        # helper methods 
    2029        def clearStackToTableBodyContext(self): 
    2030            while self.tree.openElements[-1].name not in ("tbody", "tfoot", 
    2031                                                          "thead", "html"): 
    2032                # self.parser.parseError("unexpected-implied-end-tag-in-table", 
    2033                #  {"name": self.tree.openElements[-1].name}) 
    2034                self.tree.openElements.pop() 
    2035            if self.tree.openElements[-1].name == "html": 
    2036                assert self.parser.innerHTML 
    2037 
    2038        # the rest 
    2039        def processEOF(self): 
    2040            self.parser.phases["inTable"].processEOF() 
    2041 
    2042        def processSpaceCharacters(self, token): 
    2043            return self.parser.phases["inTable"].processSpaceCharacters(token) 
    2044 
    2045        def processCharacters(self, token): 
    2046            return self.parser.phases["inTable"].processCharacters(token) 
    2047 
    2048        def startTagTr(self, token): 
    2049            self.clearStackToTableBodyContext() 
    2050            self.tree.insertElement(token) 
    2051            self.parser.phase = self.parser.phases["inRow"] 
    2052 
    2053        def startTagTableCell(self, token): 
    2054            self.parser.parseError("unexpected-cell-in-table-body", 
    2055                                   {"name": token["name"]}) 
    2056            self.startTagTr(impliedTagToken("tr", "StartTag")) 
    2057            return token 
    2058 
    2059        def startTagTableOther(self, token): 
    2060            # XXX AT Any ideas on how to share this with endTagTable? 
    2061            if (self.tree.elementInScope("tbody", variant="table") or 
    2062                self.tree.elementInScope("thead", variant="table") or 
    2063                    self.tree.elementInScope("tfoot", variant="table")): 
    2064                self.clearStackToTableBodyContext() 
    2065                self.endTagTableRowGroup( 
    2066                    impliedTagToken(self.tree.openElements[-1].name)) 
    2067                return token 
    2068            else: 
    2069                # innerHTML case 
    2070                assert self.parser.innerHTML 
    2071                self.parser.parseError() 
    2072 
    2073        def startTagOther(self, token): 
    2074            return self.parser.phases["inTable"].processStartTag(token) 
    2075 
    2076        def endTagTableRowGroup(self, token): 
    2077            if self.tree.elementInScope(token["name"], variant="table"): 
    2078                self.clearStackToTableBodyContext() 
    2079                self.tree.openElements.pop() 
    2080                self.parser.phase = self.parser.phases["inTable"] 
    2081            else: 
    2082                self.parser.parseError("unexpected-end-tag-in-table-body", 
    2083                                       {"name": token["name"]}) 
    2084 
    2085        def endTagTable(self, token): 
    2086            if (self.tree.elementInScope("tbody", variant="table") or 
    2087                self.tree.elementInScope("thead", variant="table") or 
    2088                    self.tree.elementInScope("tfoot", variant="table")): 
    2089                self.clearStackToTableBodyContext() 
    2090                self.endTagTableRowGroup( 
    2091                    impliedTagToken(self.tree.openElements[-1].name)) 
    2092                return token 
    2093            else: 
    2094                # innerHTML case 
    2095                assert self.parser.innerHTML 
    2096                self.parser.parseError() 
    2097 
    2098        def endTagIgnore(self, token): 
    2099            self.parser.parseError("unexpected-end-tag-in-table-body", 
    2100                                   {"name": token["name"]}) 
    2101 
    2102        def endTagOther(self, token): 
    2103            return self.parser.phases["inTable"].processEndTag(token) 
    2104 
    2105        startTagHandler = _utils.MethodDispatcher([ 
    2106            ("html", Phase.startTagHtml), 
    2107            ("tr", startTagTr), 
    2108            (("td", "th"), startTagTableCell), 
    2109            (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), 
    2110             startTagTableOther) 
    2111        ]) 
    2112        startTagHandler.default = startTagOther 
    2113 
    2114        endTagHandler = _utils.MethodDispatcher([ 
    2115            (("tbody", "tfoot", "thead"), endTagTableRowGroup), 
    2116            ("table", endTagTable), 
    2117            (("body", "caption", "col", "colgroup", "html", "td", "th", 
    2118              "tr"), endTagIgnore) 
    2119        ]) 
    2120        endTagHandler.default = endTagOther 
    2121 
    2122    class InRowPhase(Phase): 
    2123        # http://www.whatwg.org/specs/web-apps/current-work/#in-row 
    2124        __slots__ = tuple() 
    2125 
    2126        # helper methods (XXX unify this with other table helper methods) 
    2127        def clearStackToTableRowContext(self): 
    2128            while self.tree.openElements[-1].name not in ("tr", "html"): 
    2129                self.parser.parseError("unexpected-implied-end-tag-in-table-row", 
    2130                                       {"name": self.tree.openElements[-1].name}) 
    2131                self.tree.openElements.pop() 
    2132 
    2133        def ignoreEndTagTr(self): 
    2134            return not self.tree.elementInScope("tr", variant="table") 
    2135 
    2136        # the rest 
    2137        def processEOF(self): 
    2138            self.parser.phases["inTable"].processEOF() 
    2139 
    2140        def processSpaceCharacters(self, token): 
    2141            return self.parser.phases["inTable"].processSpaceCharacters(token) 
    2142 
    2143        def processCharacters(self, token): 
    2144            return self.parser.phases["inTable"].processCharacters(token) 
    2145 
    2146        def startTagTableCell(self, token): 
    2147            self.clearStackToTableRowContext() 
    2148            self.tree.insertElement(token) 
    2149            self.parser.phase = self.parser.phases["inCell"] 
    2150            self.tree.activeFormattingElements.append(Marker) 
    2151 
    2152        def startTagTableOther(self, token): 
    2153            ignoreEndTag = self.ignoreEndTagTr() 
    2154            self.endTagTr(impliedTagToken("tr")) 
    2155            # XXX how are we sure it's always ignored in the innerHTML case? 
    2156            if not ignoreEndTag: 
    2157                return token 
    2158 
    2159        def startTagOther(self, token): 
    2160            return self.parser.phases["inTable"].processStartTag(token) 
    2161 
    2162        def endTagTr(self, token): 
    2163            if not self.ignoreEndTagTr(): 
    2164                self.clearStackToTableRowContext() 
    2165                self.tree.openElements.pop() 
    2166                self.parser.phase = self.parser.phases["inTableBody"] 
    2167            else: 
    2168                # innerHTML case 
    2169                assert self.parser.innerHTML 
    2170                self.parser.parseError() 
    2171 
    2172        def endTagTable(self, token): 
    2173            ignoreEndTag = self.ignoreEndTagTr() 
    2174            self.endTagTr(impliedTagToken("tr")) 
    2175            # Reprocess the current tag if the tr end tag was not ignored 
    2176            # XXX how are we sure it's always ignored in the innerHTML case? 
    2177            if not ignoreEndTag: 
    2178                return token 
    2179 
    2180        def endTagTableRowGroup(self, token): 
    2181            if self.tree.elementInScope(token["name"], variant="table"): 
    2182                self.endTagTr(impliedTagToken("tr")) 
    2183                return token 
    2184            else: 
    2185                self.parser.parseError() 
    2186 
    2187        def endTagIgnore(self, token): 
    2188            self.parser.parseError("unexpected-end-tag-in-table-row", 
    2189                                   {"name": token["name"]}) 
    2190 
    2191        def endTagOther(self, token): 
    2192            return self.parser.phases["inTable"].processEndTag(token) 
    2193 
    2194        startTagHandler = _utils.MethodDispatcher([ 
    2195            ("html", Phase.startTagHtml), 
    2196            (("td", "th"), startTagTableCell), 
    2197            (("caption", "col", "colgroup", "tbody", "tfoot", "thead", 
    2198              "tr"), startTagTableOther) 
    2199        ]) 
    2200        startTagHandler.default = startTagOther 
    2201 
    2202        endTagHandler = _utils.MethodDispatcher([ 
    2203            ("tr", endTagTr), 
    2204            ("table", endTagTable), 
    2205            (("tbody", "tfoot", "thead"), endTagTableRowGroup), 
    2206            (("body", "caption", "col", "colgroup", "html", "td", "th"), 
    2207             endTagIgnore) 
    2208        ]) 
    2209        endTagHandler.default = endTagOther 
    2210 
    2211    class InCellPhase(Phase): 
    2212        # http://www.whatwg.org/specs/web-apps/current-work/#in-cell 
    2213        __slots__ = tuple() 
    2214 
    2215        # helper 
    2216        def closeCell(self): 
    2217            if self.tree.elementInScope("td", variant="table"): 
    2218                self.endTagTableCell(impliedTagToken("td")) 
    2219            elif self.tree.elementInScope("th", variant="table"): 
    2220                self.endTagTableCell(impliedTagToken("th")) 
    2221 
    2222        # the rest 
    2223        def processEOF(self): 
    2224            self.parser.phases["inBody"].processEOF() 
    2225 
    2226        def processCharacters(self, token): 
    2227            return self.parser.phases["inBody"].processCharacters(token) 
    2228 
    2229        def startTagTableOther(self, token): 
    2230            if (self.tree.elementInScope("td", variant="table") or 
    2231                    self.tree.elementInScope("th", variant="table")): 
    2232                self.closeCell() 
    2233                return token 
    2234            else: 
    2235                # innerHTML case 
    2236                assert self.parser.innerHTML 
    2237                self.parser.parseError() 
    2238 
    2239        def startTagOther(self, token): 
    2240            return self.parser.phases["inBody"].processStartTag(token) 
    2241 
    2242        def endTagTableCell(self, token): 
    2243            if self.tree.elementInScope(token["name"], variant="table"): 
    2244                self.tree.generateImpliedEndTags(token["name"]) 
    2245                if self.tree.openElements[-1].name != token["name"]: 
    2246                    self.parser.parseError("unexpected-cell-end-tag", 
    2247                                           {"name": token["name"]}) 
    2248                    while True: 
    2249                        node = self.tree.openElements.pop() 
    2250                        if node.name == token["name"]: 
    2251                            break 
    2252                else: 
    2253                    self.tree.openElements.pop() 
    2254                self.tree.clearActiveFormattingElements() 
    2255                self.parser.phase = self.parser.phases["inRow"] 
    2256            else: 
    2257                self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    2258 
    2259        def endTagIgnore(self, token): 
    2260            self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    2261 
    2262        def endTagImply(self, token): 
    2263            if self.tree.elementInScope(token["name"], variant="table"): 
    2264                self.closeCell() 
    2265                return token 
    2266            else: 
    2267                # sometimes innerHTML case 
    2268                self.parser.parseError() 
    2269 
    2270        def endTagOther(self, token): 
    2271            return self.parser.phases["inBody"].processEndTag(token) 
    2272 
    2273        startTagHandler = _utils.MethodDispatcher([ 
    2274            ("html", Phase.startTagHtml), 
    2275            (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 
    2276              "thead", "tr"), startTagTableOther) 
    2277        ]) 
    2278        startTagHandler.default = startTagOther 
    2279 
    2280        endTagHandler = _utils.MethodDispatcher([ 
    2281            (("td", "th"), endTagTableCell), 
    2282            (("body", "caption", "col", "colgroup", "html"), endTagIgnore), 
    2283            (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) 
    2284        ]) 
    2285        endTagHandler.default = endTagOther 
    2286 
    2287    class InSelectPhase(Phase): 
    2288        __slots__ = tuple() 
    2289 
    2290        # http://www.whatwg.org/specs/web-apps/current-work/#in-select 
    2291        def processEOF(self): 
    2292            if self.tree.openElements[-1].name != "html": 
    2293                self.parser.parseError("eof-in-select") 
    2294            else: 
    2295                assert self.parser.innerHTML 
    2296 
    2297        def processCharacters(self, token): 
    2298            if token["data"] == "\u0000": 
    2299                return 
    2300            self.tree.insertText(token["data"]) 
    2301 
    2302        def startTagOption(self, token): 
    2303            # We need to imply </option> if <option> is the current node. 
    2304            if self.tree.openElements[-1].name == "option": 
    2305                self.tree.openElements.pop() 
    2306            self.tree.insertElement(token) 
    2307 
    2308        def startTagOptgroup(self, token): 
    2309            if self.tree.openElements[-1].name == "option": 
    2310                self.tree.openElements.pop() 
    2311            if self.tree.openElements[-1].name == "optgroup": 
    2312                self.tree.openElements.pop() 
    2313            self.tree.insertElement(token) 
    2314 
    2315        def startTagSelect(self, token): 
    2316            self.parser.parseError("unexpected-select-in-select") 
    2317            self.endTagSelect(impliedTagToken("select")) 
    2318 
    2319        def startTagInput(self, token): 
    2320            self.parser.parseError("unexpected-input-in-select") 
    2321            if self.tree.elementInScope("select", variant="select"): 
    2322                self.endTagSelect(impliedTagToken("select")) 
    2323                return token 
    2324            else: 
    2325                assert self.parser.innerHTML 
    2326 
    2327        def startTagScript(self, token): 
    2328            return self.parser.phases["inHead"].processStartTag(token) 
    2329 
    2330        def startTagOther(self, token): 
    2331            self.parser.parseError("unexpected-start-tag-in-select", 
    2332                                   {"name": token["name"]}) 
    2333 
    2334        def endTagOption(self, token): 
    2335            if self.tree.openElements[-1].name == "option": 
    2336                self.tree.openElements.pop() 
    2337            else: 
    2338                self.parser.parseError("unexpected-end-tag-in-select", 
    2339                                       {"name": "option"}) 
    2340 
    2341        def endTagOptgroup(self, token): 
    2342            # </optgroup> implicitly closes <option> 
    2343            if (self.tree.openElements[-1].name == "option" and 
    2344                    self.tree.openElements[-2].name == "optgroup"): 
    2345                self.tree.openElements.pop() 
    2346            # It also closes </optgroup> 
    2347            if self.tree.openElements[-1].name == "optgroup": 
    2348                self.tree.openElements.pop() 
    2349            # But nothing else 
    2350            else: 
    2351                self.parser.parseError("unexpected-end-tag-in-select", 
    2352                                       {"name": "optgroup"}) 
    2353 
    2354        def endTagSelect(self, token): 
    2355            if self.tree.elementInScope("select", variant="select"): 
    2356                node = self.tree.openElements.pop() 
    2357                while node.name != "select": 
    2358                    node = self.tree.openElements.pop() 
    2359                self.parser.resetInsertionMode() 
    2360            else: 
    2361                # innerHTML case 
    2362                assert self.parser.innerHTML 
    2363                self.parser.parseError() 
    2364 
    2365        def endTagOther(self, token): 
    2366            self.parser.parseError("unexpected-end-tag-in-select", 
    2367                                   {"name": token["name"]}) 
    2368 
    2369        startTagHandler = _utils.MethodDispatcher([ 
    2370            ("html", Phase.startTagHtml), 
    2371            ("option", startTagOption), 
    2372            ("optgroup", startTagOptgroup), 
    2373            ("select", startTagSelect), 
    2374            (("input", "keygen", "textarea"), startTagInput), 
    2375            ("script", startTagScript) 
    2376        ]) 
    2377        startTagHandler.default = startTagOther 
    2378 
    2379        endTagHandler = _utils.MethodDispatcher([ 
    2380            ("option", endTagOption), 
    2381            ("optgroup", endTagOptgroup), 
    2382            ("select", endTagSelect) 
    2383        ]) 
    2384        endTagHandler.default = endTagOther 
    2385 
    2386    class InSelectInTablePhase(Phase): 
    2387        __slots__ = tuple() 
    2388 
    2389        def processEOF(self): 
    2390            self.parser.phases["inSelect"].processEOF() 
    2391 
    2392        def processCharacters(self, token): 
    2393            return self.parser.phases["inSelect"].processCharacters(token) 
    2394 
    2395        def startTagTable(self, token): 
    2396            self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) 
    2397            self.endTagOther(impliedTagToken("select")) 
    2398            return token 
    2399 
    2400        def startTagOther(self, token): 
    2401            return self.parser.phases["inSelect"].processStartTag(token) 
    2402 
    2403        def endTagTable(self, token): 
    2404            self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) 
    2405            if self.tree.elementInScope(token["name"], variant="table"): 
    2406                self.endTagOther(impliedTagToken("select")) 
    2407                return token 
    2408 
    2409        def endTagOther(self, token): 
    2410            return self.parser.phases["inSelect"].processEndTag(token) 
    2411 
    2412        startTagHandler = _utils.MethodDispatcher([ 
    2413            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 
    2414             startTagTable) 
    2415        ]) 
    2416        startTagHandler.default = startTagOther 
    2417 
    2418        endTagHandler = _utils.MethodDispatcher([ 
    2419            (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 
    2420             endTagTable) 
    2421        ]) 
    2422        endTagHandler.default = endTagOther 
    2423 
    2424    class InForeignContentPhase(Phase): 
    2425        __slots__ = tuple() 
    2426 
    2427        breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", 
    2428                                      "center", "code", "dd", "div", "dl", "dt", 
    2429                                      "em", "embed", "h1", "h2", "h3", 
    2430                                      "h4", "h5", "h6", "head", "hr", "i", "img", 
    2431                                      "li", "listing", "menu", "meta", "nobr", 
    2432                                      "ol", "p", "pre", "ruby", "s", "small", 
    2433                                      "span", "strong", "strike", "sub", "sup", 
    2434                                      "table", "tt", "u", "ul", "var"]) 
    2435 
    2436        def adjustSVGTagNames(self, token): 
    2437            replacements = {"altglyph": "altGlyph", 
    2438                            "altglyphdef": "altGlyphDef", 
    2439                            "altglyphitem": "altGlyphItem", 
    2440                            "animatecolor": "animateColor", 
    2441                            "animatemotion": "animateMotion", 
    2442                            "animatetransform": "animateTransform", 
    2443                            "clippath": "clipPath", 
    2444                            "feblend": "feBlend", 
    2445                            "fecolormatrix": "feColorMatrix", 
    2446                            "fecomponenttransfer": "feComponentTransfer", 
    2447                            "fecomposite": "feComposite", 
    2448                            "feconvolvematrix": "feConvolveMatrix", 
    2449                            "fediffuselighting": "feDiffuseLighting", 
    2450                            "fedisplacementmap": "feDisplacementMap", 
    2451                            "fedistantlight": "feDistantLight", 
    2452                            "feflood": "feFlood", 
    2453                            "fefunca": "feFuncA", 
    2454                            "fefuncb": "feFuncB", 
    2455                            "fefuncg": "feFuncG", 
    2456                            "fefuncr": "feFuncR", 
    2457                            "fegaussianblur": "feGaussianBlur", 
    2458                            "feimage": "feImage", 
    2459                            "femerge": "feMerge", 
    2460                            "femergenode": "feMergeNode", 
    2461                            "femorphology": "feMorphology", 
    2462                            "feoffset": "feOffset", 
    2463                            "fepointlight": "fePointLight", 
    2464                            "fespecularlighting": "feSpecularLighting", 
    2465                            "fespotlight": "feSpotLight", 
    2466                            "fetile": "feTile", 
    2467                            "feturbulence": "feTurbulence", 
    2468                            "foreignobject": "foreignObject", 
    2469                            "glyphref": "glyphRef", 
    2470                            "lineargradient": "linearGradient", 
    2471                            "radialgradient": "radialGradient", 
    2472                            "textpath": "textPath"} 
    2473 
    2474            if token["name"] in replacements: 
    2475                token["name"] = replacements[token["name"]] 
    2476 
    2477        def processCharacters(self, token): 
    2478            if token["data"] == "\u0000": 
    2479                token["data"] = "\uFFFD" 
    2480            elif (self.parser.framesetOK and 
    2481                  any(char not in spaceCharacters for char in token["data"])): 
    2482                self.parser.framesetOK = False 
    2483            Phase.processCharacters(self, token) 
    2484 
    2485        def processStartTag(self, token): 
    2486            currentNode = self.tree.openElements[-1] 
    2487            if (token["name"] in self.breakoutElements or 
    2488                (token["name"] == "font" and 
    2489                 set(token["data"].keys()) & {"color", "face", "size"})): 
    2490                self.parser.parseError("unexpected-html-element-in-foreign-content", 
    2491                                       {"name": token["name"]}) 
    2492                while (self.tree.openElements[-1].namespace != 
    2493                       self.tree.defaultNamespace and 
    2494                       not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and 
    2495                       not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): 
    2496                    self.tree.openElements.pop() 
    2497                return token 
    2498 
    2499            else: 
    2500                if currentNode.namespace == namespaces["mathml"]: 
    2501                    self.parser.adjustMathMLAttributes(token) 
    2502                elif currentNode.namespace == namespaces["svg"]: 
    2503                    self.adjustSVGTagNames(token) 
    2504                    self.parser.adjustSVGAttributes(token) 
    2505                self.parser.adjustForeignAttributes(token) 
    2506                token["namespace"] = currentNode.namespace 
    2507                self.tree.insertElement(token) 
    2508                if token["selfClosing"]: 
    2509                    self.tree.openElements.pop() 
    2510                    token["selfClosingAcknowledged"] = True 
    2511 
    2512        def processEndTag(self, token): 
    2513            nodeIndex = len(self.tree.openElements) - 1 
    2514            node = self.tree.openElements[-1] 
    2515            if node.name.translate(asciiUpper2Lower) != token["name"]: 
    2516                self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 
    2517 
    2518            while True: 
    2519                if node.name.translate(asciiUpper2Lower) == token["name"]: 
    2520                    # XXX this isn't in the spec but it seems necessary 
    2521                    if self.parser.phase == self.parser.phases["inTableText"]: 
    2522                        self.parser.phase.flushCharacters() 
    2523                        self.parser.phase = self.parser.phase.originalPhase 
    2524                    while self.tree.openElements.pop() != node: 
    2525                        assert self.tree.openElements 
    2526                    new_token = None 
    2527                    break 
    2528                nodeIndex -= 1 
    2529 
    2530                node = self.tree.openElements[nodeIndex] 
    2531                if node.namespace != self.tree.defaultNamespace: 
    2532                    continue 
    2533                else: 
    2534                    new_token = self.parser.phase.processEndTag(token) 
    2535                    break 
    2536            return new_token 
    2537 
    2538    class AfterBodyPhase(Phase): 
    2539        __slots__ = tuple() 
    2540 
    2541        def processEOF(self): 
    2542            # Stop parsing 
    2543            pass 
    2544 
    2545        def processComment(self, token): 
    2546            # This is needed because data is to be appended to the <html> element 
    2547            # here and not to whatever is currently open. 
    2548            self.tree.insertComment(token, self.tree.openElements[0]) 
    2549 
    2550        def processCharacters(self, token): 
    2551            self.parser.parseError("unexpected-char-after-body") 
    2552            self.parser.phase = self.parser.phases["inBody"] 
    2553            return token 
    2554 
    2555        def startTagHtml(self, token): 
    2556            return self.parser.phases["inBody"].processStartTag(token) 
    2557 
    2558        def startTagOther(self, token): 
    2559            self.parser.parseError("unexpected-start-tag-after-body", 
    2560                                   {"name": token["name"]}) 
    2561            self.parser.phase = self.parser.phases["inBody"] 
    2562            return token 
    2563 
    2564        def endTagHtml(self, name): 
    2565            if self.parser.innerHTML: 
    2566                self.parser.parseError("unexpected-end-tag-after-body-innerhtml") 
    2567            else: 
    2568                self.parser.phase = self.parser.phases["afterAfterBody"] 
    2569 
    2570        def endTagOther(self, token): 
    2571            self.parser.parseError("unexpected-end-tag-after-body", 
    2572                                   {"name": token["name"]}) 
    2573            self.parser.phase = self.parser.phases["inBody"] 
    2574            return token 
    2575 
    2576        startTagHandler = _utils.MethodDispatcher([ 
    2577            ("html", startTagHtml) 
    2578        ]) 
    2579        startTagHandler.default = startTagOther 
    2580 
    2581        endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) 
    2582        endTagHandler.default = endTagOther 
    2583 
    2584    class InFramesetPhase(Phase): 
    2585        # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset 
    2586        __slots__ = tuple() 
    2587 
    2588        def processEOF(self): 
    2589            if self.tree.openElements[-1].name != "html": 
    2590                self.parser.parseError("eof-in-frameset") 
    2591            else: 
    2592                assert self.parser.innerHTML 
    2593 
    2594        def processCharacters(self, token): 
    2595            self.parser.parseError("unexpected-char-in-frameset") 
    2596 
    2597        def startTagFrameset(self, token): 
    2598            self.tree.insertElement(token) 
    2599 
    2600        def startTagFrame(self, token): 
    2601            self.tree.insertElement(token) 
    2602            self.tree.openElements.pop() 
    2603 
    2604        def startTagNoframes(self, token): 
    2605            return self.parser.phases["inBody"].processStartTag(token) 
    2606 
    2607        def startTagOther(self, token): 
    2608            self.parser.parseError("unexpected-start-tag-in-frameset", 
    2609                                   {"name": token["name"]}) 
    2610 
    2611        def endTagFrameset(self, token): 
    2612            if self.tree.openElements[-1].name == "html": 
    2613                # innerHTML case 
    2614                self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") 
    2615            else: 
    2616                self.tree.openElements.pop() 
    2617            if (not self.parser.innerHTML and 
    2618                    self.tree.openElements[-1].name != "frameset"): 
    2619                # If we're not in innerHTML mode and the current node is not a 
    2620                # "frameset" element (anymore) then switch. 
    2621                self.parser.phase = self.parser.phases["afterFrameset"] 
    2622 
    2623        def endTagOther(self, token): 
    2624            self.parser.parseError("unexpected-end-tag-in-frameset", 
    2625                                   {"name": token["name"]}) 
    2626 
    2627        startTagHandler = _utils.MethodDispatcher([ 
    2628            ("html", Phase.startTagHtml), 
    2629            ("frameset", startTagFrameset), 
    2630            ("frame", startTagFrame), 
    2631            ("noframes", startTagNoframes) 
    2632        ]) 
    2633        startTagHandler.default = startTagOther 
    2634 
    2635        endTagHandler = _utils.MethodDispatcher([ 
    2636            ("frameset", endTagFrameset) 
    2637        ]) 
    2638        endTagHandler.default = endTagOther 
    2639 
    2640    class AfterFramesetPhase(Phase): 
    2641        # http://www.whatwg.org/specs/web-apps/current-work/#after3 
    2642        __slots__ = tuple() 
    2643 
    2644        def processEOF(self): 
    2645            # Stop parsing 
    2646            pass 
    2647 
    2648        def processCharacters(self, token): 
    2649            self.parser.parseError("unexpected-char-after-frameset") 
    2650 
    2651        def startTagNoframes(self, token): 
    2652            return self.parser.phases["inHead"].processStartTag(token) 
    2653 
    2654        def startTagOther(self, token): 
    2655            self.parser.parseError("unexpected-start-tag-after-frameset", 
    2656                                   {"name": token["name"]}) 
    2657 
    2658        def endTagHtml(self, token): 
    2659            self.parser.phase = self.parser.phases["afterAfterFrameset"] 
    2660 
    2661        def endTagOther(self, token): 
    2662            self.parser.parseError("unexpected-end-tag-after-frameset", 
    2663                                   {"name": token["name"]}) 
    2664 
    2665        startTagHandler = _utils.MethodDispatcher([ 
    2666            ("html", Phase.startTagHtml), 
    2667            ("noframes", startTagNoframes) 
    2668        ]) 
    2669        startTagHandler.default = startTagOther 
    2670 
    2671        endTagHandler = _utils.MethodDispatcher([ 
    2672            ("html", endTagHtml) 
    2673        ]) 
    2674        endTagHandler.default = endTagOther 
    2675 
    2676    class AfterAfterBodyPhase(Phase): 
    2677        __slots__ = tuple() 
    2678 
    2679        def processEOF(self): 
    2680            pass 
    2681 
    2682        def processComment(self, token): 
    2683            self.tree.insertComment(token, self.tree.document) 
    2684 
    2685        def processSpaceCharacters(self, token): 
    2686            return self.parser.phases["inBody"].processSpaceCharacters(token) 
    2687 
    2688        def processCharacters(self, token): 
    2689            self.parser.parseError("expected-eof-but-got-char") 
    2690            self.parser.phase = self.parser.phases["inBody"] 
    2691            return token 
    2692 
    2693        def startTagHtml(self, token): 
    2694            return self.parser.phases["inBody"].processStartTag(token) 
    2695 
    2696        def startTagOther(self, token): 
    2697            self.parser.parseError("expected-eof-but-got-start-tag", 
    2698                                   {"name": token["name"]}) 
    2699            self.parser.phase = self.parser.phases["inBody"] 
    2700            return token 
    2701 
    2702        def processEndTag(self, token): 
    2703            self.parser.parseError("expected-eof-but-got-end-tag", 
    2704                                   {"name": token["name"]}) 
    2705            self.parser.phase = self.parser.phases["inBody"] 
    2706            return token 
    2707 
    2708        startTagHandler = _utils.MethodDispatcher([ 
    2709            ("html", startTagHtml) 
    2710        ]) 
    2711        startTagHandler.default = startTagOther 
    2712 
    2713    class AfterAfterFramesetPhase(Phase): 
    2714        __slots__ = tuple() 
    2715 
    2716        def processEOF(self): 
    2717            pass 
    2718 
    2719        def processComment(self, token): 
    2720            self.tree.insertComment(token, self.tree.document) 
    2721 
    2722        def processSpaceCharacters(self, token): 
    2723            return self.parser.phases["inBody"].processSpaceCharacters(token) 
    2724 
    2725        def processCharacters(self, token): 
    2726            self.parser.parseError("expected-eof-but-got-char") 
    2727 
    2728        def startTagHtml(self, token): 
    2729            return self.parser.phases["inBody"].processStartTag(token) 
    2730 
    2731        def startTagNoFrames(self, token): 
    2732            return self.parser.phases["inHead"].processStartTag(token) 
    2733 
    2734        def startTagOther(self, token): 
    2735            self.parser.parseError("expected-eof-but-got-start-tag", 
    2736                                   {"name": token["name"]}) 
    2737 
    2738        def processEndTag(self, token): 
    2739            self.parser.parseError("expected-eof-but-got-end-tag", 
    2740                                   {"name": token["name"]}) 
    2741 
    2742        startTagHandler = _utils.MethodDispatcher([ 
    2743            ("html", startTagHtml), 
    2744            ("noframes", startTagNoFrames) 
    2745        ]) 
    2746        startTagHandler.default = startTagOther 
    2747 
    2748    # pylint:enable=unused-argument 
    2749 
    2750    return { 
    2751        "initial": InitialPhase, 
    2752        "beforeHtml": BeforeHtmlPhase, 
    2753        "beforeHead": BeforeHeadPhase, 
    2754        "inHead": InHeadPhase, 
    2755        "inHeadNoscript": InHeadNoscriptPhase, 
    2756        "afterHead": AfterHeadPhase, 
    2757        "inBody": InBodyPhase, 
    2758        "text": TextPhase, 
    2759        "inTable": InTablePhase, 
    2760        "inTableText": InTableTextPhase, 
    2761        "inCaption": InCaptionPhase, 
    2762        "inColumnGroup": InColumnGroupPhase, 
    2763        "inTableBody": InTableBodyPhase, 
    2764        "inRow": InRowPhase, 
    2765        "inCell": InCellPhase, 
    2766        "inSelect": InSelectPhase, 
    2767        "inSelectInTable": InSelectInTablePhase, 
    2768        "inForeignContent": InForeignContentPhase, 
    2769        "afterBody": AfterBodyPhase, 
    2770        "inFrameset": InFramesetPhase, 
    2771        "afterFrameset": AfterFramesetPhase, 
    2772        "afterAfterBody": AfterAfterBodyPhase, 
    2773        "afterAfterFrameset": AfterAfterFramesetPhase, 
    2774        # XXX after after frameset 
    2775    } 
    2776 
    2777 
    2778def adjust_attributes(token, replacements): 
    2779    needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) 
    2780    if needs_adjustment: 
    2781        token['data'] = type(token['data'])((replacements.get(k, k), v) 
    2782                                            for k, v in token['data'].items()) 
    2783 
    2784 
    2785def impliedTagToken(name, type="EndTag", attributes=None, 
    2786                    selfClosing=False): 
    2787    if attributes is None: 
    2788        attributes = {} 
    2789    return {"type": tokenTypes[type], "name": name, "data": attributes, 
    2790            "selfClosing": selfClosing} 
    2791 
    2792 
    2793class ParseError(Exception): 
    2794    """Error in parsed document""" 
    2795    pass