Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/html5lib/html5parser.py: 94%

1from __future__ import absolute_import, division, unicode_literals

2from six import with_metaclass, viewkeys

4import types

6from . import _inputstream

7from . import _tokenizer

9from . import treebuilders

10from .treebuilders.base import Marker

12from . import _utils

13from .constants import (

14 spaceCharacters, asciiUpper2Lower,

15 specialElements, headingElements, cdataElements, rcdataElements,

16 tokenTypes, tagTokenTypes,

17 namespaces,

18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,

19 adjustForeignAttributes as adjustForeignAttributesMap,

20 adjustMathMLAttributes, adjustSVGAttributes,

21 E,

22 _ReparseException

23)

26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):

27 """Parse an HTML document as a string or file-like object into a tree

29 :arg doc: the document to parse as a string or file-like object

31 :arg treebuilder: the treebuilder to use when parsing

33 :arg namespaceHTMLElements: whether or not to namespace HTML elements

35 :returns: parsed tree

37 Example:

39 >>> from html5lib.html5parser import parse

40 >>> parse('<html><body>This is a doc</body></html>')

41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>

43 """

44 tb = treebuilders.getTreeBuilder(treebuilder)

45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)

46 return p.parse(doc, **kwargs)

49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):

50 """Parse an HTML fragment as a string or file-like object into a tree

52 :arg doc: the fragment to parse as a string or file-like object

54 :arg container: the container context to parse the fragment in

56 :arg treebuilder: the treebuilder to use when parsing

58 :arg namespaceHTMLElements: whether or not to namespace HTML elements

60 :returns: parsed tree

62 Example:

64 >>> from html5lib.html5libparser import parseFragment

65 >>> parseFragment('this is a fragment')

66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>

68 """

69 tb = treebuilders.getTreeBuilder(treebuilder)

70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)

71 return p.parseFragment(doc, container=container, **kwargs)

74def method_decorator_metaclass(function):

75 class Decorated(type):

76 def __new__(meta, classname, bases, classDict):

77 for attributeName, attribute in classDict.items():

78 if isinstance(attribute, types.FunctionType):

79 attribute = function(attribute)

81 classDict[attributeName] = attribute

82 return type.__new__(meta, classname, bases, classDict)

83 return Decorated

86class HTMLParser(object):

87 """HTML parser

89 Generates a tree structure from a stream of (possibly malformed) HTML.

91 """

93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):

94 """

95 :arg tree: a treebuilder class controlling the type of tree that will be

96 returned. Built in treebuilders can be accessed through

97 html5lib.treebuilders.getTreeBuilder(treeType)

99 :arg strict: raise an exception when a parse error is encountered

100

101 :arg namespaceHTMLElements: whether or not to namespace HTML elements

102

103 :arg debug: whether or not to enable debug mode which logs things

104

105 Example:

106

107 >>> from html5lib.html5parser import HTMLParser

108 >>> parser = HTMLParser() # generates parser with etree builder

109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict

110

111 """

112

113 # Raise an exception on the first error encountered

114 self.strict = strict

115

116 if tree is None:

117 tree = treebuilders.getTreeBuilder("etree")

118 elif isinstance(tree, str):

119 tree = treebuilders.getTreeBuilder(tree)

120

121 self.tree = tree(namespaceHTMLElements)

122 self.errors = []

123

124 self.phases = {name: cls(self, self.tree) for name, cls in

125 getPhases(debug).items()}

126

127 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):

128

129 self.innerHTMLMode = innerHTML

130 self.container = container

131 self.scripting = scripting

132 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)

133 self.reset()

134

135 try:

136 self.mainLoop()

137 except _ReparseException:

138 self.reset()

139 self.mainLoop()

140

141 def reset(self):

142 self.tree.reset()

143 self.firstStartTag = False

144 self.errors = []

145 self.log = [] # only used with debug mode

146 # "quirks" / "limited quirks" / "no quirks"

147 self.compatMode = "no quirks"

148

149 if self.innerHTMLMode:

150 self.innerHTML = self.container.lower()

151

152 if self.innerHTML in cdataElements:

153 self.tokenizer.state = self.tokenizer.rcdataState

154 elif self.innerHTML in rcdataElements:

155 self.tokenizer.state = self.tokenizer.rawtextState

156 elif self.innerHTML == 'plaintext':

157 self.tokenizer.state = self.tokenizer.plaintextState

158 else:

159 # state already is data state

160 # self.tokenizer.state = self.tokenizer.dataState

161 pass

162 self.phase = self.phases["beforeHtml"]

163 self.phase.insertHtmlElement()

164 self.resetInsertionMode()

165 else:

166 self.innerHTML = False # pylint:disable=redefined-variable-type

167 self.phase = self.phases["initial"]

168

169 self.lastPhase = None

170

171 self.beforeRCDataPhase = None

172

173 self.framesetOK = True

174

175 @property

176 def documentEncoding(self):

177 """Name of the character encoding that was used to decode the input stream, or

178 :obj:`None` if that is not determined yet

179

180 """

181 if not hasattr(self, 'tokenizer'):

182 return None

183 return self.tokenizer.stream.charEncoding[0].name

184

185 def isHTMLIntegrationPoint(self, element):

186 if (element.name == "annotation-xml" and

187 element.namespace == namespaces["mathml"]):

188 return ("encoding" in element.attributes and

189 element.attributes["encoding"].translate(

190 asciiUpper2Lower) in

191 ("text/html", "application/xhtml+xml"))

192 else:

193 return (element.namespace, element.name) in htmlIntegrationPointElements

194

195 def isMathMLTextIntegrationPoint(self, element):

196 return (element.namespace, element.name) in mathmlTextIntegrationPointElements

197

198 def mainLoop(self):

199 CharactersToken = tokenTypes["Characters"]

200 SpaceCharactersToken = tokenTypes["SpaceCharacters"]

201 StartTagToken = tokenTypes["StartTag"]

202 EndTagToken = tokenTypes["EndTag"]

203 CommentToken = tokenTypes["Comment"]

204 DoctypeToken = tokenTypes["Doctype"]

205 ParseErrorToken = tokenTypes["ParseError"]

206

207 for token in self.tokenizer:

208 prev_token = None

209 new_token = token

210 while new_token is not None:

211 prev_token = new_token

212 currentNode = self.tree.openElements[-1] if self.tree.openElements else None

213 currentNodeNamespace = currentNode.namespace if currentNode else None

214 currentNodeName = currentNode.name if currentNode else None

215

216 type = new_token["type"]

217

218 if type == ParseErrorToken:

219 self.parseError(new_token["data"], new_token.get("datavars", {}))

220 new_token = None

221 else:

222 if (len(self.tree.openElements) == 0 or

223 currentNodeNamespace == self.tree.defaultNamespace or

224 (self.isMathMLTextIntegrationPoint(currentNode) and

225 ((type == StartTagToken and

226 token["name"] not in frozenset(["mglyph", "malignmark"])) or

227 type in (CharactersToken, SpaceCharactersToken))) or

228 (currentNodeNamespace == namespaces["mathml"] and

229 currentNodeName == "annotation-xml" and

230 type == StartTagToken and

231 token["name"] == "svg") or

232 (self.isHTMLIntegrationPoint(currentNode) and

233 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):

234 phase = self.phase

235 else:

236 phase = self.phases["inForeignContent"]

237

238 if type == CharactersToken:

239 new_token = phase.processCharacters(new_token)

240 elif type == SpaceCharactersToken:

241 new_token = phase.processSpaceCharacters(new_token)

242 elif type == StartTagToken:

243 new_token = phase.processStartTag(new_token)

244 elif type == EndTagToken:

245 new_token = phase.processEndTag(new_token)

246 elif type == CommentToken:

247 new_token = phase.processComment(new_token)

248 elif type == DoctypeToken:

249 new_token = phase.processDoctype(new_token)

250

251 if (type == StartTagToken and prev_token["selfClosing"] and

252 not prev_token["selfClosingAcknowledged"]):

253 self.parseError("non-void-element-with-trailing-solidus",

254 {"name": prev_token["name"]})

255

256 # When the loop finishes it's EOF

257 reprocess = True

258 phases = []

259 while reprocess:

260 phases.append(self.phase)

261 reprocess = self.phase.processEOF()

262 if reprocess:

263 assert self.phase not in phases

264

265 def parse(self, stream, *args, **kwargs):

266 """Parse a HTML document into a well-formed tree

267

268 :arg stream: a file-like object or string containing the HTML to be parsed

269

270 The optional encoding parameter must be a string that indicates

271 the encoding. If specified, that encoding will be used,

272 regardless of any BOM or later declaration (such as in a meta

273 element).

274

275 :arg scripting: treat noscript elements as if JavaScript was turned on

276

277 :returns: parsed tree

278

279 Example:

280

281 >>> from html5lib.html5parser import HTMLParser

282 >>> parser = HTMLParser()

283 >>> parser.parse('<html><body>This is a doc</body></html>')

284 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>

285

286 """

287 self._parse(stream, False, None, *args, **kwargs)

288 return self.tree.getDocument()

289

290 def parseFragment(self, stream, *args, **kwargs):

291 """Parse a HTML fragment into a well-formed tree fragment

292

293 :arg container: name of the element we're setting the innerHTML

294 property if set to None, default to 'div'

295

296 :arg stream: a file-like object or string containing the HTML to be parsed

297

298 The optional encoding parameter must be a string that indicates

299 the encoding. If specified, that encoding will be used,

300 regardless of any BOM or later declaration (such as in a meta

301 element)

302

303 :arg scripting: treat noscript elements as if JavaScript was turned on

304

305 :returns: parsed tree

306

307 Example:

308

309 >>> from html5lib.html5libparser import HTMLParser

310 >>> parser = HTMLParser()

311 >>> parser.parseFragment('this is a fragment')

312 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>

313

314 """

315 self._parse(stream, True, *args, **kwargs)

316 return self.tree.getFragment()

317

318 def parseError(self, errorcode="XXX-undefined-error", datavars=None):

319 # XXX The idea is to make errorcode mandatory.

320 if datavars is None:

321 datavars = {}

322 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))

323 if self.strict:

324 raise ParseError(E[errorcode] % datavars)

325

326 def adjustMathMLAttributes(self, token):

327 adjust_attributes(token, adjustMathMLAttributes)

328

329 def adjustSVGAttributes(self, token):

330 adjust_attributes(token, adjustSVGAttributes)

331

332 def adjustForeignAttributes(self, token):

333 adjust_attributes(token, adjustForeignAttributesMap)

334

335 def reparseTokenNormal(self, token):

336 # pylint:disable=unused-argument

337 self.parser.phase()

338

339 def resetInsertionMode(self):

340 # The name of this method is mostly historical. (It's also used in the

341 # specification.)

342 last = False

343 newModes = {

344 "select": "inSelect",

345 "td": "inCell",

346 "th": "inCell",

347 "tr": "inRow",

348 "tbody": "inTableBody",

349 "thead": "inTableBody",

350 "tfoot": "inTableBody",

351 "caption": "inCaption",

352 "colgroup": "inColumnGroup",

353 "table": "inTable",

354 "head": "inBody",

355 "body": "inBody",

356 "frameset": "inFrameset",

357 "html": "beforeHead"

358 }

359 for node in self.tree.openElements[::-1]:

360 nodeName = node.name

361 new_phase = None

362 if node == self.tree.openElements[0]:

363 assert self.innerHTML

364 last = True

365 nodeName = self.innerHTML

366 # Check for conditions that should only happen in the innerHTML

367 # case

368 if nodeName in ("select", "colgroup", "head", "html"):

369 assert self.innerHTML

370

371 if not last and node.namespace != self.tree.defaultNamespace:

372 continue

373

374 if nodeName in newModes:

375 new_phase = self.phases[newModes[nodeName]]

376 break

377 elif last:

378 new_phase = self.phases["inBody"]

379 break

380

381 self.phase = new_phase

382

383 def parseRCDataRawtext(self, token, contentType):

384 # Generic RCDATA/RAWTEXT Parsing algorithm

385 assert contentType in ("RAWTEXT", "RCDATA")

386

387 self.tree.insertElement(token)

388

389 if contentType == "RAWTEXT":

390 self.tokenizer.state = self.tokenizer.rawtextState

391 else:

392 self.tokenizer.state = self.tokenizer.rcdataState

393

394 self.originalPhase = self.phase

395

396 self.phase = self.phases["text"]

397

398

399@_utils.memoize

400def getPhases(debug):

401 def log(function):

402 """Logger that records which phase processes each token"""

403 type_names = {value: key for key, value in tokenTypes.items()}

404

405 def wrapped(self, *args, **kwargs):

406 if function.__name__.startswith("process") and len(args) > 0:

407 token = args[0]

408 info = {"type": type_names[token['type']]}

409 if token['type'] in tagTokenTypes:

410 info["name"] = token['name']

411

412 self.parser.log.append((self.parser.tokenizer.state.__name__,

413 self.parser.phase.__class__.__name__,

414 self.__class__.__name__,

415 function.__name__,

416 info))

417 return function(self, *args, **kwargs)

418 else:

419 return function(self, *args, **kwargs)

420 return wrapped

421

422 def getMetaclass(use_metaclass, metaclass_func):

423 if use_metaclass:

424 return method_decorator_metaclass(metaclass_func)

425 else:

426 return type

427

428 # pylint:disable=unused-argument

429 class Phase(with_metaclass(getMetaclass(debug, log))):

430 """Base class for helper object that implements each phase of processing

431 """

432 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")

433

434 def __init__(self, parser, tree):

435 self.parser = parser

436 self.tree = tree

437 self.__startTagCache = {}

438 self.__endTagCache = {}

439

440 def processEOF(self):

441 raise NotImplementedError

442

443 def processComment(self, token):

444 # For most phases the following is correct. Where it's not it will be

445 # overridden.

446 self.tree.insertComment(token, self.tree.openElements[-1])

447

448 def processDoctype(self, token):

449 self.parser.parseError("unexpected-doctype")

450

451 def processCharacters(self, token):

452 self.tree.insertText(token["data"])

453

454 def processSpaceCharacters(self, token):

455 self.tree.insertText(token["data"])

456

457 def processStartTag(self, token):

458 # Note the caching is done here rather than BoundMethodDispatcher as doing it there

459 # requires a circular reference to the Phase, and this ends up with a significant

460 # (CPython 2.7, 3.8) GC cost when parsing many short inputs

461 name = token["name"]

462 # In Py2, using `in` is quicker in general than try/except KeyError

463 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)

464 if name in self.__startTagCache:

465 func = self.__startTagCache[name]

466 else:

467 func = self.__startTagCache[name] = self.startTagHandler[name]

468 # bound the cache size in case we get loads of unknown tags

469 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:

470 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7

471 self.__startTagCache.pop(next(iter(self.__startTagCache)))

472 return func(token)

473

474 def startTagHtml(self, token):

475 if not self.parser.firstStartTag and token["name"] == "html":

476 self.parser.parseError("non-html-root")

477 # XXX Need a check here to see if the first start tag token emitted is

478 # this token... If it's not, invoke self.parser.parseError().

479 for attr, value in token["data"].items():

480 if attr not in self.tree.openElements[0].attributes:

481 self.tree.openElements[0].attributes[attr] = value

482 self.parser.firstStartTag = False

483

484 def processEndTag(self, token):

485 # Note the caching is done here rather than BoundMethodDispatcher as doing it there

486 # requires a circular reference to the Phase, and this ends up with a significant

487 # (CPython 2.7, 3.8) GC cost when parsing many short inputs

488 name = token["name"]

489 # In Py2, using `in` is quicker in general than try/except KeyError

490 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)

491 if name in self.__endTagCache:

492 func = self.__endTagCache[name]

493 else:

494 func = self.__endTagCache[name] = self.endTagHandler[name]

495 # bound the cache size in case we get loads of unknown tags

496 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:

497 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7

498 self.__endTagCache.pop(next(iter(self.__endTagCache)))

499 return func(token)

500

501 class InitialPhase(Phase):

502 __slots__ = tuple()

503

504 def processSpaceCharacters(self, token):

505 pass

506

507 def processComment(self, token):

508 self.tree.insertComment(token, self.tree.document)

509

510 def processDoctype(self, token):

511 name = token["name"]

512 publicId = token["publicId"]

513 systemId = token["systemId"]

514 correct = token["correct"]

515

516 if (name != "html" or publicId is not None or

517 systemId is not None and systemId != "about:legacy-compat"):

518 self.parser.parseError("unknown-doctype")

519

520 if publicId is None:

521 publicId = ""

522

523 self.tree.insertDoctype(token)

524

525 if publicId != "":

526 publicId = publicId.translate(asciiUpper2Lower)

527

528 if (not correct or token["name"] != "html" or

529 publicId.startswith(

530 ("+//silmaril//dtd html pro v0r11 19970101//",

531 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",

532 "-//as//dtd html 3.0 aswedit + extensions//",

533 "-//ietf//dtd html 2.0 level 1//",

534 "-//ietf//dtd html 2.0 level 2//",

535 "-//ietf//dtd html 2.0 strict level 1//",

536 "-//ietf//dtd html 2.0 strict level 2//",

537 "-//ietf//dtd html 2.0 strict//",

538 "-//ietf//dtd html 2.0//",

539 "-//ietf//dtd html 2.1e//",

540 "-//ietf//dtd html 3.0//",

541 "-//ietf//dtd html 3.2 final//",

542 "-//ietf//dtd html 3.2//",

543 "-//ietf//dtd html 3//",

544 "-//ietf//dtd html level 0//",

545 "-//ietf//dtd html level 1//",

546 "-//ietf//dtd html level 2//",

547 "-//ietf//dtd html level 3//",

548 "-//ietf//dtd html strict level 0//",

549 "-//ietf//dtd html strict level 1//",

550 "-//ietf//dtd html strict level 2//",

551 "-//ietf//dtd html strict level 3//",

552 "-//ietf//dtd html strict//",

553 "-//ietf//dtd html//",

554 "-//metrius//dtd metrius presentational//",

555 "-//microsoft//dtd internet explorer 2.0 html strict//",

556 "-//microsoft//dtd internet explorer 2.0 html//",

557 "-//microsoft//dtd internet explorer 2.0 tables//",

558 "-//microsoft//dtd internet explorer 3.0 html strict//",

559 "-//microsoft//dtd internet explorer 3.0 html//",

560 "-//microsoft//dtd internet explorer 3.0 tables//",

561 "-//netscape comm. corp.//dtd html//",

562 "-//netscape comm. corp.//dtd strict html//",

563 "-//o'reilly and associates//dtd html 2.0//",

564 "-//o'reilly and associates//dtd html extended 1.0//",

565 "-//o'reilly and associates//dtd html extended relaxed 1.0//",

566 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",

567 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",

568 "-//spyglass//dtd html 2.0 extended//",

569 "-//sq//dtd html 2.0 hotmetal + extensions//",

570 "-//sun microsystems corp.//dtd hotjava html//",

571 "-//sun microsystems corp.//dtd hotjava strict html//",

572 "-//w3c//dtd html 3 1995-03-24//",

573 "-//w3c//dtd html 3.2 draft//",

574 "-//w3c//dtd html 3.2 final//",

575 "-//w3c//dtd html 3.2//",

576 "-//w3c//dtd html 3.2s draft//",

577 "-//w3c//dtd html 4.0 frameset//",

578 "-//w3c//dtd html 4.0 transitional//",

579 "-//w3c//dtd html experimental 19960712//",

580 "-//w3c//dtd html experimental 970421//",

581 "-//w3c//dtd w3 html//",

582 "-//w3o//dtd w3 html 3.0//",

583 "-//webtechs//dtd mozilla html 2.0//",

584 "-//webtechs//dtd mozilla html//")) or

585 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",

586 "-/w3c/dtd html 4.0 transitional/en",

587 "html") or

588 publicId.startswith(

589 ("-//w3c//dtd html 4.01 frameset//",

590 "-//w3c//dtd html 4.01 transitional//")) and

591 systemId is None or

592 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):

593 self.parser.compatMode = "quirks"

594 elif (publicId.startswith(

595 ("-//w3c//dtd xhtml 1.0 frameset//",

596 "-//w3c//dtd xhtml 1.0 transitional//")) or

597 publicId.startswith(

598 ("-//w3c//dtd html 4.01 frameset//",

599 "-//w3c//dtd html 4.01 transitional//")) and

600 systemId is not None):

601 self.parser.compatMode = "limited quirks"

602

603 self.parser.phase = self.parser.phases["beforeHtml"]

604

605 def anythingElse(self):

606 self.parser.compatMode = "quirks"

607 self.parser.phase = self.parser.phases["beforeHtml"]

608

609 def processCharacters(self, token):

610 self.parser.parseError("expected-doctype-but-got-chars")

611 self.anythingElse()

612 return token

613

614 def processStartTag(self, token):

615 self.parser.parseError("expected-doctype-but-got-start-tag",

616 {"name": token["name"]})

617 self.anythingElse()

618 return token

619

620 def processEndTag(self, token):

621 self.parser.parseError("expected-doctype-but-got-end-tag",

622 {"name": token["name"]})

623 self.anythingElse()

624 return token

625

626 def processEOF(self):

627 self.parser.parseError("expected-doctype-but-got-eof")

628 self.anythingElse()

629 return True

630

631 class BeforeHtmlPhase(Phase):

632 __slots__ = tuple()

633

634 # helper methods

635 def insertHtmlElement(self):

636 self.tree.insertRoot(impliedTagToken("html", "StartTag"))

637 self.parser.phase = self.parser.phases["beforeHead"]

638

639 # other

640 def processEOF(self):

641 self.insertHtmlElement()

642 return True

643

644 def processComment(self, token):

645 self.tree.insertComment(token, self.tree.document)

646

647 def processSpaceCharacters(self, token):

648 pass

649

650 def processCharacters(self, token):

651 self.insertHtmlElement()

652 return token

653

654 def processStartTag(self, token):

655 if token["name"] == "html":

656 self.parser.firstStartTag = True

657 self.insertHtmlElement()

658 return token

659

660 def processEndTag(self, token):

661 if token["name"] not in ("head", "body", "html", "br"):

662 self.parser.parseError("unexpected-end-tag-before-html",

663 {"name": token["name"]})

664 else:

665 self.insertHtmlElement()

666 return token

667

668 class BeforeHeadPhase(Phase):

669 __slots__ = tuple()

670

671 def processEOF(self):

672 self.startTagHead(impliedTagToken("head", "StartTag"))

673 return True

674

675 def processSpaceCharacters(self, token):

676 pass

677

678 def processCharacters(self, token):

679 self.startTagHead(impliedTagToken("head", "StartTag"))

680 return token

681

682 def startTagHtml(self, token):

683 return self.parser.phases["inBody"].processStartTag(token)

684

685 def startTagHead(self, token):

686 self.tree.insertElement(token)

687 self.tree.headPointer = self.tree.openElements[-1]

688 self.parser.phase = self.parser.phases["inHead"]

689

690 def startTagOther(self, token):

691 self.startTagHead(impliedTagToken("head", "StartTag"))

692 return token

693

694 def endTagImplyHead(self, token):

695 self.startTagHead(impliedTagToken("head", "StartTag"))

696 return token

697

698 def endTagOther(self, token):

699 self.parser.parseError("end-tag-after-implied-root",

700 {"name": token["name"]})

701

702 startTagHandler = _utils.MethodDispatcher([

703 ("html", startTagHtml),

704 ("head", startTagHead)

705 ])

706 startTagHandler.default = startTagOther

707

708 endTagHandler = _utils.MethodDispatcher([

709 (("head", "body", "html", "br"), endTagImplyHead)

710 ])

711 endTagHandler.default = endTagOther

712

713 class InHeadPhase(Phase):

714 __slots__ = tuple()

715

716 # the real thing

717 def processEOF(self):

718 self.anythingElse()

719 return True

720

721 def processCharacters(self, token):

722 self.anythingElse()

723 return token

724

725 def startTagHtml(self, token):

726 return self.parser.phases["inBody"].processStartTag(token)

727

728 def startTagHead(self, token):

729 self.parser.parseError("two-heads-are-not-better-than-one")

730

731 def startTagBaseLinkCommand(self, token):

732 self.tree.insertElement(token)

733 self.tree.openElements.pop()

734 token["selfClosingAcknowledged"] = True

735

736 def startTagMeta(self, token):

737 self.tree.insertElement(token)

738 self.tree.openElements.pop()

739 token["selfClosingAcknowledged"] = True

740

741 attributes = token["data"]

742 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":

743 if "charset" in attributes:

744 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])

745 elif ("content" in attributes and

746 "http-equiv" in attributes and

747 attributes["http-equiv"].lower() == "content-type"):

748 # Encoding it as UTF-8 here is a hack, as really we should pass

749 # the abstract Unicode string, and just use the

750 # ContentAttrParser on that, but using UTF-8 allows all chars

751 # to be encoded and as a ASCII-superset works.

752 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))

753 parser = _inputstream.ContentAttrParser(data)

754 codec = parser.parse()

755 self.parser.tokenizer.stream.changeEncoding(codec)

756

757 def startTagTitle(self, token):

758 self.parser.parseRCDataRawtext(token, "RCDATA")

759

760 def startTagNoFramesStyle(self, token):

761 # Need to decide whether to implement the scripting-disabled case

762 self.parser.parseRCDataRawtext(token, "RAWTEXT")

763

764 def startTagNoscript(self, token):

765 if self.parser.scripting:

766 self.parser.parseRCDataRawtext(token, "RAWTEXT")

767 else:

768 self.tree.insertElement(token)

769 self.parser.phase = self.parser.phases["inHeadNoscript"]

770

771 def startTagScript(self, token):

772 self.tree.insertElement(token)

773 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState

774 self.parser.originalPhase = self.parser.phase

775 self.parser.phase = self.parser.phases["text"]

776

777 def startTagOther(self, token):

778 self.anythingElse()

779 return token

780

781 def endTagHead(self, token):

782 node = self.parser.tree.openElements.pop()

783 assert node.name == "head", "Expected head got %s" % node.name

784 self.parser.phase = self.parser.phases["afterHead"]

785

786 def endTagHtmlBodyBr(self, token):

787 self.anythingElse()

788 return token

789

790 def endTagOther(self, token):

791 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

792

793 def anythingElse(self):

794 self.endTagHead(impliedTagToken("head"))

795

796 startTagHandler = _utils.MethodDispatcher([

797 ("html", startTagHtml),

798 ("title", startTagTitle),

799 (("noframes", "style"), startTagNoFramesStyle),

800 ("noscript", startTagNoscript),

801 ("script", startTagScript),

802 (("base", "basefont", "bgsound", "command", "link"),

803 startTagBaseLinkCommand),

804 ("meta", startTagMeta),

805 ("head", startTagHead)

806 ])

807 startTagHandler.default = startTagOther

808

809 endTagHandler = _utils.MethodDispatcher([

810 ("head", endTagHead),

811 (("br", "html", "body"), endTagHtmlBodyBr)

812 ])

813 endTagHandler.default = endTagOther

814

815 class InHeadNoscriptPhase(Phase):

816 __slots__ = tuple()

817

818 def processEOF(self):

819 self.parser.parseError("eof-in-head-noscript")

820 self.anythingElse()

821 return True

822

823 def processComment(self, token):

824 return self.parser.phases["inHead"].processComment(token)

825

826 def processCharacters(self, token):

827 self.parser.parseError("char-in-head-noscript")

828 self.anythingElse()

829 return token

830

831 def processSpaceCharacters(self, token):

832 return self.parser.phases["inHead"].processSpaceCharacters(token)

833

834 def startTagHtml(self, token):

835 return self.parser.phases["inBody"].processStartTag(token)

836

837 def startTagBaseLinkCommand(self, token):

838 return self.parser.phases["inHead"].processStartTag(token)

839

840 def startTagHeadNoscript(self, token):

841 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})

842

843 def startTagOther(self, token):

844 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})

845 self.anythingElse()

846 return token

847

848 def endTagNoscript(self, token):

849 node = self.parser.tree.openElements.pop()

850 assert node.name == "noscript", "Expected noscript got %s" % node.name

851 self.parser.phase = self.parser.phases["inHead"]

852

853 def endTagBr(self, token):

854 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})

855 self.anythingElse()

856 return token

857

858 def endTagOther(self, token):

859 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

860

861 def anythingElse(self):

862 # Caller must raise parse error first!

863 self.endTagNoscript(impliedTagToken("noscript"))

864

865 startTagHandler = _utils.MethodDispatcher([

866 ("html", startTagHtml),

867 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),

868 (("head", "noscript"), startTagHeadNoscript),

869 ])

870 startTagHandler.default = startTagOther

871

872 endTagHandler = _utils.MethodDispatcher([

873 ("noscript", endTagNoscript),

874 ("br", endTagBr),

875 ])

876 endTagHandler.default = endTagOther

877

878 class AfterHeadPhase(Phase):

879 __slots__ = tuple()

880

881 def processEOF(self):

882 self.anythingElse()

883 return True

884

885 def processCharacters(self, token):

886 self.anythingElse()

887 return token

888

889 def startTagHtml(self, token):

890 return self.parser.phases["inBody"].processStartTag(token)

891

892 def startTagBody(self, token):

893 self.parser.framesetOK = False

894 self.tree.insertElement(token)

895 self.parser.phase = self.parser.phases["inBody"]

896

897 def startTagFrameset(self, token):

898 self.tree.insertElement(token)

899 self.parser.phase = self.parser.phases["inFrameset"]

900

901 def startTagFromHead(self, token):

902 self.parser.parseError("unexpected-start-tag-out-of-my-head",

903 {"name": token["name"]})

904 self.tree.openElements.append(self.tree.headPointer)

905 self.parser.phases["inHead"].processStartTag(token)

906 for node in self.tree.openElements[::-1]:

907 if node.name == "head":

908 self.tree.openElements.remove(node)

909 break

910

911 def startTagHead(self, token):

912 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})

913

914 def startTagOther(self, token):

915 self.anythingElse()

916 return token

917

918 def endTagHtmlBodyBr(self, token):

919 self.anythingElse()

920 return token

921

922 def endTagOther(self, token):

923 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

924

925 def anythingElse(self):

926 self.tree.insertElement(impliedTagToken("body", "StartTag"))

927 self.parser.phase = self.parser.phases["inBody"]

928 self.parser.framesetOK = True

929

930 startTagHandler = _utils.MethodDispatcher([

931 ("html", startTagHtml),

932 ("body", startTagBody),

933 ("frameset", startTagFrameset),

934 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",

935 "style", "title"),

936 startTagFromHead),

937 ("head", startTagHead)

938 ])

939 startTagHandler.default = startTagOther

940 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),

941 endTagHtmlBodyBr)])

942 endTagHandler.default = endTagOther

943

944 class InBodyPhase(Phase):

945 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody

946 # the really-really-really-very crazy mode

947 __slots__ = ("processSpaceCharacters",)

948

949 def __init__(self, *args, **kwargs):

950 super(InBodyPhase, self).__init__(*args, **kwargs)

951 # Set this to the default handler

952 self.processSpaceCharacters = self.processSpaceCharactersNonPre

953

954 def isMatchingFormattingElement(self, node1, node2):

955 return (node1.name == node2.name and

956 node1.namespace == node2.namespace and

957 node1.attributes == node2.attributes)

958

959 # helper

960 def addFormattingElement(self, token):

961 self.tree.insertElement(token)

962 element = self.tree.openElements[-1]

963

964 matchingElements = []

965 for node in self.tree.activeFormattingElements[::-1]:

966 if node is Marker:

967 break

968 elif self.isMatchingFormattingElement(node, element):

969 matchingElements.append(node)

970

971 assert len(matchingElements) <= 3

972 if len(matchingElements) == 3:

973 self.tree.activeFormattingElements.remove(matchingElements[-1])

974 self.tree.activeFormattingElements.append(element)

975

976 # the real deal

977 def processEOF(self):

978 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",

979 "tfoot", "th", "thead", "tr", "body",

980 "html"))

981 for node in self.tree.openElements[::-1]:

982 if node.name not in allowed_elements:

983 self.parser.parseError("expected-closing-tag-but-got-eof")

984 break

985 # Stop parsing

986

987 def processSpaceCharactersDropNewline(self, token):

988 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we

989 # want to drop leading newlines

990 data = token["data"]

991 self.processSpaceCharacters = self.processSpaceCharactersNonPre

992 if (data.startswith("\n") and

993 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and

994 not self.tree.openElements[-1].hasContent()):

995 data = data[1:]

996 if data:

997 self.tree.reconstructActiveFormattingElements()

998 self.tree.insertText(data)

999

1000 def processCharacters(self, token):

1001 if token["data"] == "\u0000":

1002 # The tokenizer should always emit null on its own

1003 return

1004 self.tree.reconstructActiveFormattingElements()

1005 self.tree.insertText(token["data"])

1006 # This must be bad for performance

1007 if (self.parser.framesetOK and

1008 any(char not in spaceCharacters

1009 for char in token["data"])):

1010 self.parser.framesetOK = False

1011

1012 def processSpaceCharactersNonPre(self, token):

1013 self.tree.reconstructActiveFormattingElements()

1014 self.tree.insertText(token["data"])

1015

1016 def startTagProcessInHead(self, token):

1017 return self.parser.phases["inHead"].processStartTag(token)

1018

1019 def startTagBody(self, token):

1020 self.parser.parseError("unexpected-start-tag", {"name": "body"})

1021 if (len(self.tree.openElements) == 1 or

1022 self.tree.openElements[1].name != "body"):

1023 assert self.parser.innerHTML

1024 else:

1025 self.parser.framesetOK = False

1026 for attr, value in token["data"].items():

1027 if attr not in self.tree.openElements[1].attributes:

1028 self.tree.openElements[1].attributes[attr] = value

1029

1030 def startTagFrameset(self, token):

1031 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})

1032 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):

1033 assert self.parser.innerHTML

1034 elif not self.parser.framesetOK:

1035 pass

1036 else:

1037 if self.tree.openElements[1].parent:

1038 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])

1039 while self.tree.openElements[-1].name != "html":

1040 self.tree.openElements.pop()

1041 self.tree.insertElement(token)

1042 self.parser.phase = self.parser.phases["inFrameset"]

1043

1044 def startTagCloseP(self, token):

1045 if self.tree.elementInScope("p", variant="button"):

1046 self.endTagP(impliedTagToken("p"))

1047 self.tree.insertElement(token)

1048

1049 def startTagPreListing(self, token):

1050 if self.tree.elementInScope("p", variant="button"):

1051 self.endTagP(impliedTagToken("p"))

1052 self.tree.insertElement(token)

1053 self.parser.framesetOK = False

1054 self.processSpaceCharacters = self.processSpaceCharactersDropNewline

1055

1056 def startTagForm(self, token):

1057 if self.tree.formPointer:

1058 self.parser.parseError("unexpected-start-tag", {"name": "form"})

1059 else:

1060 if self.tree.elementInScope("p", variant="button"):

1061 self.endTagP(impliedTagToken("p"))

1062 self.tree.insertElement(token)

1063 self.tree.formPointer = self.tree.openElements[-1]

1064

1065 def startTagListItem(self, token):

1066 self.parser.framesetOK = False

1067

1068 stopNamesMap = {"li": ["li"],

1069 "dt": ["dt", "dd"],

1070 "dd": ["dt", "dd"]}

1071 stopNames = stopNamesMap[token["name"]]

1072 for node in reversed(self.tree.openElements):

1073 if node.name in stopNames:

1074 self.parser.phase.processEndTag(

1075 impliedTagToken(node.name, "EndTag"))

1076 break

1077 if (node.nameTuple in specialElements and

1078 node.name not in ("address", "div", "p")):

1079 break

1080

1081 if self.tree.elementInScope("p", variant="button"):

1082 self.parser.phase.processEndTag(

1083 impliedTagToken("p", "EndTag"))

1084

1085 self.tree.insertElement(token)

1086

1087 def startTagPlaintext(self, token):

1088 if self.tree.elementInScope("p", variant="button"):

1089 self.endTagP(impliedTagToken("p"))

1090 self.tree.insertElement(token)

1091 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState

1092

1093 def startTagHeading(self, token):

1094 if self.tree.elementInScope("p", variant="button"):

1095 self.endTagP(impliedTagToken("p"))

1096 if self.tree.openElements[-1].name in headingElements:

1097 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})

1098 self.tree.openElements.pop()

1099 self.tree.insertElement(token)

1100

1101 def startTagA(self, token):

1102 afeAElement = self.tree.elementInActiveFormattingElements("a")

1103 if afeAElement:

1104 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1105 {"startName": "a", "endName": "a"})

1106 self.endTagFormatting(impliedTagToken("a"))

1107 if afeAElement in self.tree.openElements:

1108 self.tree.openElements.remove(afeAElement)

1109 if afeAElement in self.tree.activeFormattingElements:

1110 self.tree.activeFormattingElements.remove(afeAElement)

1111 self.tree.reconstructActiveFormattingElements()

1112 self.addFormattingElement(token)

1113

1114 def startTagFormatting(self, token):

1115 self.tree.reconstructActiveFormattingElements()

1116 self.addFormattingElement(token)

1117

1118 def startTagNobr(self, token):

1119 self.tree.reconstructActiveFormattingElements()

1120 if self.tree.elementInScope("nobr"):

1121 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1122 {"startName": "nobr", "endName": "nobr"})

1123 self.processEndTag(impliedTagToken("nobr"))

1124 # XXX Need tests that trigger the following

1125 self.tree.reconstructActiveFormattingElements()

1126 self.addFormattingElement(token)

1127

1128 def startTagButton(self, token):

1129 if self.tree.elementInScope("button"):

1130 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1131 {"startName": "button", "endName": "button"})

1132 self.processEndTag(impliedTagToken("button"))

1133 return token

1134 else:

1135 self.tree.reconstructActiveFormattingElements()

1136 self.tree.insertElement(token)

1137 self.parser.framesetOK = False

1138

1139 def startTagAppletMarqueeObject(self, token):

1140 self.tree.reconstructActiveFormattingElements()

1141 self.tree.insertElement(token)

1142 self.tree.activeFormattingElements.append(Marker)

1143 self.parser.framesetOK = False

1144

1145 def startTagXmp(self, token):

1146 if self.tree.elementInScope("p", variant="button"):

1147 self.endTagP(impliedTagToken("p"))

1148 self.tree.reconstructActiveFormattingElements()

1149 self.parser.framesetOK = False

1150 self.parser.parseRCDataRawtext(token, "RAWTEXT")

1151

1152 def startTagTable(self, token):

1153 if self.parser.compatMode != "quirks":

1154 if self.tree.elementInScope("p", variant="button"):

1155 self.processEndTag(impliedTagToken("p"))

1156 self.tree.insertElement(token)

1157 self.parser.framesetOK = False

1158 self.parser.phase = self.parser.phases["inTable"]

1159

1160 def startTagVoidFormatting(self, token):

1161 self.tree.reconstructActiveFormattingElements()

1162 self.tree.insertElement(token)

1163 self.tree.openElements.pop()

1164 token["selfClosingAcknowledged"] = True

1165 self.parser.framesetOK = False

1166

1167 def startTagInput(self, token):

1168 framesetOK = self.parser.framesetOK

1169 self.startTagVoidFormatting(token)

1170 if ("type" in token["data"] and

1171 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):

1172 # input type=hidden doesn't change framesetOK

1173 self.parser.framesetOK = framesetOK

1174

1175 def startTagParamSource(self, token):

1176 self.tree.insertElement(token)

1177 self.tree.openElements.pop()

1178 token["selfClosingAcknowledged"] = True

1179

1180 def startTagHr(self, token):

1181 if self.tree.elementInScope("p", variant="button"):

1182 self.endTagP(impliedTagToken("p"))

1183 self.tree.insertElement(token)

1184 self.tree.openElements.pop()

1185 token["selfClosingAcknowledged"] = True

1186 self.parser.framesetOK = False

1187

1188 def startTagImage(self, token):

1189 # No really...

1190 self.parser.parseError("unexpected-start-tag-treated-as",

1191 {"originalName": "image", "newName": "img"})

1192 self.processStartTag(impliedTagToken("img", "StartTag",

1193 attributes=token["data"],

1194 selfClosing=token["selfClosing"]))

1195

1196 def startTagIsIndex(self, token):

1197 self.parser.parseError("deprecated-tag", {"name": "isindex"})

1198 if self.tree.formPointer:

1199 return

1200 form_attrs = {}

1201 if "action" in token["data"]:

1202 form_attrs["action"] = token["data"]["action"]

1203 self.processStartTag(impliedTagToken("form", "StartTag",

1204 attributes=form_attrs))

1205 self.processStartTag(impliedTagToken("hr", "StartTag"))

1206 self.processStartTag(impliedTagToken("label", "StartTag"))

1207 # XXX Localization ...

1208 if "prompt" in token["data"]:

1209 prompt = token["data"]["prompt"]

1210 else:

1211 prompt = "This is a searchable index. Enter search keywords: "

1212 self.processCharacters(

1213 {"type": tokenTypes["Characters"], "data": prompt})

1214 attributes = token["data"].copy()

1215 if "action" in attributes:

1216 del attributes["action"]

1217 if "prompt" in attributes:

1218 del attributes["prompt"]

1219 attributes["name"] = "isindex"

1220 self.processStartTag(impliedTagToken("input", "StartTag",

1221 attributes=attributes,

1222 selfClosing=token["selfClosing"]))

1223 self.processEndTag(impliedTagToken("label"))

1224 self.processStartTag(impliedTagToken("hr", "StartTag"))

1225 self.processEndTag(impliedTagToken("form"))

1226

1227 def startTagTextarea(self, token):

1228 self.tree.insertElement(token)

1229 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState

1230 self.processSpaceCharacters = self.processSpaceCharactersDropNewline

1231 self.parser.framesetOK = False

1232

1233 def startTagIFrame(self, token):

1234 self.parser.framesetOK = False

1235 self.startTagRawtext(token)

1236

1237 def startTagNoscript(self, token):

1238 if self.parser.scripting:

1239 self.startTagRawtext(token)

1240 else:

1241 self.startTagOther(token)

1242

1243 def startTagRawtext(self, token):

1244 """iframe, noembed noframes, noscript(if scripting enabled)"""

1245 self.parser.parseRCDataRawtext(token, "RAWTEXT")

1246

1247 def startTagOpt(self, token):

1248 if self.tree.openElements[-1].name == "option":

1249 self.parser.phase.processEndTag(impliedTagToken("option"))

1250 self.tree.reconstructActiveFormattingElements()

1251 self.parser.tree.insertElement(token)

1252

1253 def startTagSelect(self, token):

1254 self.tree.reconstructActiveFormattingElements()

1255 self.tree.insertElement(token)

1256 self.parser.framesetOK = False

1257 if self.parser.phase in (self.parser.phases["inTable"],

1258 self.parser.phases["inCaption"],

1259 self.parser.phases["inColumnGroup"],

1260 self.parser.phases["inTableBody"],

1261 self.parser.phases["inRow"],

1262 self.parser.phases["inCell"]):

1263 self.parser.phase = self.parser.phases["inSelectInTable"]

1264 else:

1265 self.parser.phase = self.parser.phases["inSelect"]

1266

1267 def startTagRpRt(self, token):

1268 if self.tree.elementInScope("ruby"):

1269 self.tree.generateImpliedEndTags()

1270 if self.tree.openElements[-1].name != "ruby":

1271 self.parser.parseError()

1272 self.tree.insertElement(token)

1273

1274 def startTagMath(self, token):

1275 self.tree.reconstructActiveFormattingElements()

1276 self.parser.adjustMathMLAttributes(token)

1277 self.parser.adjustForeignAttributes(token)

1278 token["namespace"] = namespaces["mathml"]

1279 self.tree.insertElement(token)

1280 # Need to get the parse error right for the case where the token

1281 # has a namespace not equal to the xmlns attribute

1282 if token["selfClosing"]:

1283 self.tree.openElements.pop()

1284 token["selfClosingAcknowledged"] = True

1285

1286 def startTagSvg(self, token):

1287 self.tree.reconstructActiveFormattingElements()

1288 self.parser.adjustSVGAttributes(token)

1289 self.parser.adjustForeignAttributes(token)

1290 token["namespace"] = namespaces["svg"]

1291 self.tree.insertElement(token)

1292 # Need to get the parse error right for the case where the token

1293 # has a namespace not equal to the xmlns attribute

1294 if token["selfClosing"]:

1295 self.tree.openElements.pop()

1296 token["selfClosingAcknowledged"] = True

1297

1298 def startTagMisplaced(self, token):

1299 """ Elements that should be children of other elements that have a

1300 different insertion mode; here they are ignored

1301 "caption", "col", "colgroup", "frame", "frameset", "head",

1302 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",

1303 "tr", "noscript"

1304 """

1305 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})

1306

1307 def startTagOther(self, token):

1308 self.tree.reconstructActiveFormattingElements()

1309 self.tree.insertElement(token)

1310

1311 def endTagP(self, token):

1312 if not self.tree.elementInScope("p", variant="button"):

1313 self.startTagCloseP(impliedTagToken("p", "StartTag"))

1314 self.parser.parseError("unexpected-end-tag", {"name": "p"})

1315 self.endTagP(impliedTagToken("p", "EndTag"))

1316 else:

1317 self.tree.generateImpliedEndTags("p")

1318 if self.tree.openElements[-1].name != "p":

1319 self.parser.parseError("unexpected-end-tag", {"name": "p"})

1320 node = self.tree.openElements.pop()

1321 while node.name != "p":

1322 node = self.tree.openElements.pop()

1323

1324 def endTagBody(self, token):

1325 if not self.tree.elementInScope("body"):

1326 self.parser.parseError()

1327 return

1328 elif self.tree.openElements[-1].name != "body":

1329 for node in self.tree.openElements[2:]:

1330 if node.name not in frozenset(("dd", "dt", "li", "optgroup",

1331 "option", "p", "rp", "rt",

1332 "tbody", "td", "tfoot",

1333 "th", "thead", "tr", "body",

1334 "html")):

1335 # Not sure this is the correct name for the parse error

1336 self.parser.parseError(

1337 "expected-one-end-tag-but-got-another",

1338 {"gotName": "body", "expectedName": node.name})

1339 break

1340 self.parser.phase = self.parser.phases["afterBody"]

1341

1342 def endTagHtml(self, token):

1343 # We repeat the test for the body end tag token being ignored here

1344 if self.tree.elementInScope("body"):

1345 self.endTagBody(impliedTagToken("body"))

1346 return token

1347

1348 def endTagBlock(self, token):

1349 # Put us back in the right whitespace handling mode

1350 if token["name"] == "pre":

1351 self.processSpaceCharacters = self.processSpaceCharactersNonPre

1352 inScope = self.tree.elementInScope(token["name"])

1353 if inScope:

1354 self.tree.generateImpliedEndTags()

1355 if self.tree.openElements[-1].name != token["name"]:

1356 self.parser.parseError("end-tag-too-early", {"name": token["name"]})

1357 if inScope:

1358 node = self.tree.openElements.pop()

1359 while node.name != token["name"]:

1360 node = self.tree.openElements.pop()

1361

1362 def endTagForm(self, token):

1363 node = self.tree.formPointer

1364 self.tree.formPointer = None

1365 if node is None or not self.tree.elementInScope(node):

1366 self.parser.parseError("unexpected-end-tag",

1367 {"name": "form"})

1368 else:

1369 self.tree.generateImpliedEndTags()

1370 if self.tree.openElements[-1] != node:

1371 self.parser.parseError("end-tag-too-early-ignored",

1372 {"name": "form"})

1373 self.tree.openElements.remove(node)

1374

1375 def endTagListItem(self, token):

1376 if token["name"] == "li":

1377 variant = "list"

1378 else:

1379 variant = None

1380 if not self.tree.elementInScope(token["name"], variant=variant):

1381 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1382 else:

1383 self.tree.generateImpliedEndTags(exclude=token["name"])

1384 if self.tree.openElements[-1].name != token["name"]:

1385 self.parser.parseError(

1386 "end-tag-too-early",

1387 {"name": token["name"]})

1388 node = self.tree.openElements.pop()

1389 while node.name != token["name"]:

1390 node = self.tree.openElements.pop()

1391

1392 def endTagHeading(self, token):

1393 for item in headingElements:

1394 if self.tree.elementInScope(item):

1395 self.tree.generateImpliedEndTags()

1396 break

1397 if self.tree.openElements[-1].name != token["name"]:

1398 self.parser.parseError("end-tag-too-early", {"name": token["name"]})

1399

1400 for item in headingElements:

1401 if self.tree.elementInScope(item):

1402 item = self.tree.openElements.pop()

1403 while item.name not in headingElements:

1404 item = self.tree.openElements.pop()

1405 break

1406

1407 def endTagFormatting(self, token):

1408 """The much-feared adoption agency algorithm"""

1409 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867

1410 # XXX Better parseError messages appreciated.

1411

1412 # Step 1

1413 outerLoopCounter = 0

1414

1415 # Step 2

1416 while outerLoopCounter < 8:

1417

1418 # Step 3

1419 outerLoopCounter += 1

1420

1421 # Step 4:

1422

1423 # Let the formatting element be the last element in

1424 # the list of active formatting elements that:

1425 # - is between the end of the list and the last scope

1426 # marker in the list, if any, or the start of the list

1427 # otherwise, and

1428 # - has the same tag name as the token.

1429 formattingElement = self.tree.elementInActiveFormattingElements(

1430 token["name"])

1431 if (not formattingElement or

1432 (formattingElement in self.tree.openElements and

1433 not self.tree.elementInScope(formattingElement.name))):

1434 # If there is no such node, then abort these steps

1435 # and instead act as described in the "any other

1436 # end tag" entry below.

1437 self.endTagOther(token)

1438 return

1439

1440 # Otherwise, if there is such a node, but that node is

1441 # not in the stack of open elements, then this is a

1442 # parse error; remove the element from the list, and

1443 # abort these steps.

1444 elif formattingElement not in self.tree.openElements:

1445 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})

1446 self.tree.activeFormattingElements.remove(formattingElement)

1447 return

1448

1449 # Otherwise, if there is such a node, and that node is

1450 # also in the stack of open elements, but the element

1451 # is not in scope, then this is a parse error; ignore

1452 # the token, and abort these steps.

1453 elif not self.tree.elementInScope(formattingElement.name):

1454 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})

1455 return

1456

1457 # Otherwise, there is a formatting element and that

1458 # element is in the stack and is in scope. If the

1459 # element is not the current node, this is a parse

1460 # error. In any case, proceed with the algorithm as

1461 # written in the following steps.

1462 else:

1463 if formattingElement != self.tree.openElements[-1]:

1464 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})

1465

1466 # Step 5:

1467

1468 # Let the furthest block be the topmost node in the

1469 # stack of open elements that is lower in the stack

1470 # than the formatting element, and is an element in

1471 # the special category. There might not be one.

1472 afeIndex = self.tree.openElements.index(formattingElement)

1473 furthestBlock = None

1474 for element in self.tree.openElements[afeIndex:]:

1475 if element.nameTuple in specialElements:

1476 furthestBlock = element

1477 break

1478

1479 # Step 6:

1480

1481 # If there is no furthest block, then the UA must

1482 # first pop all the nodes from the bottom of the stack

1483 # of open elements, from the current node up to and

1484 # including the formatting element, then remove the

1485 # formatting element from the list of active

1486 # formatting elements, and finally abort these steps.

1487 if furthestBlock is None:

1488 element = self.tree.openElements.pop()

1489 while element != formattingElement:

1490 element = self.tree.openElements.pop()

1491 self.tree.activeFormattingElements.remove(element)

1492 return

1493

1494 # Step 7

1495 commonAncestor = self.tree.openElements[afeIndex - 1]

1496

1497 # Step 8:

1498 # The bookmark is supposed to help us identify where to reinsert

1499 # nodes in step 15. We have to ensure that we reinsert nodes after

1500 # the node before the active formatting element. Note the bookmark

1501 # can move in step 9.7

1502 bookmark = self.tree.activeFormattingElements.index(formattingElement)

1503

1504 # Step 9

1505 lastNode = node = furthestBlock

1506 innerLoopCounter = 0

1507

1508 index = self.tree.openElements.index(node)

1509 while innerLoopCounter < 3:

1510 innerLoopCounter += 1

1511 # Node is element before node in open elements

1512 index -= 1

1513 node = self.tree.openElements[index]

1514 if node not in self.tree.activeFormattingElements:

1515 self.tree.openElements.remove(node)

1516 continue

1517 # Step 9.6

1518 if node == formattingElement:

1519 break

1520 # Step 9.7

1521 if lastNode == furthestBlock:

1522 bookmark = self.tree.activeFormattingElements.index(node) + 1

1523 # Step 9.8

1524 clone = node.cloneNode()

1525 # Replace node with clone

1526 self.tree.activeFormattingElements[

1527 self.tree.activeFormattingElements.index(node)] = clone

1528 self.tree.openElements[

1529 self.tree.openElements.index(node)] = clone

1530 node = clone

1531 # Step 9.9

1532 # Remove lastNode from its parents, if any

1533 if lastNode.parent:

1534 lastNode.parent.removeChild(lastNode)

1535 node.appendChild(lastNode)

1536 # Step 9.10

1537 lastNode = node

1538

1539 # Step 10

1540 # Foster parent lastNode if commonAncestor is a

1541 # table, tbody, tfoot, thead, or tr we need to foster

1542 # parent the lastNode

1543 if lastNode.parent:

1544 lastNode.parent.removeChild(lastNode)

1545

1546 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):

1547 parent, insertBefore = self.tree.getTableMisnestedNodePosition()

1548 parent.insertBefore(lastNode, insertBefore)

1549 else:

1550 commonAncestor.appendChild(lastNode)

1551

1552 # Step 11

1553 clone = formattingElement.cloneNode()

1554

1555 # Step 12

1556 furthestBlock.reparentChildren(clone)

1557

1558 # Step 13

1559 furthestBlock.appendChild(clone)

1560

1561 # Step 14

1562 self.tree.activeFormattingElements.remove(formattingElement)

1563 self.tree.activeFormattingElements.insert(bookmark, clone)

1564

1565 # Step 15

1566 self.tree.openElements.remove(formattingElement)

1567 self.tree.openElements.insert(

1568 self.tree.openElements.index(furthestBlock) + 1, clone)

1569

1570 def endTagAppletMarqueeObject(self, token):

1571 if self.tree.elementInScope(token["name"]):

1572 self.tree.generateImpliedEndTags()

1573 if self.tree.openElements[-1].name != token["name"]:

1574 self.parser.parseError("end-tag-too-early", {"name": token["name"]})

1575

1576 if self.tree.elementInScope(token["name"]):

1577 element = self.tree.openElements.pop()

1578 while element.name != token["name"]:

1579 element = self.tree.openElements.pop()

1580 self.tree.clearActiveFormattingElements()

1581

1582 def endTagBr(self, token):

1583 self.parser.parseError("unexpected-end-tag-treated-as",

1584 {"originalName": "br", "newName": "br element"})

1585 self.tree.reconstructActiveFormattingElements()

1586 self.tree.insertElement(impliedTagToken("br", "StartTag"))

1587 self.tree.openElements.pop()

1588

1589 def endTagOther(self, token):

1590 for node in self.tree.openElements[::-1]:

1591 if node.name == token["name"]:

1592 self.tree.generateImpliedEndTags(exclude=token["name"])

1593 if self.tree.openElements[-1].name != token["name"]:

1594 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1595 while self.tree.openElements.pop() != node:

1596 pass

1597 break

1598 else:

1599 if node.nameTuple in specialElements:

1600 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1601 break

1602

1603 startTagHandler = _utils.MethodDispatcher([

1604 ("html", Phase.startTagHtml),

1605 (("base", "basefont", "bgsound", "command", "link", "meta",

1606 "script", "style", "title"),

1607 startTagProcessInHead),

1608 ("body", startTagBody),

1609 ("frameset", startTagFrameset),

1610 (("address", "article", "aside", "blockquote", "center", "details",

1611 "dir", "div", "dl", "fieldset", "figcaption", "figure",

1612 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",

1613 "section", "summary", "ul"),

1614 startTagCloseP),

1615 (headingElements, startTagHeading),

1616 (("pre", "listing"), startTagPreListing),

1617 ("form", startTagForm),

1618 (("li", "dd", "dt"), startTagListItem),

1619 ("plaintext", startTagPlaintext),

1620 ("a", startTagA),

1621 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",

1622 "strong", "tt", "u"), startTagFormatting),

1623 ("nobr", startTagNobr),

1624 ("button", startTagButton),

1625 (("applet", "marquee", "object"), startTagAppletMarqueeObject),

1626 ("xmp", startTagXmp),

1627 ("table", startTagTable),

1628 (("area", "br", "embed", "img", "keygen", "wbr"),

1629 startTagVoidFormatting),

1630 (("param", "source", "track"), startTagParamSource),

1631 ("input", startTagInput),

1632 ("hr", startTagHr),

1633 ("image", startTagImage),

1634 ("isindex", startTagIsIndex),

1635 ("textarea", startTagTextarea),

1636 ("iframe", startTagIFrame),

1637 ("noscript", startTagNoscript),

1638 (("noembed", "noframes"), startTagRawtext),

1639 ("select", startTagSelect),

1640 (("rp", "rt"), startTagRpRt),

1641 (("option", "optgroup"), startTagOpt),

1642 (("math"), startTagMath),

1643 (("svg"), startTagSvg),

1644 (("caption", "col", "colgroup", "frame", "head",

1645 "tbody", "td", "tfoot", "th", "thead",

1646 "tr"), startTagMisplaced)

1647 ])

1648 startTagHandler.default = startTagOther

1649

1650 endTagHandler = _utils.MethodDispatcher([

1651 ("body", endTagBody),

1652 ("html", endTagHtml),

1653 (("address", "article", "aside", "blockquote", "button", "center",

1654 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",

1655 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",

1656 "section", "summary", "ul"), endTagBlock),

1657 ("form", endTagForm),

1658 ("p", endTagP),

1659 (("dd", "dt", "li"), endTagListItem),

1660 (headingElements, endTagHeading),

1661 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",

1662 "strike", "strong", "tt", "u"), endTagFormatting),

1663 (("applet", "marquee", "object"), endTagAppletMarqueeObject),

1664 ("br", endTagBr),

1665 ])

1666 endTagHandler.default = endTagOther

1667

1668 class TextPhase(Phase):

1669 __slots__ = tuple()

1670

1671 def processCharacters(self, token):

1672 self.tree.insertText(token["data"])

1673

1674 def processEOF(self):

1675 self.parser.parseError("expected-named-closing-tag-but-got-eof",

1676 {"name": self.tree.openElements[-1].name})

1677 self.tree.openElements.pop()

1678 self.parser.phase = self.parser.originalPhase

1679 return True

1680

1681 def startTagOther(self, token):

1682 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']

1683

1684 def endTagScript(self, token):

1685 node = self.tree.openElements.pop()

1686 assert node.name == "script"

1687 self.parser.phase = self.parser.originalPhase

1688 # The rest of this method is all stuff that only happens if

1689 # document.write works

1690

1691 def endTagOther(self, token):

1692 self.tree.openElements.pop()

1693 self.parser.phase = self.parser.originalPhase

1694

1695 startTagHandler = _utils.MethodDispatcher([])

1696 startTagHandler.default = startTagOther

1697 endTagHandler = _utils.MethodDispatcher([

1698 ("script", endTagScript)])

1699 endTagHandler.default = endTagOther

1700

1701 class InTablePhase(Phase):

1702 # http://www.whatwg.org/specs/web-apps/current-work/#in-table

1703 __slots__ = tuple()

1704

1705 # helper methods

1706 def clearStackToTableContext(self):

1707 # "clear the stack back to a table context"

1708 while self.tree.openElements[-1].name not in ("table", "html"):

1709 # self.parser.parseError("unexpected-implied-end-tag-in-table",

1710 # {"name": self.tree.openElements[-1].name})

1711 self.tree.openElements.pop()

1712 # When the current node is <html> it's an innerHTML case

1713

1714 # processing methods

1715 def processEOF(self):

1716 if self.tree.openElements[-1].name != "html":

1717 self.parser.parseError("eof-in-table")

1718 else:

1719 assert self.parser.innerHTML

1720 # Stop parsing

1721

1722 def processSpaceCharacters(self, token):

1723 originalPhase = self.parser.phase

1724 self.parser.phase = self.parser.phases["inTableText"]

1725 self.parser.phase.originalPhase = originalPhase

1726 self.parser.phase.processSpaceCharacters(token)

1727

1728 def processCharacters(self, token):

1729 originalPhase = self.parser.phase

1730 self.parser.phase = self.parser.phases["inTableText"]

1731 self.parser.phase.originalPhase = originalPhase

1732 self.parser.phase.processCharacters(token)

1733

1734 def insertText(self, token):

1735 # If we get here there must be at least one non-whitespace character

1736 # Do the table magic!

1737 self.tree.insertFromTable = True

1738 self.parser.phases["inBody"].processCharacters(token)

1739 self.tree.insertFromTable = False

1740

1741 def startTagCaption(self, token):

1742 self.clearStackToTableContext()

1743 self.tree.activeFormattingElements.append(Marker)

1744 self.tree.insertElement(token)

1745 self.parser.phase = self.parser.phases["inCaption"]

1746

1747 def startTagColgroup(self, token):

1748 self.clearStackToTableContext()

1749 self.tree.insertElement(token)

1750 self.parser.phase = self.parser.phases["inColumnGroup"]

1751

1752 def startTagCol(self, token):

1753 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))

1754 return token

1755

1756 def startTagRowGroup(self, token):

1757 self.clearStackToTableContext()

1758 self.tree.insertElement(token)

1759 self.parser.phase = self.parser.phases["inTableBody"]

1760

1761 def startTagImplyTbody(self, token):

1762 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))

1763 return token

1764

1765 def startTagTable(self, token):

1766 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1767 {"startName": "table", "endName": "table"})

1768 self.parser.phase.processEndTag(impliedTagToken("table"))

1769 if not self.parser.innerHTML:

1770 return token

1771

1772 def startTagStyleScript(self, token):

1773 return self.parser.phases["inHead"].processStartTag(token)

1774

1775 def startTagInput(self, token):

1776 if ("type" in token["data"] and

1777 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):

1778 self.parser.parseError("unexpected-hidden-input-in-table")

1779 self.tree.insertElement(token)

1780 # XXX associate with form

1781 self.tree.openElements.pop()

1782 else:

1783 self.startTagOther(token)

1784

1785 def startTagForm(self, token):

1786 self.parser.parseError("unexpected-form-in-table")

1787 if self.tree.formPointer is None:

1788 self.tree.insertElement(token)

1789 self.tree.formPointer = self.tree.openElements[-1]

1790 self.tree.openElements.pop()

1791

1792 def startTagOther(self, token):

1793 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})

1794 # Do the table magic!

1795 self.tree.insertFromTable = True

1796 self.parser.phases["inBody"].processStartTag(token)

1797 self.tree.insertFromTable = False

1798

1799 def endTagTable(self, token):

1800 if self.tree.elementInScope("table", variant="table"):

1801 self.tree.generateImpliedEndTags()

1802 if self.tree.openElements[-1].name != "table":

1803 self.parser.parseError("end-tag-too-early-named",

1804 {"gotName": "table",

1805 "expectedName": self.tree.openElements[-1].name})

1806 while self.tree.openElements[-1].name != "table":

1807 self.tree.openElements.pop()

1808 self.tree.openElements.pop()

1809 self.parser.resetInsertionMode()

1810 else:

1811 # innerHTML case

1812 assert self.parser.innerHTML

1813 self.parser.parseError()

1814

1815 def endTagIgnore(self, token):

1816 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1817

1818 def endTagOther(self, token):

1819 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})

1820 # Do the table magic!

1821 self.tree.insertFromTable = True

1822 self.parser.phases["inBody"].processEndTag(token)

1823 self.tree.insertFromTable = False

1824

1825 startTagHandler = _utils.MethodDispatcher([

1826 ("html", Phase.startTagHtml),

1827 ("caption", startTagCaption),

1828 ("colgroup", startTagColgroup),

1829 ("col", startTagCol),

1830 (("tbody", "tfoot", "thead"), startTagRowGroup),

1831 (("td", "th", "tr"), startTagImplyTbody),

1832 ("table", startTagTable),

1833 (("style", "script"), startTagStyleScript),

1834 ("input", startTagInput),

1835 ("form", startTagForm)

1836 ])

1837 startTagHandler.default = startTagOther

1838

1839 endTagHandler = _utils.MethodDispatcher([

1840 ("table", endTagTable),

1841 (("body", "caption", "col", "colgroup", "html", "tbody", "td",

1842 "tfoot", "th", "thead", "tr"), endTagIgnore)

1843 ])

1844 endTagHandler.default = endTagOther

1845

1846 class InTableTextPhase(Phase):

1847 __slots__ = ("originalPhase", "characterTokens")

1848

1849 def __init__(self, *args, **kwargs):

1850 super(InTableTextPhase, self).__init__(*args, **kwargs)

1851 self.originalPhase = None

1852 self.characterTokens = []

1853

1854 def flushCharacters(self):

1855 data = "".join([item["data"] for item in self.characterTokens])

1856 if any(item not in spaceCharacters for item in data):

1857 token = {"type": tokenTypes["Characters"], "data": data}

1858 self.parser.phases["inTable"].insertText(token)

1859 elif data:

1860 self.tree.insertText(data)

1861 self.characterTokens = []

1862

1863 def processComment(self, token):

1864 self.flushCharacters()

1865 self.parser.phase = self.originalPhase

1866 return token

1867

1868 def processEOF(self):

1869 self.flushCharacters()

1870 self.parser.phase = self.originalPhase

1871 return True

1872

1873 def processCharacters(self, token):

1874 if token["data"] == "\u0000":

1875 return

1876 self.characterTokens.append(token)

1877

1878 def processSpaceCharacters(self, token):

1879 # pretty sure we should never reach here

1880 self.characterTokens.append(token)

1881 # assert False

1882

1883 def processStartTag(self, token):

1884 self.flushCharacters()

1885 self.parser.phase = self.originalPhase

1886 return token

1887

1888 def processEndTag(self, token):

1889 self.flushCharacters()

1890 self.parser.phase = self.originalPhase

1891 return token

1892

1893 class InCaptionPhase(Phase):

1894 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption

1895 __slots__ = tuple()

1896

1897 def ignoreEndTagCaption(self):

1898 return not self.tree.elementInScope("caption", variant="table")

1899

1900 def processEOF(self):

1901 self.parser.phases["inBody"].processEOF()

1902

1903 def processCharacters(self, token):

1904 return self.parser.phases["inBody"].processCharacters(token)

1905

1906 def startTagTableElement(self, token):

1907 self.parser.parseError()

1908 # XXX Have to duplicate logic here to find out if the tag is ignored

1909 ignoreEndTag = self.ignoreEndTagCaption()

1910 self.parser.phase.processEndTag(impliedTagToken("caption"))

1911 if not ignoreEndTag:

1912 return token

1913

1914 def startTagOther(self, token):

1915 return self.parser.phases["inBody"].processStartTag(token)

1916

1917 def endTagCaption(self, token):

1918 if not self.ignoreEndTagCaption():

1919 # AT this code is quite similar to endTagTable in "InTable"

1920 self.tree.generateImpliedEndTags()

1921 if self.tree.openElements[-1].name != "caption":

1922 self.parser.parseError("expected-one-end-tag-but-got-another",

1923 {"gotName": "caption",

1924 "expectedName": self.tree.openElements[-1].name})

1925 while self.tree.openElements[-1].name != "caption":

1926 self.tree.openElements.pop()

1927 self.tree.openElements.pop()

1928 self.tree.clearActiveFormattingElements()

1929 self.parser.phase = self.parser.phases["inTable"]

1930 else:

1931 # innerHTML case

1932 assert self.parser.innerHTML

1933 self.parser.parseError()

1934

1935 def endTagTable(self, token):

1936 self.parser.parseError()

1937 ignoreEndTag = self.ignoreEndTagCaption()

1938 self.parser.phase.processEndTag(impliedTagToken("caption"))

1939 if not ignoreEndTag:

1940 return token

1941

1942 def endTagIgnore(self, token):

1943 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1944

1945 def endTagOther(self, token):

1946 return self.parser.phases["inBody"].processEndTag(token)

1947

1948 startTagHandler = _utils.MethodDispatcher([

1949 ("html", Phase.startTagHtml),

1950 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",

1951 "thead", "tr"), startTagTableElement)

1952 ])

1953 startTagHandler.default = startTagOther

1954

1955 endTagHandler = _utils.MethodDispatcher([

1956 ("caption", endTagCaption),

1957 ("table", endTagTable),

1958 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",

1959 "thead", "tr"), endTagIgnore)

1960 ])

1961 endTagHandler.default = endTagOther

1962

1963 class InColumnGroupPhase(Phase):

1964 # http://www.whatwg.org/specs/web-apps/current-work/#in-column

1965 __slots__ = tuple()

1966

1967 def ignoreEndTagColgroup(self):

1968 return self.tree.openElements[-1].name == "html"

1969

1970 def processEOF(self):

1971 if self.tree.openElements[-1].name == "html":

1972 assert self.parser.innerHTML

1973 return

1974 else:

1975 ignoreEndTag = self.ignoreEndTagColgroup()

1976 self.endTagColgroup(impliedTagToken("colgroup"))

1977 if not ignoreEndTag:

1978 return True

1979

1980 def processCharacters(self, token):

1981 ignoreEndTag = self.ignoreEndTagColgroup()

1982 self.endTagColgroup(impliedTagToken("colgroup"))

1983 if not ignoreEndTag:

1984 return token

1985

1986 def startTagCol(self, token):

1987 self.tree.insertElement(token)

1988 self.tree.openElements.pop()

1989 token["selfClosingAcknowledged"] = True

1990

1991 def startTagOther(self, token):

1992 ignoreEndTag = self.ignoreEndTagColgroup()

1993 self.endTagColgroup(impliedTagToken("colgroup"))

1994 if not ignoreEndTag:

1995 return token

1996

1997 def endTagColgroup(self, token):

1998 if self.ignoreEndTagColgroup():

1999 # innerHTML case

2000 assert self.parser.innerHTML

2001 self.parser.parseError()

2002 else:

2003 self.tree.openElements.pop()

2004 self.parser.phase = self.parser.phases["inTable"]

2005

2006 def endTagCol(self, token):

2007 self.parser.parseError("no-end-tag", {"name": "col"})

2008

2009 def endTagOther(self, token):

2010 ignoreEndTag = self.ignoreEndTagColgroup()

2011 self.endTagColgroup(impliedTagToken("colgroup"))

2012 if not ignoreEndTag:

2013 return token

2014

2015 startTagHandler = _utils.MethodDispatcher([

2016 ("html", Phase.startTagHtml),

2017 ("col", startTagCol)

2018 ])

2019 startTagHandler.default = startTagOther

2020

2021 endTagHandler = _utils.MethodDispatcher([

2022 ("colgroup", endTagColgroup),

2023 ("col", endTagCol)

2024 ])

2025 endTagHandler.default = endTagOther

2026

2027 class InTableBodyPhase(Phase):

2028 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0

2029 __slots__ = tuple()

2030

2031 # helper methods

2032 def clearStackToTableBodyContext(self):

2033 while self.tree.openElements[-1].name not in ("tbody", "tfoot",

2034 "thead", "html"):

2035 # self.parser.parseError("unexpected-implied-end-tag-in-table",

2036 # {"name": self.tree.openElements[-1].name})

2037 self.tree.openElements.pop()

2038 if self.tree.openElements[-1].name == "html":

2039 assert self.parser.innerHTML

2040

2041 # the rest

2042 def processEOF(self):

2043 self.parser.phases["inTable"].processEOF()

2044

2045 def processSpaceCharacters(self, token):

2046 return self.parser.phases["inTable"].processSpaceCharacters(token)

2047

2048 def processCharacters(self, token):

2049 return self.parser.phases["inTable"].processCharacters(token)

2050

2051 def startTagTr(self, token):

2052 self.clearStackToTableBodyContext()

2053 self.tree.insertElement(token)

2054 self.parser.phase = self.parser.phases["inRow"]

2055

2056 def startTagTableCell(self, token):

2057 self.parser.parseError("unexpected-cell-in-table-body",

2058 {"name": token["name"]})

2059 self.startTagTr(impliedTagToken("tr", "StartTag"))

2060 return token

2061

2062 def startTagTableOther(self, token):

2063 # XXX AT Any ideas on how to share this with endTagTable?

2064 if (self.tree.elementInScope("tbody", variant="table") or

2065 self.tree.elementInScope("thead", variant="table") or

2066 self.tree.elementInScope("tfoot", variant="table")):

2067 self.clearStackToTableBodyContext()

2068 self.endTagTableRowGroup(

2069 impliedTagToken(self.tree.openElements[-1].name))

2070 return token

2071 else:

2072 # innerHTML case

2073 assert self.parser.innerHTML

2074 self.parser.parseError()

2075

2076 def startTagOther(self, token):

2077 return self.parser.phases["inTable"].processStartTag(token)

2078

2079 def endTagTableRowGroup(self, token):

2080 if self.tree.elementInScope(token["name"], variant="table"):

2081 self.clearStackToTableBodyContext()

2082 self.tree.openElements.pop()

2083 self.parser.phase = self.parser.phases["inTable"]

2084 else:

2085 self.parser.parseError("unexpected-end-tag-in-table-body",

2086 {"name": token["name"]})

2087

2088 def endTagTable(self, token):

2089 if (self.tree.elementInScope("tbody", variant="table") or

2090 self.tree.elementInScope("thead", variant="table") or

2091 self.tree.elementInScope("tfoot", variant="table")):

2092 self.clearStackToTableBodyContext()

2093 self.endTagTableRowGroup(

2094 impliedTagToken(self.tree.openElements[-1].name))

2095 return token

2096 else:

2097 # innerHTML case

2098 assert self.parser.innerHTML

2099 self.parser.parseError()

2100

2101 def endTagIgnore(self, token):

2102 self.parser.parseError("unexpected-end-tag-in-table-body",

2103 {"name": token["name"]})

2104

2105 def endTagOther(self, token):

2106 return self.parser.phases["inTable"].processEndTag(token)

2107

2108 startTagHandler = _utils.MethodDispatcher([

2109 ("html", Phase.startTagHtml),

2110 ("tr", startTagTr),

2111 (("td", "th"), startTagTableCell),

2112 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),

2113 startTagTableOther)

2114 ])

2115 startTagHandler.default = startTagOther

2116

2117 endTagHandler = _utils.MethodDispatcher([

2118 (("tbody", "tfoot", "thead"), endTagTableRowGroup),

2119 ("table", endTagTable),

2120 (("body", "caption", "col", "colgroup", "html", "td", "th",

2121 "tr"), endTagIgnore)

2122 ])

2123 endTagHandler.default = endTagOther

2124

2125 class InRowPhase(Phase):

2126 # http://www.whatwg.org/specs/web-apps/current-work/#in-row

2127 __slots__ = tuple()

2128

2129 # helper methods (XXX unify this with other table helper methods)

2130 def clearStackToTableRowContext(self):

2131 while self.tree.openElements[-1].name not in ("tr", "html"):

2132 self.parser.parseError("unexpected-implied-end-tag-in-table-row",

2133 {"name": self.tree.openElements[-1].name})

2134 self.tree.openElements.pop()

2135

2136 def ignoreEndTagTr(self):

2137 return not self.tree.elementInScope("tr", variant="table")

2138

2139 # the rest

2140 def processEOF(self):

2141 self.parser.phases["inTable"].processEOF()

2142

2143 def processSpaceCharacters(self, token):

2144 return self.parser.phases["inTable"].processSpaceCharacters(token)

2145

2146 def processCharacters(self, token):

2147 return self.parser.phases["inTable"].processCharacters(token)

2148

2149 def startTagTableCell(self, token):

2150 self.clearStackToTableRowContext()

2151 self.tree.insertElement(token)

2152 self.parser.phase = self.parser.phases["inCell"]

2153 self.tree.activeFormattingElements.append(Marker)

2154

2155 def startTagTableOther(self, token):

2156 ignoreEndTag = self.ignoreEndTagTr()

2157 self.endTagTr(impliedTagToken("tr"))

2158 # XXX how are we sure it's always ignored in the innerHTML case?

2159 if not ignoreEndTag:

2160 return token

2161

2162 def startTagOther(self, token):

2163 return self.parser.phases["inTable"].processStartTag(token)

2164

2165 def endTagTr(self, token):

2166 if not self.ignoreEndTagTr():

2167 self.clearStackToTableRowContext()

2168 self.tree.openElements.pop()

2169 self.parser.phase = self.parser.phases["inTableBody"]

2170 else:

2171 # innerHTML case

2172 assert self.parser.innerHTML

2173 self.parser.parseError()

2174

2175 def endTagTable(self, token):

2176 ignoreEndTag = self.ignoreEndTagTr()

2177 self.endTagTr(impliedTagToken("tr"))

2178 # Reprocess the current tag if the tr end tag was not ignored

2179 # XXX how are we sure it's always ignored in the innerHTML case?

2180 if not ignoreEndTag:

2181 return token

2182

2183 def endTagTableRowGroup(self, token):

2184 if self.tree.elementInScope(token["name"], variant="table"):

2185 self.endTagTr(impliedTagToken("tr"))

2186 return token

2187 else:

2188 self.parser.parseError()

2189

2190 def endTagIgnore(self, token):

2191 self.parser.parseError("unexpected-end-tag-in-table-row",

2192 {"name": token["name"]})

2193

2194 def endTagOther(self, token):

2195 return self.parser.phases["inTable"].processEndTag(token)

2196

2197 startTagHandler = _utils.MethodDispatcher([

2198 ("html", Phase.startTagHtml),

2199 (("td", "th"), startTagTableCell),

2200 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",

2201 "tr"), startTagTableOther)

2202 ])

2203 startTagHandler.default = startTagOther

2204

2205 endTagHandler = _utils.MethodDispatcher([

2206 ("tr", endTagTr),

2207 ("table", endTagTable),

2208 (("tbody", "tfoot", "thead"), endTagTableRowGroup),

2209 (("body", "caption", "col", "colgroup", "html", "td", "th"),

2210 endTagIgnore)

2211 ])

2212 endTagHandler.default = endTagOther

2213

2214 class InCellPhase(Phase):

2215 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell

2216 __slots__ = tuple()

2217

2218 # helper

2219 def closeCell(self):

2220 if self.tree.elementInScope("td", variant="table"):

2221 self.endTagTableCell(impliedTagToken("td"))

2222 elif self.tree.elementInScope("th", variant="table"):

2223 self.endTagTableCell(impliedTagToken("th"))

2224

2225 # the rest

2226 def processEOF(self):

2227 self.parser.phases["inBody"].processEOF()

2228

2229 def processCharacters(self, token):

2230 return self.parser.phases["inBody"].processCharacters(token)

2231

2232 def startTagTableOther(self, token):

2233 if (self.tree.elementInScope("td", variant="table") or

2234 self.tree.elementInScope("th", variant="table")):

2235 self.closeCell()

2236 return token

2237 else:

2238 # innerHTML case

2239 assert self.parser.innerHTML

2240 self.parser.parseError()

2241

2242 def startTagOther(self, token):

2243 return self.parser.phases["inBody"].processStartTag(token)

2244

2245 def endTagTableCell(self, token):

2246 if self.tree.elementInScope(token["name"], variant="table"):

2247 self.tree.generateImpliedEndTags(token["name"])

2248 if self.tree.openElements[-1].name != token["name"]:

2249 self.parser.parseError("unexpected-cell-end-tag",

2250 {"name": token["name"]})

2251 while True:

2252 node = self.tree.openElements.pop()

2253 if node.name == token["name"]:

2254 break

2255 else:

2256 self.tree.openElements.pop()

2257 self.tree.clearActiveFormattingElements()

2258 self.parser.phase = self.parser.phases["inRow"]

2259 else:

2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

2261

2262 def endTagIgnore(self, token):

2263 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

2264

2265 def endTagImply(self, token):

2266 if self.tree.elementInScope(token["name"], variant="table"):

2267 self.closeCell()

2268 return token

2269 else:

2270 # sometimes innerHTML case

2271 self.parser.parseError()

2272

2273 def endTagOther(self, token):

2274 return self.parser.phases["inBody"].processEndTag(token)

2275

2276 startTagHandler = _utils.MethodDispatcher([

2277 ("html", Phase.startTagHtml),

2278 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",

2279 "thead", "tr"), startTagTableOther)

2280 ])

2281 startTagHandler.default = startTagOther

2282

2283 endTagHandler = _utils.MethodDispatcher([

2284 (("td", "th"), endTagTableCell),

2285 (("body", "caption", "col", "colgroup", "html"), endTagIgnore),

2286 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)

2287 ])

2288 endTagHandler.default = endTagOther

2289

2290 class InSelectPhase(Phase):

2291 __slots__ = tuple()

2292

2293 # http://www.whatwg.org/specs/web-apps/current-work/#in-select

2294 def processEOF(self):

2295 if self.tree.openElements[-1].name != "html":

2296 self.parser.parseError("eof-in-select")

2297 else:

2298 assert self.parser.innerHTML

2299

2300 def processCharacters(self, token):

2301 if token["data"] == "\u0000":

2302 return

2303 self.tree.insertText(token["data"])

2304

2305 def startTagOption(self, token):

2306 # We need to imply </option> if <option> is the current node.

2307 if self.tree.openElements[-1].name == "option":

2308 self.tree.openElements.pop()

2309 self.tree.insertElement(token)

2310

2311 def startTagOptgroup(self, token):

2312 if self.tree.openElements[-1].name == "option":

2313 self.tree.openElements.pop()

2314 if self.tree.openElements[-1].name == "optgroup":

2315 self.tree.openElements.pop()

2316 self.tree.insertElement(token)

2317

2318 def startTagSelect(self, token):

2319 self.parser.parseError("unexpected-select-in-select")

2320 self.endTagSelect(impliedTagToken("select"))

2321

2322 def startTagInput(self, token):

2323 self.parser.parseError("unexpected-input-in-select")

2324 if self.tree.elementInScope("select", variant="select"):

2325 self.endTagSelect(impliedTagToken("select"))

2326 return token

2327 else:

2328 assert self.parser.innerHTML

2329

2330 def startTagScript(self, token):

2331 return self.parser.phases["inHead"].processStartTag(token)

2332

2333 def startTagOther(self, token):

2334 self.parser.parseError("unexpected-start-tag-in-select",

2335 {"name": token["name"]})

2336

2337 def endTagOption(self, token):

2338 if self.tree.openElements[-1].name == "option":

2339 self.tree.openElements.pop()

2340 else:

2341 self.parser.parseError("unexpected-end-tag-in-select",

2342 {"name": "option"})

2343

2344 def endTagOptgroup(self, token):

2345 # </optgroup> implicitly closes <option>

2346 if (self.tree.openElements[-1].name == "option" and

2347 self.tree.openElements[-2].name == "optgroup"):

2348 self.tree.openElements.pop()

2349 # It also closes </optgroup>

2350 if self.tree.openElements[-1].name == "optgroup":

2351 self.tree.openElements.pop()

2352 # But nothing else

2353 else:

2354 self.parser.parseError("unexpected-end-tag-in-select",

2355 {"name": "optgroup"})

2356

2357 def endTagSelect(self, token):

2358 if self.tree.elementInScope("select", variant="select"):

2359 node = self.tree.openElements.pop()

2360 while node.name != "select":

2361 node = self.tree.openElements.pop()

2362 self.parser.resetInsertionMode()

2363 else:

2364 # innerHTML case

2365 assert self.parser.innerHTML

2366 self.parser.parseError()

2367

2368 def endTagOther(self, token):

2369 self.parser.parseError("unexpected-end-tag-in-select",

2370 {"name": token["name"]})

2371

2372 startTagHandler = _utils.MethodDispatcher([

2373 ("html", Phase.startTagHtml),

2374 ("option", startTagOption),

2375 ("optgroup", startTagOptgroup),

2376 ("select", startTagSelect),

2377 (("input", "keygen", "textarea"), startTagInput),

2378 ("script", startTagScript)

2379 ])

2380 startTagHandler.default = startTagOther

2381

2382 endTagHandler = _utils.MethodDispatcher([

2383 ("option", endTagOption),

2384 ("optgroup", endTagOptgroup),

2385 ("select", endTagSelect)

2386 ])

2387 endTagHandler.default = endTagOther

2388

2389 class InSelectInTablePhase(Phase):

2390 __slots__ = tuple()

2391

2392 def processEOF(self):

2393 self.parser.phases["inSelect"].processEOF()

2394

2395 def processCharacters(self, token):

2396 return self.parser.phases["inSelect"].processCharacters(token)

2397

2398 def startTagTable(self, token):

2399 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})

2400 self.endTagOther(impliedTagToken("select"))

2401 return token

2402

2403 def startTagOther(self, token):

2404 return self.parser.phases["inSelect"].processStartTag(token)

2405

2406 def endTagTable(self, token):

2407 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})

2408 if self.tree.elementInScope(token["name"], variant="table"):

2409 self.endTagOther(impliedTagToken("select"))

2410 return token

2411

2412 def endTagOther(self, token):

2413 return self.parser.phases["inSelect"].processEndTag(token)

2414

2415 startTagHandler = _utils.MethodDispatcher([

2416 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),

2417 startTagTable)

2418 ])

2419 startTagHandler.default = startTagOther

2420

2421 endTagHandler = _utils.MethodDispatcher([

2422 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),

2423 endTagTable)

2424 ])

2425 endTagHandler.default = endTagOther

2426

2427 class InForeignContentPhase(Phase):

2428 __slots__ = tuple()

2429

2430 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",

2431 "center", "code", "dd", "div", "dl", "dt",

2432 "em", "embed", "h1", "h2", "h3",

2433 "h4", "h5", "h6", "head", "hr", "i", "img",

2434 "li", "listing", "menu", "meta", "nobr",

2435 "ol", "p", "pre", "ruby", "s", "small",

2436 "span", "strong", "strike", "sub", "sup",

2437 "table", "tt", "u", "ul", "var"])

2438

2439 def adjustSVGTagNames(self, token):

2440 replacements = {"altglyph": "altGlyph",

2441 "altglyphdef": "altGlyphDef",

2442 "altglyphitem": "altGlyphItem",

2443 "animatecolor": "animateColor",

2444 "animatemotion": "animateMotion",

2445 "animatetransform": "animateTransform",

2446 "clippath": "clipPath",

2447 "feblend": "feBlend",

2448 "fecolormatrix": "feColorMatrix",

2449 "fecomponenttransfer": "feComponentTransfer",

2450 "fecomposite": "feComposite",

2451 "feconvolvematrix": "feConvolveMatrix",

2452 "fediffuselighting": "feDiffuseLighting",

2453 "fedisplacementmap": "feDisplacementMap",

2454 "fedistantlight": "feDistantLight",

2455 "feflood": "feFlood",

2456 "fefunca": "feFuncA",

2457 "fefuncb": "feFuncB",

2458 "fefuncg": "feFuncG",

2459 "fefuncr": "feFuncR",

2460 "fegaussianblur": "feGaussianBlur",

2461 "feimage": "feImage",

2462 "femerge": "feMerge",

2463 "femergenode": "feMergeNode",

2464 "femorphology": "feMorphology",

2465 "feoffset": "feOffset",

2466 "fepointlight": "fePointLight",

2467 "fespecularlighting": "feSpecularLighting",

2468 "fespotlight": "feSpotLight",

2469 "fetile": "feTile",

2470 "feturbulence": "feTurbulence",

2471 "foreignobject": "foreignObject",

2472 "glyphref": "glyphRef",

2473 "lineargradient": "linearGradient",

2474 "radialgradient": "radialGradient",

2475 "textpath": "textPath"}

2476

2477 if token["name"] in replacements:

2478 token["name"] = replacements[token["name"]]

2479

2480 def processCharacters(self, token):

2481 if token["data"] == "\u0000":

2482 token["data"] = "\uFFFD"

2483 elif (self.parser.framesetOK and

2484 any(char not in spaceCharacters for char in token["data"])):

2485 self.parser.framesetOK = False

2486 Phase.processCharacters(self, token)

2487

2488 def processStartTag(self, token):

2489 currentNode = self.tree.openElements[-1]

2490 if (token["name"] in self.breakoutElements or

2491 (token["name"] == "font" and

2492 set(token["data"].keys()) & {"color", "face", "size"})):

2493 self.parser.parseError("unexpected-html-element-in-foreign-content",

2494 {"name": token["name"]})

2495 while (self.tree.openElements[-1].namespace !=

2496 self.tree.defaultNamespace and

2497 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and

2498 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):

2499 self.tree.openElements.pop()

2500 return token

2501

2502 else:

2503 if currentNode.namespace == namespaces["mathml"]:

2504 self.parser.adjustMathMLAttributes(token)

2505 elif currentNode.namespace == namespaces["svg"]:

2506 self.adjustSVGTagNames(token)

2507 self.parser.adjustSVGAttributes(token)

2508 self.parser.adjustForeignAttributes(token)

2509 token["namespace"] = currentNode.namespace

2510 self.tree.insertElement(token)

2511 if token["selfClosing"]:

2512 self.tree.openElements.pop()

2513 token["selfClosingAcknowledged"] = True

2514

2515 def processEndTag(self, token):

2516 nodeIndex = len(self.tree.openElements) - 1

2517 node = self.tree.openElements[-1]

2518 if node.name.translate(asciiUpper2Lower) != token["name"]:

2519 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

2520

2521 while True:

2522 if node.name.translate(asciiUpper2Lower) == token["name"]:

2523 # XXX this isn't in the spec but it seems necessary

2524 if self.parser.phase == self.parser.phases["inTableText"]:

2525 self.parser.phase.flushCharacters()

2526 self.parser.phase = self.parser.phase.originalPhase

2527 while self.tree.openElements.pop() != node:

2528 assert self.tree.openElements

2529 new_token = None

2530 break

2531 nodeIndex -= 1

2532

2533 node = self.tree.openElements[nodeIndex]

2534 if node.namespace != self.tree.defaultNamespace:

2535 continue

2536 else:

2537 new_token = self.parser.phase.processEndTag(token)

2538 break

2539 return new_token

2540

2541 class AfterBodyPhase(Phase):

2542 __slots__ = tuple()

2543

2544 def processEOF(self):

2545 # Stop parsing

2546 pass

2547

2548 def processComment(self, token):

2549 # This is needed because data is to be appended to the <html> element

2550 # here and not to whatever is currently open.

2551 self.tree.insertComment(token, self.tree.openElements[0])

2552

2553 def processCharacters(self, token):

2554 self.parser.parseError("unexpected-char-after-body")

2555 self.parser.phase = self.parser.phases["inBody"]

2556 return token

2557

2558 def startTagHtml(self, token):

2559 return self.parser.phases["inBody"].processStartTag(token)

2560

2561 def startTagOther(self, token):

2562 self.parser.parseError("unexpected-start-tag-after-body",

2563 {"name": token["name"]})

2564 self.parser.phase = self.parser.phases["inBody"]

2565 return token

2566

2567 def endTagHtml(self, name):

2568 if self.parser.innerHTML:

2569 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")

2570 else:

2571 self.parser.phase = self.parser.phases["afterAfterBody"]

2572

2573 def endTagOther(self, token):

2574 self.parser.parseError("unexpected-end-tag-after-body",

2575 {"name": token["name"]})

2576 self.parser.phase = self.parser.phases["inBody"]

2577 return token

2578

2579 startTagHandler = _utils.MethodDispatcher([

2580 ("html", startTagHtml)

2581 ])

2582 startTagHandler.default = startTagOther

2583

2584 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])

2585 endTagHandler.default = endTagOther

2586

2587 class InFramesetPhase(Phase):

2588 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset

2589 __slots__ = tuple()

2590

2591 def processEOF(self):

2592 if self.tree.openElements[-1].name != "html":

2593 self.parser.parseError("eof-in-frameset")

2594 else:

2595 assert self.parser.innerHTML

2596

2597 def processCharacters(self, token):

2598 self.parser.parseError("unexpected-char-in-frameset")

2599

2600 def startTagFrameset(self, token):

2601 self.tree.insertElement(token)

2602

2603 def startTagFrame(self, token):

2604 self.tree.insertElement(token)

2605 self.tree.openElements.pop()

2606

2607 def startTagNoframes(self, token):

2608 return self.parser.phases["inBody"].processStartTag(token)

2609

2610 def startTagOther(self, token):

2611 self.parser.parseError("unexpected-start-tag-in-frameset",

2612 {"name": token["name"]})

2613

2614 def endTagFrameset(self, token):

2615 if self.tree.openElements[-1].name == "html":

2616 # innerHTML case

2617 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")

2618 else:

2619 self.tree.openElements.pop()

2620 if (not self.parser.innerHTML and

2621 self.tree.openElements[-1].name != "frameset"):

2622 # If we're not in innerHTML mode and the current node is not a

2623 # "frameset" element (anymore) then switch.

2624 self.parser.phase = self.parser.phases["afterFrameset"]

2625

2626 def endTagOther(self, token):

2627 self.parser.parseError("unexpected-end-tag-in-frameset",

2628 {"name": token["name"]})

2629

2630 startTagHandler = _utils.MethodDispatcher([

2631 ("html", Phase.startTagHtml),

2632 ("frameset", startTagFrameset),

2633 ("frame", startTagFrame),

2634 ("noframes", startTagNoframes)

2635 ])

2636 startTagHandler.default = startTagOther

2637

2638 endTagHandler = _utils.MethodDispatcher([

2639 ("frameset", endTagFrameset)

2640 ])

2641 endTagHandler.default = endTagOther

2642

2643 class AfterFramesetPhase(Phase):

2644 # http://www.whatwg.org/specs/web-apps/current-work/#after3

2645 __slots__ = tuple()

2646

2647 def processEOF(self):

2648 # Stop parsing

2649 pass

2650

2651 def processCharacters(self, token):

2652 self.parser.parseError("unexpected-char-after-frameset")

2653

2654 def startTagNoframes(self, token):

2655 return self.parser.phases["inHead"].processStartTag(token)

2656

2657 def startTagOther(self, token):

2658 self.parser.parseError("unexpected-start-tag-after-frameset",

2659 {"name": token["name"]})

2660

2661 def endTagHtml(self, token):

2662 self.parser.phase = self.parser.phases["afterAfterFrameset"]

2663

2664 def endTagOther(self, token):

2665 self.parser.parseError("unexpected-end-tag-after-frameset",

2666 {"name": token["name"]})

2667

2668 startTagHandler = _utils.MethodDispatcher([

2669 ("html", Phase.startTagHtml),

2670 ("noframes", startTagNoframes)

2671 ])

2672 startTagHandler.default = startTagOther

2673

2674 endTagHandler = _utils.MethodDispatcher([

2675 ("html", endTagHtml)

2676 ])

2677 endTagHandler.default = endTagOther

2678

2679 class AfterAfterBodyPhase(Phase):

2680 __slots__ = tuple()

2681

2682 def processEOF(self):

2683 pass

2684

2685 def processComment(self, token):

2686 self.tree.insertComment(token, self.tree.document)

2687

2688 def processSpaceCharacters(self, token):

2689 return self.parser.phases["inBody"].processSpaceCharacters(token)

2690

2691 def processCharacters(self, token):

2692 self.parser.parseError("expected-eof-but-got-char")

2693 self.parser.phase = self.parser.phases["inBody"]

2694 return token

2695

2696 def startTagHtml(self, token):

2697 return self.parser.phases["inBody"].processStartTag(token)

2698

2699 def startTagOther(self, token):

2700 self.parser.parseError("expected-eof-but-got-start-tag",

2701 {"name": token["name"]})

2702 self.parser.phase = self.parser.phases["inBody"]

2703 return token

2704

2705 def processEndTag(self, token):

2706 self.parser.parseError("expected-eof-but-got-end-tag",

2707 {"name": token["name"]})

2708 self.parser.phase = self.parser.phases["inBody"]

2709 return token

2710

2711 startTagHandler = _utils.MethodDispatcher([

2712 ("html", startTagHtml)

2713 ])

2714 startTagHandler.default = startTagOther

2715

2716 class AfterAfterFramesetPhase(Phase):

2717 __slots__ = tuple()

2718

2719 def processEOF(self):

2720 pass

2721

2722 def processComment(self, token):

2723 self.tree.insertComment(token, self.tree.document)

2724

2725 def processSpaceCharacters(self, token):

2726 return self.parser.phases["inBody"].processSpaceCharacters(token)

2727

2728 def processCharacters(self, token):

2729 self.parser.parseError("expected-eof-but-got-char")

2730

2731 def startTagHtml(self, token):

2732 return self.parser.phases["inBody"].processStartTag(token)

2733

2734 def startTagNoFrames(self, token):

2735 return self.parser.phases["inHead"].processStartTag(token)

2736

2737 def startTagOther(self, token):

2738 self.parser.parseError("expected-eof-but-got-start-tag",

2739 {"name": token["name"]})

2740

2741 def processEndTag(self, token):

2742 self.parser.parseError("expected-eof-but-got-end-tag",

2743 {"name": token["name"]})

2744

2745 startTagHandler = _utils.MethodDispatcher([

2746 ("html", startTagHtml),

2747 ("noframes", startTagNoFrames)

2748 ])

2749 startTagHandler.default = startTagOther

2750

2751 # pylint:enable=unused-argument

2752

2753 return {

2754 "initial": InitialPhase,

2755 "beforeHtml": BeforeHtmlPhase,

2756 "beforeHead": BeforeHeadPhase,

2757 "inHead": InHeadPhase,

2758 "inHeadNoscript": InHeadNoscriptPhase,

2759 "afterHead": AfterHeadPhase,

2760 "inBody": InBodyPhase,

2761 "text": TextPhase,

2762 "inTable": InTablePhase,

2763 "inTableText": InTableTextPhase,

2764 "inCaption": InCaptionPhase,

2765 "inColumnGroup": InColumnGroupPhase,

2766 "inTableBody": InTableBodyPhase,

2767 "inRow": InRowPhase,

2768 "inCell": InCellPhase,

2769 "inSelect": InSelectPhase,

2770 "inSelectInTable": InSelectInTablePhase,

2771 "inForeignContent": InForeignContentPhase,

2772 "afterBody": AfterBodyPhase,

2773 "inFrameset": InFramesetPhase,

2774 "afterFrameset": AfterFramesetPhase,

2775 "afterAfterBody": AfterAfterBodyPhase,

2776 "afterAfterFrameset": AfterAfterFramesetPhase,

2777 # XXX after after frameset

2778 }

2779

2780

2781def adjust_attributes(token, replacements):

2782 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)

2783 if needs_adjustment:

2784 token['data'] = type(token['data'])((replacements.get(k, k), v)

2785 for k, v in token['data'].items())

2786

2787

2788def impliedTagToken(name, type="EndTag", attributes=None,

2789 selfClosing=False):

2790 if attributes is None:

2791 attributes = {}

2792 return {"type": tokenTypes[type], "name": name, "data": attributes,

2793 "selfClosing": selfClosing}

2794

2795

2796class ParseError(Exception):

2797 """Error in parsed document"""

2798 pass