Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html5lib/html5parser.py: 94%

1from __future__ import absolute_import, division, unicode_literals

2from six import viewkeys

4from . import _inputstream

5from . import _tokenizer

7from . import treebuilders

8from .treebuilders.base import Marker

10from . import _utils

11from .constants import (

12 spaceCharacters, asciiUpper2Lower,

13 specialElements, headingElements, cdataElements, rcdataElements,

14 tokenTypes,

15 namespaces,

16 htmlIntegrationPointElements, mathmlTextIntegrationPointElements,

17 adjustForeignAttributes as adjustForeignAttributesMap,

18 adjustMathMLAttributes, adjustSVGAttributes,

19 E,

20 _ReparseException

21)

24def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs):

25 """Parse an HTML document as a string or file-like object into a tree

27 :arg doc: the document to parse as a string or file-like object

29 :arg treebuilder: the treebuilder to use when parsing

31 :arg namespaceHTMLElements: whether or not to namespace HTML elements

33 :returns: parsed tree

35 Example:

37 >>> from html5lib.html5parser import parse

38 >>> parse('<html><body>This is a doc</body></html>')

39 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>

41 """

42 tb = treebuilders.getTreeBuilder(treebuilder)

43 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)

44 return p.parse(doc, **kwargs)

47def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs):

48 """Parse an HTML fragment as a string or file-like object into a tree

50 :arg doc: the fragment to parse as a string or file-like object

52 :arg container: the container context to parse the fragment in

54 :arg treebuilder: the treebuilder to use when parsing

56 :arg namespaceHTMLElements: whether or not to namespace HTML elements

58 :returns: parsed tree

60 Example:

62 >>> from html5lib.html5libparser import parseFragment

63 >>> parseFragment('this is a fragment')

64 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>

66 """

67 tb = treebuilders.getTreeBuilder(treebuilder)

68 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements)

69 return p.parseFragment(doc, container=container, **kwargs)

72class HTMLParser(object):

73 """HTML parser

75 Generates a tree structure from a stream of (possibly malformed) HTML.

77 """

79 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False):

80 """

81 :arg tree: a treebuilder class controlling the type of tree that will be

82 returned. Built in treebuilders can be accessed through

83 html5lib.treebuilders.getTreeBuilder(treeType)

85 :arg strict: raise an exception when a parse error is encountered

87 :arg namespaceHTMLElements: whether or not to namespace HTML elements

89 :arg debug: whether or not to enable debug mode which logs things

91 Example:

93 >>> from html5lib.html5parser import HTMLParser

94 >>> parser = HTMLParser() # generates parser with etree builder

95 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict

97 """

99 # Raise an exception on the first error encountered

100 self.strict = strict

101 self.debug = debug

102

103 if tree is None:

104 tree = treebuilders.getTreeBuilder("etree")

105 elif isinstance(tree, str):

106 tree = treebuilders.getTreeBuilder(tree)

107

108 self.tree = tree(namespaceHTMLElements)

109 self.errors = []

110

111 self.phases = {name: cls(self, self.tree) for name, cls in

112 _phases.items()}

113

114 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs):

115

116 self.innerHTMLMode = innerHTML

117 self.container = container

118 self.scripting = scripting

119 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs)

120 self.reset()

121

122 try:

123 self.mainLoop()

124 except _ReparseException:

125 self.reset()

126 self.mainLoop()

127

128 def reset(self):

129 self.tree.reset()

130 self.firstStartTag = False

131 self.errors = []

132 self.log = [] # only used with debug mode

133 # "quirks" / "limited quirks" / "no quirks"

134 self.compatMode = "no quirks"

135

136 if self.innerHTMLMode:

137 self.innerHTML = self.container.lower()

138

139 if self.innerHTML in cdataElements:

140 self.tokenizer.state = self.tokenizer.rcdataState

141 elif self.innerHTML in rcdataElements:

142 self.tokenizer.state = self.tokenizer.rawtextState

143 elif self.innerHTML == 'plaintext':

144 self.tokenizer.state = self.tokenizer.plaintextState

145 else:

146 # state already is data state

147 # self.tokenizer.state = self.tokenizer.dataState

148 pass

149 self.phase = self.phases["beforeHtml"]

150 self.phase.insertHtmlElement()

151 self.resetInsertionMode()

152 else:

153 self.innerHTML = False # pylint:disable=redefined-variable-type

154 self.phase = self.phases["initial"]

155

156 self.lastPhase = None

157

158 self.beforeRCDataPhase = None

159

160 self.framesetOK = True

161

162 @property

163 def documentEncoding(self):

164 """Name of the character encoding that was used to decode the input stream, or

165 :obj:`None` if that is not determined yet

166

167 """

168 if not hasattr(self, 'tokenizer'):

169 return None

170 return self.tokenizer.stream.charEncoding[0].name

171

172 def isHTMLIntegrationPoint(self, element):

173 if (element.name == "annotation-xml" and

174 element.namespace == namespaces["mathml"]):

175 return ("encoding" in element.attributes and

176 element.attributes["encoding"].translate(

177 asciiUpper2Lower) in

178 ("text/html", "application/xhtml+xml"))

179 else:

180 return (element.namespace, element.name) in htmlIntegrationPointElements

181

182 def isMathMLTextIntegrationPoint(self, element):

183 return (element.namespace, element.name) in mathmlTextIntegrationPointElements

184

185 def mainLoop(self):

186 CharactersToken = tokenTypes["Characters"]

187 SpaceCharactersToken = tokenTypes["SpaceCharacters"]

188 StartTagToken = tokenTypes["StartTag"]

189 EndTagToken = tokenTypes["EndTag"]

190 CommentToken = tokenTypes["Comment"]

191 DoctypeToken = tokenTypes["Doctype"]

192 ParseErrorToken = tokenTypes["ParseError"]

193

194 type_names = {value: key for key, value in tokenTypes.items()}

195 debug = self.debug

196

197 for token in self.tokenizer:

198 prev_token = None

199 new_token = token

200 while new_token is not None:

201 prev_token = new_token

202 currentNode = self.tree.openElements[-1] if self.tree.openElements else None

203 currentNodeNamespace = currentNode.namespace if currentNode else None

204 currentNodeName = currentNode.name if currentNode else None

205

206 type = new_token["type"]

207

208 if type == ParseErrorToken:

209 self.parseError(new_token["data"], new_token.get("datavars", {}))

210 new_token = None

211 else:

212 if (len(self.tree.openElements) == 0 or

213 currentNodeNamespace == self.tree.defaultNamespace or

214 (self.isMathMLTextIntegrationPoint(currentNode) and

215 ((type == StartTagToken and

216 token["name"] not in frozenset(["mglyph", "malignmark"])) or

217 type in (CharactersToken, SpaceCharactersToken))) or

218 (currentNodeNamespace == namespaces["mathml"] and

219 currentNodeName == "annotation-xml" and

220 type == StartTagToken and

221 token["name"] == "svg") or

222 (self.isHTMLIntegrationPoint(currentNode) and

223 type in (StartTagToken, CharactersToken, SpaceCharactersToken))):

224 phase = self.phase

225 else:

226 phase = self.phases["inForeignContent"]

227

228 if debug:

229 info = {"type": type_names[type]}

230 if type in (StartTagToken, EndTagToken):

231 info["name"] = new_token['name']

232

233 self.log.append((self.tokenizer.state.__name__,

234 self.phase.__class__.__name__,

235 phase.__class__.__name__,

236 "process" + info["type"],

237 info))

238

239 if type == CharactersToken:

240 new_token = phase.processCharacters(new_token)

241 elif type == SpaceCharactersToken:

242 new_token = phase.processSpaceCharacters(new_token)

243 elif type == StartTagToken:

244 new_token = phase.processStartTag(new_token)

245 elif type == EndTagToken:

246 new_token = phase.processEndTag(new_token)

247 elif type == CommentToken:

248 new_token = phase.processComment(new_token)

249 elif type == DoctypeToken:

250 new_token = phase.processDoctype(new_token)

251

252 if (type == StartTagToken and prev_token["selfClosing"] and

253 not prev_token["selfClosingAcknowledged"]):

254 self.parseError("non-void-element-with-trailing-solidus",

255 {"name": prev_token["name"]})

256

257 # When the loop finishes it's EOF

258 reprocess = True

259 phases = []

260 while reprocess:

261 phases.append(self.phase)

262 reprocess = self.phase.processEOF()

263 if reprocess:

264 assert self.phase not in phases

265

266 def parse(self, stream, *args, **kwargs):

267 """Parse a HTML document into a well-formed tree

268

269 :arg stream: a file-like object or string containing the HTML to be parsed

270

271 The optional encoding parameter must be a string that indicates

272 the encoding. If specified, that encoding will be used,

273 regardless of any BOM or later declaration (such as in a meta

274 element).

275

276 :arg scripting: treat noscript elements as if JavaScript was turned on

277

278 :returns: parsed tree

279

280 Example:

281

282 >>> from html5lib.html5parser import HTMLParser

283 >>> parser = HTMLParser()

284 >>> parser.parse('<html><body>This is a doc</body></html>')

285 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0>

286

287 """

288 self._parse(stream, False, None, *args, **kwargs)

289 return self.tree.getDocument()

290

291 def parseFragment(self, stream, *args, **kwargs):

292 """Parse a HTML fragment into a well-formed tree fragment

293

294 :arg container: name of the element we're setting the innerHTML

295 property if set to None, default to 'div'

296

297 :arg stream: a file-like object or string containing the HTML to be parsed

298

299 The optional encoding parameter must be a string that indicates

300 the encoding. If specified, that encoding will be used,

301 regardless of any BOM or later declaration (such as in a meta

302 element)

303

304 :arg scripting: treat noscript elements as if JavaScript was turned on

305

306 :returns: parsed tree

307

308 Example:

309

310 >>> from html5lib.html5libparser import HTMLParser

311 >>> parser = HTMLParser()

312 >>> parser.parseFragment('this is a fragment')

313 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090>

314

315 """

316 self._parse(stream, True, *args, **kwargs)

317 return self.tree.getFragment()

318

319 def parseError(self, errorcode="XXX-undefined-error", datavars=None):

320 # XXX The idea is to make errorcode mandatory.

321 if datavars is None:

322 datavars = {}

323 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))

324 if self.strict:

325 raise ParseError(E[errorcode] % datavars)

326

327 def adjustMathMLAttributes(self, token):

328 adjust_attributes(token, adjustMathMLAttributes)

329

330 def adjustSVGAttributes(self, token):

331 adjust_attributes(token, adjustSVGAttributes)

332

333 def adjustForeignAttributes(self, token):

334 adjust_attributes(token, adjustForeignAttributesMap)

335

336 def reparseTokenNormal(self, token):

337 # pylint:disable=unused-argument

338 self.parser.phase()

339

340 def resetInsertionMode(self):

341 # The name of this method is mostly historical. (It's also used in the

342 # specification.)

343 last = False

344 newModes = {

345 "select": "inSelect",

346 "td": "inCell",

347 "th": "inCell",

348 "tr": "inRow",

349 "tbody": "inTableBody",

350 "thead": "inTableBody",

351 "tfoot": "inTableBody",

352 "caption": "inCaption",

353 "colgroup": "inColumnGroup",

354 "table": "inTable",

355 "head": "inBody",

356 "body": "inBody",

357 "frameset": "inFrameset",

358 "html": "beforeHead"

359 }

360 for node in self.tree.openElements[::-1]:

361 nodeName = node.name

362 new_phase = None

363 if node == self.tree.openElements[0]:

364 assert self.innerHTML

365 last = True

366 nodeName = self.innerHTML

367 # Check for conditions that should only happen in the innerHTML

368 # case

369 if nodeName in ("select", "colgroup", "head", "html"):

370 assert self.innerHTML

371

372 if not last and node.namespace != self.tree.defaultNamespace:

373 continue

374

375 if nodeName in newModes:

376 new_phase = self.phases[newModes[nodeName]]

377 break

378 elif last:

379 new_phase = self.phases["inBody"]

380 break

381

382 self.phase = new_phase

383

384 def parseRCDataRawtext(self, token, contentType):

385 # Generic RCDATA/RAWTEXT Parsing algorithm

386 assert contentType in ("RAWTEXT", "RCDATA")

387

388 self.tree.insertElement(token)

389

390 if contentType == "RAWTEXT":

391 self.tokenizer.state = self.tokenizer.rawtextState

392 else:

393 self.tokenizer.state = self.tokenizer.rcdataState

394

395 self.originalPhase = self.phase

396

397 self.phase = self.phases["text"]

398

399

400class Phase(object):

401 """Base class for helper object that implements each phase of processing

402 """

403 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")

404

405 def __init__(self, parser, tree):

406 self.parser = parser

407 self.tree = tree

408 self.__startTagCache = {}

409 self.__endTagCache = {}

410

411 def processEOF(self):

412 raise NotImplementedError

413

414 def processComment(self, token):

415 # For most phases the following is correct. Where it's not it will be

416 # overridden.

417 self.tree.insertComment(token, self.tree.openElements[-1])

418

419 def processDoctype(self, token):

420 self.parser.parseError("unexpected-doctype")

421

422 def processCharacters(self, token):

423 self.tree.insertText(token["data"])

424

425 def processSpaceCharacters(self, token):

426 self.tree.insertText(token["data"])

427

428 def processStartTag(self, token):

429 # Note the caching is done here rather than BoundMethodDispatcher as doing it there

430 # requires a circular reference to the Phase, and this ends up with a significant

431 # (CPython 2.7, 3.8) GC cost when parsing many short inputs

432 name = token["name"]

433 # In Py2, using `in` is quicker in general than try/except KeyError

434 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)

435 if name in self.__startTagCache:

436 func = self.__startTagCache[name]

437 else:

438 func = self.__startTagCache[name] = self.startTagHandler[name]

439 # bound the cache size in case we get loads of unknown tags

440 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:

441 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7

442 self.__startTagCache.pop(next(iter(self.__startTagCache)))

443 return func(token)

444

445 def startTagHtml(self, token):

446 if not self.parser.firstStartTag and token["name"] == "html":

447 self.parser.parseError("non-html-root")

448 # XXX Need a check here to see if the first start tag token emitted is

449 # this token... If it's not, invoke self.parser.parseError().

450 for attr, value in token["data"].items():

451 if attr not in self.tree.openElements[0].attributes:

452 self.tree.openElements[0].attributes[attr] = value

453 self.parser.firstStartTag = False

454

455 def processEndTag(self, token):

456 # Note the caching is done here rather than BoundMethodDispatcher as doing it there

457 # requires a circular reference to the Phase, and this ends up with a significant

458 # (CPython 2.7, 3.8) GC cost when parsing many short inputs

459 name = token["name"]

460 # In Py2, using `in` is quicker in general than try/except KeyError

461 # In Py3, `in` is quicker when there are few cache hits (typically short inputs)

462 if name in self.__endTagCache:

463 func = self.__endTagCache[name]

464 else:

465 func = self.__endTagCache[name] = self.endTagHandler[name]

466 # bound the cache size in case we get loads of unknown tags

467 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:

468 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7

469 self.__endTagCache.pop(next(iter(self.__endTagCache)))

470 return func(token)

471

472

473class InitialPhase(Phase):

474 __slots__ = tuple()

475

476 def processSpaceCharacters(self, token):

477 pass

478

479 def processComment(self, token):

480 self.tree.insertComment(token, self.tree.document)

481

482 def processDoctype(self, token):

483 name = token["name"]

484 publicId = token["publicId"]

485 systemId = token["systemId"]

486 correct = token["correct"]

487

488 if (name != "html" or publicId is not None or

489 systemId is not None and systemId != "about:legacy-compat"):

490 self.parser.parseError("unknown-doctype")

491

492 if publicId is None:

493 publicId = ""

494

495 self.tree.insertDoctype(token)

496

497 if publicId != "":

498 publicId = publicId.translate(asciiUpper2Lower)

499

500 if (not correct or token["name"] != "html" or

501 publicId.startswith(

502 ("+//silmaril//dtd html pro v0r11 19970101//",

503 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//",

504 "-//as//dtd html 3.0 aswedit + extensions//",

505 "-//ietf//dtd html 2.0 level 1//",

506 "-//ietf//dtd html 2.0 level 2//",

507 "-//ietf//dtd html 2.0 strict level 1//",

508 "-//ietf//dtd html 2.0 strict level 2//",

509 "-//ietf//dtd html 2.0 strict//",

510 "-//ietf//dtd html 2.0//",

511 "-//ietf//dtd html 2.1e//",

512 "-//ietf//dtd html 3.0//",

513 "-//ietf//dtd html 3.2 final//",

514 "-//ietf//dtd html 3.2//",

515 "-//ietf//dtd html 3//",

516 "-//ietf//dtd html level 0//",

517 "-//ietf//dtd html level 1//",

518 "-//ietf//dtd html level 2//",

519 "-//ietf//dtd html level 3//",

520 "-//ietf//dtd html strict level 0//",

521 "-//ietf//dtd html strict level 1//",

522 "-//ietf//dtd html strict level 2//",

523 "-//ietf//dtd html strict level 3//",

524 "-//ietf//dtd html strict//",

525 "-//ietf//dtd html//",

526 "-//metrius//dtd metrius presentational//",

527 "-//microsoft//dtd internet explorer 2.0 html strict//",

528 "-//microsoft//dtd internet explorer 2.0 html//",

529 "-//microsoft//dtd internet explorer 2.0 tables//",

530 "-//microsoft//dtd internet explorer 3.0 html strict//",

531 "-//microsoft//dtd internet explorer 3.0 html//",

532 "-//microsoft//dtd internet explorer 3.0 tables//",

533 "-//netscape comm. corp.//dtd html//",

534 "-//netscape comm. corp.//dtd strict html//",

535 "-//o'reilly and associates//dtd html 2.0//",

536 "-//o'reilly and associates//dtd html extended 1.0//",

537 "-//o'reilly and associates//dtd html extended relaxed 1.0//",

538 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",

539 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",

540 "-//spyglass//dtd html 2.0 extended//",

541 "-//sq//dtd html 2.0 hotmetal + extensions//",

542 "-//sun microsystems corp.//dtd hotjava html//",

543 "-//sun microsystems corp.//dtd hotjava strict html//",

544 "-//w3c//dtd html 3 1995-03-24//",

545 "-//w3c//dtd html 3.2 draft//",

546 "-//w3c//dtd html 3.2 final//",

547 "-//w3c//dtd html 3.2//",

548 "-//w3c//dtd html 3.2s draft//",

549 "-//w3c//dtd html 4.0 frameset//",

550 "-//w3c//dtd html 4.0 transitional//",

551 "-//w3c//dtd html experimental 19960712//",

552 "-//w3c//dtd html experimental 970421//",

553 "-//w3c//dtd w3 html//",

554 "-//w3o//dtd w3 html 3.0//",

555 "-//webtechs//dtd mozilla html 2.0//",

556 "-//webtechs//dtd mozilla html//")) or

557 publicId in ("-//w3o//dtd w3 html strict 3.0//en//",

558 "-/w3c/dtd html 4.0 transitional/en",

559 "html") or

560 publicId.startswith(

561 ("-//w3c//dtd html 4.01 frameset//",

562 "-//w3c//dtd html 4.01 transitional//")) and

563 systemId is None or

564 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):

565 self.parser.compatMode = "quirks"

566 elif (publicId.startswith(

567 ("-//w3c//dtd xhtml 1.0 frameset//",

568 "-//w3c//dtd xhtml 1.0 transitional//")) or

569 publicId.startswith(

570 ("-//w3c//dtd html 4.01 frameset//",

571 "-//w3c//dtd html 4.01 transitional//")) and

572 systemId is not None):

573 self.parser.compatMode = "limited quirks"

574

575 self.parser.phase = self.parser.phases["beforeHtml"]

576

577 def anythingElse(self):

578 self.parser.compatMode = "quirks"

579 self.parser.phase = self.parser.phases["beforeHtml"]

580

581 def processCharacters(self, token):

582 self.parser.parseError("expected-doctype-but-got-chars")

583 self.anythingElse()

584 return token

585

586 def processStartTag(self, token):

587 self.parser.parseError("expected-doctype-but-got-start-tag",

588 {"name": token["name"]})

589 self.anythingElse()

590 return token

591

592 def processEndTag(self, token):

593 self.parser.parseError("expected-doctype-but-got-end-tag",

594 {"name": token["name"]})

595 self.anythingElse()

596 return token

597

598 def processEOF(self):

599 self.parser.parseError("expected-doctype-but-got-eof")

600 self.anythingElse()

601 return True

602

603

604class BeforeHtmlPhase(Phase):

605 __slots__ = tuple()

606

607 # helper methods

608 def insertHtmlElement(self):

609 self.tree.insertRoot(impliedTagToken("html", "StartTag"))

610 self.parser.phase = self.parser.phases["beforeHead"]

611

612 # other

613 def processEOF(self):

614 self.insertHtmlElement()

615 return True

616

617 def processComment(self, token):

618 self.tree.insertComment(token, self.tree.document)

619

620 def processSpaceCharacters(self, token):

621 pass

622

623 def processCharacters(self, token):

624 self.insertHtmlElement()

625 return token

626

627 def processStartTag(self, token):

628 if token["name"] == "html":

629 self.parser.firstStartTag = True

630 self.insertHtmlElement()

631 return token

632

633 def processEndTag(self, token):

634 if token["name"] not in ("head", "body", "html", "br"):

635 self.parser.parseError("unexpected-end-tag-before-html",

636 {"name": token["name"]})

637 else:

638 self.insertHtmlElement()

639 return token

640

641

642class BeforeHeadPhase(Phase):

643 __slots__ = tuple()

644

645 def processEOF(self):

646 self.startTagHead(impliedTagToken("head", "StartTag"))

647 return True

648

649 def processSpaceCharacters(self, token):

650 pass

651

652 def processCharacters(self, token):

653 self.startTagHead(impliedTagToken("head", "StartTag"))

654 return token

655

656 def startTagHtml(self, token):

657 return self.parser.phases["inBody"].processStartTag(token)

658

659 def startTagHead(self, token):

660 self.tree.insertElement(token)

661 self.tree.headPointer = self.tree.openElements[-1]

662 self.parser.phase = self.parser.phases["inHead"]

663

664 def startTagOther(self, token):

665 self.startTagHead(impliedTagToken("head", "StartTag"))

666 return token

667

668 def endTagImplyHead(self, token):

669 self.startTagHead(impliedTagToken("head", "StartTag"))

670 return token

671

672 def endTagOther(self, token):

673 self.parser.parseError("end-tag-after-implied-root",

674 {"name": token["name"]})

675

676 startTagHandler = _utils.MethodDispatcher([

677 ("html", startTagHtml),

678 ("head", startTagHead)

679 ])

680 startTagHandler.default = startTagOther

681

682 endTagHandler = _utils.MethodDispatcher([

683 (("head", "body", "html", "br"), endTagImplyHead)

684 ])

685 endTagHandler.default = endTagOther

686

687

688class InHeadPhase(Phase):

689 __slots__ = tuple()

690

691 # the real thing

692 def processEOF(self):

693 self.anythingElse()

694 return True

695

696 def processCharacters(self, token):

697 self.anythingElse()

698 return token

699

700 def startTagHtml(self, token):

701 return self.parser.phases["inBody"].processStartTag(token)

702

703 def startTagHead(self, token):

704 self.parser.parseError("two-heads-are-not-better-than-one")

705

706 def startTagBaseLinkCommand(self, token):

707 self.tree.insertElement(token)

708 self.tree.openElements.pop()

709 token["selfClosingAcknowledged"] = True

710

711 def startTagMeta(self, token):

712 self.tree.insertElement(token)

713 self.tree.openElements.pop()

714 token["selfClosingAcknowledged"] = True

715

716 attributes = token["data"]

717 if self.parser.tokenizer.stream.charEncoding[1] == "tentative":

718 if "charset" in attributes:

719 self.parser.tokenizer.stream.changeEncoding(attributes["charset"])

720 elif ("content" in attributes and

721 "http-equiv" in attributes and

722 attributes["http-equiv"].lower() == "content-type"):

723 # Encoding it as UTF-8 here is a hack, as really we should pass

724 # the abstract Unicode string, and just use the

725 # ContentAttrParser on that, but using UTF-8 allows all chars

726 # to be encoded and as a ASCII-superset works.

727 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))

728 parser = _inputstream.ContentAttrParser(data)

729 codec = parser.parse()

730 self.parser.tokenizer.stream.changeEncoding(codec)

731

732 def startTagTitle(self, token):

733 self.parser.parseRCDataRawtext(token, "RCDATA")

734

735 def startTagNoFramesStyle(self, token):

736 # Need to decide whether to implement the scripting-disabled case

737 self.parser.parseRCDataRawtext(token, "RAWTEXT")

738

739 def startTagNoscript(self, token):

740 if self.parser.scripting:

741 self.parser.parseRCDataRawtext(token, "RAWTEXT")

742 else:

743 self.tree.insertElement(token)

744 self.parser.phase = self.parser.phases["inHeadNoscript"]

745

746 def startTagScript(self, token):

747 self.tree.insertElement(token)

748 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState

749 self.parser.originalPhase = self.parser.phase

750 self.parser.phase = self.parser.phases["text"]

751

752 def startTagOther(self, token):

753 self.anythingElse()

754 return token

755

756 def endTagHead(self, token):

757 node = self.parser.tree.openElements.pop()

758 assert node.name == "head", "Expected head got %s" % node.name

759 self.parser.phase = self.parser.phases["afterHead"]

760

761 def endTagHtmlBodyBr(self, token):

762 self.anythingElse()

763 return token

764

765 def endTagOther(self, token):

766 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

767

768 def anythingElse(self):

769 self.endTagHead(impliedTagToken("head"))

770

771 startTagHandler = _utils.MethodDispatcher([

772 ("html", startTagHtml),

773 ("title", startTagTitle),

774 (("noframes", "style"), startTagNoFramesStyle),

775 ("noscript", startTagNoscript),

776 ("script", startTagScript),

777 (("base", "basefont", "bgsound", "command", "link"),

778 startTagBaseLinkCommand),

779 ("meta", startTagMeta),

780 ("head", startTagHead)

781 ])

782 startTagHandler.default = startTagOther

783

784 endTagHandler = _utils.MethodDispatcher([

785 ("head", endTagHead),

786 (("br", "html", "body"), endTagHtmlBodyBr)

787 ])

788 endTagHandler.default = endTagOther

789

790

791class InHeadNoscriptPhase(Phase):

792 __slots__ = tuple()

793

794 def processEOF(self):

795 self.parser.parseError("eof-in-head-noscript")

796 self.anythingElse()

797 return True

798

799 def processComment(self, token):

800 return self.parser.phases["inHead"].processComment(token)

801

802 def processCharacters(self, token):

803 self.parser.parseError("char-in-head-noscript")

804 self.anythingElse()

805 return token

806

807 def processSpaceCharacters(self, token):

808 return self.parser.phases["inHead"].processSpaceCharacters(token)

809

810 def startTagHtml(self, token):

811 return self.parser.phases["inBody"].processStartTag(token)

812

813 def startTagBaseLinkCommand(self, token):

814 return self.parser.phases["inHead"].processStartTag(token)

815

816 def startTagHeadNoscript(self, token):

817 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})

818

819 def startTagOther(self, token):

820 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})

821 self.anythingElse()

822 return token

823

824 def endTagNoscript(self, token):

825 node = self.parser.tree.openElements.pop()

826 assert node.name == "noscript", "Expected noscript got %s" % node.name

827 self.parser.phase = self.parser.phases["inHead"]

828

829 def endTagBr(self, token):

830 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})

831 self.anythingElse()

832 return token

833

834 def endTagOther(self, token):

835 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

836

837 def anythingElse(self):

838 # Caller must raise parse error first!

839 self.endTagNoscript(impliedTagToken("noscript"))

840

841 startTagHandler = _utils.MethodDispatcher([

842 ("html", startTagHtml),

843 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),

844 (("head", "noscript"), startTagHeadNoscript),

845 ])

846 startTagHandler.default = startTagOther

847

848 endTagHandler = _utils.MethodDispatcher([

849 ("noscript", endTagNoscript),

850 ("br", endTagBr),

851 ])

852 endTagHandler.default = endTagOther

853

854

855class AfterHeadPhase(Phase):

856 __slots__ = tuple()

857

858 def processEOF(self):

859 self.anythingElse()

860 return True

861

862 def processCharacters(self, token):

863 self.anythingElse()

864 return token

865

866 def startTagHtml(self, token):

867 return self.parser.phases["inBody"].processStartTag(token)

868

869 def startTagBody(self, token):

870 self.parser.framesetOK = False

871 self.tree.insertElement(token)

872 self.parser.phase = self.parser.phases["inBody"]

873

874 def startTagFrameset(self, token):

875 self.tree.insertElement(token)

876 self.parser.phase = self.parser.phases["inFrameset"]

877

878 def startTagFromHead(self, token):

879 self.parser.parseError("unexpected-start-tag-out-of-my-head",

880 {"name": token["name"]})

881 self.tree.openElements.append(self.tree.headPointer)

882 self.parser.phases["inHead"].processStartTag(token)

883 for node in self.tree.openElements[::-1]:

884 if node.name == "head":

885 self.tree.openElements.remove(node)

886 break

887

888 def startTagHead(self, token):

889 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})

890

891 def startTagOther(self, token):

892 self.anythingElse()

893 return token

894

895 def endTagHtmlBodyBr(self, token):

896 self.anythingElse()

897 return token

898

899 def endTagOther(self, token):

900 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

901

902 def anythingElse(self):

903 self.tree.insertElement(impliedTagToken("body", "StartTag"))

904 self.parser.phase = self.parser.phases["inBody"]

905 self.parser.framesetOK = True

906

907 startTagHandler = _utils.MethodDispatcher([

908 ("html", startTagHtml),

909 ("body", startTagBody),

910 ("frameset", startTagFrameset),

911 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script",

912 "style", "title"),

913 startTagFromHead),

914 ("head", startTagHead)

915 ])

916 startTagHandler.default = startTagOther

917 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),

918 endTagHtmlBodyBr)])

919 endTagHandler.default = endTagOther

920

921

922class InBodyPhase(Phase):

923 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody

924 # the really-really-really-very crazy mode

925 __slots__ = ("processSpaceCharacters",)

926

927 def __init__(self, *args, **kwargs):

928 super(InBodyPhase, self).__init__(*args, **kwargs)

929 # Set this to the default handler

930 self.processSpaceCharacters = self.processSpaceCharactersNonPre

931

932 def isMatchingFormattingElement(self, node1, node2):

933 return (node1.name == node2.name and

934 node1.namespace == node2.namespace and

935 node1.attributes == node2.attributes)

936

937 # helper

938 def addFormattingElement(self, token):

939 self.tree.insertElement(token)

940 element = self.tree.openElements[-1]

941

942 matchingElements = []

943 for node in self.tree.activeFormattingElements[::-1]:

944 if node is Marker:

945 break

946 elif self.isMatchingFormattingElement(node, element):

947 matchingElements.append(node)

948

949 assert len(matchingElements) <= 3

950 if len(matchingElements) == 3:

951 self.tree.activeFormattingElements.remove(matchingElements[-1])

952 self.tree.activeFormattingElements.append(element)

953

954 # the real deal

955 def processEOF(self):

956 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",

957 "tfoot", "th", "thead", "tr", "body",

958 "html"))

959 for node in self.tree.openElements[::-1]:

960 if node.name not in allowed_elements:

961 self.parser.parseError("expected-closing-tag-but-got-eof")

962 break

963 # Stop parsing

964

965 def processSpaceCharactersDropNewline(self, token):

966 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we

967 # want to drop leading newlines

968 data = token["data"]

969 self.processSpaceCharacters = self.processSpaceCharactersNonPre

970 if (data.startswith("\n") and

971 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and

972 not self.tree.openElements[-1].hasContent()):

973 data = data[1:]

974 if data:

975 self.tree.reconstructActiveFormattingElements()

976 self.tree.insertText(data)

977

978 def processCharacters(self, token):

979 if token["data"] == "\u0000":

980 # The tokenizer should always emit null on its own

981 return

982 self.tree.reconstructActiveFormattingElements()

983 self.tree.insertText(token["data"])

984 # This must be bad for performance

985 if (self.parser.framesetOK and

986 any(char not in spaceCharacters

987 for char in token["data"])):

988 self.parser.framesetOK = False

989

990 def processSpaceCharactersNonPre(self, token):

991 self.tree.reconstructActiveFormattingElements()

992 self.tree.insertText(token["data"])

993

994 def startTagProcessInHead(self, token):

995 return self.parser.phases["inHead"].processStartTag(token)

996

997 def startTagBody(self, token):

998 self.parser.parseError("unexpected-start-tag", {"name": "body"})

999 if (len(self.tree.openElements) == 1 or

1000 self.tree.openElements[1].name != "body"):

1001 assert self.parser.innerHTML

1002 else:

1003 self.parser.framesetOK = False

1004 for attr, value in token["data"].items():

1005 if attr not in self.tree.openElements[1].attributes:

1006 self.tree.openElements[1].attributes[attr] = value

1007

1008 def startTagFrameset(self, token):

1009 self.parser.parseError("unexpected-start-tag", {"name": "frameset"})

1010 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"):

1011 assert self.parser.innerHTML

1012 elif not self.parser.framesetOK:

1013 pass

1014 else:

1015 if self.tree.openElements[1].parent:

1016 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1])

1017 while self.tree.openElements[-1].name != "html":

1018 self.tree.openElements.pop()

1019 self.tree.insertElement(token)

1020 self.parser.phase = self.parser.phases["inFrameset"]

1021

1022 def startTagCloseP(self, token):

1023 if self.tree.elementInScope("p", variant="button"):

1024 self.endTagP(impliedTagToken("p"))

1025 self.tree.insertElement(token)

1026

1027 def startTagPreListing(self, token):

1028 if self.tree.elementInScope("p", variant="button"):

1029 self.endTagP(impliedTagToken("p"))

1030 self.tree.insertElement(token)

1031 self.parser.framesetOK = False

1032 self.processSpaceCharacters = self.processSpaceCharactersDropNewline

1033

1034 def startTagForm(self, token):

1035 if self.tree.formPointer:

1036 self.parser.parseError("unexpected-start-tag", {"name": "form"})

1037 else:

1038 if self.tree.elementInScope("p", variant="button"):

1039 self.endTagP(impliedTagToken("p"))

1040 self.tree.insertElement(token)

1041 self.tree.formPointer = self.tree.openElements[-1]

1042

1043 def startTagListItem(self, token):

1044 self.parser.framesetOK = False

1045

1046 stopNamesMap = {"li": ["li"],

1047 "dt": ["dt", "dd"],

1048 "dd": ["dt", "dd"]}

1049 stopNames = stopNamesMap[token["name"]]

1050 for node in reversed(self.tree.openElements):

1051 if node.name in stopNames:

1052 self.parser.phase.processEndTag(

1053 impliedTagToken(node.name, "EndTag"))

1054 break

1055 if (node.nameTuple in specialElements and

1056 node.name not in ("address", "div", "p")):

1057 break

1058

1059 if self.tree.elementInScope("p", variant="button"):

1060 self.parser.phase.processEndTag(

1061 impliedTagToken("p", "EndTag"))

1062

1063 self.tree.insertElement(token)

1064

1065 def startTagPlaintext(self, token):

1066 if self.tree.elementInScope("p", variant="button"):

1067 self.endTagP(impliedTagToken("p"))

1068 self.tree.insertElement(token)

1069 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState

1070

1071 def startTagHeading(self, token):

1072 if self.tree.elementInScope("p", variant="button"):

1073 self.endTagP(impliedTagToken("p"))

1074 if self.tree.openElements[-1].name in headingElements:

1075 self.parser.parseError("unexpected-start-tag", {"name": token["name"]})

1076 self.tree.openElements.pop()

1077 self.tree.insertElement(token)

1078

1079 def startTagA(self, token):

1080 afeAElement = self.tree.elementInActiveFormattingElements("a")

1081 if afeAElement:

1082 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1083 {"startName": "a", "endName": "a"})

1084 self.endTagFormatting(impliedTagToken("a"))

1085 if afeAElement in self.tree.openElements:

1086 self.tree.openElements.remove(afeAElement)

1087 if afeAElement in self.tree.activeFormattingElements:

1088 self.tree.activeFormattingElements.remove(afeAElement)

1089 self.tree.reconstructActiveFormattingElements()

1090 self.addFormattingElement(token)

1091

1092 def startTagFormatting(self, token):

1093 self.tree.reconstructActiveFormattingElements()

1094 self.addFormattingElement(token)

1095

1096 def startTagNobr(self, token):

1097 self.tree.reconstructActiveFormattingElements()

1098 if self.tree.elementInScope("nobr"):

1099 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1100 {"startName": "nobr", "endName": "nobr"})

1101 self.processEndTag(impliedTagToken("nobr"))

1102 # XXX Need tests that trigger the following

1103 self.tree.reconstructActiveFormattingElements()

1104 self.addFormattingElement(token)

1105

1106 def startTagButton(self, token):

1107 if self.tree.elementInScope("button"):

1108 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1109 {"startName": "button", "endName": "button"})

1110 self.processEndTag(impliedTagToken("button"))

1111 return token

1112 else:

1113 self.tree.reconstructActiveFormattingElements()

1114 self.tree.insertElement(token)

1115 self.parser.framesetOK = False

1116

1117 def startTagAppletMarqueeObject(self, token):

1118 self.tree.reconstructActiveFormattingElements()

1119 self.tree.insertElement(token)

1120 self.tree.activeFormattingElements.append(Marker)

1121 self.parser.framesetOK = False

1122

1123 def startTagXmp(self, token):

1124 if self.tree.elementInScope("p", variant="button"):

1125 self.endTagP(impliedTagToken("p"))

1126 self.tree.reconstructActiveFormattingElements()

1127 self.parser.framesetOK = False

1128 self.parser.parseRCDataRawtext(token, "RAWTEXT")

1129

1130 def startTagTable(self, token):

1131 if self.parser.compatMode != "quirks":

1132 if self.tree.elementInScope("p", variant="button"):

1133 self.processEndTag(impliedTagToken("p"))

1134 self.tree.insertElement(token)

1135 self.parser.framesetOK = False

1136 self.parser.phase = self.parser.phases["inTable"]

1137

1138 def startTagVoidFormatting(self, token):

1139 self.tree.reconstructActiveFormattingElements()

1140 self.tree.insertElement(token)

1141 self.tree.openElements.pop()

1142 token["selfClosingAcknowledged"] = True

1143 self.parser.framesetOK = False

1144

1145 def startTagInput(self, token):

1146 framesetOK = self.parser.framesetOK

1147 self.startTagVoidFormatting(token)

1148 if ("type" in token["data"] and

1149 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):

1150 # input type=hidden doesn't change framesetOK

1151 self.parser.framesetOK = framesetOK

1152

1153 def startTagParamSource(self, token):

1154 self.tree.insertElement(token)

1155 self.tree.openElements.pop()

1156 token["selfClosingAcknowledged"] = True

1157

1158 def startTagHr(self, token):

1159 if self.tree.elementInScope("p", variant="button"):

1160 self.endTagP(impliedTagToken("p"))

1161 self.tree.insertElement(token)

1162 self.tree.openElements.pop()

1163 token["selfClosingAcknowledged"] = True

1164 self.parser.framesetOK = False

1165

1166 def startTagImage(self, token):

1167 # No really...

1168 self.parser.parseError("unexpected-start-tag-treated-as",

1169 {"originalName": "image", "newName": "img"})

1170 self.processStartTag(impliedTagToken("img", "StartTag",

1171 attributes=token["data"],

1172 selfClosing=token["selfClosing"]))

1173

1174 def startTagIsIndex(self, token):

1175 self.parser.parseError("deprecated-tag", {"name": "isindex"})

1176 if self.tree.formPointer:

1177 return

1178 form_attrs = {}

1179 if "action" in token["data"]:

1180 form_attrs["action"] = token["data"]["action"]

1181 self.processStartTag(impliedTagToken("form", "StartTag",

1182 attributes=form_attrs))

1183 self.processStartTag(impliedTagToken("hr", "StartTag"))

1184 self.processStartTag(impliedTagToken("label", "StartTag"))

1185 # XXX Localization ...

1186 if "prompt" in token["data"]:

1187 prompt = token["data"]["prompt"]

1188 else:

1189 prompt = "This is a searchable index. Enter search keywords: "

1190 self.processCharacters(

1191 {"type": tokenTypes["Characters"], "data": prompt})

1192 attributes = token["data"].copy()

1193 if "action" in attributes:

1194 del attributes["action"]

1195 if "prompt" in attributes:

1196 del attributes["prompt"]

1197 attributes["name"] = "isindex"

1198 self.processStartTag(impliedTagToken("input", "StartTag",

1199 attributes=attributes,

1200 selfClosing=token["selfClosing"]))

1201 self.processEndTag(impliedTagToken("label"))

1202 self.processStartTag(impliedTagToken("hr", "StartTag"))

1203 self.processEndTag(impliedTagToken("form"))

1204

1205 def startTagTextarea(self, token):

1206 self.tree.insertElement(token)

1207 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState

1208 self.processSpaceCharacters = self.processSpaceCharactersDropNewline

1209 self.parser.framesetOK = False

1210

1211 def startTagIFrame(self, token):

1212 self.parser.framesetOK = False

1213 self.startTagRawtext(token)

1214

1215 def startTagNoscript(self, token):

1216 if self.parser.scripting:

1217 self.startTagRawtext(token)

1218 else:

1219 self.startTagOther(token)

1220

1221 def startTagRawtext(self, token):

1222 """iframe, noembed noframes, noscript(if scripting enabled)"""

1223 self.parser.parseRCDataRawtext(token, "RAWTEXT")

1224

1225 def startTagOpt(self, token):

1226 if self.tree.openElements[-1].name == "option":

1227 self.parser.phase.processEndTag(impliedTagToken("option"))

1228 self.tree.reconstructActiveFormattingElements()

1229 self.parser.tree.insertElement(token)

1230

1231 def startTagSelect(self, token):

1232 self.tree.reconstructActiveFormattingElements()

1233 self.tree.insertElement(token)

1234 self.parser.framesetOK = False

1235 if self.parser.phase in (self.parser.phases["inTable"],

1236 self.parser.phases["inCaption"],

1237 self.parser.phases["inColumnGroup"],

1238 self.parser.phases["inTableBody"],

1239 self.parser.phases["inRow"],

1240 self.parser.phases["inCell"]):

1241 self.parser.phase = self.parser.phases["inSelectInTable"]

1242 else:

1243 self.parser.phase = self.parser.phases["inSelect"]

1244

1245 def startTagRpRt(self, token):

1246 if self.tree.elementInScope("ruby"):

1247 self.tree.generateImpliedEndTags()

1248 if self.tree.openElements[-1].name != "ruby":

1249 self.parser.parseError()

1250 self.tree.insertElement(token)

1251

1252 def startTagMath(self, token):

1253 self.tree.reconstructActiveFormattingElements()

1254 self.parser.adjustMathMLAttributes(token)

1255 self.parser.adjustForeignAttributes(token)

1256 token["namespace"] = namespaces["mathml"]

1257 self.tree.insertElement(token)

1258 # Need to get the parse error right for the case where the token

1259 # has a namespace not equal to the xmlns attribute

1260 if token["selfClosing"]:

1261 self.tree.openElements.pop()

1262 token["selfClosingAcknowledged"] = True

1263

1264 def startTagSvg(self, token):

1265 self.tree.reconstructActiveFormattingElements()

1266 self.parser.adjustSVGAttributes(token)

1267 self.parser.adjustForeignAttributes(token)

1268 token["namespace"] = namespaces["svg"]

1269 self.tree.insertElement(token)

1270 # Need to get the parse error right for the case where the token

1271 # has a namespace not equal to the xmlns attribute

1272 if token["selfClosing"]:

1273 self.tree.openElements.pop()

1274 token["selfClosingAcknowledged"] = True

1275

1276 def startTagMisplaced(self, token):

1277 """ Elements that should be children of other elements that have a

1278 different insertion mode; here they are ignored

1279 "caption", "col", "colgroup", "frame", "frameset", "head",

1280 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead",

1281 "tr", "noscript"

1282 """

1283 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]})

1284

1285 def startTagOther(self, token):

1286 self.tree.reconstructActiveFormattingElements()

1287 self.tree.insertElement(token)

1288

1289 def endTagP(self, token):

1290 if not self.tree.elementInScope("p", variant="button"):

1291 self.startTagCloseP(impliedTagToken("p", "StartTag"))

1292 self.parser.parseError("unexpected-end-tag", {"name": "p"})

1293 self.endTagP(impliedTagToken("p", "EndTag"))

1294 else:

1295 self.tree.generateImpliedEndTags("p")

1296 if self.tree.openElements[-1].name != "p":

1297 self.parser.parseError("unexpected-end-tag", {"name": "p"})

1298 node = self.tree.openElements.pop()

1299 while node.name != "p":

1300 node = self.tree.openElements.pop()

1301

1302 def endTagBody(self, token):

1303 if not self.tree.elementInScope("body"):

1304 self.parser.parseError()

1305 return

1306 elif self.tree.openElements[-1].name != "body":

1307 for node in self.tree.openElements[2:]:

1308 if node.name not in frozenset(("dd", "dt", "li", "optgroup",

1309 "option", "p", "rp", "rt",

1310 "tbody", "td", "tfoot",

1311 "th", "thead", "tr", "body",

1312 "html")):

1313 # Not sure this is the correct name for the parse error

1314 self.parser.parseError(

1315 "expected-one-end-tag-but-got-another",

1316 {"gotName": "body", "expectedName": node.name})

1317 break

1318 self.parser.phase = self.parser.phases["afterBody"]

1319

1320 def endTagHtml(self, token):

1321 # We repeat the test for the body end tag token being ignored here

1322 if self.tree.elementInScope("body"):

1323 self.endTagBody(impliedTagToken("body"))

1324 return token

1325

1326 def endTagBlock(self, token):

1327 # Put us back in the right whitespace handling mode

1328 if token["name"] == "pre":

1329 self.processSpaceCharacters = self.processSpaceCharactersNonPre

1330 inScope = self.tree.elementInScope(token["name"])

1331 if inScope:

1332 self.tree.generateImpliedEndTags()

1333 if self.tree.openElements[-1].name != token["name"]:

1334 self.parser.parseError("end-tag-too-early", {"name": token["name"]})

1335 if inScope:

1336 node = self.tree.openElements.pop()

1337 while node.name != token["name"]:

1338 node = self.tree.openElements.pop()

1339

1340 def endTagForm(self, token):

1341 node = self.tree.formPointer

1342 self.tree.formPointer = None

1343 if node is None or not self.tree.elementInScope(node):

1344 self.parser.parseError("unexpected-end-tag",

1345 {"name": "form"})

1346 else:

1347 self.tree.generateImpliedEndTags()

1348 if self.tree.openElements[-1] != node:

1349 self.parser.parseError("end-tag-too-early-ignored",

1350 {"name": "form"})

1351 self.tree.openElements.remove(node)

1352

1353 def endTagListItem(self, token):

1354 if token["name"] == "li":

1355 variant = "list"

1356 else:

1357 variant = None

1358 if not self.tree.elementInScope(token["name"], variant=variant):

1359 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1360 else:

1361 self.tree.generateImpliedEndTags(exclude=token["name"])

1362 if self.tree.openElements[-1].name != token["name"]:

1363 self.parser.parseError(

1364 "end-tag-too-early",

1365 {"name": token["name"]})

1366 node = self.tree.openElements.pop()

1367 while node.name != token["name"]:

1368 node = self.tree.openElements.pop()

1369

1370 def endTagHeading(self, token):

1371 for item in headingElements:

1372 if self.tree.elementInScope(item):

1373 self.tree.generateImpliedEndTags()

1374 break

1375 if self.tree.openElements[-1].name != token["name"]:

1376 self.parser.parseError("end-tag-too-early", {"name": token["name"]})

1377

1378 for item in headingElements:

1379 if self.tree.elementInScope(item):

1380 item = self.tree.openElements.pop()

1381 while item.name not in headingElements:

1382 item = self.tree.openElements.pop()

1383 break

1384

1385 def endTagFormatting(self, token):

1386 """The much-feared adoption agency algorithm"""

1387 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867

1388 # XXX Better parseError messages appreciated.

1389

1390 # Step 1

1391 outerLoopCounter = 0

1392

1393 # Step 2

1394 while outerLoopCounter < 8:

1395

1396 # Step 3

1397 outerLoopCounter += 1

1398

1399 # Step 4:

1400

1401 # Let the formatting element be the last element in

1402 # the list of active formatting elements that:

1403 # - is between the end of the list and the last scope

1404 # marker in the list, if any, or the start of the list

1405 # otherwise, and

1406 # - has the same tag name as the token.

1407 formattingElement = self.tree.elementInActiveFormattingElements(

1408 token["name"])

1409 if (not formattingElement or

1410 (formattingElement in self.tree.openElements and

1411 not self.tree.elementInScope(formattingElement.name))):

1412 # If there is no such node, then abort these steps

1413 # and instead act as described in the "any other

1414 # end tag" entry below.

1415 self.endTagOther(token)

1416 return

1417

1418 # Otherwise, if there is such a node, but that node is

1419 # not in the stack of open elements, then this is a

1420 # parse error; remove the element from the list, and

1421 # abort these steps.

1422 elif formattingElement not in self.tree.openElements:

1423 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]})

1424 self.tree.activeFormattingElements.remove(formattingElement)

1425 return

1426

1427 # Otherwise, if there is such a node, and that node is

1428 # also in the stack of open elements, but the element

1429 # is not in scope, then this is a parse error; ignore

1430 # the token, and abort these steps.

1431 elif not self.tree.elementInScope(formattingElement.name):

1432 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]})

1433 return

1434

1435 # Otherwise, there is a formatting element and that

1436 # element is in the stack and is in scope. If the

1437 # element is not the current node, this is a parse

1438 # error. In any case, proceed with the algorithm as

1439 # written in the following steps.

1440 else:

1441 if formattingElement != self.tree.openElements[-1]:

1442 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]})

1443

1444 # Step 5:

1445

1446 # Let the furthest block be the topmost node in the

1447 # stack of open elements that is lower in the stack

1448 # than the formatting element, and is an element in

1449 # the special category. There might not be one.

1450 afeIndex = self.tree.openElements.index(formattingElement)

1451 furthestBlock = None

1452 for element in self.tree.openElements[afeIndex:]:

1453 if element.nameTuple in specialElements:

1454 furthestBlock = element

1455 break

1456

1457 # Step 6:

1458

1459 # If there is no furthest block, then the UA must

1460 # first pop all the nodes from the bottom of the stack

1461 # of open elements, from the current node up to and

1462 # including the formatting element, then remove the

1463 # formatting element from the list of active

1464 # formatting elements, and finally abort these steps.

1465 if furthestBlock is None:

1466 element = self.tree.openElements.pop()

1467 while element != formattingElement:

1468 element = self.tree.openElements.pop()

1469 self.tree.activeFormattingElements.remove(element)

1470 return

1471

1472 # Step 7

1473 commonAncestor = self.tree.openElements[afeIndex - 1]

1474

1475 # Step 8:

1476 # The bookmark is supposed to help us identify where to reinsert

1477 # nodes in step 15. We have to ensure that we reinsert nodes after

1478 # the node before the active formatting element. Note the bookmark

1479 # can move in step 9.7

1480 bookmark = self.tree.activeFormattingElements.index(formattingElement)

1481

1482 # Step 9

1483 lastNode = node = furthestBlock

1484 innerLoopCounter = 0

1485

1486 index = self.tree.openElements.index(node)

1487 while innerLoopCounter < 3:

1488 innerLoopCounter += 1

1489 # Node is element before node in open elements

1490 index -= 1

1491 node = self.tree.openElements[index]

1492 if node not in self.tree.activeFormattingElements:

1493 self.tree.openElements.remove(node)

1494 continue

1495 # Step 9.6

1496 if node == formattingElement:

1497 break

1498 # Step 9.7

1499 if lastNode == furthestBlock:

1500 bookmark = self.tree.activeFormattingElements.index(node) + 1

1501 # Step 9.8

1502 clone = node.cloneNode()

1503 # Replace node with clone

1504 self.tree.activeFormattingElements[

1505 self.tree.activeFormattingElements.index(node)] = clone

1506 self.tree.openElements[

1507 self.tree.openElements.index(node)] = clone

1508 node = clone

1509 # Step 9.9

1510 # Remove lastNode from its parents, if any

1511 if lastNode.parent:

1512 lastNode.parent.removeChild(lastNode)

1513 node.appendChild(lastNode)

1514 # Step 9.10

1515 lastNode = node

1516

1517 # Step 10

1518 # Foster parent lastNode if commonAncestor is a

1519 # table, tbody, tfoot, thead, or tr we need to foster

1520 # parent the lastNode

1521 if lastNode.parent:

1522 lastNode.parent.removeChild(lastNode)

1523

1524 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")):

1525 parent, insertBefore = self.tree.getTableMisnestedNodePosition()

1526 parent.insertBefore(lastNode, insertBefore)

1527 else:

1528 commonAncestor.appendChild(lastNode)

1529

1530 # Step 11

1531 clone = formattingElement.cloneNode()

1532

1533 # Step 12

1534 furthestBlock.reparentChildren(clone)

1535

1536 # Step 13

1537 furthestBlock.appendChild(clone)

1538

1539 # Step 14

1540 self.tree.activeFormattingElements.remove(formattingElement)

1541 self.tree.activeFormattingElements.insert(bookmark, clone)

1542

1543 # Step 15

1544 self.tree.openElements.remove(formattingElement)

1545 self.tree.openElements.insert(

1546 self.tree.openElements.index(furthestBlock) + 1, clone)

1547

1548 def endTagAppletMarqueeObject(self, token):

1549 if self.tree.elementInScope(token["name"]):

1550 self.tree.generateImpliedEndTags()

1551 if self.tree.openElements[-1].name != token["name"]:

1552 self.parser.parseError("end-tag-too-early", {"name": token["name"]})

1553

1554 if self.tree.elementInScope(token["name"]):

1555 element = self.tree.openElements.pop()

1556 while element.name != token["name"]:

1557 element = self.tree.openElements.pop()

1558 self.tree.clearActiveFormattingElements()

1559

1560 def endTagBr(self, token):

1561 self.parser.parseError("unexpected-end-tag-treated-as",

1562 {"originalName": "br", "newName": "br element"})

1563 self.tree.reconstructActiveFormattingElements()

1564 self.tree.insertElement(impliedTagToken("br", "StartTag"))

1565 self.tree.openElements.pop()

1566

1567 def endTagOther(self, token):

1568 for node in self.tree.openElements[::-1]:

1569 if node.name == token["name"]:

1570 self.tree.generateImpliedEndTags(exclude=token["name"])

1571 if self.tree.openElements[-1].name != token["name"]:

1572 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1573 while self.tree.openElements.pop() != node:

1574 pass

1575 break

1576 else:

1577 if node.nameTuple in specialElements:

1578 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1579 break

1580

1581 startTagHandler = _utils.MethodDispatcher([

1582 ("html", Phase.startTagHtml),

1583 (("base", "basefont", "bgsound", "command", "link", "meta",

1584 "script", "style", "title"),

1585 startTagProcessInHead),

1586 ("body", startTagBody),

1587 ("frameset", startTagFrameset),

1588 (("address", "article", "aside", "blockquote", "center", "details",

1589 "dir", "div", "dl", "fieldset", "figcaption", "figure",

1590 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p",

1591 "section", "summary", "ul"),

1592 startTagCloseP),

1593 (headingElements, startTagHeading),

1594 (("pre", "listing"), startTagPreListing),

1595 ("form", startTagForm),

1596 (("li", "dd", "dt"), startTagListItem),

1597 ("plaintext", startTagPlaintext),

1598 ("a", startTagA),

1599 (("b", "big", "code", "em", "font", "i", "s", "small", "strike",

1600 "strong", "tt", "u"), startTagFormatting),

1601 ("nobr", startTagNobr),

1602 ("button", startTagButton),

1603 (("applet", "marquee", "object"), startTagAppletMarqueeObject),

1604 ("xmp", startTagXmp),

1605 ("table", startTagTable),

1606 (("area", "br", "embed", "img", "keygen", "wbr"),

1607 startTagVoidFormatting),

1608 (("param", "source", "track"), startTagParamSource),

1609 ("input", startTagInput),

1610 ("hr", startTagHr),

1611 ("image", startTagImage),

1612 ("isindex", startTagIsIndex),

1613 ("textarea", startTagTextarea),

1614 ("iframe", startTagIFrame),

1615 ("noscript", startTagNoscript),

1616 (("noembed", "noframes"), startTagRawtext),

1617 ("select", startTagSelect),

1618 (("rp", "rt"), startTagRpRt),

1619 (("option", "optgroup"), startTagOpt),

1620 (("math"), startTagMath),

1621 (("svg"), startTagSvg),

1622 (("caption", "col", "colgroup", "frame", "head",

1623 "tbody", "td", "tfoot", "th", "thead",

1624 "tr"), startTagMisplaced)

1625 ])

1626 startTagHandler.default = startTagOther

1627

1628 endTagHandler = _utils.MethodDispatcher([

1629 ("body", endTagBody),

1630 ("html", endTagHtml),

1631 (("address", "article", "aside", "blockquote", "button", "center",

1632 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure",

1633 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre",

1634 "section", "summary", "ul"), endTagBlock),

1635 ("form", endTagForm),

1636 ("p", endTagP),

1637 (("dd", "dt", "li"), endTagListItem),

1638 (headingElements, endTagHeading),

1639 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small",

1640 "strike", "strong", "tt", "u"), endTagFormatting),

1641 (("applet", "marquee", "object"), endTagAppletMarqueeObject),

1642 ("br", endTagBr),

1643 ])

1644 endTagHandler.default = endTagOther

1645

1646

1647class TextPhase(Phase):

1648 __slots__ = tuple()

1649

1650 def processCharacters(self, token):

1651 self.tree.insertText(token["data"])

1652

1653 def processEOF(self):

1654 self.parser.parseError("expected-named-closing-tag-but-got-eof",

1655 {"name": self.tree.openElements[-1].name})

1656 self.tree.openElements.pop()

1657 self.parser.phase = self.parser.originalPhase

1658 return True

1659

1660 def startTagOther(self, token):

1661 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name']

1662

1663 def endTagScript(self, token):

1664 node = self.tree.openElements.pop()

1665 assert node.name == "script"

1666 self.parser.phase = self.parser.originalPhase

1667 # The rest of this method is all stuff that only happens if

1668 # document.write works

1669

1670 def endTagOther(self, token):

1671 self.tree.openElements.pop()

1672 self.parser.phase = self.parser.originalPhase

1673

1674 startTagHandler = _utils.MethodDispatcher([])

1675 startTagHandler.default = startTagOther

1676 endTagHandler = _utils.MethodDispatcher([

1677 ("script", endTagScript)])

1678 endTagHandler.default = endTagOther

1679

1680

1681class InTablePhase(Phase):

1682 # http://www.whatwg.org/specs/web-apps/current-work/#in-table

1683 __slots__ = tuple()

1684

1685 # helper methods

1686 def clearStackToTableContext(self):

1687 # "clear the stack back to a table context"

1688 while self.tree.openElements[-1].name not in ("table", "html"):

1689 # self.parser.parseError("unexpected-implied-end-tag-in-table",

1690 # {"name": self.tree.openElements[-1].name})

1691 self.tree.openElements.pop()

1692 # When the current node is <html> it's an innerHTML case

1693

1694 # processing methods

1695 def processEOF(self):

1696 if self.tree.openElements[-1].name != "html":

1697 self.parser.parseError("eof-in-table")

1698 else:

1699 assert self.parser.innerHTML

1700 # Stop parsing

1701

1702 def processSpaceCharacters(self, token):

1703 originalPhase = self.parser.phase

1704 self.parser.phase = self.parser.phases["inTableText"]

1705 self.parser.phase.originalPhase = originalPhase

1706 self.parser.phase.processSpaceCharacters(token)

1707

1708 def processCharacters(self, token):

1709 originalPhase = self.parser.phase

1710 self.parser.phase = self.parser.phases["inTableText"]

1711 self.parser.phase.originalPhase = originalPhase

1712 self.parser.phase.processCharacters(token)

1713

1714 def insertText(self, token):

1715 # If we get here there must be at least one non-whitespace character

1716 # Do the table magic!

1717 self.tree.insertFromTable = True

1718 self.parser.phases["inBody"].processCharacters(token)

1719 self.tree.insertFromTable = False

1720

1721 def startTagCaption(self, token):

1722 self.clearStackToTableContext()

1723 self.tree.activeFormattingElements.append(Marker)

1724 self.tree.insertElement(token)

1725 self.parser.phase = self.parser.phases["inCaption"]

1726

1727 def startTagColgroup(self, token):

1728 self.clearStackToTableContext()

1729 self.tree.insertElement(token)

1730 self.parser.phase = self.parser.phases["inColumnGroup"]

1731

1732 def startTagCol(self, token):

1733 self.startTagColgroup(impliedTagToken("colgroup", "StartTag"))

1734 return token

1735

1736 def startTagRowGroup(self, token):

1737 self.clearStackToTableContext()

1738 self.tree.insertElement(token)

1739 self.parser.phase = self.parser.phases["inTableBody"]

1740

1741 def startTagImplyTbody(self, token):

1742 self.startTagRowGroup(impliedTagToken("tbody", "StartTag"))

1743 return token

1744

1745 def startTagTable(self, token):

1746 self.parser.parseError("unexpected-start-tag-implies-end-tag",

1747 {"startName": "table", "endName": "table"})

1748 self.parser.phase.processEndTag(impliedTagToken("table"))

1749 if not self.parser.innerHTML:

1750 return token

1751

1752 def startTagStyleScript(self, token):

1753 return self.parser.phases["inHead"].processStartTag(token)

1754

1755 def startTagInput(self, token):

1756 if ("type" in token["data"] and

1757 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"):

1758 self.parser.parseError("unexpected-hidden-input-in-table")

1759 self.tree.insertElement(token)

1760 # XXX associate with form

1761 self.tree.openElements.pop()

1762 else:

1763 self.startTagOther(token)

1764

1765 def startTagForm(self, token):

1766 self.parser.parseError("unexpected-form-in-table")

1767 if self.tree.formPointer is None:

1768 self.tree.insertElement(token)

1769 self.tree.formPointer = self.tree.openElements[-1]

1770 self.tree.openElements.pop()

1771

1772 def startTagOther(self, token):

1773 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]})

1774 # Do the table magic!

1775 self.tree.insertFromTable = True

1776 self.parser.phases["inBody"].processStartTag(token)

1777 self.tree.insertFromTable = False

1778

1779 def endTagTable(self, token):

1780 if self.tree.elementInScope("table", variant="table"):

1781 self.tree.generateImpliedEndTags()

1782 if self.tree.openElements[-1].name != "table":

1783 self.parser.parseError("end-tag-too-early-named",

1784 {"gotName": "table",

1785 "expectedName": self.tree.openElements[-1].name})

1786 while self.tree.openElements[-1].name != "table":

1787 self.tree.openElements.pop()

1788 self.tree.openElements.pop()

1789 self.parser.resetInsertionMode()

1790 else:

1791 # innerHTML case

1792 assert self.parser.innerHTML

1793 self.parser.parseError()

1794

1795 def endTagIgnore(self, token):

1796 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1797

1798 def endTagOther(self, token):

1799 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]})

1800 # Do the table magic!

1801 self.tree.insertFromTable = True

1802 self.parser.phases["inBody"].processEndTag(token)

1803 self.tree.insertFromTable = False

1804

1805 startTagHandler = _utils.MethodDispatcher([

1806 ("html", Phase.startTagHtml),

1807 ("caption", startTagCaption),

1808 ("colgroup", startTagColgroup),

1809 ("col", startTagCol),

1810 (("tbody", "tfoot", "thead"), startTagRowGroup),

1811 (("td", "th", "tr"), startTagImplyTbody),

1812 ("table", startTagTable),

1813 (("style", "script"), startTagStyleScript),

1814 ("input", startTagInput),

1815 ("form", startTagForm)

1816 ])

1817 startTagHandler.default = startTagOther

1818

1819 endTagHandler = _utils.MethodDispatcher([

1820 ("table", endTagTable),

1821 (("body", "caption", "col", "colgroup", "html", "tbody", "td",

1822 "tfoot", "th", "thead", "tr"), endTagIgnore)

1823 ])

1824 endTagHandler.default = endTagOther

1825

1826

1827class InTableTextPhase(Phase):

1828 __slots__ = ("originalPhase", "characterTokens")

1829

1830 def __init__(self, *args, **kwargs):

1831 super(InTableTextPhase, self).__init__(*args, **kwargs)

1832 self.originalPhase = None

1833 self.characterTokens = []

1834

1835 def flushCharacters(self):

1836 data = "".join([item["data"] for item in self.characterTokens])

1837 if any(item not in spaceCharacters for item in data):

1838 token = {"type": tokenTypes["Characters"], "data": data}

1839 self.parser.phases["inTable"].insertText(token)

1840 elif data:

1841 self.tree.insertText(data)

1842 self.characterTokens = []

1843

1844 def processComment(self, token):

1845 self.flushCharacters()

1846 self.parser.phase = self.originalPhase

1847 return token

1848

1849 def processEOF(self):

1850 self.flushCharacters()

1851 self.parser.phase = self.originalPhase

1852 return True

1853

1854 def processCharacters(self, token):

1855 if token["data"] == "\u0000":

1856 return

1857 self.characterTokens.append(token)

1858

1859 def processSpaceCharacters(self, token):

1860 # pretty sure we should never reach here

1861 self.characterTokens.append(token)

1862# assert False

1863

1864 def processStartTag(self, token):

1865 self.flushCharacters()

1866 self.parser.phase = self.originalPhase

1867 return token

1868

1869 def processEndTag(self, token):

1870 self.flushCharacters()

1871 self.parser.phase = self.originalPhase

1872 return token

1873

1874

1875class InCaptionPhase(Phase):

1876 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption

1877 __slots__ = tuple()

1878

1879 def ignoreEndTagCaption(self):

1880 return not self.tree.elementInScope("caption", variant="table")

1881

1882 def processEOF(self):

1883 self.parser.phases["inBody"].processEOF()

1884

1885 def processCharacters(self, token):

1886 return self.parser.phases["inBody"].processCharacters(token)

1887

1888 def startTagTableElement(self, token):

1889 self.parser.parseError()

1890 # XXX Have to duplicate logic here to find out if the tag is ignored

1891 ignoreEndTag = self.ignoreEndTagCaption()

1892 self.parser.phase.processEndTag(impliedTagToken("caption"))

1893 if not ignoreEndTag:

1894 return token

1895

1896 def startTagOther(self, token):

1897 return self.parser.phases["inBody"].processStartTag(token)

1898

1899 def endTagCaption(self, token):

1900 if not self.ignoreEndTagCaption():

1901 # AT this code is quite similar to endTagTable in "InTable"

1902 self.tree.generateImpliedEndTags()

1903 if self.tree.openElements[-1].name != "caption":

1904 self.parser.parseError("expected-one-end-tag-but-got-another",

1905 {"gotName": "caption",

1906 "expectedName": self.tree.openElements[-1].name})

1907 while self.tree.openElements[-1].name != "caption":

1908 self.tree.openElements.pop()

1909 self.tree.openElements.pop()

1910 self.tree.clearActiveFormattingElements()

1911 self.parser.phase = self.parser.phases["inTable"]

1912 else:

1913 # innerHTML case

1914 assert self.parser.innerHTML

1915 self.parser.parseError()

1916

1917 def endTagTable(self, token):

1918 self.parser.parseError()

1919 ignoreEndTag = self.ignoreEndTagCaption()

1920 self.parser.phase.processEndTag(impliedTagToken("caption"))

1921 if not ignoreEndTag:

1922 return token

1923

1924 def endTagIgnore(self, token):

1925 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

1926

1927 def endTagOther(self, token):

1928 return self.parser.phases["inBody"].processEndTag(token)

1929

1930 startTagHandler = _utils.MethodDispatcher([

1931 ("html", Phase.startTagHtml),

1932 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",

1933 "thead", "tr"), startTagTableElement)

1934 ])

1935 startTagHandler.default = startTagOther

1936

1937 endTagHandler = _utils.MethodDispatcher([

1938 ("caption", endTagCaption),

1939 ("table", endTagTable),

1940 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th",

1941 "thead", "tr"), endTagIgnore)

1942 ])

1943 endTagHandler.default = endTagOther

1944

1945

1946class InColumnGroupPhase(Phase):

1947 # http://www.whatwg.org/specs/web-apps/current-work/#in-column

1948 __slots__ = tuple()

1949

1950 def ignoreEndTagColgroup(self):

1951 return self.tree.openElements[-1].name == "html"

1952

1953 def processEOF(self):

1954 if self.tree.openElements[-1].name == "html":

1955 assert self.parser.innerHTML

1956 return

1957 else:

1958 ignoreEndTag = self.ignoreEndTagColgroup()

1959 self.endTagColgroup(impliedTagToken("colgroup"))

1960 if not ignoreEndTag:

1961 return True

1962

1963 def processCharacters(self, token):

1964 ignoreEndTag = self.ignoreEndTagColgroup()

1965 self.endTagColgroup(impliedTagToken("colgroup"))

1966 if not ignoreEndTag:

1967 return token

1968

1969 def startTagCol(self, token):

1970 self.tree.insertElement(token)

1971 self.tree.openElements.pop()

1972 token["selfClosingAcknowledged"] = True

1973

1974 def startTagOther(self, token):

1975 ignoreEndTag = self.ignoreEndTagColgroup()

1976 self.endTagColgroup(impliedTagToken("colgroup"))

1977 if not ignoreEndTag:

1978 return token

1979

1980 def endTagColgroup(self, token):

1981 if self.ignoreEndTagColgroup():

1982 # innerHTML case

1983 assert self.parser.innerHTML

1984 self.parser.parseError()

1985 else:

1986 self.tree.openElements.pop()

1987 self.parser.phase = self.parser.phases["inTable"]

1988

1989 def endTagCol(self, token):

1990 self.parser.parseError("no-end-tag", {"name": "col"})

1991

1992 def endTagOther(self, token):

1993 ignoreEndTag = self.ignoreEndTagColgroup()

1994 self.endTagColgroup(impliedTagToken("colgroup"))

1995 if not ignoreEndTag:

1996 return token

1997

1998 startTagHandler = _utils.MethodDispatcher([

1999 ("html", Phase.startTagHtml),

2000 ("col", startTagCol)

2001 ])

2002 startTagHandler.default = startTagOther

2003

2004 endTagHandler = _utils.MethodDispatcher([

2005 ("colgroup", endTagColgroup),

2006 ("col", endTagCol)

2007 ])

2008 endTagHandler.default = endTagOther

2009

2010

2011class InTableBodyPhase(Phase):

2012 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0

2013 __slots__ = tuple()

2014

2015 # helper methods

2016 def clearStackToTableBodyContext(self):

2017 while self.tree.openElements[-1].name not in ("tbody", "tfoot",

2018 "thead", "html"):

2019 # self.parser.parseError("unexpected-implied-end-tag-in-table",

2020 # {"name": self.tree.openElements[-1].name})

2021 self.tree.openElements.pop()

2022 if self.tree.openElements[-1].name == "html":

2023 assert self.parser.innerHTML

2024

2025 # the rest

2026 def processEOF(self):

2027 self.parser.phases["inTable"].processEOF()

2028

2029 def processSpaceCharacters(self, token):

2030 return self.parser.phases["inTable"].processSpaceCharacters(token)

2031

2032 def processCharacters(self, token):

2033 return self.parser.phases["inTable"].processCharacters(token)

2034

2035 def startTagTr(self, token):

2036 self.clearStackToTableBodyContext()

2037 self.tree.insertElement(token)

2038 self.parser.phase = self.parser.phases["inRow"]

2039

2040 def startTagTableCell(self, token):

2041 self.parser.parseError("unexpected-cell-in-table-body",

2042 {"name": token["name"]})

2043 self.startTagTr(impliedTagToken("tr", "StartTag"))

2044 return token

2045

2046 def startTagTableOther(self, token):

2047 # XXX AT Any ideas on how to share this with endTagTable?

2048 if (self.tree.elementInScope("tbody", variant="table") or

2049 self.tree.elementInScope("thead", variant="table") or

2050 self.tree.elementInScope("tfoot", variant="table")):

2051 self.clearStackToTableBodyContext()

2052 self.endTagTableRowGroup(

2053 impliedTagToken(self.tree.openElements[-1].name))

2054 return token

2055 else:

2056 # innerHTML case

2057 assert self.parser.innerHTML

2058 self.parser.parseError()

2059

2060 def startTagOther(self, token):

2061 return self.parser.phases["inTable"].processStartTag(token)

2062

2063 def endTagTableRowGroup(self, token):

2064 if self.tree.elementInScope(token["name"], variant="table"):

2065 self.clearStackToTableBodyContext()

2066 self.tree.openElements.pop()

2067 self.parser.phase = self.parser.phases["inTable"]

2068 else:

2069 self.parser.parseError("unexpected-end-tag-in-table-body",

2070 {"name": token["name"]})

2071

2072 def endTagTable(self, token):

2073 if (self.tree.elementInScope("tbody", variant="table") or

2074 self.tree.elementInScope("thead", variant="table") or

2075 self.tree.elementInScope("tfoot", variant="table")):

2076 self.clearStackToTableBodyContext()

2077 self.endTagTableRowGroup(

2078 impliedTagToken(self.tree.openElements[-1].name))

2079 return token

2080 else:

2081 # innerHTML case

2082 assert self.parser.innerHTML

2083 self.parser.parseError()

2084

2085 def endTagIgnore(self, token):

2086 self.parser.parseError("unexpected-end-tag-in-table-body",

2087 {"name": token["name"]})

2088

2089 def endTagOther(self, token):

2090 return self.parser.phases["inTable"].processEndTag(token)

2091

2092 startTagHandler = _utils.MethodDispatcher([

2093 ("html", Phase.startTagHtml),

2094 ("tr", startTagTr),

2095 (("td", "th"), startTagTableCell),

2096 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"),

2097 startTagTableOther)

2098 ])

2099 startTagHandler.default = startTagOther

2100

2101 endTagHandler = _utils.MethodDispatcher([

2102 (("tbody", "tfoot", "thead"), endTagTableRowGroup),

2103 ("table", endTagTable),

2104 (("body", "caption", "col", "colgroup", "html", "td", "th",

2105 "tr"), endTagIgnore)

2106 ])

2107 endTagHandler.default = endTagOther

2108

2109

2110class InRowPhase(Phase):

2111 # http://www.whatwg.org/specs/web-apps/current-work/#in-row

2112 __slots__ = tuple()

2113

2114 # helper methods (XXX unify this with other table helper methods)

2115 def clearStackToTableRowContext(self):

2116 while self.tree.openElements[-1].name not in ("tr", "html"):

2117 self.parser.parseError("unexpected-implied-end-tag-in-table-row",

2118 {"name": self.tree.openElements[-1].name})

2119 self.tree.openElements.pop()

2120

2121 def ignoreEndTagTr(self):

2122 return not self.tree.elementInScope("tr", variant="table")

2123

2124 # the rest

2125 def processEOF(self):

2126 self.parser.phases["inTable"].processEOF()

2127

2128 def processSpaceCharacters(self, token):

2129 return self.parser.phases["inTable"].processSpaceCharacters(token)

2130

2131 def processCharacters(self, token):

2132 return self.parser.phases["inTable"].processCharacters(token)

2133

2134 def startTagTableCell(self, token):

2135 self.clearStackToTableRowContext()

2136 self.tree.insertElement(token)

2137 self.parser.phase = self.parser.phases["inCell"]

2138 self.tree.activeFormattingElements.append(Marker)

2139

2140 def startTagTableOther(self, token):

2141 ignoreEndTag = self.ignoreEndTagTr()

2142 self.endTagTr(impliedTagToken("tr"))

2143 # XXX how are we sure it's always ignored in the innerHTML case?

2144 if not ignoreEndTag:

2145 return token

2146

2147 def startTagOther(self, token):

2148 return self.parser.phases["inTable"].processStartTag(token)

2149

2150 def endTagTr(self, token):

2151 if not self.ignoreEndTagTr():

2152 self.clearStackToTableRowContext()

2153 self.tree.openElements.pop()

2154 self.parser.phase = self.parser.phases["inTableBody"]

2155 else:

2156 # innerHTML case

2157 assert self.parser.innerHTML

2158 self.parser.parseError()

2159

2160 def endTagTable(self, token):

2161 ignoreEndTag = self.ignoreEndTagTr()

2162 self.endTagTr(impliedTagToken("tr"))

2163 # Reprocess the current tag if the tr end tag was not ignored

2164 # XXX how are we sure it's always ignored in the innerHTML case?

2165 if not ignoreEndTag:

2166 return token

2167

2168 def endTagTableRowGroup(self, token):

2169 if self.tree.elementInScope(token["name"], variant="table"):

2170 self.endTagTr(impliedTagToken("tr"))

2171 return token

2172 else:

2173 self.parser.parseError()

2174

2175 def endTagIgnore(self, token):

2176 self.parser.parseError("unexpected-end-tag-in-table-row",

2177 {"name": token["name"]})

2178

2179 def endTagOther(self, token):

2180 return self.parser.phases["inTable"].processEndTag(token)

2181

2182 startTagHandler = _utils.MethodDispatcher([

2183 ("html", Phase.startTagHtml),

2184 (("td", "th"), startTagTableCell),

2185 (("caption", "col", "colgroup", "tbody", "tfoot", "thead",

2186 "tr"), startTagTableOther)

2187 ])

2188 startTagHandler.default = startTagOther

2189

2190 endTagHandler = _utils.MethodDispatcher([

2191 ("tr", endTagTr),

2192 ("table", endTagTable),

2193 (("tbody", "tfoot", "thead"), endTagTableRowGroup),

2194 (("body", "caption", "col", "colgroup", "html", "td", "th"),

2195 endTagIgnore)

2196 ])

2197 endTagHandler.default = endTagOther

2198

2199

2200class InCellPhase(Phase):

2201 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell

2202 __slots__ = tuple()

2203

2204 # helper

2205 def closeCell(self):

2206 if self.tree.elementInScope("td", variant="table"):

2207 self.endTagTableCell(impliedTagToken("td"))

2208 elif self.tree.elementInScope("th", variant="table"):

2209 self.endTagTableCell(impliedTagToken("th"))

2210

2211 # the rest

2212 def processEOF(self):

2213 self.parser.phases["inBody"].processEOF()

2214

2215 def processCharacters(self, token):

2216 return self.parser.phases["inBody"].processCharacters(token)

2217

2218 def startTagTableOther(self, token):

2219 if (self.tree.elementInScope("td", variant="table") or

2220 self.tree.elementInScope("th", variant="table")):

2221 self.closeCell()

2222 return token

2223 else:

2224 # innerHTML case

2225 assert self.parser.innerHTML

2226 self.parser.parseError()

2227

2228 def startTagOther(self, token):

2229 return self.parser.phases["inBody"].processStartTag(token)

2230

2231 def endTagTableCell(self, token):

2232 if self.tree.elementInScope(token["name"], variant="table"):

2233 self.tree.generateImpliedEndTags(token["name"])

2234 if self.tree.openElements[-1].name != token["name"]:

2235 self.parser.parseError("unexpected-cell-end-tag",

2236 {"name": token["name"]})

2237 while True:

2238 node = self.tree.openElements.pop()

2239 if node.name == token["name"]:

2240 break

2241 else:

2242 self.tree.openElements.pop()

2243 self.tree.clearActiveFormattingElements()

2244 self.parser.phase = self.parser.phases["inRow"]

2245 else:

2246 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

2247

2248 def endTagIgnore(self, token):

2249 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

2250

2251 def endTagImply(self, token):

2252 if self.tree.elementInScope(token["name"], variant="table"):

2253 self.closeCell()

2254 return token

2255 else:

2256 # sometimes innerHTML case

2257 self.parser.parseError()

2258

2259 def endTagOther(self, token):

2260 return self.parser.phases["inBody"].processEndTag(token)

2261

2262 startTagHandler = _utils.MethodDispatcher([

2263 ("html", Phase.startTagHtml),

2264 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th",

2265 "thead", "tr"), startTagTableOther)

2266 ])

2267 startTagHandler.default = startTagOther

2268

2269 endTagHandler = _utils.MethodDispatcher([

2270 (("td", "th"), endTagTableCell),

2271 (("body", "caption", "col", "colgroup", "html"), endTagIgnore),

2272 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply)

2273 ])

2274 endTagHandler.default = endTagOther

2275

2276

2277class InSelectPhase(Phase):

2278 __slots__ = tuple()

2279

2280 # http://www.whatwg.org/specs/web-apps/current-work/#in-select

2281 def processEOF(self):

2282 if self.tree.openElements[-1].name != "html":

2283 self.parser.parseError("eof-in-select")

2284 else:

2285 assert self.parser.innerHTML

2286

2287 def processCharacters(self, token):

2288 if token["data"] == "\u0000":

2289 return

2290 self.tree.insertText(token["data"])

2291

2292 def startTagOption(self, token):

2293 # We need to imply </option> if <option> is the current node.

2294 if self.tree.openElements[-1].name == "option":

2295 self.tree.openElements.pop()

2296 self.tree.insertElement(token)

2297

2298 def startTagOptgroup(self, token):

2299 if self.tree.openElements[-1].name == "option":

2300 self.tree.openElements.pop()

2301 if self.tree.openElements[-1].name == "optgroup":

2302 self.tree.openElements.pop()

2303 self.tree.insertElement(token)

2304

2305 def startTagSelect(self, token):

2306 self.parser.parseError("unexpected-select-in-select")

2307 self.endTagSelect(impliedTagToken("select"))

2308

2309 def startTagInput(self, token):

2310 self.parser.parseError("unexpected-input-in-select")

2311 if self.tree.elementInScope("select", variant="select"):

2312 self.endTagSelect(impliedTagToken("select"))

2313 return token

2314 else:

2315 assert self.parser.innerHTML

2316

2317 def startTagScript(self, token):

2318 return self.parser.phases["inHead"].processStartTag(token)

2319

2320 def startTagOther(self, token):

2321 self.parser.parseError("unexpected-start-tag-in-select",

2322 {"name": token["name"]})

2323

2324 def endTagOption(self, token):

2325 if self.tree.openElements[-1].name == "option":

2326 self.tree.openElements.pop()

2327 else:

2328 self.parser.parseError("unexpected-end-tag-in-select",

2329 {"name": "option"})

2330

2331 def endTagOptgroup(self, token):

2332 # </optgroup> implicitly closes <option>

2333 if (self.tree.openElements[-1].name == "option" and

2334 self.tree.openElements[-2].name == "optgroup"):

2335 self.tree.openElements.pop()

2336 # It also closes </optgroup>

2337 if self.tree.openElements[-1].name == "optgroup":

2338 self.tree.openElements.pop()

2339 # But nothing else

2340 else:

2341 self.parser.parseError("unexpected-end-tag-in-select",

2342 {"name": "optgroup"})

2343

2344 def endTagSelect(self, token):

2345 if self.tree.elementInScope("select", variant="select"):

2346 node = self.tree.openElements.pop()

2347 while node.name != "select":

2348 node = self.tree.openElements.pop()

2349 self.parser.resetInsertionMode()

2350 else:

2351 # innerHTML case

2352 assert self.parser.innerHTML

2353 self.parser.parseError()

2354

2355 def endTagOther(self, token):

2356 self.parser.parseError("unexpected-end-tag-in-select",

2357 {"name": token["name"]})

2358

2359 startTagHandler = _utils.MethodDispatcher([

2360 ("html", Phase.startTagHtml),

2361 ("option", startTagOption),

2362 ("optgroup", startTagOptgroup),

2363 ("select", startTagSelect),

2364 (("input", "keygen", "textarea"), startTagInput),

2365 ("script", startTagScript)

2366 ])

2367 startTagHandler.default = startTagOther

2368

2369 endTagHandler = _utils.MethodDispatcher([

2370 ("option", endTagOption),

2371 ("optgroup", endTagOptgroup),

2372 ("select", endTagSelect)

2373 ])

2374 endTagHandler.default = endTagOther

2375

2376

2377class InSelectInTablePhase(Phase):

2378 __slots__ = tuple()

2379

2380 def processEOF(self):

2381 self.parser.phases["inSelect"].processEOF()

2382

2383 def processCharacters(self, token):

2384 return self.parser.phases["inSelect"].processCharacters(token)

2385

2386 def startTagTable(self, token):

2387 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]})

2388 self.endTagOther(impliedTagToken("select"))

2389 return token

2390

2391 def startTagOther(self, token):

2392 return self.parser.phases["inSelect"].processStartTag(token)

2393

2394 def endTagTable(self, token):

2395 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]})

2396 if self.tree.elementInScope(token["name"], variant="table"):

2397 self.endTagOther(impliedTagToken("select"))

2398 return token

2399

2400 def endTagOther(self, token):

2401 return self.parser.phases["inSelect"].processEndTag(token)

2402

2403 startTagHandler = _utils.MethodDispatcher([

2404 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),

2405 startTagTable)

2406 ])

2407 startTagHandler.default = startTagOther

2408

2409 endTagHandler = _utils.MethodDispatcher([

2410 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"),

2411 endTagTable)

2412 ])

2413 endTagHandler.default = endTagOther

2414

2415

2416class InForeignContentPhase(Phase):

2417 __slots__ = tuple()

2418

2419 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br",

2420 "center", "code", "dd", "div", "dl", "dt",

2421 "em", "embed", "h1", "h2", "h3",

2422 "h4", "h5", "h6", "head", "hr", "i", "img",

2423 "li", "listing", "menu", "meta", "nobr",

2424 "ol", "p", "pre", "ruby", "s", "small",

2425 "span", "strong", "strike", "sub", "sup",

2426 "table", "tt", "u", "ul", "var"])

2427

2428 def adjustSVGTagNames(self, token):

2429 replacements = {"altglyph": "altGlyph",

2430 "altglyphdef": "altGlyphDef",

2431 "altglyphitem": "altGlyphItem",

2432 "animatecolor": "animateColor",

2433 "animatemotion": "animateMotion",

2434 "animatetransform": "animateTransform",

2435 "clippath": "clipPath",

2436 "feblend": "feBlend",

2437 "fecolormatrix": "feColorMatrix",

2438 "fecomponenttransfer": "feComponentTransfer",

2439 "fecomposite": "feComposite",

2440 "feconvolvematrix": "feConvolveMatrix",

2441 "fediffuselighting": "feDiffuseLighting",

2442 "fedisplacementmap": "feDisplacementMap",

2443 "fedistantlight": "feDistantLight",

2444 "feflood": "feFlood",

2445 "fefunca": "feFuncA",

2446 "fefuncb": "feFuncB",

2447 "fefuncg": "feFuncG",

2448 "fefuncr": "feFuncR",

2449 "fegaussianblur": "feGaussianBlur",

2450 "feimage": "feImage",

2451 "femerge": "feMerge",

2452 "femergenode": "feMergeNode",

2453 "femorphology": "feMorphology",

2454 "feoffset": "feOffset",

2455 "fepointlight": "fePointLight",

2456 "fespecularlighting": "feSpecularLighting",

2457 "fespotlight": "feSpotLight",

2458 "fetile": "feTile",

2459 "feturbulence": "feTurbulence",

2460 "foreignobject": "foreignObject",

2461 "glyphref": "glyphRef",

2462 "lineargradient": "linearGradient",

2463 "radialgradient": "radialGradient",

2464 "textpath": "textPath"}

2465

2466 if token["name"] in replacements:

2467 token["name"] = replacements[token["name"]]

2468

2469 def processCharacters(self, token):

2470 if token["data"] == "\u0000":

2471 token["data"] = "\uFFFD"

2472 elif (self.parser.framesetOK and

2473 any(char not in spaceCharacters for char in token["data"])):

2474 self.parser.framesetOK = False

2475 Phase.processCharacters(self, token)

2476

2477 def processStartTag(self, token):

2478 currentNode = self.tree.openElements[-1]

2479 if (token["name"] in self.breakoutElements or

2480 (token["name"] == "font" and

2481 set(token["data"].keys()) & {"color", "face", "size"})):

2482 self.parser.parseError("unexpected-html-element-in-foreign-content",

2483 {"name": token["name"]})

2484 while (self.tree.openElements[-1].namespace !=

2485 self.tree.defaultNamespace and

2486 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and

2487 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])):

2488 self.tree.openElements.pop()

2489 return token

2490

2491 else:

2492 if currentNode.namespace == namespaces["mathml"]:

2493 self.parser.adjustMathMLAttributes(token)

2494 elif currentNode.namespace == namespaces["svg"]:

2495 self.adjustSVGTagNames(token)

2496 self.parser.adjustSVGAttributes(token)

2497 self.parser.adjustForeignAttributes(token)

2498 token["namespace"] = currentNode.namespace

2499 self.tree.insertElement(token)

2500 if token["selfClosing"]:

2501 self.tree.openElements.pop()

2502 token["selfClosingAcknowledged"] = True

2503

2504 def processEndTag(self, token):

2505 nodeIndex = len(self.tree.openElements) - 1

2506 node = self.tree.openElements[-1]

2507 if node.name.translate(asciiUpper2Lower) != token["name"]:

2508 self.parser.parseError("unexpected-end-tag", {"name": token["name"]})

2509

2510 while True:

2511 if node.name.translate(asciiUpper2Lower) == token["name"]:

2512 # XXX this isn't in the spec but it seems necessary

2513 if self.parser.phase == self.parser.phases["inTableText"]:

2514 self.parser.phase.flushCharacters()

2515 self.parser.phase = self.parser.phase.originalPhase

2516 while self.tree.openElements.pop() != node:

2517 assert self.tree.openElements

2518 new_token = None

2519 break

2520 nodeIndex -= 1

2521

2522 node = self.tree.openElements[nodeIndex]

2523 if node.namespace != self.tree.defaultNamespace:

2524 continue

2525 else:

2526 new_token = self.parser.phase.processEndTag(token)

2527 break

2528 return new_token

2529

2530

2531class AfterBodyPhase(Phase):

2532 __slots__ = tuple()

2533

2534 def processEOF(self):

2535 # Stop parsing

2536 pass

2537

2538 def processComment(self, token):

2539 # This is needed because data is to be appended to the <html> element

2540 # here and not to whatever is currently open.

2541 self.tree.insertComment(token, self.tree.openElements[0])

2542

2543 def processCharacters(self, token):

2544 self.parser.parseError("unexpected-char-after-body")

2545 self.parser.phase = self.parser.phases["inBody"]

2546 return token

2547

2548 def startTagHtml(self, token):

2549 return self.parser.phases["inBody"].processStartTag(token)

2550

2551 def startTagOther(self, token):

2552 self.parser.parseError("unexpected-start-tag-after-body",

2553 {"name": token["name"]})

2554 self.parser.phase = self.parser.phases["inBody"]

2555 return token

2556

2557 def endTagHtml(self, name):

2558 if self.parser.innerHTML:

2559 self.parser.parseError("unexpected-end-tag-after-body-innerhtml")

2560 else:

2561 self.parser.phase = self.parser.phases["afterAfterBody"]

2562

2563 def endTagOther(self, token):

2564 self.parser.parseError("unexpected-end-tag-after-body",

2565 {"name": token["name"]})

2566 self.parser.phase = self.parser.phases["inBody"]

2567 return token

2568

2569 startTagHandler = _utils.MethodDispatcher([

2570 ("html", startTagHtml)

2571 ])

2572 startTagHandler.default = startTagOther

2573

2574 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)])

2575 endTagHandler.default = endTagOther

2576

2577

2578class InFramesetPhase(Phase):

2579 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset

2580 __slots__ = tuple()

2581

2582 def processEOF(self):

2583 if self.tree.openElements[-1].name != "html":

2584 self.parser.parseError("eof-in-frameset")

2585 else:

2586 assert self.parser.innerHTML

2587

2588 def processCharacters(self, token):

2589 self.parser.parseError("unexpected-char-in-frameset")

2590

2591 def startTagFrameset(self, token):

2592 self.tree.insertElement(token)

2593

2594 def startTagFrame(self, token):

2595 self.tree.insertElement(token)

2596 self.tree.openElements.pop()

2597

2598 def startTagNoframes(self, token):

2599 return self.parser.phases["inBody"].processStartTag(token)

2600

2601 def startTagOther(self, token):

2602 self.parser.parseError("unexpected-start-tag-in-frameset",

2603 {"name": token["name"]})

2604

2605 def endTagFrameset(self, token):

2606 if self.tree.openElements[-1].name == "html":

2607 # innerHTML case

2608 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml")

2609 else:

2610 self.tree.openElements.pop()

2611 if (not self.parser.innerHTML and

2612 self.tree.openElements[-1].name != "frameset"):

2613 # If we're not in innerHTML mode and the current node is not a

2614 # "frameset" element (anymore) then switch.

2615 self.parser.phase = self.parser.phases["afterFrameset"]

2616

2617 def endTagOther(self, token):

2618 self.parser.parseError("unexpected-end-tag-in-frameset",

2619 {"name": token["name"]})

2620

2621 startTagHandler = _utils.MethodDispatcher([

2622 ("html", Phase.startTagHtml),

2623 ("frameset", startTagFrameset),

2624 ("frame", startTagFrame),

2625 ("noframes", startTagNoframes)

2626 ])

2627 startTagHandler.default = startTagOther

2628

2629 endTagHandler = _utils.MethodDispatcher([

2630 ("frameset", endTagFrameset)

2631 ])

2632 endTagHandler.default = endTagOther

2633

2634

2635class AfterFramesetPhase(Phase):

2636 # http://www.whatwg.org/specs/web-apps/current-work/#after3

2637 __slots__ = tuple()

2638

2639 def processEOF(self):

2640 # Stop parsing

2641 pass

2642

2643 def processCharacters(self, token):

2644 self.parser.parseError("unexpected-char-after-frameset")

2645

2646 def startTagNoframes(self, token):

2647 return self.parser.phases["inHead"].processStartTag(token)

2648

2649 def startTagOther(self, token):

2650 self.parser.parseError("unexpected-start-tag-after-frameset",

2651 {"name": token["name"]})

2652

2653 def endTagHtml(self, token):

2654 self.parser.phase = self.parser.phases["afterAfterFrameset"]

2655

2656 def endTagOther(self, token):

2657 self.parser.parseError("unexpected-end-tag-after-frameset",

2658 {"name": token["name"]})

2659

2660 startTagHandler = _utils.MethodDispatcher([

2661 ("html", Phase.startTagHtml),

2662 ("noframes", startTagNoframes)

2663 ])

2664 startTagHandler.default = startTagOther

2665

2666 endTagHandler = _utils.MethodDispatcher([

2667 ("html", endTagHtml)

2668 ])

2669 endTagHandler.default = endTagOther

2670

2671

2672class AfterAfterBodyPhase(Phase):

2673 __slots__ = tuple()

2674

2675 def processEOF(self):

2676 pass

2677

2678 def processComment(self, token):

2679 self.tree.insertComment(token, self.tree.document)

2680

2681 def processSpaceCharacters(self, token):

2682 return self.parser.phases["inBody"].processSpaceCharacters(token)

2683

2684 def processCharacters(self, token):

2685 self.parser.parseError("expected-eof-but-got-char")

2686 self.parser.phase = self.parser.phases["inBody"]

2687 return token

2688

2689 def startTagHtml(self, token):

2690 return self.parser.phases["inBody"].processStartTag(token)

2691

2692 def startTagOther(self, token):

2693 self.parser.parseError("expected-eof-but-got-start-tag",

2694 {"name": token["name"]})

2695 self.parser.phase = self.parser.phases["inBody"]

2696 return token

2697

2698 def processEndTag(self, token):

2699 self.parser.parseError("expected-eof-but-got-end-tag",

2700 {"name": token["name"]})

2701 self.parser.phase = self.parser.phases["inBody"]

2702 return token

2703

2704 startTagHandler = _utils.MethodDispatcher([

2705 ("html", startTagHtml)

2706 ])

2707 startTagHandler.default = startTagOther

2708

2709

2710class AfterAfterFramesetPhase(Phase):

2711 __slots__ = tuple()

2712

2713 def processEOF(self):

2714 pass

2715

2716 def processComment(self, token):

2717 self.tree.insertComment(token, self.tree.document)

2718

2719 def processSpaceCharacters(self, token):

2720 return self.parser.phases["inBody"].processSpaceCharacters(token)

2721

2722 def processCharacters(self, token):

2723 self.parser.parseError("expected-eof-but-got-char")

2724

2725 def startTagHtml(self, token):

2726 return self.parser.phases["inBody"].processStartTag(token)

2727

2728 def startTagNoFrames(self, token):

2729 return self.parser.phases["inHead"].processStartTag(token)

2730

2731 def startTagOther(self, token):

2732 self.parser.parseError("expected-eof-but-got-start-tag",

2733 {"name": token["name"]})

2734

2735 def processEndTag(self, token):

2736 self.parser.parseError("expected-eof-but-got-end-tag",

2737 {"name": token["name"]})

2738

2739 startTagHandler = _utils.MethodDispatcher([

2740 ("html", startTagHtml),

2741 ("noframes", startTagNoFrames)

2742 ])

2743 startTagHandler.default = startTagOther

2744

2745# pylint:enable=unused-argument

2746

2747

2748_phases = {

2749 "initial": InitialPhase,

2750 "beforeHtml": BeforeHtmlPhase,

2751 "beforeHead": BeforeHeadPhase,

2752 "inHead": InHeadPhase,

2753 "inHeadNoscript": InHeadNoscriptPhase,

2754 "afterHead": AfterHeadPhase,

2755 "inBody": InBodyPhase,

2756 "text": TextPhase,

2757 "inTable": InTablePhase,

2758 "inTableText": InTableTextPhase,

2759 "inCaption": InCaptionPhase,

2760 "inColumnGroup": InColumnGroupPhase,

2761 "inTableBody": InTableBodyPhase,

2762 "inRow": InRowPhase,

2763 "inCell": InCellPhase,

2764 "inSelect": InSelectPhase,

2765 "inSelectInTable": InSelectInTablePhase,

2766 "inForeignContent": InForeignContentPhase,

2767 "afterBody": AfterBodyPhase,

2768 "inFrameset": InFramesetPhase,

2769 "afterFrameset": AfterFramesetPhase,

2770 "afterAfterBody": AfterAfterBodyPhase,

2771 "afterAfterFrameset": AfterAfterFramesetPhase,

2772 # XXX after after frameset

2773}

2774

2775

2776def adjust_attributes(token, replacements):

2777 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements)

2778 if needs_adjustment:

2779 token['data'] = type(token['data'])((replacements.get(k, k), v)

2780 for k, v in token['data'].items())

2781

2782

2783def impliedTagToken(name, type="EndTag", attributes=None,

2784 selfClosing=False):

2785 if attributes is None:

2786 attributes = {}

2787 return {"type": tokenTypes[type], "name": name, "data": attributes,

2788 "selfClosing": selfClosing}

2789

2790

2791class ParseError(Exception):

2792 """Error in parsed document"""

2793 pass