Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/html5lib/html5parser.py: 94%

1535 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:18 +0000

1from __future__ import absolute_import, division, unicode_literals 

2from six import with_metaclass, viewkeys 

3 

4import types 

5 

6from . import _inputstream 

7from . import _tokenizer 

8 

9from . import treebuilders 

10from .treebuilders.base import Marker 

11 

12from . import _utils 

13from .constants import ( 

14 spaceCharacters, asciiUpper2Lower, 

15 specialElements, headingElements, cdataElements, rcdataElements, 

16 tokenTypes, tagTokenTypes, 

17 namespaces, 

18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements, 

19 adjustForeignAttributes as adjustForeignAttributesMap, 

20 adjustMathMLAttributes, adjustSVGAttributes, 

21 E, 

22 _ReparseException 

23) 

24 

25 

26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): 

27 """Parse an HTML document as a string or file-like object into a tree 

28 

29 :arg doc: the document to parse as a string or file-like object 

30 

31 :arg treebuilder: the treebuilder to use when parsing 

32 

33 :arg namespaceHTMLElements: whether or not to namespace HTML elements 

34 

35 :returns: parsed tree 

36 

37 Example: 

38 

39 >>> from html5lib.html5parser import parse 

40 >>> parse('<html><body><p>This is a doc</p></body></html>') 

41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 

42 

43 """ 

44 tb = treebuilders.getTreeBuilder(treebuilder) 

45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 

46 return p.parse(doc, **kwargs) 

47 

48 

49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): 

50 """Parse an HTML fragment as a string or file-like object into a tree 

51 

52 :arg doc: the fragment to parse as a string or file-like object 

53 

54 :arg container: the container context to parse the fragment in 

55 

56 :arg treebuilder: the treebuilder to use when parsing 

57 

58 :arg namespaceHTMLElements: whether or not to namespace HTML elements 

59 

60 :returns: parsed tree 

61 

62 Example: 

63 

64 >>> from html5lib.html5libparser import parseFragment 

65 >>> parseFragment('<b>this is a fragment</b>') 

66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 

67 

68 """ 

69 tb = treebuilders.getTreeBuilder(treebuilder) 

70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 

71 return p.parseFragment(doc, container=container, **kwargs) 

72 

73 

74def method_decorator_metaclass(function): 

75 class Decorated(type): 

76 def __new__(meta, classname, bases, classDict): 

77 for attributeName, attribute in classDict.items(): 

78 if isinstance(attribute, types.FunctionType): 

79 attribute = function(attribute) 

80 

81 classDict[attributeName] = attribute 

82 return type.__new__(meta, classname, bases, classDict) 

83 return Decorated 

84 

85 

86class HTMLParser(object): 

87 """HTML parser 

88 

89 Generates a tree structure from a stream of (possibly malformed) HTML. 

90 

91 """ 

92 

93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): 

94 """ 

95 :arg tree: a treebuilder class controlling the type of tree that will be 

96 returned. Built in treebuilders can be accessed through 

97 html5lib.treebuilders.getTreeBuilder(treeType) 

98 

99 :arg strict: raise an exception when a parse error is encountered 

100 

101 :arg namespaceHTMLElements: whether or not to namespace HTML elements 

102 

103 :arg debug: whether or not to enable debug mode which logs things 

104 

105 Example: 

106 

107 >>> from html5lib.html5parser import HTMLParser 

108 >>> parser = HTMLParser() # generates parser with etree builder 

109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict 

110 

111 """ 

112 

113 # Raise an exception on the first error encountered 

114 self.strict = strict 

115 

116 if tree is None: 

117 tree = treebuilders.getTreeBuilder("etree") 

118 elif isinstance(tree, str): 

119 tree = treebuilders.getTreeBuilder(tree) 

120 

121 self.tree = tree(namespaceHTMLElements) 

122 self.errors = [] 

123 

124 self.phases = {name: cls(self, self.tree) for name, cls in 

125 getPhases(debug).items()} 

126 

127 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): 

128 

129 self.innerHTMLMode = innerHTML 

130 self.container = container 

131 self.scripting = scripting 

132 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) 

133 self.reset() 

134 

135 try: 

136 self.mainLoop() 

137 except _ReparseException: 

138 self.reset() 

139 self.mainLoop() 

140 

141 def reset(self): 

142 self.tree.reset() 

143 self.firstStartTag = False 

144 self.errors = [] 

145 self.log = [] # only used with debug mode 

146 # "quirks" / "limited quirks" / "no quirks" 

147 self.compatMode = "no quirks" 

148 

149 if self.innerHTMLMode: 

150 self.innerHTML = self.container.lower() 

151 

152 if self.innerHTML in cdataElements: 

153 self.tokenizer.state = self.tokenizer.rcdataState 

154 elif self.innerHTML in rcdataElements: 

155 self.tokenizer.state = self.tokenizer.rawtextState 

156 elif self.innerHTML == 'plaintext': 

157 self.tokenizer.state = self.tokenizer.plaintextState 

158 else: 

159 # state already is data state 

160 # self.tokenizer.state = self.tokenizer.dataState 

161 pass 

162 self.phase = self.phases["beforeHtml"] 

163 self.phase.insertHtmlElement() 

164 self.resetInsertionMode() 

165 else: 

166 self.innerHTML = False # pylint:disable=redefined-variable-type 

167 self.phase = self.phases["initial"] 

168 

169 self.lastPhase = None 

170 

171 self.beforeRCDataPhase = None 

172 

173 self.framesetOK = True 

174 

175 @property 

176 def documentEncoding(self): 

177 """Name of the character encoding that was used to decode the input stream, or 

178 :obj:`None` if that is not determined yet 

179 

180 """ 

181 if not hasattr(self, 'tokenizer'): 

182 return None 

183 return self.tokenizer.stream.charEncoding[0].name 

184 

185 def isHTMLIntegrationPoint(self, element): 

186 if (element.name == "annotation-xml" and 

187 element.namespace == namespaces["mathml"]): 

188 return ("encoding" in element.attributes and 

189 element.attributes["encoding"].translate( 

190 asciiUpper2Lower) in 

191 ("text/html", "application/xhtml+xml")) 

192 else: 

193 return (element.namespace, element.name) in htmlIntegrationPointElements 

194 

195 def isMathMLTextIntegrationPoint(self, element): 

196 return (element.namespace, element.name) in mathmlTextIntegrationPointElements 

197 

198 def mainLoop(self): 

199 CharactersToken = tokenTypes["Characters"] 

200 SpaceCharactersToken = tokenTypes["SpaceCharacters"] 

201 StartTagToken = tokenTypes["StartTag"] 

202 EndTagToken = tokenTypes["EndTag"] 

203 CommentToken = tokenTypes["Comment"] 

204 DoctypeToken = tokenTypes["Doctype"] 

205 ParseErrorToken = tokenTypes["ParseError"] 

206 

207 for token in self.tokenizer: 

208 prev_token = None 

209 new_token = token 

210 while new_token is not None: 

211 prev_token = new_token 

212 currentNode = self.tree.openElements[-1] if self.tree.openElements else None 

213 currentNodeNamespace = currentNode.namespace if currentNode else None 

214 currentNodeName = currentNode.name if currentNode else None 

215 

216 type = new_token["type"] 

217 

218 if type == ParseErrorToken: 

219 self.parseError(new_token["data"], new_token.get("datavars", {})) 

220 new_token = None 

221 else: 

222 if (len(self.tree.openElements) == 0 or 

223 currentNodeNamespace == self.tree.defaultNamespace or 

224 (self.isMathMLTextIntegrationPoint(currentNode) and 

225 ((type == StartTagToken and 

226 token["name"] not in frozenset(["mglyph", "malignmark"])) or 

227 type in (CharactersToken, SpaceCharactersToken))) or 

228 (currentNodeNamespace == namespaces["mathml"] and 

229 currentNodeName == "annotation-xml" and 

230 type == StartTagToken and 

231 token["name"] == "svg") or 

232 (self.isHTMLIntegrationPoint(currentNode) and 

233 type in (StartTagToken, CharactersToken, SpaceCharactersToken))): 

234 phase = self.phase 

235 else: 

236 phase = self.phases["inForeignContent"] 

237 

238 if type == CharactersToken: 

239 new_token = phase.processCharacters(new_token) 

240 elif type == SpaceCharactersToken: 

241 new_token = phase.processSpaceCharacters(new_token) 

242 elif type == StartTagToken: 

243 new_token = phase.processStartTag(new_token) 

244 elif type == EndTagToken: 

245 new_token = phase.processEndTag(new_token) 

246 elif type == CommentToken: 

247 new_token = phase.processComment(new_token) 

248 elif type == DoctypeToken: 

249 new_token = phase.processDoctype(new_token) 

250 

251 if (type == StartTagToken and prev_token["selfClosing"] and 

252 not prev_token["selfClosingAcknowledged"]): 

253 self.parseError("non-void-element-with-trailing-solidus", 

254 {"name": prev_token["name"]}) 

255 

256 # When the loop finishes it's EOF 

257 reprocess = True 

258 phases = [] 

259 while reprocess: 

260 phases.append(self.phase) 

261 reprocess = self.phase.processEOF() 

262 if reprocess: 

263 assert self.phase not in phases 

264 

265 def parse(self, stream, *args, **kwargs): 

266 """Parse a HTML document into a well-formed tree 

267 

268 :arg stream: a file-like object or string containing the HTML to be parsed 

269 

270 The optional encoding parameter must be a string that indicates 

271 the encoding. If specified, that encoding will be used, 

272 regardless of any BOM or later declaration (such as in a meta 

273 element). 

274 

275 :arg scripting: treat noscript elements as if JavaScript was turned on 

276 

277 :returns: parsed tree 

278 

279 Example: 

280 

281 >>> from html5lib.html5parser import HTMLParser 

282 >>> parser = HTMLParser() 

283 >>> parser.parse('<html><body><p>This is a doc</p></body></html>') 

284 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 

285 

286 """ 

287 self._parse(stream, False, None, *args, **kwargs) 

288 return self.tree.getDocument() 

289 

290 def parseFragment(self, stream, *args, **kwargs): 

291 """Parse a HTML fragment into a well-formed tree fragment 

292 

293 :arg container: name of the element we're setting the innerHTML 

294 property if set to None, default to 'div' 

295 

296 :arg stream: a file-like object or string containing the HTML to be parsed 

297 

298 The optional encoding parameter must be a string that indicates 

299 the encoding. If specified, that encoding will be used, 

300 regardless of any BOM or later declaration (such as in a meta 

301 element) 

302 

303 :arg scripting: treat noscript elements as if JavaScript was turned on 

304 

305 :returns: parsed tree 

306 

307 Example: 

308 

309 >>> from html5lib.html5libparser import HTMLParser 

310 >>> parser = HTMLParser() 

311 >>> parser.parseFragment('<b>this is a fragment</b>') 

312 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 

313 

314 """ 

315 self._parse(stream, True, *args, **kwargs) 

316 return self.tree.getFragment() 

317 

318 def parseError(self, errorcode="XXX-undefined-error", datavars=None): 

319 # XXX The idea is to make errorcode mandatory. 

320 if datavars is None: 

321 datavars = {} 

322 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) 

323 if self.strict: 

324 raise ParseError(E[errorcode] % datavars) 

325 

326 def adjustMathMLAttributes(self, token): 

327 adjust_attributes(token, adjustMathMLAttributes) 

328 

329 def adjustSVGAttributes(self, token): 

330 adjust_attributes(token, adjustSVGAttributes) 

331 

332 def adjustForeignAttributes(self, token): 

333 adjust_attributes(token, adjustForeignAttributesMap) 

334 

335 def reparseTokenNormal(self, token): 

336 # pylint:disable=unused-argument 

337 self.parser.phase() 

338 

339 def resetInsertionMode(self): 

340 # The name of this method is mostly historical. (It's also used in the 

341 # specification.) 

342 last = False 

343 newModes = { 

344 "select": "inSelect", 

345 "td": "inCell", 

346 "th": "inCell", 

347 "tr": "inRow", 

348 "tbody": "inTableBody", 

349 "thead": "inTableBody", 

350 "tfoot": "inTableBody", 

351 "caption": "inCaption", 

352 "colgroup": "inColumnGroup", 

353 "table": "inTable", 

354 "head": "inBody", 

355 "body": "inBody", 

356 "frameset": "inFrameset", 

357 "html": "beforeHead" 

358 } 

359 for node in self.tree.openElements[::-1]: 

360 nodeName = node.name 

361 new_phase = None 

362 if node == self.tree.openElements[0]: 

363 assert self.innerHTML 

364 last = True 

365 nodeName = self.innerHTML 

366 # Check for conditions that should only happen in the innerHTML 

367 # case 

368 if nodeName in ("select", "colgroup", "head", "html"): 

369 assert self.innerHTML 

370 

371 if not last and node.namespace != self.tree.defaultNamespace: 

372 continue 

373 

374 if nodeName in newModes: 

375 new_phase = self.phases[newModes[nodeName]] 

376 break 

377 elif last: 

378 new_phase = self.phases["inBody"] 

379 break 

380 

381 self.phase = new_phase 

382 

383 def parseRCDataRawtext(self, token, contentType): 

384 # Generic RCDATA/RAWTEXT Parsing algorithm 

385 assert contentType in ("RAWTEXT", "RCDATA") 

386 

387 self.tree.insertElement(token) 

388 

389 if contentType == "RAWTEXT": 

390 self.tokenizer.state = self.tokenizer.rawtextState 

391 else: 

392 self.tokenizer.state = self.tokenizer.rcdataState 

393 

394 self.originalPhase = self.phase 

395 

396 self.phase = self.phases["text"] 

397 

398 

399@_utils.memoize 

400def getPhases(debug): 

401 def log(function): 

402 """Logger that records which phase processes each token""" 

403 type_names = {value: key for key, value in tokenTypes.items()} 

404 

405 def wrapped(self, *args, **kwargs): 

406 if function.__name__.startswith("process") and len(args) > 0: 

407 token = args[0] 

408 info = {"type": type_names[token['type']]} 

409 if token['type'] in tagTokenTypes: 

410 info["name"] = token['name'] 

411 

412 self.parser.log.append((self.parser.tokenizer.state.__name__, 

413 self.parser.phase.__class__.__name__, 

414 self.__class__.__name__, 

415 function.__name__, 

416 info)) 

417 return function(self, *args, **kwargs) 

418 else: 

419 return function(self, *args, **kwargs) 

420 return wrapped 

421 

422 def getMetaclass(use_metaclass, metaclass_func): 

423 if use_metaclass: 

424 return method_decorator_metaclass(metaclass_func) 

425 else: 

426 return type 

427 

428 # pylint:disable=unused-argument 

429 class Phase(with_metaclass(getMetaclass(debug, log))): 

430 """Base class for helper object that implements each phase of processing 

431 """ 

432 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") 

433 

434 def __init__(self, parser, tree): 

435 self.parser = parser 

436 self.tree = tree 

437 self.__startTagCache = {} 

438 self.__endTagCache = {} 

439 

440 def processEOF(self): 

441 raise NotImplementedError 

442 

443 def processComment(self, token): 

444 # For most phases the following is correct. Where it's not it will be 

445 # overridden. 

446 self.tree.insertComment(token, self.tree.openElements[-1]) 

447 

448 def processDoctype(self, token): 

449 self.parser.parseError("unexpected-doctype") 

450 

451 def processCharacters(self, token): 

452 self.tree.insertText(token["data"]) 

453 

454 def processSpaceCharacters(self, token): 

455 self.tree.insertText(token["data"]) 

456 

457 def processStartTag(self, token): 

458 # Note the caching is done here rather than BoundMethodDispatcher as doing it there 

459 # requires a circular reference to the Phase, and this ends up with a significant 

460 # (CPython 2.7, 3.8) GC cost when parsing many short inputs 

461 name = token["name"] 

462 # In Py2, using `in` is quicker in general than try/except KeyError 

463 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 

464 if name in self.__startTagCache: 

465 func = self.__startTagCache[name] 

466 else: 

467 func = self.__startTagCache[name] = self.startTagHandler[name] 

468 # bound the cache size in case we get loads of unknown tags 

469 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: 

470 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 

471 self.__startTagCache.pop(next(iter(self.__startTagCache))) 

472 return func(token) 

473 

474 def startTagHtml(self, token): 

475 if not self.parser.firstStartTag and token["name"] == "html": 

476 self.parser.parseError("non-html-root") 

477 # XXX Need a check here to see if the first start tag token emitted is 

478 # this token... If it's not, invoke self.parser.parseError(). 

479 for attr, value in token["data"].items(): 

480 if attr not in self.tree.openElements[0].attributes: 

481 self.tree.openElements[0].attributes[attr] = value 

482 self.parser.firstStartTag = False 

483 

484 def processEndTag(self, token): 

485 # Note the caching is done here rather than BoundMethodDispatcher as doing it there 

486 # requires a circular reference to the Phase, and this ends up with a significant 

487 # (CPython 2.7, 3.8) GC cost when parsing many short inputs 

488 name = token["name"] 

489 # In Py2, using `in` is quicker in general than try/except KeyError 

490 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 

491 if name in self.__endTagCache: 

492 func = self.__endTagCache[name] 

493 else: 

494 func = self.__endTagCache[name] = self.endTagHandler[name] 

495 # bound the cache size in case we get loads of unknown tags 

496 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: 

497 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 

498 self.__endTagCache.pop(next(iter(self.__endTagCache))) 

499 return func(token) 

500 

501 class InitialPhase(Phase): 

502 __slots__ = tuple() 

503 

504 def processSpaceCharacters(self, token): 

505 pass 

506 

507 def processComment(self, token): 

508 self.tree.insertComment(token, self.tree.document) 

509 

510 def processDoctype(self, token): 

511 name = token["name"] 

512 publicId = token["publicId"] 

513 systemId = token["systemId"] 

514 correct = token["correct"] 

515 

516 if (name != "html" or publicId is not None or 

517 systemId is not None and systemId != "about:legacy-compat"): 

518 self.parser.parseError("unknown-doctype") 

519 

520 if publicId is None: 

521 publicId = "" 

522 

523 self.tree.insertDoctype(token) 

524 

525 if publicId != "": 

526 publicId = publicId.translate(asciiUpper2Lower) 

527 

528 if (not correct or token["name"] != "html" or 

529 publicId.startswith( 

530 ("+//silmaril//dtd html pro v0r11 19970101//", 

531 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", 

532 "-//as//dtd html 3.0 aswedit + extensions//", 

533 "-//ietf//dtd html 2.0 level 1//", 

534 "-//ietf//dtd html 2.0 level 2//", 

535 "-//ietf//dtd html 2.0 strict level 1//", 

536 "-//ietf//dtd html 2.0 strict level 2//", 

537 "-//ietf//dtd html 2.0 strict//", 

538 "-//ietf//dtd html 2.0//", 

539 "-//ietf//dtd html 2.1e//", 

540 "-//ietf//dtd html 3.0//", 

541 "-//ietf//dtd html 3.2 final//", 

542 "-//ietf//dtd html 3.2//", 

543 "-//ietf//dtd html 3//", 

544 "-//ietf//dtd html level 0//", 

545 "-//ietf//dtd html level 1//", 

546 "-//ietf//dtd html level 2//", 

547 "-//ietf//dtd html level 3//", 

548 "-//ietf//dtd html strict level 0//", 

549 "-//ietf//dtd html strict level 1//", 

550 "-//ietf//dtd html strict level 2//", 

551 "-//ietf//dtd html strict level 3//", 

552 "-//ietf//dtd html strict//", 

553 "-//ietf//dtd html//", 

554 "-//metrius//dtd metrius presentational//", 

555 "-//microsoft//dtd internet explorer 2.0 html strict//", 

556 "-//microsoft//dtd internet explorer 2.0 html//", 

557 "-//microsoft//dtd internet explorer 2.0 tables//", 

558 "-//microsoft//dtd internet explorer 3.0 html strict//", 

559 "-//microsoft//dtd internet explorer 3.0 html//", 

560 "-//microsoft//dtd internet explorer 3.0 tables//", 

561 "-//netscape comm. corp.//dtd html//", 

562 "-//netscape comm. corp.//dtd strict html//", 

563 "-//o'reilly and associates//dtd html 2.0//", 

564 "-//o'reilly and associates//dtd html extended 1.0//", 

565 "-//o'reilly and associates//dtd html extended relaxed 1.0//", 

566 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", 

567 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", 

568 "-//spyglass//dtd html 2.0 extended//", 

569 "-//sq//dtd html 2.0 hotmetal + extensions//", 

570 "-//sun microsystems corp.//dtd hotjava html//", 

571 "-//sun microsystems corp.//dtd hotjava strict html//", 

572 "-//w3c//dtd html 3 1995-03-24//", 

573 "-//w3c//dtd html 3.2 draft//", 

574 "-//w3c//dtd html 3.2 final//", 

575 "-//w3c//dtd html 3.2//", 

576 "-//w3c//dtd html 3.2s draft//", 

577 "-//w3c//dtd html 4.0 frameset//", 

578 "-//w3c//dtd html 4.0 transitional//", 

579 "-//w3c//dtd html experimental 19960712//", 

580 "-//w3c//dtd html experimental 970421//", 

581 "-//w3c//dtd w3 html//", 

582 "-//w3o//dtd w3 html 3.0//", 

583 "-//webtechs//dtd mozilla html 2.0//", 

584 "-//webtechs//dtd mozilla html//")) or 

585 publicId in ("-//w3o//dtd w3 html strict 3.0//en//", 

586 "-/w3c/dtd html 4.0 transitional/en", 

587 "html") or 

588 publicId.startswith( 

589 ("-//w3c//dtd html 4.01 frameset//", 

590 "-//w3c//dtd html 4.01 transitional//")) and 

591 systemId is None or 

592 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): 

593 self.parser.compatMode = "quirks" 

594 elif (publicId.startswith( 

595 ("-//w3c//dtd xhtml 1.0 frameset//", 

596 "-//w3c//dtd xhtml 1.0 transitional//")) or 

597 publicId.startswith( 

598 ("-//w3c//dtd html 4.01 frameset//", 

599 "-//w3c//dtd html 4.01 transitional//")) and 

600 systemId is not None): 

601 self.parser.compatMode = "limited quirks" 

602 

603 self.parser.phase = self.parser.phases["beforeHtml"] 

604 

605 def anythingElse(self): 

606 self.parser.compatMode = "quirks" 

607 self.parser.phase = self.parser.phases["beforeHtml"] 

608 

609 def processCharacters(self, token): 

610 self.parser.parseError("expected-doctype-but-got-chars") 

611 self.anythingElse() 

612 return token 

613 

614 def processStartTag(self, token): 

615 self.parser.parseError("expected-doctype-but-got-start-tag", 

616 {"name": token["name"]}) 

617 self.anythingElse() 

618 return token 

619 

620 def processEndTag(self, token): 

621 self.parser.parseError("expected-doctype-but-got-end-tag", 

622 {"name": token["name"]}) 

623 self.anythingElse() 

624 return token 

625 

626 def processEOF(self): 

627 self.parser.parseError("expected-doctype-but-got-eof") 

628 self.anythingElse() 

629 return True 

630 

631 class BeforeHtmlPhase(Phase): 

632 __slots__ = tuple() 

633 

634 # helper methods 

635 def insertHtmlElement(self): 

636 self.tree.insertRoot(impliedTagToken("html", "StartTag")) 

637 self.parser.phase = self.parser.phases["beforeHead"] 

638 

639 # other 

640 def processEOF(self): 

641 self.insertHtmlElement() 

642 return True 

643 

644 def processComment(self, token): 

645 self.tree.insertComment(token, self.tree.document) 

646 

647 def processSpaceCharacters(self, token): 

648 pass 

649 

650 def processCharacters(self, token): 

651 self.insertHtmlElement() 

652 return token 

653 

654 def processStartTag(self, token): 

655 if token["name"] == "html": 

656 self.parser.firstStartTag = True 

657 self.insertHtmlElement() 

658 return token 

659 

660 def processEndTag(self, token): 

661 if token["name"] not in ("head", "body", "html", "br"): 

662 self.parser.parseError("unexpected-end-tag-before-html", 

663 {"name": token["name"]}) 

664 else: 

665 self.insertHtmlElement() 

666 return token 

667 

668 class BeforeHeadPhase(Phase): 

669 __slots__ = tuple() 

670 

671 def processEOF(self): 

672 self.startTagHead(impliedTagToken("head", "StartTag")) 

673 return True 

674 

675 def processSpaceCharacters(self, token): 

676 pass 

677 

678 def processCharacters(self, token): 

679 self.startTagHead(impliedTagToken("head", "StartTag")) 

680 return token 

681 

682 def startTagHtml(self, token): 

683 return self.parser.phases["inBody"].processStartTag(token) 

684 

685 def startTagHead(self, token): 

686 self.tree.insertElement(token) 

687 self.tree.headPointer = self.tree.openElements[-1] 

688 self.parser.phase = self.parser.phases["inHead"] 

689 

690 def startTagOther(self, token): 

691 self.startTagHead(impliedTagToken("head", "StartTag")) 

692 return token 

693 

694 def endTagImplyHead(self, token): 

695 self.startTagHead(impliedTagToken("head", "StartTag")) 

696 return token 

697 

698 def endTagOther(self, token): 

699 self.parser.parseError("end-tag-after-implied-root", 

700 {"name": token["name"]}) 

701 

702 startTagHandler = _utils.MethodDispatcher([ 

703 ("html", startTagHtml), 

704 ("head", startTagHead) 

705 ]) 

706 startTagHandler.default = startTagOther 

707 

708 endTagHandler = _utils.MethodDispatcher([ 

709 (("head", "body", "html", "br"), endTagImplyHead) 

710 ]) 

711 endTagHandler.default = endTagOther 

712 

713 class InHeadPhase(Phase): 

714 __slots__ = tuple() 

715 

716 # the real thing 

717 def processEOF(self): 

718 self.anythingElse() 

719 return True 

720 

721 def processCharacters(self, token): 

722 self.anythingElse() 

723 return token 

724 

725 def startTagHtml(self, token): 

726 return self.parser.phases["inBody"].processStartTag(token) 

727 

728 def startTagHead(self, token): 

729 self.parser.parseError("two-heads-are-not-better-than-one") 

730 

731 def startTagBaseLinkCommand(self, token): 

732 self.tree.insertElement(token) 

733 self.tree.openElements.pop() 

734 token["selfClosingAcknowledged"] = True 

735 

736 def startTagMeta(self, token): 

737 self.tree.insertElement(token) 

738 self.tree.openElements.pop() 

739 token["selfClosingAcknowledged"] = True 

740 

741 attributes = token["data"] 

742 if self.parser.tokenizer.stream.charEncoding[1] == "tentative": 

743 if "charset" in attributes: 

744 self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) 

745 elif ("content" in attributes and 

746 "http-equiv" in attributes and 

747 attributes["http-equiv"].lower() == "content-type"): 

748 # Encoding it as UTF-8 here is a hack, as really we should pass 

749 # the abstract Unicode string, and just use the 

750 # ContentAttrParser on that, but using UTF-8 allows all chars 

751 # to be encoded and as a ASCII-superset works. 

752 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) 

753 parser = _inputstream.ContentAttrParser(data) 

754 codec = parser.parse() 

755 self.parser.tokenizer.stream.changeEncoding(codec) 

756 

757 def startTagTitle(self, token): 

758 self.parser.parseRCDataRawtext(token, "RCDATA") 

759 

760 def startTagNoFramesStyle(self, token): 

761 # Need to decide whether to implement the scripting-disabled case 

762 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

763 

764 def startTagNoscript(self, token): 

765 if self.parser.scripting: 

766 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

767 else: 

768 self.tree.insertElement(token) 

769 self.parser.phase = self.parser.phases["inHeadNoscript"] 

770 

771 def startTagScript(self, token): 

772 self.tree.insertElement(token) 

773 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState 

774 self.parser.originalPhase = self.parser.phase 

775 self.parser.phase = self.parser.phases["text"] 

776 

777 def startTagOther(self, token): 

778 self.anythingElse() 

779 return token 

780 

781 def endTagHead(self, token): 

782 node = self.parser.tree.openElements.pop() 

783 assert node.name == "head", "Expected head got %s" % node.name 

784 self.parser.phase = self.parser.phases["afterHead"] 

785 

786 def endTagHtmlBodyBr(self, token): 

787 self.anythingElse() 

788 return token 

789 

790 def endTagOther(self, token): 

791 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

792 

793 def anythingElse(self): 

794 self.endTagHead(impliedTagToken("head")) 

795 

796 startTagHandler = _utils.MethodDispatcher([ 

797 ("html", startTagHtml), 

798 ("title", startTagTitle), 

799 (("noframes", "style"), startTagNoFramesStyle), 

800 ("noscript", startTagNoscript), 

801 ("script", startTagScript), 

802 (("base", "basefont", "bgsound", "command", "link"), 

803 startTagBaseLinkCommand), 

804 ("meta", startTagMeta), 

805 ("head", startTagHead) 

806 ]) 

807 startTagHandler.default = startTagOther 

808 

809 endTagHandler = _utils.MethodDispatcher([ 

810 ("head", endTagHead), 

811 (("br", "html", "body"), endTagHtmlBodyBr) 

812 ]) 

813 endTagHandler.default = endTagOther 

814 

815 class InHeadNoscriptPhase(Phase): 

816 __slots__ = tuple() 

817 

818 def processEOF(self): 

819 self.parser.parseError("eof-in-head-noscript") 

820 self.anythingElse() 

821 return True 

822 

823 def processComment(self, token): 

824 return self.parser.phases["inHead"].processComment(token) 

825 

826 def processCharacters(self, token): 

827 self.parser.parseError("char-in-head-noscript") 

828 self.anythingElse() 

829 return token 

830 

831 def processSpaceCharacters(self, token): 

832 return self.parser.phases["inHead"].processSpaceCharacters(token) 

833 

834 def startTagHtml(self, token): 

835 return self.parser.phases["inBody"].processStartTag(token) 

836 

837 def startTagBaseLinkCommand(self, token): 

838 return self.parser.phases["inHead"].processStartTag(token) 

839 

840 def startTagHeadNoscript(self, token): 

841 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 

842 

843 def startTagOther(self, token): 

844 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 

845 self.anythingElse() 

846 return token 

847 

848 def endTagNoscript(self, token): 

849 node = self.parser.tree.openElements.pop() 

850 assert node.name == "noscript", "Expected noscript got %s" % node.name 

851 self.parser.phase = self.parser.phases["inHead"] 

852 

853 def endTagBr(self, token): 

854 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 

855 self.anythingElse() 

856 return token 

857 

858 def endTagOther(self, token): 

859 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

860 

861 def anythingElse(self): 

862 # Caller must raise parse error first! 

863 self.endTagNoscript(impliedTagToken("noscript")) 

864 

865 startTagHandler = _utils.MethodDispatcher([ 

866 ("html", startTagHtml), 

867 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), 

868 (("head", "noscript"), startTagHeadNoscript), 

869 ]) 

870 startTagHandler.default = startTagOther 

871 

872 endTagHandler = _utils.MethodDispatcher([ 

873 ("noscript", endTagNoscript), 

874 ("br", endTagBr), 

875 ]) 

876 endTagHandler.default = endTagOther 

877 

878 class AfterHeadPhase(Phase): 

879 __slots__ = tuple() 

880 

881 def processEOF(self): 

882 self.anythingElse() 

883 return True 

884 

885 def processCharacters(self, token): 

886 self.anythingElse() 

887 return token 

888 

889 def startTagHtml(self, token): 

890 return self.parser.phases["inBody"].processStartTag(token) 

891 

892 def startTagBody(self, token): 

893 self.parser.framesetOK = False 

894 self.tree.insertElement(token) 

895 self.parser.phase = self.parser.phases["inBody"] 

896 

897 def startTagFrameset(self, token): 

898 self.tree.insertElement(token) 

899 self.parser.phase = self.parser.phases["inFrameset"] 

900 

901 def startTagFromHead(self, token): 

902 self.parser.parseError("unexpected-start-tag-out-of-my-head", 

903 {"name": token["name"]}) 

904 self.tree.openElements.append(self.tree.headPointer) 

905 self.parser.phases["inHead"].processStartTag(token) 

906 for node in self.tree.openElements[::-1]: 

907 if node.name == "head": 

908 self.tree.openElements.remove(node) 

909 break 

910 

911 def startTagHead(self, token): 

912 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 

913 

914 def startTagOther(self, token): 

915 self.anythingElse() 

916 return token 

917 

918 def endTagHtmlBodyBr(self, token): 

919 self.anythingElse() 

920 return token 

921 

922 def endTagOther(self, token): 

923 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

924 

925 def anythingElse(self): 

926 self.tree.insertElement(impliedTagToken("body", "StartTag")) 

927 self.parser.phase = self.parser.phases["inBody"] 

928 self.parser.framesetOK = True 

929 

930 startTagHandler = _utils.MethodDispatcher([ 

931 ("html", startTagHtml), 

932 ("body", startTagBody), 

933 ("frameset", startTagFrameset), 

934 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", 

935 "style", "title"), 

936 startTagFromHead), 

937 ("head", startTagHead) 

938 ]) 

939 startTagHandler.default = startTagOther 

940 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), 

941 endTagHtmlBodyBr)]) 

942 endTagHandler.default = endTagOther 

943 

944 class InBodyPhase(Phase): 

945 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody 

946 # the really-really-really-very crazy mode 

947 __slots__ = ("processSpaceCharacters",) 

948 

949 def __init__(self, *args, **kwargs): 

950 super(InBodyPhase, self).__init__(*args, **kwargs) 

951 # Set this to the default handler 

952 self.processSpaceCharacters = self.processSpaceCharactersNonPre 

953 

954 def isMatchingFormattingElement(self, node1, node2): 

955 return (node1.name == node2.name and 

956 node1.namespace == node2.namespace and 

957 node1.attributes == node2.attributes) 

958 

959 # helper 

960 def addFormattingElement(self, token): 

961 self.tree.insertElement(token) 

962 element = self.tree.openElements[-1] 

963 

964 matchingElements = [] 

965 for node in self.tree.activeFormattingElements[::-1]: 

966 if node is Marker: 

967 break 

968 elif self.isMatchingFormattingElement(node, element): 

969 matchingElements.append(node) 

970 

971 assert len(matchingElements) <= 3 

972 if len(matchingElements) == 3: 

973 self.tree.activeFormattingElements.remove(matchingElements[-1]) 

974 self.tree.activeFormattingElements.append(element) 

975 

976 # the real deal 

977 def processEOF(self): 

978 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", 

979 "tfoot", "th", "thead", "tr", "body", 

980 "html")) 

981 for node in self.tree.openElements[::-1]: 

982 if node.name not in allowed_elements: 

983 self.parser.parseError("expected-closing-tag-but-got-eof") 

984 break 

985 # Stop parsing 

986 

987 def processSpaceCharactersDropNewline(self, token): 

988 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we 

989 # want to drop leading newlines 

990 data = token["data"] 

991 self.processSpaceCharacters = self.processSpaceCharactersNonPre 

992 if (data.startswith("\n") and 

993 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and 

994 not self.tree.openElements[-1].hasContent()): 

995 data = data[1:] 

996 if data: 

997 self.tree.reconstructActiveFormattingElements() 

998 self.tree.insertText(data) 

999 

1000 def processCharacters(self, token): 

1001 if token["data"] == "\u0000": 

1002 # The tokenizer should always emit null on its own 

1003 return 

1004 self.tree.reconstructActiveFormattingElements() 

1005 self.tree.insertText(token["data"]) 

1006 # This must be bad for performance 

1007 if (self.parser.framesetOK and 

1008 any(char not in spaceCharacters 

1009 for char in token["data"])): 

1010 self.parser.framesetOK = False 

1011 

1012 def processSpaceCharactersNonPre(self, token): 

1013 self.tree.reconstructActiveFormattingElements() 

1014 self.tree.insertText(token["data"]) 

1015 

1016 def startTagProcessInHead(self, token): 

1017 return self.parser.phases["inHead"].processStartTag(token) 

1018 

1019 def startTagBody(self, token): 

1020 self.parser.parseError("unexpected-start-tag", {"name": "body"}) 

1021 if (len(self.tree.openElements) == 1 or 

1022 self.tree.openElements[1].name != "body"): 

1023 assert self.parser.innerHTML 

1024 else: 

1025 self.parser.framesetOK = False 

1026 for attr, value in token["data"].items(): 

1027 if attr not in self.tree.openElements[1].attributes: 

1028 self.tree.openElements[1].attributes[attr] = value 

1029 

1030 def startTagFrameset(self, token): 

1031 self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) 

1032 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): 

1033 assert self.parser.innerHTML 

1034 elif not self.parser.framesetOK: 

1035 pass 

1036 else: 

1037 if self.tree.openElements[1].parent: 

1038 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) 

1039 while self.tree.openElements[-1].name != "html": 

1040 self.tree.openElements.pop() 

1041 self.tree.insertElement(token) 

1042 self.parser.phase = self.parser.phases["inFrameset"] 

1043 

1044 def startTagCloseP(self, token): 

1045 if self.tree.elementInScope("p", variant="button"): 

1046 self.endTagP(impliedTagToken("p")) 

1047 self.tree.insertElement(token) 

1048 

1049 def startTagPreListing(self, token): 

1050 if self.tree.elementInScope("p", variant="button"): 

1051 self.endTagP(impliedTagToken("p")) 

1052 self.tree.insertElement(token) 

1053 self.parser.framesetOK = False 

1054 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 

1055 

1056 def startTagForm(self, token): 

1057 if self.tree.formPointer: 

1058 self.parser.parseError("unexpected-start-tag", {"name": "form"}) 

1059 else: 

1060 if self.tree.elementInScope("p", variant="button"): 

1061 self.endTagP(impliedTagToken("p")) 

1062 self.tree.insertElement(token) 

1063 self.tree.formPointer = self.tree.openElements[-1] 

1064 

1065 def startTagListItem(self, token): 

1066 self.parser.framesetOK = False 

1067 

1068 stopNamesMap = {"li": ["li"], 

1069 "dt": ["dt", "dd"], 

1070 "dd": ["dt", "dd"]} 

1071 stopNames = stopNamesMap[token["name"]] 

1072 for node in reversed(self.tree.openElements): 

1073 if node.name in stopNames: 

1074 self.parser.phase.processEndTag( 

1075 impliedTagToken(node.name, "EndTag")) 

1076 break 

1077 if (node.nameTuple in specialElements and 

1078 node.name not in ("address", "div", "p")): 

1079 break 

1080 

1081 if self.tree.elementInScope("p", variant="button"): 

1082 self.parser.phase.processEndTag( 

1083 impliedTagToken("p", "EndTag")) 

1084 

1085 self.tree.insertElement(token) 

1086 

1087 def startTagPlaintext(self, token): 

1088 if self.tree.elementInScope("p", variant="button"): 

1089 self.endTagP(impliedTagToken("p")) 

1090 self.tree.insertElement(token) 

1091 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState 

1092 

1093 def startTagHeading(self, token): 

1094 if self.tree.elementInScope("p", variant="button"): 

1095 self.endTagP(impliedTagToken("p")) 

1096 if self.tree.openElements[-1].name in headingElements: 

1097 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 

1098 self.tree.openElements.pop() 

1099 self.tree.insertElement(token) 

1100 

1101 def startTagA(self, token): 

1102 afeAElement = self.tree.elementInActiveFormattingElements("a") 

1103 if afeAElement: 

1104 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1105 {"startName": "a", "endName": "a"}) 

1106 self.endTagFormatting(impliedTagToken("a")) 

1107 if afeAElement in self.tree.openElements: 

1108 self.tree.openElements.remove(afeAElement) 

1109 if afeAElement in self.tree.activeFormattingElements: 

1110 self.tree.activeFormattingElements.remove(afeAElement) 

1111 self.tree.reconstructActiveFormattingElements() 

1112 self.addFormattingElement(token) 

1113 

1114 def startTagFormatting(self, token): 

1115 self.tree.reconstructActiveFormattingElements() 

1116 self.addFormattingElement(token) 

1117 

1118 def startTagNobr(self, token): 

1119 self.tree.reconstructActiveFormattingElements() 

1120 if self.tree.elementInScope("nobr"): 

1121 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1122 {"startName": "nobr", "endName": "nobr"}) 

1123 self.processEndTag(impliedTagToken("nobr")) 

1124 # XXX Need tests that trigger the following 

1125 self.tree.reconstructActiveFormattingElements() 

1126 self.addFormattingElement(token) 

1127 

1128 def startTagButton(self, token): 

1129 if self.tree.elementInScope("button"): 

1130 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1131 {"startName": "button", "endName": "button"}) 

1132 self.processEndTag(impliedTagToken("button")) 

1133 return token 

1134 else: 

1135 self.tree.reconstructActiveFormattingElements() 

1136 self.tree.insertElement(token) 

1137 self.parser.framesetOK = False 

1138 

1139 def startTagAppletMarqueeObject(self, token): 

1140 self.tree.reconstructActiveFormattingElements() 

1141 self.tree.insertElement(token) 

1142 self.tree.activeFormattingElements.append(Marker) 

1143 self.parser.framesetOK = False 

1144 

1145 def startTagXmp(self, token): 

1146 if self.tree.elementInScope("p", variant="button"): 

1147 self.endTagP(impliedTagToken("p")) 

1148 self.tree.reconstructActiveFormattingElements() 

1149 self.parser.framesetOK = False 

1150 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

1151 

1152 def startTagTable(self, token): 

1153 if self.parser.compatMode != "quirks": 

1154 if self.tree.elementInScope("p", variant="button"): 

1155 self.processEndTag(impliedTagToken("p")) 

1156 self.tree.insertElement(token) 

1157 self.parser.framesetOK = False 

1158 self.parser.phase = self.parser.phases["inTable"] 

1159 

1160 def startTagVoidFormatting(self, token): 

1161 self.tree.reconstructActiveFormattingElements() 

1162 self.tree.insertElement(token) 

1163 self.tree.openElements.pop() 

1164 token["selfClosingAcknowledged"] = True 

1165 self.parser.framesetOK = False 

1166 

1167 def startTagInput(self, token): 

1168 framesetOK = self.parser.framesetOK 

1169 self.startTagVoidFormatting(token) 

1170 if ("type" in token["data"] and 

1171 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 

1172 # input type=hidden doesn't change framesetOK 

1173 self.parser.framesetOK = framesetOK 

1174 

1175 def startTagParamSource(self, token): 

1176 self.tree.insertElement(token) 

1177 self.tree.openElements.pop() 

1178 token["selfClosingAcknowledged"] = True 

1179 

1180 def startTagHr(self, token): 

1181 if self.tree.elementInScope("p", variant="button"): 

1182 self.endTagP(impliedTagToken("p")) 

1183 self.tree.insertElement(token) 

1184 self.tree.openElements.pop() 

1185 token["selfClosingAcknowledged"] = True 

1186 self.parser.framesetOK = False 

1187 

1188 def startTagImage(self, token): 

1189 # No really... 

1190 self.parser.parseError("unexpected-start-tag-treated-as", 

1191 {"originalName": "image", "newName": "img"}) 

1192 self.processStartTag(impliedTagToken("img", "StartTag", 

1193 attributes=token["data"], 

1194 selfClosing=token["selfClosing"])) 

1195 

1196 def startTagIsIndex(self, token): 

1197 self.parser.parseError("deprecated-tag", {"name": "isindex"}) 

1198 if self.tree.formPointer: 

1199 return 

1200 form_attrs = {} 

1201 if "action" in token["data"]: 

1202 form_attrs["action"] = token["data"]["action"] 

1203 self.processStartTag(impliedTagToken("form", "StartTag", 

1204 attributes=form_attrs)) 

1205 self.processStartTag(impliedTagToken("hr", "StartTag")) 

1206 self.processStartTag(impliedTagToken("label", "StartTag")) 

1207 # XXX Localization ... 

1208 if "prompt" in token["data"]: 

1209 prompt = token["data"]["prompt"] 

1210 else: 

1211 prompt = "This is a searchable index. Enter search keywords: " 

1212 self.processCharacters( 

1213 {"type": tokenTypes["Characters"], "data": prompt}) 

1214 attributes = token["data"].copy() 

1215 if "action" in attributes: 

1216 del attributes["action"] 

1217 if "prompt" in attributes: 

1218 del attributes["prompt"] 

1219 attributes["name"] = "isindex" 

1220 self.processStartTag(impliedTagToken("input", "StartTag", 

1221 attributes=attributes, 

1222 selfClosing=token["selfClosing"])) 

1223 self.processEndTag(impliedTagToken("label")) 

1224 self.processStartTag(impliedTagToken("hr", "StartTag")) 

1225 self.processEndTag(impliedTagToken("form")) 

1226 

1227 def startTagTextarea(self, token): 

1228 self.tree.insertElement(token) 

1229 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState 

1230 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 

1231 self.parser.framesetOK = False 

1232 

1233 def startTagIFrame(self, token): 

1234 self.parser.framesetOK = False 

1235 self.startTagRawtext(token) 

1236 

1237 def startTagNoscript(self, token): 

1238 if self.parser.scripting: 

1239 self.startTagRawtext(token) 

1240 else: 

1241 self.startTagOther(token) 

1242 

1243 def startTagRawtext(self, token): 

1244 """iframe, noembed noframes, noscript(if scripting enabled)""" 

1245 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

1246 

1247 def startTagOpt(self, token): 

1248 if self.tree.openElements[-1].name == "option": 

1249 self.parser.phase.processEndTag(impliedTagToken("option")) 

1250 self.tree.reconstructActiveFormattingElements() 

1251 self.parser.tree.insertElement(token) 

1252 

1253 def startTagSelect(self, token): 

1254 self.tree.reconstructActiveFormattingElements() 

1255 self.tree.insertElement(token) 

1256 self.parser.framesetOK = False 

1257 if self.parser.phase in (self.parser.phases["inTable"], 

1258 self.parser.phases["inCaption"], 

1259 self.parser.phases["inColumnGroup"], 

1260 self.parser.phases["inTableBody"], 

1261 self.parser.phases["inRow"], 

1262 self.parser.phases["inCell"]): 

1263 self.parser.phase = self.parser.phases["inSelectInTable"] 

1264 else: 

1265 self.parser.phase = self.parser.phases["inSelect"] 

1266 

1267 def startTagRpRt(self, token): 

1268 if self.tree.elementInScope("ruby"): 

1269 self.tree.generateImpliedEndTags() 

1270 if self.tree.openElements[-1].name != "ruby": 

1271 self.parser.parseError() 

1272 self.tree.insertElement(token) 

1273 

1274 def startTagMath(self, token): 

1275 self.tree.reconstructActiveFormattingElements() 

1276 self.parser.adjustMathMLAttributes(token) 

1277 self.parser.adjustForeignAttributes(token) 

1278 token["namespace"] = namespaces["mathml"] 

1279 self.tree.insertElement(token) 

1280 # Need to get the parse error right for the case where the token 

1281 # has a namespace not equal to the xmlns attribute 

1282 if token["selfClosing"]: 

1283 self.tree.openElements.pop() 

1284 token["selfClosingAcknowledged"] = True 

1285 

1286 def startTagSvg(self, token): 

1287 self.tree.reconstructActiveFormattingElements() 

1288 self.parser.adjustSVGAttributes(token) 

1289 self.parser.adjustForeignAttributes(token) 

1290 token["namespace"] = namespaces["svg"] 

1291 self.tree.insertElement(token) 

1292 # Need to get the parse error right for the case where the token 

1293 # has a namespace not equal to the xmlns attribute 

1294 if token["selfClosing"]: 

1295 self.tree.openElements.pop() 

1296 token["selfClosingAcknowledged"] = True 

1297 

1298 def startTagMisplaced(self, token): 

1299 """ Elements that should be children of other elements that have a 

1300 different insertion mode; here they are ignored 

1301 "caption", "col", "colgroup", "frame", "frameset", "head", 

1302 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", 

1303 "tr", "noscript" 

1304 """ 

1305 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) 

1306 

1307 def startTagOther(self, token): 

1308 self.tree.reconstructActiveFormattingElements() 

1309 self.tree.insertElement(token) 

1310 

1311 def endTagP(self, token): 

1312 if not self.tree.elementInScope("p", variant="button"): 

1313 self.startTagCloseP(impliedTagToken("p", "StartTag")) 

1314 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 

1315 self.endTagP(impliedTagToken("p", "EndTag")) 

1316 else: 

1317 self.tree.generateImpliedEndTags("p") 

1318 if self.tree.openElements[-1].name != "p": 

1319 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 

1320 node = self.tree.openElements.pop() 

1321 while node.name != "p": 

1322 node = self.tree.openElements.pop() 

1323 

1324 def endTagBody(self, token): 

1325 if not self.tree.elementInScope("body"): 

1326 self.parser.parseError() 

1327 return 

1328 elif self.tree.openElements[-1].name != "body": 

1329 for node in self.tree.openElements[2:]: 

1330 if node.name not in frozenset(("dd", "dt", "li", "optgroup", 

1331 "option", "p", "rp", "rt", 

1332 "tbody", "td", "tfoot", 

1333 "th", "thead", "tr", "body", 

1334 "html")): 

1335 # Not sure this is the correct name for the parse error 

1336 self.parser.parseError( 

1337 "expected-one-end-tag-but-got-another", 

1338 {"gotName": "body", "expectedName": node.name}) 

1339 break 

1340 self.parser.phase = self.parser.phases["afterBody"] 

1341 

1342 def endTagHtml(self, token): 

1343 # We repeat the test for the body end tag token being ignored here 

1344 if self.tree.elementInScope("body"): 

1345 self.endTagBody(impliedTagToken("body")) 

1346 return token 

1347 

1348 def endTagBlock(self, token): 

1349 # Put us back in the right whitespace handling mode 

1350 if token["name"] == "pre": 

1351 self.processSpaceCharacters = self.processSpaceCharactersNonPre 

1352 inScope = self.tree.elementInScope(token["name"]) 

1353 if inScope: 

1354 self.tree.generateImpliedEndTags() 

1355 if self.tree.openElements[-1].name != token["name"]: 

1356 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 

1357 if inScope: 

1358 node = self.tree.openElements.pop() 

1359 while node.name != token["name"]: 

1360 node = self.tree.openElements.pop() 

1361 

1362 def endTagForm(self, token): 

1363 node = self.tree.formPointer 

1364 self.tree.formPointer = None 

1365 if node is None or not self.tree.elementInScope(node): 

1366 self.parser.parseError("unexpected-end-tag", 

1367 {"name": "form"}) 

1368 else: 

1369 self.tree.generateImpliedEndTags() 

1370 if self.tree.openElements[-1] != node: 

1371 self.parser.parseError("end-tag-too-early-ignored", 

1372 {"name": "form"}) 

1373 self.tree.openElements.remove(node) 

1374 

1375 def endTagListItem(self, token): 

1376 if token["name"] == "li": 

1377 variant = "list" 

1378 else: 

1379 variant = None 

1380 if not self.tree.elementInScope(token["name"], variant=variant): 

1381 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1382 else: 

1383 self.tree.generateImpliedEndTags(exclude=token["name"]) 

1384 if self.tree.openElements[-1].name != token["name"]: 

1385 self.parser.parseError( 

1386 "end-tag-too-early", 

1387 {"name": token["name"]}) 

1388 node = self.tree.openElements.pop() 

1389 while node.name != token["name"]: 

1390 node = self.tree.openElements.pop() 

1391 

1392 def endTagHeading(self, token): 

1393 for item in headingElements: 

1394 if self.tree.elementInScope(item): 

1395 self.tree.generateImpliedEndTags() 

1396 break 

1397 if self.tree.openElements[-1].name != token["name"]: 

1398 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 

1399 

1400 for item in headingElements: 

1401 if self.tree.elementInScope(item): 

1402 item = self.tree.openElements.pop() 

1403 while item.name not in headingElements: 

1404 item = self.tree.openElements.pop() 

1405 break 

1406 

1407 def endTagFormatting(self, token): 

1408 """The much-feared adoption agency algorithm""" 

1409 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 

1410 # XXX Better parseError messages appreciated. 

1411 

1412 # Step 1 

1413 outerLoopCounter = 0 

1414 

1415 # Step 2 

1416 while outerLoopCounter < 8: 

1417 

1418 # Step 3 

1419 outerLoopCounter += 1 

1420 

1421 # Step 4: 

1422 

1423 # Let the formatting element be the last element in 

1424 # the list of active formatting elements that: 

1425 # - is between the end of the list and the last scope 

1426 # marker in the list, if any, or the start of the list 

1427 # otherwise, and 

1428 # - has the same tag name as the token. 

1429 formattingElement = self.tree.elementInActiveFormattingElements( 

1430 token["name"]) 

1431 if (not formattingElement or 

1432 (formattingElement in self.tree.openElements and 

1433 not self.tree.elementInScope(formattingElement.name))): 

1434 # If there is no such node, then abort these steps 

1435 # and instead act as described in the "any other 

1436 # end tag" entry below. 

1437 self.endTagOther(token) 

1438 return 

1439 

1440 # Otherwise, if there is such a node, but that node is 

1441 # not in the stack of open elements, then this is a 

1442 # parse error; remove the element from the list, and 

1443 # abort these steps. 

1444 elif formattingElement not in self.tree.openElements: 

1445 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) 

1446 self.tree.activeFormattingElements.remove(formattingElement) 

1447 return 

1448 

1449 # Otherwise, if there is such a node, and that node is 

1450 # also in the stack of open elements, but the element 

1451 # is not in scope, then this is a parse error; ignore 

1452 # the token, and abort these steps. 

1453 elif not self.tree.elementInScope(formattingElement.name): 

1454 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) 

1455 return 

1456 

1457 # Otherwise, there is a formatting element and that 

1458 # element is in the stack and is in scope. If the 

1459 # element is not the current node, this is a parse 

1460 # error. In any case, proceed with the algorithm as 

1461 # written in the following steps. 

1462 else: 

1463 if formattingElement != self.tree.openElements[-1]: 

1464 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) 

1465 

1466 # Step 5: 

1467 

1468 # Let the furthest block be the topmost node in the 

1469 # stack of open elements that is lower in the stack 

1470 # than the formatting element, and is an element in 

1471 # the special category. There might not be one. 

1472 afeIndex = self.tree.openElements.index(formattingElement) 

1473 furthestBlock = None 

1474 for element in self.tree.openElements[afeIndex:]: 

1475 if element.nameTuple in specialElements: 

1476 furthestBlock = element 

1477 break 

1478 

1479 # Step 6: 

1480 

1481 # If there is no furthest block, then the UA must 

1482 # first pop all the nodes from the bottom of the stack 

1483 # of open elements, from the current node up to and 

1484 # including the formatting element, then remove the 

1485 # formatting element from the list of active 

1486 # formatting elements, and finally abort these steps. 

1487 if furthestBlock is None: 

1488 element = self.tree.openElements.pop() 

1489 while element != formattingElement: 

1490 element = self.tree.openElements.pop() 

1491 self.tree.activeFormattingElements.remove(element) 

1492 return 

1493 

1494 # Step 7 

1495 commonAncestor = self.tree.openElements[afeIndex - 1] 

1496 

1497 # Step 8: 

1498 # The bookmark is supposed to help us identify where to reinsert 

1499 # nodes in step 15. We have to ensure that we reinsert nodes after 

1500 # the node before the active formatting element. Note the bookmark 

1501 # can move in step 9.7 

1502 bookmark = self.tree.activeFormattingElements.index(formattingElement) 

1503 

1504 # Step 9 

1505 lastNode = node = furthestBlock 

1506 innerLoopCounter = 0 

1507 

1508 index = self.tree.openElements.index(node) 

1509 while innerLoopCounter < 3: 

1510 innerLoopCounter += 1 

1511 # Node is element before node in open elements 

1512 index -= 1 

1513 node = self.tree.openElements[index] 

1514 if node not in self.tree.activeFormattingElements: 

1515 self.tree.openElements.remove(node) 

1516 continue 

1517 # Step 9.6 

1518 if node == formattingElement: 

1519 break 

1520 # Step 9.7 

1521 if lastNode == furthestBlock: 

1522 bookmark = self.tree.activeFormattingElements.index(node) + 1 

1523 # Step 9.8 

1524 clone = node.cloneNode() 

1525 # Replace node with clone 

1526 self.tree.activeFormattingElements[ 

1527 self.tree.activeFormattingElements.index(node)] = clone 

1528 self.tree.openElements[ 

1529 self.tree.openElements.index(node)] = clone 

1530 node = clone 

1531 # Step 9.9 

1532 # Remove lastNode from its parents, if any 

1533 if lastNode.parent: 

1534 lastNode.parent.removeChild(lastNode) 

1535 node.appendChild(lastNode) 

1536 # Step 9.10 

1537 lastNode = node 

1538 

1539 # Step 10 

1540 # Foster parent lastNode if commonAncestor is a 

1541 # table, tbody, tfoot, thead, or tr we need to foster 

1542 # parent the lastNode 

1543 if lastNode.parent: 

1544 lastNode.parent.removeChild(lastNode) 

1545 

1546 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): 

1547 parent, insertBefore = self.tree.getTableMisnestedNodePosition() 

1548 parent.insertBefore(lastNode, insertBefore) 

1549 else: 

1550 commonAncestor.appendChild(lastNode) 

1551 

1552 # Step 11 

1553 clone = formattingElement.cloneNode() 

1554 

1555 # Step 12 

1556 furthestBlock.reparentChildren(clone) 

1557 

1558 # Step 13 

1559 furthestBlock.appendChild(clone) 

1560 

1561 # Step 14 

1562 self.tree.activeFormattingElements.remove(formattingElement) 

1563 self.tree.activeFormattingElements.insert(bookmark, clone) 

1564 

1565 # Step 15 

1566 self.tree.openElements.remove(formattingElement) 

1567 self.tree.openElements.insert( 

1568 self.tree.openElements.index(furthestBlock) + 1, clone) 

1569 

1570 def endTagAppletMarqueeObject(self, token): 

1571 if self.tree.elementInScope(token["name"]): 

1572 self.tree.generateImpliedEndTags() 

1573 if self.tree.openElements[-1].name != token["name"]: 

1574 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 

1575 

1576 if self.tree.elementInScope(token["name"]): 

1577 element = self.tree.openElements.pop() 

1578 while element.name != token["name"]: 

1579 element = self.tree.openElements.pop() 

1580 self.tree.clearActiveFormattingElements() 

1581 

1582 def endTagBr(self, token): 

1583 self.parser.parseError("unexpected-end-tag-treated-as", 

1584 {"originalName": "br", "newName": "br element"}) 

1585 self.tree.reconstructActiveFormattingElements() 

1586 self.tree.insertElement(impliedTagToken("br", "StartTag")) 

1587 self.tree.openElements.pop() 

1588 

1589 def endTagOther(self, token): 

1590 for node in self.tree.openElements[::-1]: 

1591 if node.name == token["name"]: 

1592 self.tree.generateImpliedEndTags(exclude=token["name"]) 

1593 if self.tree.openElements[-1].name != token["name"]: 

1594 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1595 while self.tree.openElements.pop() != node: 

1596 pass 

1597 break 

1598 else: 

1599 if node.nameTuple in specialElements: 

1600 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1601 break 

1602 

1603 startTagHandler = _utils.MethodDispatcher([ 

1604 ("html", Phase.startTagHtml), 

1605 (("base", "basefont", "bgsound", "command", "link", "meta", 

1606 "script", "style", "title"), 

1607 startTagProcessInHead), 

1608 ("body", startTagBody), 

1609 ("frameset", startTagFrameset), 

1610 (("address", "article", "aside", "blockquote", "center", "details", 

1611 "dir", "div", "dl", "fieldset", "figcaption", "figure", 

1612 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", 

1613 "section", "summary", "ul"), 

1614 startTagCloseP), 

1615 (headingElements, startTagHeading), 

1616 (("pre", "listing"), startTagPreListing), 

1617 ("form", startTagForm), 

1618 (("li", "dd", "dt"), startTagListItem), 

1619 ("plaintext", startTagPlaintext), 

1620 ("a", startTagA), 

1621 (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 

1622 "strong", "tt", "u"), startTagFormatting), 

1623 ("nobr", startTagNobr), 

1624 ("button", startTagButton), 

1625 (("applet", "marquee", "object"), startTagAppletMarqueeObject), 

1626 ("xmp", startTagXmp), 

1627 ("table", startTagTable), 

1628 (("area", "br", "embed", "img", "keygen", "wbr"), 

1629 startTagVoidFormatting), 

1630 (("param", "source", "track"), startTagParamSource), 

1631 ("input", startTagInput), 

1632 ("hr", startTagHr), 

1633 ("image", startTagImage), 

1634 ("isindex", startTagIsIndex), 

1635 ("textarea", startTagTextarea), 

1636 ("iframe", startTagIFrame), 

1637 ("noscript", startTagNoscript), 

1638 (("noembed", "noframes"), startTagRawtext), 

1639 ("select", startTagSelect), 

1640 (("rp", "rt"), startTagRpRt), 

1641 (("option", "optgroup"), startTagOpt), 

1642 (("math"), startTagMath), 

1643 (("svg"), startTagSvg), 

1644 (("caption", "col", "colgroup", "frame", "head", 

1645 "tbody", "td", "tfoot", "th", "thead", 

1646 "tr"), startTagMisplaced) 

1647 ]) 

1648 startTagHandler.default = startTagOther 

1649 

1650 endTagHandler = _utils.MethodDispatcher([ 

1651 ("body", endTagBody), 

1652 ("html", endTagHtml), 

1653 (("address", "article", "aside", "blockquote", "button", "center", 

1654 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", 

1655 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", 

1656 "section", "summary", "ul"), endTagBlock), 

1657 ("form", endTagForm), 

1658 ("p", endTagP), 

1659 (("dd", "dt", "li"), endTagListItem), 

1660 (headingElements, endTagHeading), 

1661 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", 

1662 "strike", "strong", "tt", "u"), endTagFormatting), 

1663 (("applet", "marquee", "object"), endTagAppletMarqueeObject), 

1664 ("br", endTagBr), 

1665 ]) 

1666 endTagHandler.default = endTagOther 

1667 

1668 class TextPhase(Phase): 

1669 __slots__ = tuple() 

1670 

1671 def processCharacters(self, token): 

1672 self.tree.insertText(token["data"]) 

1673 

1674 def processEOF(self): 

1675 self.parser.parseError("expected-named-closing-tag-but-got-eof", 

1676 {"name": self.tree.openElements[-1].name}) 

1677 self.tree.openElements.pop() 

1678 self.parser.phase = self.parser.originalPhase 

1679 return True 

1680 

1681 def startTagOther(self, token): 

1682 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] 

1683 

1684 def endTagScript(self, token): 

1685 node = self.tree.openElements.pop() 

1686 assert node.name == "script" 

1687 self.parser.phase = self.parser.originalPhase 

1688 # The rest of this method is all stuff that only happens if 

1689 # document.write works 

1690 

1691 def endTagOther(self, token): 

1692 self.tree.openElements.pop() 

1693 self.parser.phase = self.parser.originalPhase 

1694 

1695 startTagHandler = _utils.MethodDispatcher([]) 

1696 startTagHandler.default = startTagOther 

1697 endTagHandler = _utils.MethodDispatcher([ 

1698 ("script", endTagScript)]) 

1699 endTagHandler.default = endTagOther 

1700 

1701 class InTablePhase(Phase): 

1702 # http://www.whatwg.org/specs/web-apps/current-work/#in-table 

1703 __slots__ = tuple() 

1704 

1705 # helper methods 

1706 def clearStackToTableContext(self): 

1707 # "clear the stack back to a table context" 

1708 while self.tree.openElements[-1].name not in ("table", "html"): 

1709 # self.parser.parseError("unexpected-implied-end-tag-in-table", 

1710 # {"name": self.tree.openElements[-1].name}) 

1711 self.tree.openElements.pop() 

1712 # When the current node is <html> it's an innerHTML case 

1713 

1714 # processing methods 

1715 def processEOF(self): 

1716 if self.tree.openElements[-1].name != "html": 

1717 self.parser.parseError("eof-in-table") 

1718 else: 

1719 assert self.parser.innerHTML 

1720 # Stop parsing 

1721 

1722 def processSpaceCharacters(self, token): 

1723 originalPhase = self.parser.phase 

1724 self.parser.phase = self.parser.phases["inTableText"] 

1725 self.parser.phase.originalPhase = originalPhase 

1726 self.parser.phase.processSpaceCharacters(token) 

1727 

1728 def processCharacters(self, token): 

1729 originalPhase = self.parser.phase 

1730 self.parser.phase = self.parser.phases["inTableText"] 

1731 self.parser.phase.originalPhase = originalPhase 

1732 self.parser.phase.processCharacters(token) 

1733 

1734 def insertText(self, token): 

1735 # If we get here there must be at least one non-whitespace character 

1736 # Do the table magic! 

1737 self.tree.insertFromTable = True 

1738 self.parser.phases["inBody"].processCharacters(token) 

1739 self.tree.insertFromTable = False 

1740 

1741 def startTagCaption(self, token): 

1742 self.clearStackToTableContext() 

1743 self.tree.activeFormattingElements.append(Marker) 

1744 self.tree.insertElement(token) 

1745 self.parser.phase = self.parser.phases["inCaption"] 

1746 

1747 def startTagColgroup(self, token): 

1748 self.clearStackToTableContext() 

1749 self.tree.insertElement(token) 

1750 self.parser.phase = self.parser.phases["inColumnGroup"] 

1751 

1752 def startTagCol(self, token): 

1753 self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) 

1754 return token 

1755 

1756 def startTagRowGroup(self, token): 

1757 self.clearStackToTableContext() 

1758 self.tree.insertElement(token) 

1759 self.parser.phase = self.parser.phases["inTableBody"] 

1760 

1761 def startTagImplyTbody(self, token): 

1762 self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) 

1763 return token 

1764 

1765 def startTagTable(self, token): 

1766 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1767 {"startName": "table", "endName": "table"}) 

1768 self.parser.phase.processEndTag(impliedTagToken("table")) 

1769 if not self.parser.innerHTML: 

1770 return token 

1771 

1772 def startTagStyleScript(self, token): 

1773 return self.parser.phases["inHead"].processStartTag(token) 

1774 

1775 def startTagInput(self, token): 

1776 if ("type" in token["data"] and 

1777 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 

1778 self.parser.parseError("unexpected-hidden-input-in-table") 

1779 self.tree.insertElement(token) 

1780 # XXX associate with form 

1781 self.tree.openElements.pop() 

1782 else: 

1783 self.startTagOther(token) 

1784 

1785 def startTagForm(self, token): 

1786 self.parser.parseError("unexpected-form-in-table") 

1787 if self.tree.formPointer is None: 

1788 self.tree.insertElement(token) 

1789 self.tree.formPointer = self.tree.openElements[-1] 

1790 self.tree.openElements.pop() 

1791 

1792 def startTagOther(self, token): 

1793 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) 

1794 # Do the table magic! 

1795 self.tree.insertFromTable = True 

1796 self.parser.phases["inBody"].processStartTag(token) 

1797 self.tree.insertFromTable = False 

1798 

1799 def endTagTable(self, token): 

1800 if self.tree.elementInScope("table", variant="table"): 

1801 self.tree.generateImpliedEndTags() 

1802 if self.tree.openElements[-1].name != "table": 

1803 self.parser.parseError("end-tag-too-early-named", 

1804 {"gotName": "table", 

1805 "expectedName": self.tree.openElements[-1].name}) 

1806 while self.tree.openElements[-1].name != "table": 

1807 self.tree.openElements.pop() 

1808 self.tree.openElements.pop() 

1809 self.parser.resetInsertionMode() 

1810 else: 

1811 # innerHTML case 

1812 assert self.parser.innerHTML 

1813 self.parser.parseError() 

1814 

1815 def endTagIgnore(self, token): 

1816 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1817 

1818 def endTagOther(self, token): 

1819 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) 

1820 # Do the table magic! 

1821 self.tree.insertFromTable = True 

1822 self.parser.phases["inBody"].processEndTag(token) 

1823 self.tree.insertFromTable = False 

1824 

1825 startTagHandler = _utils.MethodDispatcher([ 

1826 ("html", Phase.startTagHtml), 

1827 ("caption", startTagCaption), 

1828 ("colgroup", startTagColgroup), 

1829 ("col", startTagCol), 

1830 (("tbody", "tfoot", "thead"), startTagRowGroup), 

1831 (("td", "th", "tr"), startTagImplyTbody), 

1832 ("table", startTagTable), 

1833 (("style", "script"), startTagStyleScript), 

1834 ("input", startTagInput), 

1835 ("form", startTagForm) 

1836 ]) 

1837 startTagHandler.default = startTagOther 

1838 

1839 endTagHandler = _utils.MethodDispatcher([ 

1840 ("table", endTagTable), 

1841 (("body", "caption", "col", "colgroup", "html", "tbody", "td", 

1842 "tfoot", "th", "thead", "tr"), endTagIgnore) 

1843 ]) 

1844 endTagHandler.default = endTagOther 

1845 

1846 class InTableTextPhase(Phase): 

1847 __slots__ = ("originalPhase", "characterTokens") 

1848 

1849 def __init__(self, *args, **kwargs): 

1850 super(InTableTextPhase, self).__init__(*args, **kwargs) 

1851 self.originalPhase = None 

1852 self.characterTokens = [] 

1853 

1854 def flushCharacters(self): 

1855 data = "".join([item["data"] for item in self.characterTokens]) 

1856 if any(item not in spaceCharacters for item in data): 

1857 token = {"type": tokenTypes["Characters"], "data": data} 

1858 self.parser.phases["inTable"].insertText(token) 

1859 elif data: 

1860 self.tree.insertText(data) 

1861 self.characterTokens = [] 

1862 

1863 def processComment(self, token): 

1864 self.flushCharacters() 

1865 self.parser.phase = self.originalPhase 

1866 return token 

1867 

1868 def processEOF(self): 

1869 self.flushCharacters() 

1870 self.parser.phase = self.originalPhase 

1871 return True 

1872 

1873 def processCharacters(self, token): 

1874 if token["data"] == "\u0000": 

1875 return 

1876 self.characterTokens.append(token) 

1877 

1878 def processSpaceCharacters(self, token): 

1879 # pretty sure we should never reach here 

1880 self.characterTokens.append(token) 

1881 # assert False 

1882 

1883 def processStartTag(self, token): 

1884 self.flushCharacters() 

1885 self.parser.phase = self.originalPhase 

1886 return token 

1887 

1888 def processEndTag(self, token): 

1889 self.flushCharacters() 

1890 self.parser.phase = self.originalPhase 

1891 return token 

1892 

1893 class InCaptionPhase(Phase): 

1894 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption 

1895 __slots__ = tuple() 

1896 

1897 def ignoreEndTagCaption(self): 

1898 return not self.tree.elementInScope("caption", variant="table") 

1899 

1900 def processEOF(self): 

1901 self.parser.phases["inBody"].processEOF() 

1902 

1903 def processCharacters(self, token): 

1904 return self.parser.phases["inBody"].processCharacters(token) 

1905 

1906 def startTagTableElement(self, token): 

1907 self.parser.parseError() 

1908 # XXX Have to duplicate logic here to find out if the tag is ignored 

1909 ignoreEndTag = self.ignoreEndTagCaption() 

1910 self.parser.phase.processEndTag(impliedTagToken("caption")) 

1911 if not ignoreEndTag: 

1912 return token 

1913 

1914 def startTagOther(self, token): 

1915 return self.parser.phases["inBody"].processStartTag(token) 

1916 

1917 def endTagCaption(self, token): 

1918 if not self.ignoreEndTagCaption(): 

1919 # AT this code is quite similar to endTagTable in "InTable" 

1920 self.tree.generateImpliedEndTags() 

1921 if self.tree.openElements[-1].name != "caption": 

1922 self.parser.parseError("expected-one-end-tag-but-got-another", 

1923 {"gotName": "caption", 

1924 "expectedName": self.tree.openElements[-1].name}) 

1925 while self.tree.openElements[-1].name != "caption": 

1926 self.tree.openElements.pop() 

1927 self.tree.openElements.pop() 

1928 self.tree.clearActiveFormattingElements() 

1929 self.parser.phase = self.parser.phases["inTable"] 

1930 else: 

1931 # innerHTML case 

1932 assert self.parser.innerHTML 

1933 self.parser.parseError() 

1934 

1935 def endTagTable(self, token): 

1936 self.parser.parseError() 

1937 ignoreEndTag = self.ignoreEndTagCaption() 

1938 self.parser.phase.processEndTag(impliedTagToken("caption")) 

1939 if not ignoreEndTag: 

1940 return token 

1941 

1942 def endTagIgnore(self, token): 

1943 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1944 

1945 def endTagOther(self, token): 

1946 return self.parser.phases["inBody"].processEndTag(token) 

1947 

1948 startTagHandler = _utils.MethodDispatcher([ 

1949 ("html", Phase.startTagHtml), 

1950 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 

1951 "thead", "tr"), startTagTableElement) 

1952 ]) 

1953 startTagHandler.default = startTagOther 

1954 

1955 endTagHandler = _utils.MethodDispatcher([ 

1956 ("caption", endTagCaption), 

1957 ("table", endTagTable), 

1958 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", 

1959 "thead", "tr"), endTagIgnore) 

1960 ]) 

1961 endTagHandler.default = endTagOther 

1962 

1963 class InColumnGroupPhase(Phase): 

1964 # http://www.whatwg.org/specs/web-apps/current-work/#in-column 

1965 __slots__ = tuple() 

1966 

1967 def ignoreEndTagColgroup(self): 

1968 return self.tree.openElements[-1].name == "html" 

1969 

1970 def processEOF(self): 

1971 if self.tree.openElements[-1].name == "html": 

1972 assert self.parser.innerHTML 

1973 return 

1974 else: 

1975 ignoreEndTag = self.ignoreEndTagColgroup() 

1976 self.endTagColgroup(impliedTagToken("colgroup")) 

1977 if not ignoreEndTag: 

1978 return True 

1979 

1980 def processCharacters(self, token): 

1981 ignoreEndTag = self.ignoreEndTagColgroup() 

1982 self.endTagColgroup(impliedTagToken("colgroup")) 

1983 if not ignoreEndTag: 

1984 return token 

1985 

1986 def startTagCol(self, token): 

1987 self.tree.insertElement(token) 

1988 self.tree.openElements.pop() 

1989 token["selfClosingAcknowledged"] = True 

1990 

1991 def startTagOther(self, token): 

1992 ignoreEndTag = self.ignoreEndTagColgroup() 

1993 self.endTagColgroup(impliedTagToken("colgroup")) 

1994 if not ignoreEndTag: 

1995 return token 

1996 

1997 def endTagColgroup(self, token): 

1998 if self.ignoreEndTagColgroup(): 

1999 # innerHTML case 

2000 assert self.parser.innerHTML 

2001 self.parser.parseError() 

2002 else: 

2003 self.tree.openElements.pop() 

2004 self.parser.phase = self.parser.phases["inTable"] 

2005 

2006 def endTagCol(self, token): 

2007 self.parser.parseError("no-end-tag", {"name": "col"}) 

2008 

2009 def endTagOther(self, token): 

2010 ignoreEndTag = self.ignoreEndTagColgroup() 

2011 self.endTagColgroup(impliedTagToken("colgroup")) 

2012 if not ignoreEndTag: 

2013 return token 

2014 

2015 startTagHandler = _utils.MethodDispatcher([ 

2016 ("html", Phase.startTagHtml), 

2017 ("col", startTagCol) 

2018 ]) 

2019 startTagHandler.default = startTagOther 

2020 

2021 endTagHandler = _utils.MethodDispatcher([ 

2022 ("colgroup", endTagColgroup), 

2023 ("col", endTagCol) 

2024 ]) 

2025 endTagHandler.default = endTagOther 

2026 

2027 class InTableBodyPhase(Phase): 

2028 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 

2029 __slots__ = tuple() 

2030 

2031 # helper methods 

2032 def clearStackToTableBodyContext(self): 

2033 while self.tree.openElements[-1].name not in ("tbody", "tfoot", 

2034 "thead", "html"): 

2035 # self.parser.parseError("unexpected-implied-end-tag-in-table", 

2036 # {"name": self.tree.openElements[-1].name}) 

2037 self.tree.openElements.pop() 

2038 if self.tree.openElements[-1].name == "html": 

2039 assert self.parser.innerHTML 

2040 

2041 # the rest 

2042 def processEOF(self): 

2043 self.parser.phases["inTable"].processEOF() 

2044 

2045 def processSpaceCharacters(self, token): 

2046 return self.parser.phases["inTable"].processSpaceCharacters(token) 

2047 

2048 def processCharacters(self, token): 

2049 return self.parser.phases["inTable"].processCharacters(token) 

2050 

2051 def startTagTr(self, token): 

2052 self.clearStackToTableBodyContext() 

2053 self.tree.insertElement(token) 

2054 self.parser.phase = self.parser.phases["inRow"] 

2055 

2056 def startTagTableCell(self, token): 

2057 self.parser.parseError("unexpected-cell-in-table-body", 

2058 {"name": token["name"]}) 

2059 self.startTagTr(impliedTagToken("tr", "StartTag")) 

2060 return token 

2061 

2062 def startTagTableOther(self, token): 

2063 # XXX AT Any ideas on how to share this with endTagTable? 

2064 if (self.tree.elementInScope("tbody", variant="table") or 

2065 self.tree.elementInScope("thead", variant="table") or 

2066 self.tree.elementInScope("tfoot", variant="table")): 

2067 self.clearStackToTableBodyContext() 

2068 self.endTagTableRowGroup( 

2069 impliedTagToken(self.tree.openElements[-1].name)) 

2070 return token 

2071 else: 

2072 # innerHTML case 

2073 assert self.parser.innerHTML 

2074 self.parser.parseError() 

2075 

2076 def startTagOther(self, token): 

2077 return self.parser.phases["inTable"].processStartTag(token) 

2078 

2079 def endTagTableRowGroup(self, token): 

2080 if self.tree.elementInScope(token["name"], variant="table"): 

2081 self.clearStackToTableBodyContext() 

2082 self.tree.openElements.pop() 

2083 self.parser.phase = self.parser.phases["inTable"] 

2084 else: 

2085 self.parser.parseError("unexpected-end-tag-in-table-body", 

2086 {"name": token["name"]}) 

2087 

2088 def endTagTable(self, token): 

2089 if (self.tree.elementInScope("tbody", variant="table") or 

2090 self.tree.elementInScope("thead", variant="table") or 

2091 self.tree.elementInScope("tfoot", variant="table")): 

2092 self.clearStackToTableBodyContext() 

2093 self.endTagTableRowGroup( 

2094 impliedTagToken(self.tree.openElements[-1].name)) 

2095 return token 

2096 else: 

2097 # innerHTML case 

2098 assert self.parser.innerHTML 

2099 self.parser.parseError() 

2100 

2101 def endTagIgnore(self, token): 

2102 self.parser.parseError("unexpected-end-tag-in-table-body", 

2103 {"name": token["name"]}) 

2104 

2105 def endTagOther(self, token): 

2106 return self.parser.phases["inTable"].processEndTag(token) 

2107 

2108 startTagHandler = _utils.MethodDispatcher([ 

2109 ("html", Phase.startTagHtml), 

2110 ("tr", startTagTr), 

2111 (("td", "th"), startTagTableCell), 

2112 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), 

2113 startTagTableOther) 

2114 ]) 

2115 startTagHandler.default = startTagOther 

2116 

2117 endTagHandler = _utils.MethodDispatcher([ 

2118 (("tbody", "tfoot", "thead"), endTagTableRowGroup), 

2119 ("table", endTagTable), 

2120 (("body", "caption", "col", "colgroup", "html", "td", "th", 

2121 "tr"), endTagIgnore) 

2122 ]) 

2123 endTagHandler.default = endTagOther 

2124 

2125 class InRowPhase(Phase): 

2126 # http://www.whatwg.org/specs/web-apps/current-work/#in-row 

2127 __slots__ = tuple() 

2128 

2129 # helper methods (XXX unify this with other table helper methods) 

2130 def clearStackToTableRowContext(self): 

2131 while self.tree.openElements[-1].name not in ("tr", "html"): 

2132 self.parser.parseError("unexpected-implied-end-tag-in-table-row", 

2133 {"name": self.tree.openElements[-1].name}) 

2134 self.tree.openElements.pop() 

2135 

2136 def ignoreEndTagTr(self): 

2137 return not self.tree.elementInScope("tr", variant="table") 

2138 

2139 # the rest 

2140 def processEOF(self): 

2141 self.parser.phases["inTable"].processEOF() 

2142 

2143 def processSpaceCharacters(self, token): 

2144 return self.parser.phases["inTable"].processSpaceCharacters(token) 

2145 

2146 def processCharacters(self, token): 

2147 return self.parser.phases["inTable"].processCharacters(token) 

2148 

2149 def startTagTableCell(self, token): 

2150 self.clearStackToTableRowContext() 

2151 self.tree.insertElement(token) 

2152 self.parser.phase = self.parser.phases["inCell"] 

2153 self.tree.activeFormattingElements.append(Marker) 

2154 

2155 def startTagTableOther(self, token): 

2156 ignoreEndTag = self.ignoreEndTagTr() 

2157 self.endTagTr(impliedTagToken("tr")) 

2158 # XXX how are we sure it's always ignored in the innerHTML case? 

2159 if not ignoreEndTag: 

2160 return token 

2161 

2162 def startTagOther(self, token): 

2163 return self.parser.phases["inTable"].processStartTag(token) 

2164 

2165 def endTagTr(self, token): 

2166 if not self.ignoreEndTagTr(): 

2167 self.clearStackToTableRowContext() 

2168 self.tree.openElements.pop() 

2169 self.parser.phase = self.parser.phases["inTableBody"] 

2170 else: 

2171 # innerHTML case 

2172 assert self.parser.innerHTML 

2173 self.parser.parseError() 

2174 

2175 def endTagTable(self, token): 

2176 ignoreEndTag = self.ignoreEndTagTr() 

2177 self.endTagTr(impliedTagToken("tr")) 

2178 # Reprocess the current tag if the tr end tag was not ignored 

2179 # XXX how are we sure it's always ignored in the innerHTML case? 

2180 if not ignoreEndTag: 

2181 return token 

2182 

2183 def endTagTableRowGroup(self, token): 

2184 if self.tree.elementInScope(token["name"], variant="table"): 

2185 self.endTagTr(impliedTagToken("tr")) 

2186 return token 

2187 else: 

2188 self.parser.parseError() 

2189 

2190 def endTagIgnore(self, token): 

2191 self.parser.parseError("unexpected-end-tag-in-table-row", 

2192 {"name": token["name"]}) 

2193 

2194 def endTagOther(self, token): 

2195 return self.parser.phases["inTable"].processEndTag(token) 

2196 

2197 startTagHandler = _utils.MethodDispatcher([ 

2198 ("html", Phase.startTagHtml), 

2199 (("td", "th"), startTagTableCell), 

2200 (("caption", "col", "colgroup", "tbody", "tfoot", "thead", 

2201 "tr"), startTagTableOther) 

2202 ]) 

2203 startTagHandler.default = startTagOther 

2204 

2205 endTagHandler = _utils.MethodDispatcher([ 

2206 ("tr", endTagTr), 

2207 ("table", endTagTable), 

2208 (("tbody", "tfoot", "thead"), endTagTableRowGroup), 

2209 (("body", "caption", "col", "colgroup", "html", "td", "th"), 

2210 endTagIgnore) 

2211 ]) 

2212 endTagHandler.default = endTagOther 

2213 

2214 class InCellPhase(Phase): 

2215 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell 

2216 __slots__ = tuple() 

2217 

2218 # helper 

2219 def closeCell(self): 

2220 if self.tree.elementInScope("td", variant="table"): 

2221 self.endTagTableCell(impliedTagToken("td")) 

2222 elif self.tree.elementInScope("th", variant="table"): 

2223 self.endTagTableCell(impliedTagToken("th")) 

2224 

2225 # the rest 

2226 def processEOF(self): 

2227 self.parser.phases["inBody"].processEOF() 

2228 

2229 def processCharacters(self, token): 

2230 return self.parser.phases["inBody"].processCharacters(token) 

2231 

2232 def startTagTableOther(self, token): 

2233 if (self.tree.elementInScope("td", variant="table") or 

2234 self.tree.elementInScope("th", variant="table")): 

2235 self.closeCell() 

2236 return token 

2237 else: 

2238 # innerHTML case 

2239 assert self.parser.innerHTML 

2240 self.parser.parseError() 

2241 

2242 def startTagOther(self, token): 

2243 return self.parser.phases["inBody"].processStartTag(token) 

2244 

2245 def endTagTableCell(self, token): 

2246 if self.tree.elementInScope(token["name"], variant="table"): 

2247 self.tree.generateImpliedEndTags(token["name"]) 

2248 if self.tree.openElements[-1].name != token["name"]: 

2249 self.parser.parseError("unexpected-cell-end-tag", 

2250 {"name": token["name"]}) 

2251 while True: 

2252 node = self.tree.openElements.pop() 

2253 if node.name == token["name"]: 

2254 break 

2255 else: 

2256 self.tree.openElements.pop() 

2257 self.tree.clearActiveFormattingElements() 

2258 self.parser.phase = self.parser.phases["inRow"] 

2259 else: 

2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

2261 

2262 def endTagIgnore(self, token): 

2263 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

2264 

2265 def endTagImply(self, token): 

2266 if self.tree.elementInScope(token["name"], variant="table"): 

2267 self.closeCell() 

2268 return token 

2269 else: 

2270 # sometimes innerHTML case 

2271 self.parser.parseError() 

2272 

2273 def endTagOther(self, token): 

2274 return self.parser.phases["inBody"].processEndTag(token) 

2275 

2276 startTagHandler = _utils.MethodDispatcher([ 

2277 ("html", Phase.startTagHtml), 

2278 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 

2279 "thead", "tr"), startTagTableOther) 

2280 ]) 

2281 startTagHandler.default = startTagOther 

2282 

2283 endTagHandler = _utils.MethodDispatcher([ 

2284 (("td", "th"), endTagTableCell), 

2285 (("body", "caption", "col", "colgroup", "html"), endTagIgnore), 

2286 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) 

2287 ]) 

2288 endTagHandler.default = endTagOther 

2289 

2290 class InSelectPhase(Phase): 

2291 __slots__ = tuple() 

2292 

2293 # http://www.whatwg.org/specs/web-apps/current-work/#in-select 

2294 def processEOF(self): 

2295 if self.tree.openElements[-1].name != "html": 

2296 self.parser.parseError("eof-in-select") 

2297 else: 

2298 assert self.parser.innerHTML 

2299 

2300 def processCharacters(self, token): 

2301 if token["data"] == "\u0000": 

2302 return 

2303 self.tree.insertText(token["data"]) 

2304 

2305 def startTagOption(self, token): 

2306 # We need to imply </option> if <option> is the current node. 

2307 if self.tree.openElements[-1].name == "option": 

2308 self.tree.openElements.pop() 

2309 self.tree.insertElement(token) 

2310 

2311 def startTagOptgroup(self, token): 

2312 if self.tree.openElements[-1].name == "option": 

2313 self.tree.openElements.pop() 

2314 if self.tree.openElements[-1].name == "optgroup": 

2315 self.tree.openElements.pop() 

2316 self.tree.insertElement(token) 

2317 

2318 def startTagSelect(self, token): 

2319 self.parser.parseError("unexpected-select-in-select") 

2320 self.endTagSelect(impliedTagToken("select")) 

2321 

2322 def startTagInput(self, token): 

2323 self.parser.parseError("unexpected-input-in-select") 

2324 if self.tree.elementInScope("select", variant="select"): 

2325 self.endTagSelect(impliedTagToken("select")) 

2326 return token 

2327 else: 

2328 assert self.parser.innerHTML 

2329 

2330 def startTagScript(self, token): 

2331 return self.parser.phases["inHead"].processStartTag(token) 

2332 

2333 def startTagOther(self, token): 

2334 self.parser.parseError("unexpected-start-tag-in-select", 

2335 {"name": token["name"]}) 

2336 

2337 def endTagOption(self, token): 

2338 if self.tree.openElements[-1].name == "option": 

2339 self.tree.openElements.pop() 

2340 else: 

2341 self.parser.parseError("unexpected-end-tag-in-select", 

2342 {"name": "option"}) 

2343 

2344 def endTagOptgroup(self, token): 

2345 # </optgroup> implicitly closes <option> 

2346 if (self.tree.openElements[-1].name == "option" and 

2347 self.tree.openElements[-2].name == "optgroup"): 

2348 self.tree.openElements.pop() 

2349 # It also closes </optgroup> 

2350 if self.tree.openElements[-1].name == "optgroup": 

2351 self.tree.openElements.pop() 

2352 # But nothing else 

2353 else: 

2354 self.parser.parseError("unexpected-end-tag-in-select", 

2355 {"name": "optgroup"}) 

2356 

2357 def endTagSelect(self, token): 

2358 if self.tree.elementInScope("select", variant="select"): 

2359 node = self.tree.openElements.pop() 

2360 while node.name != "select": 

2361 node = self.tree.openElements.pop() 

2362 self.parser.resetInsertionMode() 

2363 else: 

2364 # innerHTML case 

2365 assert self.parser.innerHTML 

2366 self.parser.parseError() 

2367 

2368 def endTagOther(self, token): 

2369 self.parser.parseError("unexpected-end-tag-in-select", 

2370 {"name": token["name"]}) 

2371 

2372 startTagHandler = _utils.MethodDispatcher([ 

2373 ("html", Phase.startTagHtml), 

2374 ("option", startTagOption), 

2375 ("optgroup", startTagOptgroup), 

2376 ("select", startTagSelect), 

2377 (("input", "keygen", "textarea"), startTagInput), 

2378 ("script", startTagScript) 

2379 ]) 

2380 startTagHandler.default = startTagOther 

2381 

2382 endTagHandler = _utils.MethodDispatcher([ 

2383 ("option", endTagOption), 

2384 ("optgroup", endTagOptgroup), 

2385 ("select", endTagSelect) 

2386 ]) 

2387 endTagHandler.default = endTagOther 

2388 

2389 class InSelectInTablePhase(Phase): 

2390 __slots__ = tuple() 

2391 

2392 def processEOF(self): 

2393 self.parser.phases["inSelect"].processEOF() 

2394 

2395 def processCharacters(self, token): 

2396 return self.parser.phases["inSelect"].processCharacters(token) 

2397 

2398 def startTagTable(self, token): 

2399 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) 

2400 self.endTagOther(impliedTagToken("select")) 

2401 return token 

2402 

2403 def startTagOther(self, token): 

2404 return self.parser.phases["inSelect"].processStartTag(token) 

2405 

2406 def endTagTable(self, token): 

2407 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) 

2408 if self.tree.elementInScope(token["name"], variant="table"): 

2409 self.endTagOther(impliedTagToken("select")) 

2410 return token 

2411 

2412 def endTagOther(self, token): 

2413 return self.parser.phases["inSelect"].processEndTag(token) 

2414 

2415 startTagHandler = _utils.MethodDispatcher([ 

2416 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 

2417 startTagTable) 

2418 ]) 

2419 startTagHandler.default = startTagOther 

2420 

2421 endTagHandler = _utils.MethodDispatcher([ 

2422 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 

2423 endTagTable) 

2424 ]) 

2425 endTagHandler.default = endTagOther 

2426 

2427 class InForeignContentPhase(Phase): 

2428 __slots__ = tuple() 

2429 

2430 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", 

2431 "center", "code", "dd", "div", "dl", "dt", 

2432 "em", "embed", "h1", "h2", "h3", 

2433 "h4", "h5", "h6", "head", "hr", "i", "img", 

2434 "li", "listing", "menu", "meta", "nobr", 

2435 "ol", "p", "pre", "ruby", "s", "small", 

2436 "span", "strong", "strike", "sub", "sup", 

2437 "table", "tt", "u", "ul", "var"]) 

2438 

2439 def adjustSVGTagNames(self, token): 

2440 replacements = {"altglyph": "altGlyph", 

2441 "altglyphdef": "altGlyphDef", 

2442 "altglyphitem": "altGlyphItem", 

2443 "animatecolor": "animateColor", 

2444 "animatemotion": "animateMotion", 

2445 "animatetransform": "animateTransform", 

2446 "clippath": "clipPath", 

2447 "feblend": "feBlend", 

2448 "fecolormatrix": "feColorMatrix", 

2449 "fecomponenttransfer": "feComponentTransfer", 

2450 "fecomposite": "feComposite", 

2451 "feconvolvematrix": "feConvolveMatrix", 

2452 "fediffuselighting": "feDiffuseLighting", 

2453 "fedisplacementmap": "feDisplacementMap", 

2454 "fedistantlight": "feDistantLight", 

2455 "feflood": "feFlood", 

2456 "fefunca": "feFuncA", 

2457 "fefuncb": "feFuncB", 

2458 "fefuncg": "feFuncG", 

2459 "fefuncr": "feFuncR", 

2460 "fegaussianblur": "feGaussianBlur", 

2461 "feimage": "feImage", 

2462 "femerge": "feMerge", 

2463 "femergenode": "feMergeNode", 

2464 "femorphology": "feMorphology", 

2465 "feoffset": "feOffset", 

2466 "fepointlight": "fePointLight", 

2467 "fespecularlighting": "feSpecularLighting", 

2468 "fespotlight": "feSpotLight", 

2469 "fetile": "feTile", 

2470 "feturbulence": "feTurbulence", 

2471 "foreignobject": "foreignObject", 

2472 "glyphref": "glyphRef", 

2473 "lineargradient": "linearGradient", 

2474 "radialgradient": "radialGradient", 

2475 "textpath": "textPath"} 

2476 

2477 if token["name"] in replacements: 

2478 token["name"] = replacements[token["name"]] 

2479 

2480 def processCharacters(self, token): 

2481 if token["data"] == "\u0000": 

2482 token["data"] = "\uFFFD" 

2483 elif (self.parser.framesetOK and 

2484 any(char not in spaceCharacters for char in token["data"])): 

2485 self.parser.framesetOK = False 

2486 Phase.processCharacters(self, token) 

2487 

2488 def processStartTag(self, token): 

2489 currentNode = self.tree.openElements[-1] 

2490 if (token["name"] in self.breakoutElements or 

2491 (token["name"] == "font" and 

2492 set(token["data"].keys()) & {"color", "face", "size"})): 

2493 self.parser.parseError("unexpected-html-element-in-foreign-content", 

2494 {"name": token["name"]}) 

2495 while (self.tree.openElements[-1].namespace != 

2496 self.tree.defaultNamespace and 

2497 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and 

2498 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): 

2499 self.tree.openElements.pop() 

2500 return token 

2501 

2502 else: 

2503 if currentNode.namespace == namespaces["mathml"]: 

2504 self.parser.adjustMathMLAttributes(token) 

2505 elif currentNode.namespace == namespaces["svg"]: 

2506 self.adjustSVGTagNames(token) 

2507 self.parser.adjustSVGAttributes(token) 

2508 self.parser.adjustForeignAttributes(token) 

2509 token["namespace"] = currentNode.namespace 

2510 self.tree.insertElement(token) 

2511 if token["selfClosing"]: 

2512 self.tree.openElements.pop() 

2513 token["selfClosingAcknowledged"] = True 

2514 

2515 def processEndTag(self, token): 

2516 nodeIndex = len(self.tree.openElements) - 1 

2517 node = self.tree.openElements[-1] 

2518 if node.name.translate(asciiUpper2Lower) != token["name"]: 

2519 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

2520 

2521 while True: 

2522 if node.name.translate(asciiUpper2Lower) == token["name"]: 

2523 # XXX this isn't in the spec but it seems necessary 

2524 if self.parser.phase == self.parser.phases["inTableText"]: 

2525 self.parser.phase.flushCharacters() 

2526 self.parser.phase = self.parser.phase.originalPhase 

2527 while self.tree.openElements.pop() != node: 

2528 assert self.tree.openElements 

2529 new_token = None 

2530 break 

2531 nodeIndex -= 1 

2532 

2533 node = self.tree.openElements[nodeIndex] 

2534 if node.namespace != self.tree.defaultNamespace: 

2535 continue 

2536 else: 

2537 new_token = self.parser.phase.processEndTag(token) 

2538 break 

2539 return new_token 

2540 

2541 class AfterBodyPhase(Phase): 

2542 __slots__ = tuple() 

2543 

2544 def processEOF(self): 

2545 # Stop parsing 

2546 pass 

2547 

2548 def processComment(self, token): 

2549 # This is needed because data is to be appended to the <html> element 

2550 # here and not to whatever is currently open. 

2551 self.tree.insertComment(token, self.tree.openElements[0]) 

2552 

2553 def processCharacters(self, token): 

2554 self.parser.parseError("unexpected-char-after-body") 

2555 self.parser.phase = self.parser.phases["inBody"] 

2556 return token 

2557 

2558 def startTagHtml(self, token): 

2559 return self.parser.phases["inBody"].processStartTag(token) 

2560 

2561 def startTagOther(self, token): 

2562 self.parser.parseError("unexpected-start-tag-after-body", 

2563 {"name": token["name"]}) 

2564 self.parser.phase = self.parser.phases["inBody"] 

2565 return token 

2566 

2567 def endTagHtml(self, name): 

2568 if self.parser.innerHTML: 

2569 self.parser.parseError("unexpected-end-tag-after-body-innerhtml") 

2570 else: 

2571 self.parser.phase = self.parser.phases["afterAfterBody"] 

2572 

2573 def endTagOther(self, token): 

2574 self.parser.parseError("unexpected-end-tag-after-body", 

2575 {"name": token["name"]}) 

2576 self.parser.phase = self.parser.phases["inBody"] 

2577 return token 

2578 

2579 startTagHandler = _utils.MethodDispatcher([ 

2580 ("html", startTagHtml) 

2581 ]) 

2582 startTagHandler.default = startTagOther 

2583 

2584 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) 

2585 endTagHandler.default = endTagOther 

2586 

2587 class InFramesetPhase(Phase): 

2588 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset 

2589 __slots__ = tuple() 

2590 

2591 def processEOF(self): 

2592 if self.tree.openElements[-1].name != "html": 

2593 self.parser.parseError("eof-in-frameset") 

2594 else: 

2595 assert self.parser.innerHTML 

2596 

2597 def processCharacters(self, token): 

2598 self.parser.parseError("unexpected-char-in-frameset") 

2599 

2600 def startTagFrameset(self, token): 

2601 self.tree.insertElement(token) 

2602 

2603 def startTagFrame(self, token): 

2604 self.tree.insertElement(token) 

2605 self.tree.openElements.pop() 

2606 

2607 def startTagNoframes(self, token): 

2608 return self.parser.phases["inBody"].processStartTag(token) 

2609 

2610 def startTagOther(self, token): 

2611 self.parser.parseError("unexpected-start-tag-in-frameset", 

2612 {"name": token["name"]}) 

2613 

2614 def endTagFrameset(self, token): 

2615 if self.tree.openElements[-1].name == "html": 

2616 # innerHTML case 

2617 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") 

2618 else: 

2619 self.tree.openElements.pop() 

2620 if (not self.parser.innerHTML and 

2621 self.tree.openElements[-1].name != "frameset"): 

2622 # If we're not in innerHTML mode and the current node is not a 

2623 # "frameset" element (anymore) then switch. 

2624 self.parser.phase = self.parser.phases["afterFrameset"] 

2625 

2626 def endTagOther(self, token): 

2627 self.parser.parseError("unexpected-end-tag-in-frameset", 

2628 {"name": token["name"]}) 

2629 

2630 startTagHandler = _utils.MethodDispatcher([ 

2631 ("html", Phase.startTagHtml), 

2632 ("frameset", startTagFrameset), 

2633 ("frame", startTagFrame), 

2634 ("noframes", startTagNoframes) 

2635 ]) 

2636 startTagHandler.default = startTagOther 

2637 

2638 endTagHandler = _utils.MethodDispatcher([ 

2639 ("frameset", endTagFrameset) 

2640 ]) 

2641 endTagHandler.default = endTagOther 

2642 

2643 class AfterFramesetPhase(Phase): 

2644 # http://www.whatwg.org/specs/web-apps/current-work/#after3 

2645 __slots__ = tuple() 

2646 

2647 def processEOF(self): 

2648 # Stop parsing 

2649 pass 

2650 

2651 def processCharacters(self, token): 

2652 self.parser.parseError("unexpected-char-after-frameset") 

2653 

2654 def startTagNoframes(self, token): 

2655 return self.parser.phases["inHead"].processStartTag(token) 

2656 

2657 def startTagOther(self, token): 

2658 self.parser.parseError("unexpected-start-tag-after-frameset", 

2659 {"name": token["name"]}) 

2660 

2661 def endTagHtml(self, token): 

2662 self.parser.phase = self.parser.phases["afterAfterFrameset"] 

2663 

2664 def endTagOther(self, token): 

2665 self.parser.parseError("unexpected-end-tag-after-frameset", 

2666 {"name": token["name"]}) 

2667 

2668 startTagHandler = _utils.MethodDispatcher([ 

2669 ("html", Phase.startTagHtml), 

2670 ("noframes", startTagNoframes) 

2671 ]) 

2672 startTagHandler.default = startTagOther 

2673 

2674 endTagHandler = _utils.MethodDispatcher([ 

2675 ("html", endTagHtml) 

2676 ]) 

2677 endTagHandler.default = endTagOther 

2678 

2679 class AfterAfterBodyPhase(Phase): 

2680 __slots__ = tuple() 

2681 

2682 def processEOF(self): 

2683 pass 

2684 

2685 def processComment(self, token): 

2686 self.tree.insertComment(token, self.tree.document) 

2687 

2688 def processSpaceCharacters(self, token): 

2689 return self.parser.phases["inBody"].processSpaceCharacters(token) 

2690 

2691 def processCharacters(self, token): 

2692 self.parser.parseError("expected-eof-but-got-char") 

2693 self.parser.phase = self.parser.phases["inBody"] 

2694 return token 

2695 

2696 def startTagHtml(self, token): 

2697 return self.parser.phases["inBody"].processStartTag(token) 

2698 

2699 def startTagOther(self, token): 

2700 self.parser.parseError("expected-eof-but-got-start-tag", 

2701 {"name": token["name"]}) 

2702 self.parser.phase = self.parser.phases["inBody"] 

2703 return token 

2704 

2705 def processEndTag(self, token): 

2706 self.parser.parseError("expected-eof-but-got-end-tag", 

2707 {"name": token["name"]}) 

2708 self.parser.phase = self.parser.phases["inBody"] 

2709 return token 

2710 

2711 startTagHandler = _utils.MethodDispatcher([ 

2712 ("html", startTagHtml) 

2713 ]) 

2714 startTagHandler.default = startTagOther 

2715 

2716 class AfterAfterFramesetPhase(Phase): 

2717 __slots__ = tuple() 

2718 

2719 def processEOF(self): 

2720 pass 

2721 

2722 def processComment(self, token): 

2723 self.tree.insertComment(token, self.tree.document) 

2724 

2725 def processSpaceCharacters(self, token): 

2726 return self.parser.phases["inBody"].processSpaceCharacters(token) 

2727 

2728 def processCharacters(self, token): 

2729 self.parser.parseError("expected-eof-but-got-char") 

2730 

2731 def startTagHtml(self, token): 

2732 return self.parser.phases["inBody"].processStartTag(token) 

2733 

2734 def startTagNoFrames(self, token): 

2735 return self.parser.phases["inHead"].processStartTag(token) 

2736 

2737 def startTagOther(self, token): 

2738 self.parser.parseError("expected-eof-but-got-start-tag", 

2739 {"name": token["name"]}) 

2740 

2741 def processEndTag(self, token): 

2742 self.parser.parseError("expected-eof-but-got-end-tag", 

2743 {"name": token["name"]}) 

2744 

2745 startTagHandler = _utils.MethodDispatcher([ 

2746 ("html", startTagHtml), 

2747 ("noframes", startTagNoFrames) 

2748 ]) 

2749 startTagHandler.default = startTagOther 

2750 

2751 # pylint:enable=unused-argument 

2752 

2753 return { 

2754 "initial": InitialPhase, 

2755 "beforeHtml": BeforeHtmlPhase, 

2756 "beforeHead": BeforeHeadPhase, 

2757 "inHead": InHeadPhase, 

2758 "inHeadNoscript": InHeadNoscriptPhase, 

2759 "afterHead": AfterHeadPhase, 

2760 "inBody": InBodyPhase, 

2761 "text": TextPhase, 

2762 "inTable": InTablePhase, 

2763 "inTableText": InTableTextPhase, 

2764 "inCaption": InCaptionPhase, 

2765 "inColumnGroup": InColumnGroupPhase, 

2766 "inTableBody": InTableBodyPhase, 

2767 "inRow": InRowPhase, 

2768 "inCell": InCellPhase, 

2769 "inSelect": InSelectPhase, 

2770 "inSelectInTable": InSelectInTablePhase, 

2771 "inForeignContent": InForeignContentPhase, 

2772 "afterBody": AfterBodyPhase, 

2773 "inFrameset": InFramesetPhase, 

2774 "afterFrameset": AfterFramesetPhase, 

2775 "afterAfterBody": AfterAfterBodyPhase, 

2776 "afterAfterFrameset": AfterAfterFramesetPhase, 

2777 # XXX after after frameset 

2778 } 

2779 

2780 

2781def adjust_attributes(token, replacements): 

2782 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) 

2783 if needs_adjustment: 

2784 token['data'] = type(token['data'])((replacements.get(k, k), v) 

2785 for k, v in token['data'].items()) 

2786 

2787 

2788def impliedTagToken(name, type="EndTag", attributes=None, 

2789 selfClosing=False): 

2790 if attributes is None: 

2791 attributes = {} 

2792 return {"type": tokenTypes[type], "name": name, "data": attributes, 

2793 "selfClosing": selfClosing} 

2794 

2795 

2796class ParseError(Exception): 

2797 """Error in parsed document""" 

2798 pass