Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/html5parser.py: 44%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1533 statements  

1from __future__ import absolute_import, division, unicode_literals 

2from six import with_metaclass, viewkeys 

3 

4import types 

5 

6from . import _inputstream 

7from . import _tokenizer 

8 

9from . import treebuilders 

10from .treebuilders.base import Marker 

11 

12from . import _utils 

13from .constants import ( 

14 spaceCharacters, asciiUpper2Lower, 

15 specialElements, headingElements, cdataElements, rcdataElements, 

16 tokenTypes, tagTokenTypes, 

17 namespaces, 

18 htmlIntegrationPointElements, mathmlTextIntegrationPointElements, 

19 adjustForeignAttributes as adjustForeignAttributesMap, 

20 adjustMathMLAttributes, adjustSVGAttributes, 

21 E, 

22 _ReparseException 

23) 

24 

25 

26def parse(doc, treebuilder="etree", namespaceHTMLElements=True, **kwargs): 

27 """Parse an HTML document as a string or file-like object into a tree 

28 

29 :arg doc: the document to parse as a string or file-like object 

30 

31 :arg treebuilder: the treebuilder to use when parsing 

32 

33 :arg namespaceHTMLElements: whether or not to namespace HTML elements 

34 

35 :returns: parsed tree 

36 

37 Example: 

38 

39 >>> from html5lib.html5parser import parse 

40 >>> parse('<html><body><p>This is a doc</p></body></html>') 

41 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 

42 

43 """ 

44 tb = treebuilders.getTreeBuilder(treebuilder) 

45 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 

46 return p.parse(doc, **kwargs) 

47 

48 

49def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElements=True, **kwargs): 

50 """Parse an HTML fragment as a string or file-like object into a tree 

51 

52 :arg doc: the fragment to parse as a string or file-like object 

53 

54 :arg container: the container context to parse the fragment in 

55 

56 :arg treebuilder: the treebuilder to use when parsing 

57 

58 :arg namespaceHTMLElements: whether or not to namespace HTML elements 

59 

60 :returns: parsed tree 

61 

62 Example: 

63 

64 >>> from html5lib.html5libparser import parseFragment 

65 >>> parseFragment('<b>this is a fragment</b>') 

66 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 

67 

68 """ 

69 tb = treebuilders.getTreeBuilder(treebuilder) 

70 p = HTMLParser(tb, namespaceHTMLElements=namespaceHTMLElements) 

71 return p.parseFragment(doc, container=container, **kwargs) 

72 

73 

74def method_decorator_metaclass(function): 

75 class Decorated(type): 

76 def __new__(meta, classname, bases, classDict): 

77 for attributeName, attribute in classDict.items(): 

78 if isinstance(attribute, types.FunctionType): 

79 attribute = function(attribute) 

80 

81 classDict[attributeName] = attribute 

82 return type.__new__(meta, classname, bases, classDict) 

83 return Decorated 

84 

85 

86class HTMLParser(object): 

87 """HTML parser 

88 

89 Generates a tree structure from a stream of (possibly malformed) HTML. 

90 

91 """ 

92 

93 def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=False): 

94 """ 

95 :arg tree: a treebuilder class controlling the type of tree that will be 

96 returned. Built in treebuilders can be accessed through 

97 html5lib.treebuilders.getTreeBuilder(treeType) 

98 

99 :arg strict: raise an exception when a parse error is encountered 

100 

101 :arg namespaceHTMLElements: whether or not to namespace HTML elements 

102 

103 :arg debug: whether or not to enable debug mode which logs things 

104 

105 Example: 

106 

107 >>> from html5lib.html5parser import HTMLParser 

108 >>> parser = HTMLParser() # generates parser with etree builder 

109 >>> parser = HTMLParser('lxml', strict=True) # generates parser with lxml builder which is strict 

110 

111 """ 

112 

113 # Raise an exception on the first error encountered 

114 self.strict = strict 

115 

116 if tree is None: 

117 tree = treebuilders.getTreeBuilder("etree") 

118 self.tree = tree(namespaceHTMLElements) 

119 self.errors = [] 

120 

121 self.phases = {name: cls(self, self.tree) for name, cls in 

122 getPhases(debug).items()} 

123 

124 def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): 

125 

126 self.innerHTMLMode = innerHTML 

127 self.container = container 

128 self.scripting = scripting 

129 self.tokenizer = _tokenizer.HTMLTokenizer(stream, parser=self, **kwargs) 

130 self.reset() 

131 

132 try: 

133 self.mainLoop() 

134 except _ReparseException: 

135 self.reset() 

136 self.mainLoop() 

137 

138 def reset(self): 

139 self.tree.reset() 

140 self.firstStartTag = False 

141 self.errors = [] 

142 self.log = [] # only used with debug mode 

143 # "quirks" / "limited quirks" / "no quirks" 

144 self.compatMode = "no quirks" 

145 

146 if self.innerHTMLMode: 

147 self.innerHTML = self.container.lower() 

148 

149 if self.innerHTML in cdataElements: 

150 self.tokenizer.state = self.tokenizer.rcdataState 

151 elif self.innerHTML in rcdataElements: 

152 self.tokenizer.state = self.tokenizer.rawtextState 

153 elif self.innerHTML == 'plaintext': 

154 self.tokenizer.state = self.tokenizer.plaintextState 

155 else: 

156 # state already is data state 

157 # self.tokenizer.state = self.tokenizer.dataState 

158 pass 

159 self.phase = self.phases["beforeHtml"] 

160 self.phase.insertHtmlElement() 

161 self.resetInsertionMode() 

162 else: 

163 self.innerHTML = False # pylint:disable=redefined-variable-type 

164 self.phase = self.phases["initial"] 

165 

166 self.lastPhase = None 

167 

168 self.beforeRCDataPhase = None 

169 

170 self.framesetOK = True 

171 

172 @property 

173 def documentEncoding(self): 

174 """Name of the character encoding that was used to decode the input stream, or 

175 :obj:`None` if that is not determined yet 

176 

177 """ 

178 if not hasattr(self, 'tokenizer'): 

179 return None 

180 return self.tokenizer.stream.charEncoding[0].name 

181 

182 def isHTMLIntegrationPoint(self, element): 

183 if (element.name == "annotation-xml" and 

184 element.namespace == namespaces["mathml"]): 

185 return ("encoding" in element.attributes and 

186 element.attributes["encoding"].translate( 

187 asciiUpper2Lower) in 

188 ("text/html", "application/xhtml+xml")) 

189 else: 

190 return (element.namespace, element.name) in htmlIntegrationPointElements 

191 

192 def isMathMLTextIntegrationPoint(self, element): 

193 return (element.namespace, element.name) in mathmlTextIntegrationPointElements 

194 

195 def mainLoop(self): 

196 CharactersToken = tokenTypes["Characters"] 

197 SpaceCharactersToken = tokenTypes["SpaceCharacters"] 

198 StartTagToken = tokenTypes["StartTag"] 

199 EndTagToken = tokenTypes["EndTag"] 

200 CommentToken = tokenTypes["Comment"] 

201 DoctypeToken = tokenTypes["Doctype"] 

202 ParseErrorToken = tokenTypes["ParseError"] 

203 

204 for token in self.tokenizer: 

205 prev_token = None 

206 new_token = token 

207 while new_token is not None: 

208 prev_token = new_token 

209 currentNode = self.tree.openElements[-1] if self.tree.openElements else None 

210 currentNodeNamespace = currentNode.namespace if currentNode else None 

211 currentNodeName = currentNode.name if currentNode else None 

212 

213 type = new_token["type"] 

214 

215 if type == ParseErrorToken: 

216 self.parseError(new_token["data"], new_token.get("datavars", {})) 

217 new_token = None 

218 else: 

219 if (len(self.tree.openElements) == 0 or 

220 currentNodeNamespace == self.tree.defaultNamespace or 

221 (self.isMathMLTextIntegrationPoint(currentNode) and 

222 ((type == StartTagToken and 

223 token["name"] not in frozenset(["mglyph", "malignmark"])) or 

224 type in (CharactersToken, SpaceCharactersToken))) or 

225 (currentNodeNamespace == namespaces["mathml"] and 

226 currentNodeName == "annotation-xml" and 

227 type == StartTagToken and 

228 token["name"] == "svg") or 

229 (self.isHTMLIntegrationPoint(currentNode) and 

230 type in (StartTagToken, CharactersToken, SpaceCharactersToken))): 

231 phase = self.phase 

232 else: 

233 phase = self.phases["inForeignContent"] 

234 

235 if type == CharactersToken: 

236 new_token = phase.processCharacters(new_token) 

237 elif type == SpaceCharactersToken: 

238 new_token = phase.processSpaceCharacters(new_token) 

239 elif type == StartTagToken: 

240 new_token = phase.processStartTag(new_token) 

241 elif type == EndTagToken: 

242 new_token = phase.processEndTag(new_token) 

243 elif type == CommentToken: 

244 new_token = phase.processComment(new_token) 

245 elif type == DoctypeToken: 

246 new_token = phase.processDoctype(new_token) 

247 

248 if (type == StartTagToken and prev_token["selfClosing"] and 

249 not prev_token["selfClosingAcknowledged"]): 

250 self.parseError("non-void-element-with-trailing-solidus", 

251 {"name": prev_token["name"]}) 

252 

253 # When the loop finishes it's EOF 

254 reprocess = True 

255 phases = [] 

256 while reprocess: 

257 phases.append(self.phase) 

258 reprocess = self.phase.processEOF() 

259 if reprocess: 

260 assert self.phase not in phases 

261 

262 def parse(self, stream, *args, **kwargs): 

263 """Parse a HTML document into a well-formed tree 

264 

265 :arg stream: a file-like object or string containing the HTML to be parsed 

266 

267 The optional encoding parameter must be a string that indicates 

268 the encoding. If specified, that encoding will be used, 

269 regardless of any BOM or later declaration (such as in a meta 

270 element). 

271 

272 :arg scripting: treat noscript elements as if JavaScript was turned on 

273 

274 :returns: parsed tree 

275 

276 Example: 

277 

278 >>> from html5lib.html5parser import HTMLParser 

279 >>> parser = HTMLParser() 

280 >>> parser.parse('<html><body><p>This is a doc</p></body></html>') 

281 <Element u'{http://www.w3.org/1999/xhtml}html' at 0x7feac4909db0> 

282 

283 """ 

284 self._parse(stream, False, None, *args, **kwargs) 

285 return self.tree.getDocument() 

286 

287 def parseFragment(self, stream, *args, **kwargs): 

288 """Parse a HTML fragment into a well-formed tree fragment 

289 

290 :arg container: name of the element we're setting the innerHTML 

291 property if set to None, default to 'div' 

292 

293 :arg stream: a file-like object or string containing the HTML to be parsed 

294 

295 The optional encoding parameter must be a string that indicates 

296 the encoding. If specified, that encoding will be used, 

297 regardless of any BOM or later declaration (such as in a meta 

298 element) 

299 

300 :arg scripting: treat noscript elements as if JavaScript was turned on 

301 

302 :returns: parsed tree 

303 

304 Example: 

305 

306 >>> from html5lib.html5libparser import HTMLParser 

307 >>> parser = HTMLParser() 

308 >>> parser.parseFragment('<b>this is a fragment</b>') 

309 <Element u'DOCUMENT_FRAGMENT' at 0x7feac484b090> 

310 

311 """ 

312 self._parse(stream, True, *args, **kwargs) 

313 return self.tree.getFragment() 

314 

315 def parseError(self, errorcode="XXX-undefined-error", datavars=None): 

316 # XXX The idea is to make errorcode mandatory. 

317 if datavars is None: 

318 datavars = {} 

319 self.errors.append((self.tokenizer.stream.position(), errorcode, datavars)) 

320 if self.strict: 

321 raise ParseError(E[errorcode] % datavars) 

322 

323 def adjustMathMLAttributes(self, token): 

324 adjust_attributes(token, adjustMathMLAttributes) 

325 

326 def adjustSVGAttributes(self, token): 

327 adjust_attributes(token, adjustSVGAttributes) 

328 

329 def adjustForeignAttributes(self, token): 

330 adjust_attributes(token, adjustForeignAttributesMap) 

331 

332 def reparseTokenNormal(self, token): 

333 # pylint:disable=unused-argument 

334 self.parser.phase() 

335 

336 def resetInsertionMode(self): 

337 # The name of this method is mostly historical. (It's also used in the 

338 # specification.) 

339 last = False 

340 newModes = { 

341 "select": "inSelect", 

342 "td": "inCell", 

343 "th": "inCell", 

344 "tr": "inRow", 

345 "tbody": "inTableBody", 

346 "thead": "inTableBody", 

347 "tfoot": "inTableBody", 

348 "caption": "inCaption", 

349 "colgroup": "inColumnGroup", 

350 "table": "inTable", 

351 "head": "inBody", 

352 "body": "inBody", 

353 "frameset": "inFrameset", 

354 "html": "beforeHead" 

355 } 

356 for node in self.tree.openElements[::-1]: 

357 nodeName = node.name 

358 new_phase = None 

359 if node == self.tree.openElements[0]: 

360 assert self.innerHTML 

361 last = True 

362 nodeName = self.innerHTML 

363 # Check for conditions that should only happen in the innerHTML 

364 # case 

365 if nodeName in ("select", "colgroup", "head", "html"): 

366 assert self.innerHTML 

367 

368 if not last and node.namespace != self.tree.defaultNamespace: 

369 continue 

370 

371 if nodeName in newModes: 

372 new_phase = self.phases[newModes[nodeName]] 

373 break 

374 elif last: 

375 new_phase = self.phases["inBody"] 

376 break 

377 

378 self.phase = new_phase 

379 

380 def parseRCDataRawtext(self, token, contentType): 

381 # Generic RCDATA/RAWTEXT Parsing algorithm 

382 assert contentType in ("RAWTEXT", "RCDATA") 

383 

384 self.tree.insertElement(token) 

385 

386 if contentType == "RAWTEXT": 

387 self.tokenizer.state = self.tokenizer.rawtextState 

388 else: 

389 self.tokenizer.state = self.tokenizer.rcdataState 

390 

391 self.originalPhase = self.phase 

392 

393 self.phase = self.phases["text"] 

394 

395 

396@_utils.memoize 

397def getPhases(debug): 

398 def log(function): 

399 """Logger that records which phase processes each token""" 

400 type_names = {value: key for key, value in tokenTypes.items()} 

401 

402 def wrapped(self, *args, **kwargs): 

403 if function.__name__.startswith("process") and len(args) > 0: 

404 token = args[0] 

405 info = {"type": type_names[token['type']]} 

406 if token['type'] in tagTokenTypes: 

407 info["name"] = token['name'] 

408 

409 self.parser.log.append((self.parser.tokenizer.state.__name__, 

410 self.parser.phase.__class__.__name__, 

411 self.__class__.__name__, 

412 function.__name__, 

413 info)) 

414 return function(self, *args, **kwargs) 

415 else: 

416 return function(self, *args, **kwargs) 

417 return wrapped 

418 

419 def getMetaclass(use_metaclass, metaclass_func): 

420 if use_metaclass: 

421 return method_decorator_metaclass(metaclass_func) 

422 else: 

423 return type 

424 

425 # pylint:disable=unused-argument 

426 class Phase(with_metaclass(getMetaclass(debug, log))): 

427 """Base class for helper object that implements each phase of processing 

428 """ 

429 __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") 

430 

431 def __init__(self, parser, tree): 

432 self.parser = parser 

433 self.tree = tree 

434 self.__startTagCache = {} 

435 self.__endTagCache = {} 

436 

437 def processEOF(self): 

438 raise NotImplementedError 

439 

440 def processComment(self, token): 

441 # For most phases the following is correct. Where it's not it will be 

442 # overridden. 

443 self.tree.insertComment(token, self.tree.openElements[-1]) 

444 

445 def processDoctype(self, token): 

446 self.parser.parseError("unexpected-doctype") 

447 

448 def processCharacters(self, token): 

449 self.tree.insertText(token["data"]) 

450 

451 def processSpaceCharacters(self, token): 

452 self.tree.insertText(token["data"]) 

453 

454 def processStartTag(self, token): 

455 # Note the caching is done here rather than BoundMethodDispatcher as doing it there 

456 # requires a circular reference to the Phase, and this ends up with a significant 

457 # (CPython 2.7, 3.8) GC cost when parsing many short inputs 

458 name = token["name"] 

459 # In Py2, using `in` is quicker in general than try/except KeyError 

460 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 

461 if name in self.__startTagCache: 

462 func = self.__startTagCache[name] 

463 else: 

464 func = self.__startTagCache[name] = self.startTagHandler[name] 

465 # bound the cache size in case we get loads of unknown tags 

466 while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: 

467 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 

468 self.__startTagCache.pop(next(iter(self.__startTagCache))) 

469 return func(token) 

470 

471 def startTagHtml(self, token): 

472 if not self.parser.firstStartTag and token["name"] == "html": 

473 self.parser.parseError("non-html-root") 

474 # XXX Need a check here to see if the first start tag token emitted is 

475 # this token... If it's not, invoke self.parser.parseError(). 

476 for attr, value in token["data"].items(): 

477 if attr not in self.tree.openElements[0].attributes: 

478 self.tree.openElements[0].attributes[attr] = value 

479 self.parser.firstStartTag = False 

480 

481 def processEndTag(self, token): 

482 # Note the caching is done here rather than BoundMethodDispatcher as doing it there 

483 # requires a circular reference to the Phase, and this ends up with a significant 

484 # (CPython 2.7, 3.8) GC cost when parsing many short inputs 

485 name = token["name"] 

486 # In Py2, using `in` is quicker in general than try/except KeyError 

487 # In Py3, `in` is quicker when there are few cache hits (typically short inputs) 

488 if name in self.__endTagCache: 

489 func = self.__endTagCache[name] 

490 else: 

491 func = self.__endTagCache[name] = self.endTagHandler[name] 

492 # bound the cache size in case we get loads of unknown tags 

493 while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: 

494 # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 

495 self.__endTagCache.pop(next(iter(self.__endTagCache))) 

496 return func(token) 

497 

498 class InitialPhase(Phase): 

499 __slots__ = tuple() 

500 

501 def processSpaceCharacters(self, token): 

502 pass 

503 

504 def processComment(self, token): 

505 self.tree.insertComment(token, self.tree.document) 

506 

507 def processDoctype(self, token): 

508 name = token["name"] 

509 publicId = token["publicId"] 

510 systemId = token["systemId"] 

511 correct = token["correct"] 

512 

513 if (name != "html" or publicId is not None or 

514 systemId is not None and systemId != "about:legacy-compat"): 

515 self.parser.parseError("unknown-doctype") 

516 

517 if publicId is None: 

518 publicId = "" 

519 

520 self.tree.insertDoctype(token) 

521 

522 if publicId != "": 

523 publicId = publicId.translate(asciiUpper2Lower) 

524 

525 if (not correct or token["name"] != "html" or 

526 publicId.startswith( 

527 ("+//silmaril//dtd html pro v0r11 19970101//", 

528 "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", 

529 "-//as//dtd html 3.0 aswedit + extensions//", 

530 "-//ietf//dtd html 2.0 level 1//", 

531 "-//ietf//dtd html 2.0 level 2//", 

532 "-//ietf//dtd html 2.0 strict level 1//", 

533 "-//ietf//dtd html 2.0 strict level 2//", 

534 "-//ietf//dtd html 2.0 strict//", 

535 "-//ietf//dtd html 2.0//", 

536 "-//ietf//dtd html 2.1e//", 

537 "-//ietf//dtd html 3.0//", 

538 "-//ietf//dtd html 3.2 final//", 

539 "-//ietf//dtd html 3.2//", 

540 "-//ietf//dtd html 3//", 

541 "-//ietf//dtd html level 0//", 

542 "-//ietf//dtd html level 1//", 

543 "-//ietf//dtd html level 2//", 

544 "-//ietf//dtd html level 3//", 

545 "-//ietf//dtd html strict level 0//", 

546 "-//ietf//dtd html strict level 1//", 

547 "-//ietf//dtd html strict level 2//", 

548 "-//ietf//dtd html strict level 3//", 

549 "-//ietf//dtd html strict//", 

550 "-//ietf//dtd html//", 

551 "-//metrius//dtd metrius presentational//", 

552 "-//microsoft//dtd internet explorer 2.0 html strict//", 

553 "-//microsoft//dtd internet explorer 2.0 html//", 

554 "-//microsoft//dtd internet explorer 2.0 tables//", 

555 "-//microsoft//dtd internet explorer 3.0 html strict//", 

556 "-//microsoft//dtd internet explorer 3.0 html//", 

557 "-//microsoft//dtd internet explorer 3.0 tables//", 

558 "-//netscape comm. corp.//dtd html//", 

559 "-//netscape comm. corp.//dtd strict html//", 

560 "-//o'reilly and associates//dtd html 2.0//", 

561 "-//o'reilly and associates//dtd html extended 1.0//", 

562 "-//o'reilly and associates//dtd html extended relaxed 1.0//", 

563 "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", 

564 "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", 

565 "-//spyglass//dtd html 2.0 extended//", 

566 "-//sq//dtd html 2.0 hotmetal + extensions//", 

567 "-//sun microsystems corp.//dtd hotjava html//", 

568 "-//sun microsystems corp.//dtd hotjava strict html//", 

569 "-//w3c//dtd html 3 1995-03-24//", 

570 "-//w3c//dtd html 3.2 draft//", 

571 "-//w3c//dtd html 3.2 final//", 

572 "-//w3c//dtd html 3.2//", 

573 "-//w3c//dtd html 3.2s draft//", 

574 "-//w3c//dtd html 4.0 frameset//", 

575 "-//w3c//dtd html 4.0 transitional//", 

576 "-//w3c//dtd html experimental 19960712//", 

577 "-//w3c//dtd html experimental 970421//", 

578 "-//w3c//dtd w3 html//", 

579 "-//w3o//dtd w3 html 3.0//", 

580 "-//webtechs//dtd mozilla html 2.0//", 

581 "-//webtechs//dtd mozilla html//")) or 

582 publicId in ("-//w3o//dtd w3 html strict 3.0//en//", 

583 "-/w3c/dtd html 4.0 transitional/en", 

584 "html") or 

585 publicId.startswith( 

586 ("-//w3c//dtd html 4.01 frameset//", 

587 "-//w3c//dtd html 4.01 transitional//")) and 

588 systemId is None or 

589 systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): 

590 self.parser.compatMode = "quirks" 

591 elif (publicId.startswith( 

592 ("-//w3c//dtd xhtml 1.0 frameset//", 

593 "-//w3c//dtd xhtml 1.0 transitional//")) or 

594 publicId.startswith( 

595 ("-//w3c//dtd html 4.01 frameset//", 

596 "-//w3c//dtd html 4.01 transitional//")) and 

597 systemId is not None): 

598 self.parser.compatMode = "limited quirks" 

599 

600 self.parser.phase = self.parser.phases["beforeHtml"] 

601 

602 def anythingElse(self): 

603 self.parser.compatMode = "quirks" 

604 self.parser.phase = self.parser.phases["beforeHtml"] 

605 

606 def processCharacters(self, token): 

607 self.parser.parseError("expected-doctype-but-got-chars") 

608 self.anythingElse() 

609 return token 

610 

611 def processStartTag(self, token): 

612 self.parser.parseError("expected-doctype-but-got-start-tag", 

613 {"name": token["name"]}) 

614 self.anythingElse() 

615 return token 

616 

617 def processEndTag(self, token): 

618 self.parser.parseError("expected-doctype-but-got-end-tag", 

619 {"name": token["name"]}) 

620 self.anythingElse() 

621 return token 

622 

623 def processEOF(self): 

624 self.parser.parseError("expected-doctype-but-got-eof") 

625 self.anythingElse() 

626 return True 

627 

628 class BeforeHtmlPhase(Phase): 

629 __slots__ = tuple() 

630 

631 # helper methods 

632 def insertHtmlElement(self): 

633 self.tree.insertRoot(impliedTagToken("html", "StartTag")) 

634 self.parser.phase = self.parser.phases["beforeHead"] 

635 

636 # other 

637 def processEOF(self): 

638 self.insertHtmlElement() 

639 return True 

640 

641 def processComment(self, token): 

642 self.tree.insertComment(token, self.tree.document) 

643 

644 def processSpaceCharacters(self, token): 

645 pass 

646 

647 def processCharacters(self, token): 

648 self.insertHtmlElement() 

649 return token 

650 

651 def processStartTag(self, token): 

652 if token["name"] == "html": 

653 self.parser.firstStartTag = True 

654 self.insertHtmlElement() 

655 return token 

656 

657 def processEndTag(self, token): 

658 if token["name"] not in ("head", "body", "html", "br"): 

659 self.parser.parseError("unexpected-end-tag-before-html", 

660 {"name": token["name"]}) 

661 else: 

662 self.insertHtmlElement() 

663 return token 

664 

665 class BeforeHeadPhase(Phase): 

666 __slots__ = tuple() 

667 

668 def processEOF(self): 

669 self.startTagHead(impliedTagToken("head", "StartTag")) 

670 return True 

671 

672 def processSpaceCharacters(self, token): 

673 pass 

674 

675 def processCharacters(self, token): 

676 self.startTagHead(impliedTagToken("head", "StartTag")) 

677 return token 

678 

679 def startTagHtml(self, token): 

680 return self.parser.phases["inBody"].processStartTag(token) 

681 

682 def startTagHead(self, token): 

683 self.tree.insertElement(token) 

684 self.tree.headPointer = self.tree.openElements[-1] 

685 self.parser.phase = self.parser.phases["inHead"] 

686 

687 def startTagOther(self, token): 

688 self.startTagHead(impliedTagToken("head", "StartTag")) 

689 return token 

690 

691 def endTagImplyHead(self, token): 

692 self.startTagHead(impliedTagToken("head", "StartTag")) 

693 return token 

694 

695 def endTagOther(self, token): 

696 self.parser.parseError("end-tag-after-implied-root", 

697 {"name": token["name"]}) 

698 

699 startTagHandler = _utils.MethodDispatcher([ 

700 ("html", startTagHtml), 

701 ("head", startTagHead) 

702 ]) 

703 startTagHandler.default = startTagOther 

704 

705 endTagHandler = _utils.MethodDispatcher([ 

706 (("head", "body", "html", "br"), endTagImplyHead) 

707 ]) 

708 endTagHandler.default = endTagOther 

709 

710 class InHeadPhase(Phase): 

711 __slots__ = tuple() 

712 

713 # the real thing 

714 def processEOF(self): 

715 self.anythingElse() 

716 return True 

717 

718 def processCharacters(self, token): 

719 self.anythingElse() 

720 return token 

721 

722 def startTagHtml(self, token): 

723 return self.parser.phases["inBody"].processStartTag(token) 

724 

725 def startTagHead(self, token): 

726 self.parser.parseError("two-heads-are-not-better-than-one") 

727 

728 def startTagBaseLinkCommand(self, token): 

729 self.tree.insertElement(token) 

730 self.tree.openElements.pop() 

731 token["selfClosingAcknowledged"] = True 

732 

733 def startTagMeta(self, token): 

734 self.tree.insertElement(token) 

735 self.tree.openElements.pop() 

736 token["selfClosingAcknowledged"] = True 

737 

738 attributes = token["data"] 

739 if self.parser.tokenizer.stream.charEncoding[1] == "tentative": 

740 if "charset" in attributes: 

741 self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) 

742 elif ("content" in attributes and 

743 "http-equiv" in attributes and 

744 attributes["http-equiv"].lower() == "content-type"): 

745 # Encoding it as UTF-8 here is a hack, as really we should pass 

746 # the abstract Unicode string, and just use the 

747 # ContentAttrParser on that, but using UTF-8 allows all chars 

748 # to be encoded and as a ASCII-superset works. 

749 data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) 

750 parser = _inputstream.ContentAttrParser(data) 

751 codec = parser.parse() 

752 self.parser.tokenizer.stream.changeEncoding(codec) 

753 

754 def startTagTitle(self, token): 

755 self.parser.parseRCDataRawtext(token, "RCDATA") 

756 

757 def startTagNoFramesStyle(self, token): 

758 # Need to decide whether to implement the scripting-disabled case 

759 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

760 

761 def startTagNoscript(self, token): 

762 if self.parser.scripting: 

763 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

764 else: 

765 self.tree.insertElement(token) 

766 self.parser.phase = self.parser.phases["inHeadNoscript"] 

767 

768 def startTagScript(self, token): 

769 self.tree.insertElement(token) 

770 self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState 

771 self.parser.originalPhase = self.parser.phase 

772 self.parser.phase = self.parser.phases["text"] 

773 

774 def startTagOther(self, token): 

775 self.anythingElse() 

776 return token 

777 

778 def endTagHead(self, token): 

779 node = self.parser.tree.openElements.pop() 

780 assert node.name == "head", "Expected head got %s" % node.name 

781 self.parser.phase = self.parser.phases["afterHead"] 

782 

783 def endTagHtmlBodyBr(self, token): 

784 self.anythingElse() 

785 return token 

786 

787 def endTagOther(self, token): 

788 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

789 

790 def anythingElse(self): 

791 self.endTagHead(impliedTagToken("head")) 

792 

793 startTagHandler = _utils.MethodDispatcher([ 

794 ("html", startTagHtml), 

795 ("title", startTagTitle), 

796 (("noframes", "style"), startTagNoFramesStyle), 

797 ("noscript", startTagNoscript), 

798 ("script", startTagScript), 

799 (("base", "basefont", "bgsound", "command", "link"), 

800 startTagBaseLinkCommand), 

801 ("meta", startTagMeta), 

802 ("head", startTagHead) 

803 ]) 

804 startTagHandler.default = startTagOther 

805 

806 endTagHandler = _utils.MethodDispatcher([ 

807 ("head", endTagHead), 

808 (("br", "html", "body"), endTagHtmlBodyBr) 

809 ]) 

810 endTagHandler.default = endTagOther 

811 

812 class InHeadNoscriptPhase(Phase): 

813 __slots__ = tuple() 

814 

815 def processEOF(self): 

816 self.parser.parseError("eof-in-head-noscript") 

817 self.anythingElse() 

818 return True 

819 

820 def processComment(self, token): 

821 return self.parser.phases["inHead"].processComment(token) 

822 

823 def processCharacters(self, token): 

824 self.parser.parseError("char-in-head-noscript") 

825 self.anythingElse() 

826 return token 

827 

828 def processSpaceCharacters(self, token): 

829 return self.parser.phases["inHead"].processSpaceCharacters(token) 

830 

831 def startTagHtml(self, token): 

832 return self.parser.phases["inBody"].processStartTag(token) 

833 

834 def startTagBaseLinkCommand(self, token): 

835 return self.parser.phases["inHead"].processStartTag(token) 

836 

837 def startTagHeadNoscript(self, token): 

838 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 

839 

840 def startTagOther(self, token): 

841 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 

842 self.anythingElse() 

843 return token 

844 

845 def endTagNoscript(self, token): 

846 node = self.parser.tree.openElements.pop() 

847 assert node.name == "noscript", "Expected noscript got %s" % node.name 

848 self.parser.phase = self.parser.phases["inHead"] 

849 

850 def endTagBr(self, token): 

851 self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) 

852 self.anythingElse() 

853 return token 

854 

855 def endTagOther(self, token): 

856 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

857 

858 def anythingElse(self): 

859 # Caller must raise parse error first! 

860 self.endTagNoscript(impliedTagToken("noscript")) 

861 

862 startTagHandler = _utils.MethodDispatcher([ 

863 ("html", startTagHtml), 

864 (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), 

865 (("head", "noscript"), startTagHeadNoscript), 

866 ]) 

867 startTagHandler.default = startTagOther 

868 

869 endTagHandler = _utils.MethodDispatcher([ 

870 ("noscript", endTagNoscript), 

871 ("br", endTagBr), 

872 ]) 

873 endTagHandler.default = endTagOther 

874 

875 class AfterHeadPhase(Phase): 

876 __slots__ = tuple() 

877 

878 def processEOF(self): 

879 self.anythingElse() 

880 return True 

881 

882 def processCharacters(self, token): 

883 self.anythingElse() 

884 return token 

885 

886 def startTagHtml(self, token): 

887 return self.parser.phases["inBody"].processStartTag(token) 

888 

889 def startTagBody(self, token): 

890 self.parser.framesetOK = False 

891 self.tree.insertElement(token) 

892 self.parser.phase = self.parser.phases["inBody"] 

893 

894 def startTagFrameset(self, token): 

895 self.tree.insertElement(token) 

896 self.parser.phase = self.parser.phases["inFrameset"] 

897 

898 def startTagFromHead(self, token): 

899 self.parser.parseError("unexpected-start-tag-out-of-my-head", 

900 {"name": token["name"]}) 

901 self.tree.openElements.append(self.tree.headPointer) 

902 self.parser.phases["inHead"].processStartTag(token) 

903 for node in self.tree.openElements[::-1]: 

904 if node.name == "head": 

905 self.tree.openElements.remove(node) 

906 break 

907 

908 def startTagHead(self, token): 

909 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 

910 

911 def startTagOther(self, token): 

912 self.anythingElse() 

913 return token 

914 

915 def endTagHtmlBodyBr(self, token): 

916 self.anythingElse() 

917 return token 

918 

919 def endTagOther(self, token): 

920 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

921 

922 def anythingElse(self): 

923 self.tree.insertElement(impliedTagToken("body", "StartTag")) 

924 self.parser.phase = self.parser.phases["inBody"] 

925 self.parser.framesetOK = True 

926 

927 startTagHandler = _utils.MethodDispatcher([ 

928 ("html", startTagHtml), 

929 ("body", startTagBody), 

930 ("frameset", startTagFrameset), 

931 (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", 

932 "style", "title"), 

933 startTagFromHead), 

934 ("head", startTagHead) 

935 ]) 

936 startTagHandler.default = startTagOther 

937 endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), 

938 endTagHtmlBodyBr)]) 

939 endTagHandler.default = endTagOther 

940 

941 class InBodyPhase(Phase): 

942 # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody 

943 # the really-really-really-very crazy mode 

944 __slots__ = ("processSpaceCharacters",) 

945 

946 def __init__(self, *args, **kwargs): 

947 super(InBodyPhase, self).__init__(*args, **kwargs) 

948 # Set this to the default handler 

949 self.processSpaceCharacters = self.processSpaceCharactersNonPre 

950 

951 def isMatchingFormattingElement(self, node1, node2): 

952 return (node1.name == node2.name and 

953 node1.namespace == node2.namespace and 

954 node1.attributes == node2.attributes) 

955 

956 # helper 

957 def addFormattingElement(self, token): 

958 self.tree.insertElement(token) 

959 element = self.tree.openElements[-1] 

960 

961 matchingElements = [] 

962 for node in self.tree.activeFormattingElements[::-1]: 

963 if node is Marker: 

964 break 

965 elif self.isMatchingFormattingElement(node, element): 

966 matchingElements.append(node) 

967 

968 assert len(matchingElements) <= 3 

969 if len(matchingElements) == 3: 

970 self.tree.activeFormattingElements.remove(matchingElements[-1]) 

971 self.tree.activeFormattingElements.append(element) 

972 

973 # the real deal 

974 def processEOF(self): 

975 allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", 

976 "tfoot", "th", "thead", "tr", "body", 

977 "html")) 

978 for node in self.tree.openElements[::-1]: 

979 if node.name not in allowed_elements: 

980 self.parser.parseError("expected-closing-tag-but-got-eof") 

981 break 

982 # Stop parsing 

983 

984 def processSpaceCharactersDropNewline(self, token): 

985 # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we 

986 # want to drop leading newlines 

987 data = token["data"] 

988 self.processSpaceCharacters = self.processSpaceCharactersNonPre 

989 if (data.startswith("\n") and 

990 self.tree.openElements[-1].name in ("pre", "listing", "textarea") and 

991 not self.tree.openElements[-1].hasContent()): 

992 data = data[1:] 

993 if data: 

994 self.tree.reconstructActiveFormattingElements() 

995 self.tree.insertText(data) 

996 

997 def processCharacters(self, token): 

998 if token["data"] == "\u0000": 

999 # The tokenizer should always emit null on its own 

1000 return 

1001 self.tree.reconstructActiveFormattingElements() 

1002 self.tree.insertText(token["data"]) 

1003 # This must be bad for performance 

1004 if (self.parser.framesetOK and 

1005 any([char not in spaceCharacters 

1006 for char in token["data"]])): 

1007 self.parser.framesetOK = False 

1008 

1009 def processSpaceCharactersNonPre(self, token): 

1010 self.tree.reconstructActiveFormattingElements() 

1011 self.tree.insertText(token["data"]) 

1012 

1013 def startTagProcessInHead(self, token): 

1014 return self.parser.phases["inHead"].processStartTag(token) 

1015 

1016 def startTagBody(self, token): 

1017 self.parser.parseError("unexpected-start-tag", {"name": "body"}) 

1018 if (len(self.tree.openElements) == 1 or 

1019 self.tree.openElements[1].name != "body"): 

1020 assert self.parser.innerHTML 

1021 else: 

1022 self.parser.framesetOK = False 

1023 for attr, value in token["data"].items(): 

1024 if attr not in self.tree.openElements[1].attributes: 

1025 self.tree.openElements[1].attributes[attr] = value 

1026 

1027 def startTagFrameset(self, token): 

1028 self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) 

1029 if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): 

1030 assert self.parser.innerHTML 

1031 elif not self.parser.framesetOK: 

1032 pass 

1033 else: 

1034 if self.tree.openElements[1].parent: 

1035 self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) 

1036 while self.tree.openElements[-1].name != "html": 

1037 self.tree.openElements.pop() 

1038 self.tree.insertElement(token) 

1039 self.parser.phase = self.parser.phases["inFrameset"] 

1040 

1041 def startTagCloseP(self, token): 

1042 if self.tree.elementInScope("p", variant="button"): 

1043 self.endTagP(impliedTagToken("p")) 

1044 self.tree.insertElement(token) 

1045 

1046 def startTagPreListing(self, token): 

1047 if self.tree.elementInScope("p", variant="button"): 

1048 self.endTagP(impliedTagToken("p")) 

1049 self.tree.insertElement(token) 

1050 self.parser.framesetOK = False 

1051 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 

1052 

1053 def startTagForm(self, token): 

1054 if self.tree.formPointer: 

1055 self.parser.parseError("unexpected-start-tag", {"name": "form"}) 

1056 else: 

1057 if self.tree.elementInScope("p", variant="button"): 

1058 self.endTagP(impliedTagToken("p")) 

1059 self.tree.insertElement(token) 

1060 self.tree.formPointer = self.tree.openElements[-1] 

1061 

1062 def startTagListItem(self, token): 

1063 self.parser.framesetOK = False 

1064 

1065 stopNamesMap = {"li": ["li"], 

1066 "dt": ["dt", "dd"], 

1067 "dd": ["dt", "dd"]} 

1068 stopNames = stopNamesMap[token["name"]] 

1069 for node in reversed(self.tree.openElements): 

1070 if node.name in stopNames: 

1071 self.parser.phase.processEndTag( 

1072 impliedTagToken(node.name, "EndTag")) 

1073 break 

1074 if (node.nameTuple in specialElements and 

1075 node.name not in ("address", "div", "p")): 

1076 break 

1077 

1078 if self.tree.elementInScope("p", variant="button"): 

1079 self.parser.phase.processEndTag( 

1080 impliedTagToken("p", "EndTag")) 

1081 

1082 self.tree.insertElement(token) 

1083 

1084 def startTagPlaintext(self, token): 

1085 if self.tree.elementInScope("p", variant="button"): 

1086 self.endTagP(impliedTagToken("p")) 

1087 self.tree.insertElement(token) 

1088 self.parser.tokenizer.state = self.parser.tokenizer.plaintextState 

1089 

1090 def startTagHeading(self, token): 

1091 if self.tree.elementInScope("p", variant="button"): 

1092 self.endTagP(impliedTagToken("p")) 

1093 if self.tree.openElements[-1].name in headingElements: 

1094 self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) 

1095 self.tree.openElements.pop() 

1096 self.tree.insertElement(token) 

1097 

1098 def startTagA(self, token): 

1099 afeAElement = self.tree.elementInActiveFormattingElements("a") 

1100 if afeAElement: 

1101 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1102 {"startName": "a", "endName": "a"}) 

1103 self.endTagFormatting(impliedTagToken("a")) 

1104 if afeAElement in self.tree.openElements: 

1105 self.tree.openElements.remove(afeAElement) 

1106 if afeAElement in self.tree.activeFormattingElements: 

1107 self.tree.activeFormattingElements.remove(afeAElement) 

1108 self.tree.reconstructActiveFormattingElements() 

1109 self.addFormattingElement(token) 

1110 

1111 def startTagFormatting(self, token): 

1112 self.tree.reconstructActiveFormattingElements() 

1113 self.addFormattingElement(token) 

1114 

1115 def startTagNobr(self, token): 

1116 self.tree.reconstructActiveFormattingElements() 

1117 if self.tree.elementInScope("nobr"): 

1118 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1119 {"startName": "nobr", "endName": "nobr"}) 

1120 self.processEndTag(impliedTagToken("nobr")) 

1121 # XXX Need tests that trigger the following 

1122 self.tree.reconstructActiveFormattingElements() 

1123 self.addFormattingElement(token) 

1124 

1125 def startTagButton(self, token): 

1126 if self.tree.elementInScope("button"): 

1127 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1128 {"startName": "button", "endName": "button"}) 

1129 self.processEndTag(impliedTagToken("button")) 

1130 return token 

1131 else: 

1132 self.tree.reconstructActiveFormattingElements() 

1133 self.tree.insertElement(token) 

1134 self.parser.framesetOK = False 

1135 

1136 def startTagAppletMarqueeObject(self, token): 

1137 self.tree.reconstructActiveFormattingElements() 

1138 self.tree.insertElement(token) 

1139 self.tree.activeFormattingElements.append(Marker) 

1140 self.parser.framesetOK = False 

1141 

1142 def startTagXmp(self, token): 

1143 if self.tree.elementInScope("p", variant="button"): 

1144 self.endTagP(impliedTagToken("p")) 

1145 self.tree.reconstructActiveFormattingElements() 

1146 self.parser.framesetOK = False 

1147 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

1148 

1149 def startTagTable(self, token): 

1150 if self.parser.compatMode != "quirks": 

1151 if self.tree.elementInScope("p", variant="button"): 

1152 self.processEndTag(impliedTagToken("p")) 

1153 self.tree.insertElement(token) 

1154 self.parser.framesetOK = False 

1155 self.parser.phase = self.parser.phases["inTable"] 

1156 

1157 def startTagVoidFormatting(self, token): 

1158 self.tree.reconstructActiveFormattingElements() 

1159 self.tree.insertElement(token) 

1160 self.tree.openElements.pop() 

1161 token["selfClosingAcknowledged"] = True 

1162 self.parser.framesetOK = False 

1163 

1164 def startTagInput(self, token): 

1165 framesetOK = self.parser.framesetOK 

1166 self.startTagVoidFormatting(token) 

1167 if ("type" in token["data"] and 

1168 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 

1169 # input type=hidden doesn't change framesetOK 

1170 self.parser.framesetOK = framesetOK 

1171 

1172 def startTagParamSource(self, token): 

1173 self.tree.insertElement(token) 

1174 self.tree.openElements.pop() 

1175 token["selfClosingAcknowledged"] = True 

1176 

1177 def startTagHr(self, token): 

1178 if self.tree.elementInScope("p", variant="button"): 

1179 self.endTagP(impliedTagToken("p")) 

1180 self.tree.insertElement(token) 

1181 self.tree.openElements.pop() 

1182 token["selfClosingAcknowledged"] = True 

1183 self.parser.framesetOK = False 

1184 

1185 def startTagImage(self, token): 

1186 # No really... 

1187 self.parser.parseError("unexpected-start-tag-treated-as", 

1188 {"originalName": "image", "newName": "img"}) 

1189 self.processStartTag(impliedTagToken("img", "StartTag", 

1190 attributes=token["data"], 

1191 selfClosing=token["selfClosing"])) 

1192 

1193 def startTagIsIndex(self, token): 

1194 self.parser.parseError("deprecated-tag", {"name": "isindex"}) 

1195 if self.tree.formPointer: 

1196 return 

1197 form_attrs = {} 

1198 if "action" in token["data"]: 

1199 form_attrs["action"] = token["data"]["action"] 

1200 self.processStartTag(impliedTagToken("form", "StartTag", 

1201 attributes=form_attrs)) 

1202 self.processStartTag(impliedTagToken("hr", "StartTag")) 

1203 self.processStartTag(impliedTagToken("label", "StartTag")) 

1204 # XXX Localization ... 

1205 if "prompt" in token["data"]: 

1206 prompt = token["data"]["prompt"] 

1207 else: 

1208 prompt = "This is a searchable index. Enter search keywords: " 

1209 self.processCharacters( 

1210 {"type": tokenTypes["Characters"], "data": prompt}) 

1211 attributes = token["data"].copy() 

1212 if "action" in attributes: 

1213 del attributes["action"] 

1214 if "prompt" in attributes: 

1215 del attributes["prompt"] 

1216 attributes["name"] = "isindex" 

1217 self.processStartTag(impliedTagToken("input", "StartTag", 

1218 attributes=attributes, 

1219 selfClosing=token["selfClosing"])) 

1220 self.processEndTag(impliedTagToken("label")) 

1221 self.processStartTag(impliedTagToken("hr", "StartTag")) 

1222 self.processEndTag(impliedTagToken("form")) 

1223 

1224 def startTagTextarea(self, token): 

1225 self.tree.insertElement(token) 

1226 self.parser.tokenizer.state = self.parser.tokenizer.rcdataState 

1227 self.processSpaceCharacters = self.processSpaceCharactersDropNewline 

1228 self.parser.framesetOK = False 

1229 

1230 def startTagIFrame(self, token): 

1231 self.parser.framesetOK = False 

1232 self.startTagRawtext(token) 

1233 

1234 def startTagNoscript(self, token): 

1235 if self.parser.scripting: 

1236 self.startTagRawtext(token) 

1237 else: 

1238 self.startTagOther(token) 

1239 

1240 def startTagRawtext(self, token): 

1241 """iframe, noembed noframes, noscript(if scripting enabled)""" 

1242 self.parser.parseRCDataRawtext(token, "RAWTEXT") 

1243 

1244 def startTagOpt(self, token): 

1245 if self.tree.openElements[-1].name == "option": 

1246 self.parser.phase.processEndTag(impliedTagToken("option")) 

1247 self.tree.reconstructActiveFormattingElements() 

1248 self.parser.tree.insertElement(token) 

1249 

1250 def startTagSelect(self, token): 

1251 self.tree.reconstructActiveFormattingElements() 

1252 self.tree.insertElement(token) 

1253 self.parser.framesetOK = False 

1254 if self.parser.phase in (self.parser.phases["inTable"], 

1255 self.parser.phases["inCaption"], 

1256 self.parser.phases["inColumnGroup"], 

1257 self.parser.phases["inTableBody"], 

1258 self.parser.phases["inRow"], 

1259 self.parser.phases["inCell"]): 

1260 self.parser.phase = self.parser.phases["inSelectInTable"] 

1261 else: 

1262 self.parser.phase = self.parser.phases["inSelect"] 

1263 

1264 def startTagRpRt(self, token): 

1265 if self.tree.elementInScope("ruby"): 

1266 self.tree.generateImpliedEndTags() 

1267 if self.tree.openElements[-1].name != "ruby": 

1268 self.parser.parseError() 

1269 self.tree.insertElement(token) 

1270 

1271 def startTagMath(self, token): 

1272 self.tree.reconstructActiveFormattingElements() 

1273 self.parser.adjustMathMLAttributes(token) 

1274 self.parser.adjustForeignAttributes(token) 

1275 token["namespace"] = namespaces["mathml"] 

1276 self.tree.insertElement(token) 

1277 # Need to get the parse error right for the case where the token 

1278 # has a namespace not equal to the xmlns attribute 

1279 if token["selfClosing"]: 

1280 self.tree.openElements.pop() 

1281 token["selfClosingAcknowledged"] = True 

1282 

1283 def startTagSvg(self, token): 

1284 self.tree.reconstructActiveFormattingElements() 

1285 self.parser.adjustSVGAttributes(token) 

1286 self.parser.adjustForeignAttributes(token) 

1287 token["namespace"] = namespaces["svg"] 

1288 self.tree.insertElement(token) 

1289 # Need to get the parse error right for the case where the token 

1290 # has a namespace not equal to the xmlns attribute 

1291 if token["selfClosing"]: 

1292 self.tree.openElements.pop() 

1293 token["selfClosingAcknowledged"] = True 

1294 

1295 def startTagMisplaced(self, token): 

1296 """ Elements that should be children of other elements that have a 

1297 different insertion mode; here they are ignored 

1298 "caption", "col", "colgroup", "frame", "frameset", "head", 

1299 "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", 

1300 "tr", "noscript" 

1301 """ 

1302 self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) 

1303 

1304 def startTagOther(self, token): 

1305 self.tree.reconstructActiveFormattingElements() 

1306 self.tree.insertElement(token) 

1307 

1308 def endTagP(self, token): 

1309 if not self.tree.elementInScope("p", variant="button"): 

1310 self.startTagCloseP(impliedTagToken("p", "StartTag")) 

1311 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 

1312 self.endTagP(impliedTagToken("p", "EndTag")) 

1313 else: 

1314 self.tree.generateImpliedEndTags("p") 

1315 if self.tree.openElements[-1].name != "p": 

1316 self.parser.parseError("unexpected-end-tag", {"name": "p"}) 

1317 node = self.tree.openElements.pop() 

1318 while node.name != "p": 

1319 node = self.tree.openElements.pop() 

1320 

1321 def endTagBody(self, token): 

1322 if not self.tree.elementInScope("body"): 

1323 self.parser.parseError() 

1324 return 

1325 elif self.tree.openElements[-1].name != "body": 

1326 for node in self.tree.openElements[2:]: 

1327 if node.name not in frozenset(("dd", "dt", "li", "optgroup", 

1328 "option", "p", "rp", "rt", 

1329 "tbody", "td", "tfoot", 

1330 "th", "thead", "tr", "body", 

1331 "html")): 

1332 # Not sure this is the correct name for the parse error 

1333 self.parser.parseError( 

1334 "expected-one-end-tag-but-got-another", 

1335 {"gotName": "body", "expectedName": node.name}) 

1336 break 

1337 self.parser.phase = self.parser.phases["afterBody"] 

1338 

1339 def endTagHtml(self, token): 

1340 # We repeat the test for the body end tag token being ignored here 

1341 if self.tree.elementInScope("body"): 

1342 self.endTagBody(impliedTagToken("body")) 

1343 return token 

1344 

1345 def endTagBlock(self, token): 

1346 # Put us back in the right whitespace handling mode 

1347 if token["name"] == "pre": 

1348 self.processSpaceCharacters = self.processSpaceCharactersNonPre 

1349 inScope = self.tree.elementInScope(token["name"]) 

1350 if inScope: 

1351 self.tree.generateImpliedEndTags() 

1352 if self.tree.openElements[-1].name != token["name"]: 

1353 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 

1354 if inScope: 

1355 node = self.tree.openElements.pop() 

1356 while node.name != token["name"]: 

1357 node = self.tree.openElements.pop() 

1358 

1359 def endTagForm(self, token): 

1360 node = self.tree.formPointer 

1361 self.tree.formPointer = None 

1362 if node is None or not self.tree.elementInScope(node): 

1363 self.parser.parseError("unexpected-end-tag", 

1364 {"name": "form"}) 

1365 else: 

1366 self.tree.generateImpliedEndTags() 

1367 if self.tree.openElements[-1] != node: 

1368 self.parser.parseError("end-tag-too-early-ignored", 

1369 {"name": "form"}) 

1370 self.tree.openElements.remove(node) 

1371 

1372 def endTagListItem(self, token): 

1373 if token["name"] == "li": 

1374 variant = "list" 

1375 else: 

1376 variant = None 

1377 if not self.tree.elementInScope(token["name"], variant=variant): 

1378 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1379 else: 

1380 self.tree.generateImpliedEndTags(exclude=token["name"]) 

1381 if self.tree.openElements[-1].name != token["name"]: 

1382 self.parser.parseError( 

1383 "end-tag-too-early", 

1384 {"name": token["name"]}) 

1385 node = self.tree.openElements.pop() 

1386 while node.name != token["name"]: 

1387 node = self.tree.openElements.pop() 

1388 

1389 def endTagHeading(self, token): 

1390 for item in headingElements: 

1391 if self.tree.elementInScope(item): 

1392 self.tree.generateImpliedEndTags() 

1393 break 

1394 if self.tree.openElements[-1].name != token["name"]: 

1395 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 

1396 

1397 for item in headingElements: 

1398 if self.tree.elementInScope(item): 

1399 item = self.tree.openElements.pop() 

1400 while item.name not in headingElements: 

1401 item = self.tree.openElements.pop() 

1402 break 

1403 

1404 def endTagFormatting(self, token): 

1405 """The much-feared adoption agency algorithm""" 

1406 # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 

1407 # XXX Better parseError messages appreciated. 

1408 

1409 # Step 1 

1410 outerLoopCounter = 0 

1411 

1412 # Step 2 

1413 while outerLoopCounter < 8: 

1414 

1415 # Step 3 

1416 outerLoopCounter += 1 

1417 

1418 # Step 4: 

1419 

1420 # Let the formatting element be the last element in 

1421 # the list of active formatting elements that: 

1422 # - is between the end of the list and the last scope 

1423 # marker in the list, if any, or the start of the list 

1424 # otherwise, and 

1425 # - has the same tag name as the token. 

1426 formattingElement = self.tree.elementInActiveFormattingElements( 

1427 token["name"]) 

1428 if (not formattingElement or 

1429 (formattingElement in self.tree.openElements and 

1430 not self.tree.elementInScope(formattingElement.name))): 

1431 # If there is no such node, then abort these steps 

1432 # and instead act as described in the "any other 

1433 # end tag" entry below. 

1434 self.endTagOther(token) 

1435 return 

1436 

1437 # Otherwise, if there is such a node, but that node is 

1438 # not in the stack of open elements, then this is a 

1439 # parse error; remove the element from the list, and 

1440 # abort these steps. 

1441 elif formattingElement not in self.tree.openElements: 

1442 self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) 

1443 self.tree.activeFormattingElements.remove(formattingElement) 

1444 return 

1445 

1446 # Otherwise, if there is such a node, and that node is 

1447 # also in the stack of open elements, but the element 

1448 # is not in scope, then this is a parse error; ignore 

1449 # the token, and abort these steps. 

1450 elif not self.tree.elementInScope(formattingElement.name): 

1451 self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) 

1452 return 

1453 

1454 # Otherwise, there is a formatting element and that 

1455 # element is in the stack and is in scope. If the 

1456 # element is not the current node, this is a parse 

1457 # error. In any case, proceed with the algorithm as 

1458 # written in the following steps. 

1459 else: 

1460 if formattingElement != self.tree.openElements[-1]: 

1461 self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) 

1462 

1463 # Step 5: 

1464 

1465 # Let the furthest block be the topmost node in the 

1466 # stack of open elements that is lower in the stack 

1467 # than the formatting element, and is an element in 

1468 # the special category. There might not be one. 

1469 afeIndex = self.tree.openElements.index(formattingElement) 

1470 furthestBlock = None 

1471 for element in self.tree.openElements[afeIndex:]: 

1472 if element.nameTuple in specialElements: 

1473 furthestBlock = element 

1474 break 

1475 

1476 # Step 6: 

1477 

1478 # If there is no furthest block, then the UA must 

1479 # first pop all the nodes from the bottom of the stack 

1480 # of open elements, from the current node up to and 

1481 # including the formatting element, then remove the 

1482 # formatting element from the list of active 

1483 # formatting elements, and finally abort these steps. 

1484 if furthestBlock is None: 

1485 element = self.tree.openElements.pop() 

1486 while element != formattingElement: 

1487 element = self.tree.openElements.pop() 

1488 self.tree.activeFormattingElements.remove(element) 

1489 return 

1490 

1491 # Step 7 

1492 commonAncestor = self.tree.openElements[afeIndex - 1] 

1493 

1494 # Step 8: 

1495 # The bookmark is supposed to help us identify where to reinsert 

1496 # nodes in step 15. We have to ensure that we reinsert nodes after 

1497 # the node before the active formatting element. Note the bookmark 

1498 # can move in step 9.7 

1499 bookmark = self.tree.activeFormattingElements.index(formattingElement) 

1500 

1501 # Step 9 

1502 lastNode = node = furthestBlock 

1503 innerLoopCounter = 0 

1504 

1505 index = self.tree.openElements.index(node) 

1506 while innerLoopCounter < 3: 

1507 innerLoopCounter += 1 

1508 # Node is element before node in open elements 

1509 index -= 1 

1510 node = self.tree.openElements[index] 

1511 if node not in self.tree.activeFormattingElements: 

1512 self.tree.openElements.remove(node) 

1513 continue 

1514 # Step 9.6 

1515 if node == formattingElement: 

1516 break 

1517 # Step 9.7 

1518 if lastNode == furthestBlock: 

1519 bookmark = self.tree.activeFormattingElements.index(node) + 1 

1520 # Step 9.8 

1521 clone = node.cloneNode() 

1522 # Replace node with clone 

1523 self.tree.activeFormattingElements[ 

1524 self.tree.activeFormattingElements.index(node)] = clone 

1525 self.tree.openElements[ 

1526 self.tree.openElements.index(node)] = clone 

1527 node = clone 

1528 # Step 9.9 

1529 # Remove lastNode from its parents, if any 

1530 if lastNode.parent: 

1531 lastNode.parent.removeChild(lastNode) 

1532 node.appendChild(lastNode) 

1533 # Step 9.10 

1534 lastNode = node 

1535 

1536 # Step 10 

1537 # Foster parent lastNode if commonAncestor is a 

1538 # table, tbody, tfoot, thead, or tr we need to foster 

1539 # parent the lastNode 

1540 if lastNode.parent: 

1541 lastNode.parent.removeChild(lastNode) 

1542 

1543 if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): 

1544 parent, insertBefore = self.tree.getTableMisnestedNodePosition() 

1545 parent.insertBefore(lastNode, insertBefore) 

1546 else: 

1547 commonAncestor.appendChild(lastNode) 

1548 

1549 # Step 11 

1550 clone = formattingElement.cloneNode() 

1551 

1552 # Step 12 

1553 furthestBlock.reparentChildren(clone) 

1554 

1555 # Step 13 

1556 furthestBlock.appendChild(clone) 

1557 

1558 # Step 14 

1559 self.tree.activeFormattingElements.remove(formattingElement) 

1560 self.tree.activeFormattingElements.insert(bookmark, clone) 

1561 

1562 # Step 15 

1563 self.tree.openElements.remove(formattingElement) 

1564 self.tree.openElements.insert( 

1565 self.tree.openElements.index(furthestBlock) + 1, clone) 

1566 

1567 def endTagAppletMarqueeObject(self, token): 

1568 if self.tree.elementInScope(token["name"]): 

1569 self.tree.generateImpliedEndTags() 

1570 if self.tree.openElements[-1].name != token["name"]: 

1571 self.parser.parseError("end-tag-too-early", {"name": token["name"]}) 

1572 

1573 if self.tree.elementInScope(token["name"]): 

1574 element = self.tree.openElements.pop() 

1575 while element.name != token["name"]: 

1576 element = self.tree.openElements.pop() 

1577 self.tree.clearActiveFormattingElements() 

1578 

1579 def endTagBr(self, token): 

1580 self.parser.parseError("unexpected-end-tag-treated-as", 

1581 {"originalName": "br", "newName": "br element"}) 

1582 self.tree.reconstructActiveFormattingElements() 

1583 self.tree.insertElement(impliedTagToken("br", "StartTag")) 

1584 self.tree.openElements.pop() 

1585 

1586 def endTagOther(self, token): 

1587 for node in self.tree.openElements[::-1]: 

1588 if node.name == token["name"]: 

1589 self.tree.generateImpliedEndTags(exclude=token["name"]) 

1590 if self.tree.openElements[-1].name != token["name"]: 

1591 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1592 while self.tree.openElements.pop() != node: 

1593 pass 

1594 break 

1595 else: 

1596 if node.nameTuple in specialElements: 

1597 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1598 break 

1599 

1600 startTagHandler = _utils.MethodDispatcher([ 

1601 ("html", Phase.startTagHtml), 

1602 (("base", "basefont", "bgsound", "command", "link", "meta", 

1603 "script", "style", "title"), 

1604 startTagProcessInHead), 

1605 ("body", startTagBody), 

1606 ("frameset", startTagFrameset), 

1607 (("address", "article", "aside", "blockquote", "center", "details", 

1608 "dir", "div", "dl", "fieldset", "figcaption", "figure", 

1609 "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", 

1610 "section", "summary", "ul"), 

1611 startTagCloseP), 

1612 (headingElements, startTagHeading), 

1613 (("pre", "listing"), startTagPreListing), 

1614 ("form", startTagForm), 

1615 (("li", "dd", "dt"), startTagListItem), 

1616 ("plaintext", startTagPlaintext), 

1617 ("a", startTagA), 

1618 (("b", "big", "code", "em", "font", "i", "s", "small", "strike", 

1619 "strong", "tt", "u"), startTagFormatting), 

1620 ("nobr", startTagNobr), 

1621 ("button", startTagButton), 

1622 (("applet", "marquee", "object"), startTagAppletMarqueeObject), 

1623 ("xmp", startTagXmp), 

1624 ("table", startTagTable), 

1625 (("area", "br", "embed", "img", "keygen", "wbr"), 

1626 startTagVoidFormatting), 

1627 (("param", "source", "track"), startTagParamSource), 

1628 ("input", startTagInput), 

1629 ("hr", startTagHr), 

1630 ("image", startTagImage), 

1631 ("isindex", startTagIsIndex), 

1632 ("textarea", startTagTextarea), 

1633 ("iframe", startTagIFrame), 

1634 ("noscript", startTagNoscript), 

1635 (("noembed", "noframes"), startTagRawtext), 

1636 ("select", startTagSelect), 

1637 (("rp", "rt"), startTagRpRt), 

1638 (("option", "optgroup"), startTagOpt), 

1639 (("math"), startTagMath), 

1640 (("svg"), startTagSvg), 

1641 (("caption", "col", "colgroup", "frame", "head", 

1642 "tbody", "td", "tfoot", "th", "thead", 

1643 "tr"), startTagMisplaced) 

1644 ]) 

1645 startTagHandler.default = startTagOther 

1646 

1647 endTagHandler = _utils.MethodDispatcher([ 

1648 ("body", endTagBody), 

1649 ("html", endTagHtml), 

1650 (("address", "article", "aside", "blockquote", "button", "center", 

1651 "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", 

1652 "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", 

1653 "section", "summary", "ul"), endTagBlock), 

1654 ("form", endTagForm), 

1655 ("p", endTagP), 

1656 (("dd", "dt", "li"), endTagListItem), 

1657 (headingElements, endTagHeading), 

1658 (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", 

1659 "strike", "strong", "tt", "u"), endTagFormatting), 

1660 (("applet", "marquee", "object"), endTagAppletMarqueeObject), 

1661 ("br", endTagBr), 

1662 ]) 

1663 endTagHandler.default = endTagOther 

1664 

1665 class TextPhase(Phase): 

1666 __slots__ = tuple() 

1667 

1668 def processCharacters(self, token): 

1669 self.tree.insertText(token["data"]) 

1670 

1671 def processEOF(self): 

1672 self.parser.parseError("expected-named-closing-tag-but-got-eof", 

1673 {"name": self.tree.openElements[-1].name}) 

1674 self.tree.openElements.pop() 

1675 self.parser.phase = self.parser.originalPhase 

1676 return True 

1677 

1678 def startTagOther(self, token): 

1679 assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] 

1680 

1681 def endTagScript(self, token): 

1682 node = self.tree.openElements.pop() 

1683 assert node.name == "script" 

1684 self.parser.phase = self.parser.originalPhase 

1685 # The rest of this method is all stuff that only happens if 

1686 # document.write works 

1687 

1688 def endTagOther(self, token): 

1689 self.tree.openElements.pop() 

1690 self.parser.phase = self.parser.originalPhase 

1691 

1692 startTagHandler = _utils.MethodDispatcher([]) 

1693 startTagHandler.default = startTagOther 

1694 endTagHandler = _utils.MethodDispatcher([ 

1695 ("script", endTagScript)]) 

1696 endTagHandler.default = endTagOther 

1697 

1698 class InTablePhase(Phase): 

1699 # http://www.whatwg.org/specs/web-apps/current-work/#in-table 

1700 __slots__ = tuple() 

1701 

1702 # helper methods 

1703 def clearStackToTableContext(self): 

1704 # "clear the stack back to a table context" 

1705 while self.tree.openElements[-1].name not in ("table", "html"): 

1706 # self.parser.parseError("unexpected-implied-end-tag-in-table", 

1707 # {"name": self.tree.openElements[-1].name}) 

1708 self.tree.openElements.pop() 

1709 # When the current node is <html> it's an innerHTML case 

1710 

1711 # processing methods 

1712 def processEOF(self): 

1713 if self.tree.openElements[-1].name != "html": 

1714 self.parser.parseError("eof-in-table") 

1715 else: 

1716 assert self.parser.innerHTML 

1717 # Stop parsing 

1718 

1719 def processSpaceCharacters(self, token): 

1720 originalPhase = self.parser.phase 

1721 self.parser.phase = self.parser.phases["inTableText"] 

1722 self.parser.phase.originalPhase = originalPhase 

1723 self.parser.phase.processSpaceCharacters(token) 

1724 

1725 def processCharacters(self, token): 

1726 originalPhase = self.parser.phase 

1727 self.parser.phase = self.parser.phases["inTableText"] 

1728 self.parser.phase.originalPhase = originalPhase 

1729 self.parser.phase.processCharacters(token) 

1730 

1731 def insertText(self, token): 

1732 # If we get here there must be at least one non-whitespace character 

1733 # Do the table magic! 

1734 self.tree.insertFromTable = True 

1735 self.parser.phases["inBody"].processCharacters(token) 

1736 self.tree.insertFromTable = False 

1737 

1738 def startTagCaption(self, token): 

1739 self.clearStackToTableContext() 

1740 self.tree.activeFormattingElements.append(Marker) 

1741 self.tree.insertElement(token) 

1742 self.parser.phase = self.parser.phases["inCaption"] 

1743 

1744 def startTagColgroup(self, token): 

1745 self.clearStackToTableContext() 

1746 self.tree.insertElement(token) 

1747 self.parser.phase = self.parser.phases["inColumnGroup"] 

1748 

1749 def startTagCol(self, token): 

1750 self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) 

1751 return token 

1752 

1753 def startTagRowGroup(self, token): 

1754 self.clearStackToTableContext() 

1755 self.tree.insertElement(token) 

1756 self.parser.phase = self.parser.phases["inTableBody"] 

1757 

1758 def startTagImplyTbody(self, token): 

1759 self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) 

1760 return token 

1761 

1762 def startTagTable(self, token): 

1763 self.parser.parseError("unexpected-start-tag-implies-end-tag", 

1764 {"startName": "table", "endName": "table"}) 

1765 self.parser.phase.processEndTag(impliedTagToken("table")) 

1766 if not self.parser.innerHTML: 

1767 return token 

1768 

1769 def startTagStyleScript(self, token): 

1770 return self.parser.phases["inHead"].processStartTag(token) 

1771 

1772 def startTagInput(self, token): 

1773 if ("type" in token["data"] and 

1774 token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): 

1775 self.parser.parseError("unexpected-hidden-input-in-table") 

1776 self.tree.insertElement(token) 

1777 # XXX associate with form 

1778 self.tree.openElements.pop() 

1779 else: 

1780 self.startTagOther(token) 

1781 

1782 def startTagForm(self, token): 

1783 self.parser.parseError("unexpected-form-in-table") 

1784 if self.tree.formPointer is None: 

1785 self.tree.insertElement(token) 

1786 self.tree.formPointer = self.tree.openElements[-1] 

1787 self.tree.openElements.pop() 

1788 

1789 def startTagOther(self, token): 

1790 self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) 

1791 # Do the table magic! 

1792 self.tree.insertFromTable = True 

1793 self.parser.phases["inBody"].processStartTag(token) 

1794 self.tree.insertFromTable = False 

1795 

1796 def endTagTable(self, token): 

1797 if self.tree.elementInScope("table", variant="table"): 

1798 self.tree.generateImpliedEndTags() 

1799 if self.tree.openElements[-1].name != "table": 

1800 self.parser.parseError("end-tag-too-early-named", 

1801 {"gotName": "table", 

1802 "expectedName": self.tree.openElements[-1].name}) 

1803 while self.tree.openElements[-1].name != "table": 

1804 self.tree.openElements.pop() 

1805 self.tree.openElements.pop() 

1806 self.parser.resetInsertionMode() 

1807 else: 

1808 # innerHTML case 

1809 assert self.parser.innerHTML 

1810 self.parser.parseError() 

1811 

1812 def endTagIgnore(self, token): 

1813 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1814 

1815 def endTagOther(self, token): 

1816 self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) 

1817 # Do the table magic! 

1818 self.tree.insertFromTable = True 

1819 self.parser.phases["inBody"].processEndTag(token) 

1820 self.tree.insertFromTable = False 

1821 

1822 startTagHandler = _utils.MethodDispatcher([ 

1823 ("html", Phase.startTagHtml), 

1824 ("caption", startTagCaption), 

1825 ("colgroup", startTagColgroup), 

1826 ("col", startTagCol), 

1827 (("tbody", "tfoot", "thead"), startTagRowGroup), 

1828 (("td", "th", "tr"), startTagImplyTbody), 

1829 ("table", startTagTable), 

1830 (("style", "script"), startTagStyleScript), 

1831 ("input", startTagInput), 

1832 ("form", startTagForm) 

1833 ]) 

1834 startTagHandler.default = startTagOther 

1835 

1836 endTagHandler = _utils.MethodDispatcher([ 

1837 ("table", endTagTable), 

1838 (("body", "caption", "col", "colgroup", "html", "tbody", "td", 

1839 "tfoot", "th", "thead", "tr"), endTagIgnore) 

1840 ]) 

1841 endTagHandler.default = endTagOther 

1842 

1843 class InTableTextPhase(Phase): 

1844 __slots__ = ("originalPhase", "characterTokens") 

1845 

1846 def __init__(self, *args, **kwargs): 

1847 super(InTableTextPhase, self).__init__(*args, **kwargs) 

1848 self.originalPhase = None 

1849 self.characterTokens = [] 

1850 

1851 def flushCharacters(self): 

1852 data = "".join([item["data"] for item in self.characterTokens]) 

1853 if any([item not in spaceCharacters for item in data]): 

1854 token = {"type": tokenTypes["Characters"], "data": data} 

1855 self.parser.phases["inTable"].insertText(token) 

1856 elif data: 

1857 self.tree.insertText(data) 

1858 self.characterTokens = [] 

1859 

1860 def processComment(self, token): 

1861 self.flushCharacters() 

1862 self.parser.phase = self.originalPhase 

1863 return token 

1864 

1865 def processEOF(self): 

1866 self.flushCharacters() 

1867 self.parser.phase = self.originalPhase 

1868 return True 

1869 

1870 def processCharacters(self, token): 

1871 if token["data"] == "\u0000": 

1872 return 

1873 self.characterTokens.append(token) 

1874 

1875 def processSpaceCharacters(self, token): 

1876 # pretty sure we should never reach here 

1877 self.characterTokens.append(token) 

1878 # assert False 

1879 

1880 def processStartTag(self, token): 

1881 self.flushCharacters() 

1882 self.parser.phase = self.originalPhase 

1883 return token 

1884 

1885 def processEndTag(self, token): 

1886 self.flushCharacters() 

1887 self.parser.phase = self.originalPhase 

1888 return token 

1889 

1890 class InCaptionPhase(Phase): 

1891 # http://www.whatwg.org/specs/web-apps/current-work/#in-caption 

1892 __slots__ = tuple() 

1893 

1894 def ignoreEndTagCaption(self): 

1895 return not self.tree.elementInScope("caption", variant="table") 

1896 

1897 def processEOF(self): 

1898 self.parser.phases["inBody"].processEOF() 

1899 

1900 def processCharacters(self, token): 

1901 return self.parser.phases["inBody"].processCharacters(token) 

1902 

1903 def startTagTableElement(self, token): 

1904 self.parser.parseError() 

1905 # XXX Have to duplicate logic here to find out if the tag is ignored 

1906 ignoreEndTag = self.ignoreEndTagCaption() 

1907 self.parser.phase.processEndTag(impliedTagToken("caption")) 

1908 if not ignoreEndTag: 

1909 return token 

1910 

1911 def startTagOther(self, token): 

1912 return self.parser.phases["inBody"].processStartTag(token) 

1913 

1914 def endTagCaption(self, token): 

1915 if not self.ignoreEndTagCaption(): 

1916 # AT this code is quite similar to endTagTable in "InTable" 

1917 self.tree.generateImpliedEndTags() 

1918 if self.tree.openElements[-1].name != "caption": 

1919 self.parser.parseError("expected-one-end-tag-but-got-another", 

1920 {"gotName": "caption", 

1921 "expectedName": self.tree.openElements[-1].name}) 

1922 while self.tree.openElements[-1].name != "caption": 

1923 self.tree.openElements.pop() 

1924 self.tree.openElements.pop() 

1925 self.tree.clearActiveFormattingElements() 

1926 self.parser.phase = self.parser.phases["inTable"] 

1927 else: 

1928 # innerHTML case 

1929 assert self.parser.innerHTML 

1930 self.parser.parseError() 

1931 

1932 def endTagTable(self, token): 

1933 self.parser.parseError() 

1934 ignoreEndTag = self.ignoreEndTagCaption() 

1935 self.parser.phase.processEndTag(impliedTagToken("caption")) 

1936 if not ignoreEndTag: 

1937 return token 

1938 

1939 def endTagIgnore(self, token): 

1940 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

1941 

1942 def endTagOther(self, token): 

1943 return self.parser.phases["inBody"].processEndTag(token) 

1944 

1945 startTagHandler = _utils.MethodDispatcher([ 

1946 ("html", Phase.startTagHtml), 

1947 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 

1948 "thead", "tr"), startTagTableElement) 

1949 ]) 

1950 startTagHandler.default = startTagOther 

1951 

1952 endTagHandler = _utils.MethodDispatcher([ 

1953 ("caption", endTagCaption), 

1954 ("table", endTagTable), 

1955 (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", 

1956 "thead", "tr"), endTagIgnore) 

1957 ]) 

1958 endTagHandler.default = endTagOther 

1959 

1960 class InColumnGroupPhase(Phase): 

1961 # http://www.whatwg.org/specs/web-apps/current-work/#in-column 

1962 __slots__ = tuple() 

1963 

1964 def ignoreEndTagColgroup(self): 

1965 return self.tree.openElements[-1].name == "html" 

1966 

1967 def processEOF(self): 

1968 if self.tree.openElements[-1].name == "html": 

1969 assert self.parser.innerHTML 

1970 return 

1971 else: 

1972 ignoreEndTag = self.ignoreEndTagColgroup() 

1973 self.endTagColgroup(impliedTagToken("colgroup")) 

1974 if not ignoreEndTag: 

1975 return True 

1976 

1977 def processCharacters(self, token): 

1978 ignoreEndTag = self.ignoreEndTagColgroup() 

1979 self.endTagColgroup(impliedTagToken("colgroup")) 

1980 if not ignoreEndTag: 

1981 return token 

1982 

1983 def startTagCol(self, token): 

1984 self.tree.insertElement(token) 

1985 self.tree.openElements.pop() 

1986 token["selfClosingAcknowledged"] = True 

1987 

1988 def startTagOther(self, token): 

1989 ignoreEndTag = self.ignoreEndTagColgroup() 

1990 self.endTagColgroup(impliedTagToken("colgroup")) 

1991 if not ignoreEndTag: 

1992 return token 

1993 

1994 def endTagColgroup(self, token): 

1995 if self.ignoreEndTagColgroup(): 

1996 # innerHTML case 

1997 assert self.parser.innerHTML 

1998 self.parser.parseError() 

1999 else: 

2000 self.tree.openElements.pop() 

2001 self.parser.phase = self.parser.phases["inTable"] 

2002 

2003 def endTagCol(self, token): 

2004 self.parser.parseError("no-end-tag", {"name": "col"}) 

2005 

2006 def endTagOther(self, token): 

2007 ignoreEndTag = self.ignoreEndTagColgroup() 

2008 self.endTagColgroup(impliedTagToken("colgroup")) 

2009 if not ignoreEndTag: 

2010 return token 

2011 

2012 startTagHandler = _utils.MethodDispatcher([ 

2013 ("html", Phase.startTagHtml), 

2014 ("col", startTagCol) 

2015 ]) 

2016 startTagHandler.default = startTagOther 

2017 

2018 endTagHandler = _utils.MethodDispatcher([ 

2019 ("colgroup", endTagColgroup), 

2020 ("col", endTagCol) 

2021 ]) 

2022 endTagHandler.default = endTagOther 

2023 

2024 class InTableBodyPhase(Phase): 

2025 # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 

2026 __slots__ = tuple() 

2027 

2028 # helper methods 

2029 def clearStackToTableBodyContext(self): 

2030 while self.tree.openElements[-1].name not in ("tbody", "tfoot", 

2031 "thead", "html"): 

2032 # self.parser.parseError("unexpected-implied-end-tag-in-table", 

2033 # {"name": self.tree.openElements[-1].name}) 

2034 self.tree.openElements.pop() 

2035 if self.tree.openElements[-1].name == "html": 

2036 assert self.parser.innerHTML 

2037 

2038 # the rest 

2039 def processEOF(self): 

2040 self.parser.phases["inTable"].processEOF() 

2041 

2042 def processSpaceCharacters(self, token): 

2043 return self.parser.phases["inTable"].processSpaceCharacters(token) 

2044 

2045 def processCharacters(self, token): 

2046 return self.parser.phases["inTable"].processCharacters(token) 

2047 

2048 def startTagTr(self, token): 

2049 self.clearStackToTableBodyContext() 

2050 self.tree.insertElement(token) 

2051 self.parser.phase = self.parser.phases["inRow"] 

2052 

2053 def startTagTableCell(self, token): 

2054 self.parser.parseError("unexpected-cell-in-table-body", 

2055 {"name": token["name"]}) 

2056 self.startTagTr(impliedTagToken("tr", "StartTag")) 

2057 return token 

2058 

2059 def startTagTableOther(self, token): 

2060 # XXX AT Any ideas on how to share this with endTagTable? 

2061 if (self.tree.elementInScope("tbody", variant="table") or 

2062 self.tree.elementInScope("thead", variant="table") or 

2063 self.tree.elementInScope("tfoot", variant="table")): 

2064 self.clearStackToTableBodyContext() 

2065 self.endTagTableRowGroup( 

2066 impliedTagToken(self.tree.openElements[-1].name)) 

2067 return token 

2068 else: 

2069 # innerHTML case 

2070 assert self.parser.innerHTML 

2071 self.parser.parseError() 

2072 

2073 def startTagOther(self, token): 

2074 return self.parser.phases["inTable"].processStartTag(token) 

2075 

2076 def endTagTableRowGroup(self, token): 

2077 if self.tree.elementInScope(token["name"], variant="table"): 

2078 self.clearStackToTableBodyContext() 

2079 self.tree.openElements.pop() 

2080 self.parser.phase = self.parser.phases["inTable"] 

2081 else: 

2082 self.parser.parseError("unexpected-end-tag-in-table-body", 

2083 {"name": token["name"]}) 

2084 

2085 def endTagTable(self, token): 

2086 if (self.tree.elementInScope("tbody", variant="table") or 

2087 self.tree.elementInScope("thead", variant="table") or 

2088 self.tree.elementInScope("tfoot", variant="table")): 

2089 self.clearStackToTableBodyContext() 

2090 self.endTagTableRowGroup( 

2091 impliedTagToken(self.tree.openElements[-1].name)) 

2092 return token 

2093 else: 

2094 # innerHTML case 

2095 assert self.parser.innerHTML 

2096 self.parser.parseError() 

2097 

2098 def endTagIgnore(self, token): 

2099 self.parser.parseError("unexpected-end-tag-in-table-body", 

2100 {"name": token["name"]}) 

2101 

2102 def endTagOther(self, token): 

2103 return self.parser.phases["inTable"].processEndTag(token) 

2104 

2105 startTagHandler = _utils.MethodDispatcher([ 

2106 ("html", Phase.startTagHtml), 

2107 ("tr", startTagTr), 

2108 (("td", "th"), startTagTableCell), 

2109 (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), 

2110 startTagTableOther) 

2111 ]) 

2112 startTagHandler.default = startTagOther 

2113 

2114 endTagHandler = _utils.MethodDispatcher([ 

2115 (("tbody", "tfoot", "thead"), endTagTableRowGroup), 

2116 ("table", endTagTable), 

2117 (("body", "caption", "col", "colgroup", "html", "td", "th", 

2118 "tr"), endTagIgnore) 

2119 ]) 

2120 endTagHandler.default = endTagOther 

2121 

2122 class InRowPhase(Phase): 

2123 # http://www.whatwg.org/specs/web-apps/current-work/#in-row 

2124 __slots__ = tuple() 

2125 

2126 # helper methods (XXX unify this with other table helper methods) 

2127 def clearStackToTableRowContext(self): 

2128 while self.tree.openElements[-1].name not in ("tr", "html"): 

2129 self.parser.parseError("unexpected-implied-end-tag-in-table-row", 

2130 {"name": self.tree.openElements[-1].name}) 

2131 self.tree.openElements.pop() 

2132 

2133 def ignoreEndTagTr(self): 

2134 return not self.tree.elementInScope("tr", variant="table") 

2135 

2136 # the rest 

2137 def processEOF(self): 

2138 self.parser.phases["inTable"].processEOF() 

2139 

2140 def processSpaceCharacters(self, token): 

2141 return self.parser.phases["inTable"].processSpaceCharacters(token) 

2142 

2143 def processCharacters(self, token): 

2144 return self.parser.phases["inTable"].processCharacters(token) 

2145 

2146 def startTagTableCell(self, token): 

2147 self.clearStackToTableRowContext() 

2148 self.tree.insertElement(token) 

2149 self.parser.phase = self.parser.phases["inCell"] 

2150 self.tree.activeFormattingElements.append(Marker) 

2151 

2152 def startTagTableOther(self, token): 

2153 ignoreEndTag = self.ignoreEndTagTr() 

2154 self.endTagTr(impliedTagToken("tr")) 

2155 # XXX how are we sure it's always ignored in the innerHTML case? 

2156 if not ignoreEndTag: 

2157 return token 

2158 

2159 def startTagOther(self, token): 

2160 return self.parser.phases["inTable"].processStartTag(token) 

2161 

2162 def endTagTr(self, token): 

2163 if not self.ignoreEndTagTr(): 

2164 self.clearStackToTableRowContext() 

2165 self.tree.openElements.pop() 

2166 self.parser.phase = self.parser.phases["inTableBody"] 

2167 else: 

2168 # innerHTML case 

2169 assert self.parser.innerHTML 

2170 self.parser.parseError() 

2171 

2172 def endTagTable(self, token): 

2173 ignoreEndTag = self.ignoreEndTagTr() 

2174 self.endTagTr(impliedTagToken("tr")) 

2175 # Reprocess the current tag if the tr end tag was not ignored 

2176 # XXX how are we sure it's always ignored in the innerHTML case? 

2177 if not ignoreEndTag: 

2178 return token 

2179 

2180 def endTagTableRowGroup(self, token): 

2181 if self.tree.elementInScope(token["name"], variant="table"): 

2182 self.endTagTr(impliedTagToken("tr")) 

2183 return token 

2184 else: 

2185 self.parser.parseError() 

2186 

2187 def endTagIgnore(self, token): 

2188 self.parser.parseError("unexpected-end-tag-in-table-row", 

2189 {"name": token["name"]}) 

2190 

2191 def endTagOther(self, token): 

2192 return self.parser.phases["inTable"].processEndTag(token) 

2193 

2194 startTagHandler = _utils.MethodDispatcher([ 

2195 ("html", Phase.startTagHtml), 

2196 (("td", "th"), startTagTableCell), 

2197 (("caption", "col", "colgroup", "tbody", "tfoot", "thead", 

2198 "tr"), startTagTableOther) 

2199 ]) 

2200 startTagHandler.default = startTagOther 

2201 

2202 endTagHandler = _utils.MethodDispatcher([ 

2203 ("tr", endTagTr), 

2204 ("table", endTagTable), 

2205 (("tbody", "tfoot", "thead"), endTagTableRowGroup), 

2206 (("body", "caption", "col", "colgroup", "html", "td", "th"), 

2207 endTagIgnore) 

2208 ]) 

2209 endTagHandler.default = endTagOther 

2210 

2211 class InCellPhase(Phase): 

2212 # http://www.whatwg.org/specs/web-apps/current-work/#in-cell 

2213 __slots__ = tuple() 

2214 

2215 # helper 

2216 def closeCell(self): 

2217 if self.tree.elementInScope("td", variant="table"): 

2218 self.endTagTableCell(impliedTagToken("td")) 

2219 elif self.tree.elementInScope("th", variant="table"): 

2220 self.endTagTableCell(impliedTagToken("th")) 

2221 

2222 # the rest 

2223 def processEOF(self): 

2224 self.parser.phases["inBody"].processEOF() 

2225 

2226 def processCharacters(self, token): 

2227 return self.parser.phases["inBody"].processCharacters(token) 

2228 

2229 def startTagTableOther(self, token): 

2230 if (self.tree.elementInScope("td", variant="table") or 

2231 self.tree.elementInScope("th", variant="table")): 

2232 self.closeCell() 

2233 return token 

2234 else: 

2235 # innerHTML case 

2236 assert self.parser.innerHTML 

2237 self.parser.parseError() 

2238 

2239 def startTagOther(self, token): 

2240 return self.parser.phases["inBody"].processStartTag(token) 

2241 

2242 def endTagTableCell(self, token): 

2243 if self.tree.elementInScope(token["name"], variant="table"): 

2244 self.tree.generateImpliedEndTags(token["name"]) 

2245 if self.tree.openElements[-1].name != token["name"]: 

2246 self.parser.parseError("unexpected-cell-end-tag", 

2247 {"name": token["name"]}) 

2248 while True: 

2249 node = self.tree.openElements.pop() 

2250 if node.name == token["name"]: 

2251 break 

2252 else: 

2253 self.tree.openElements.pop() 

2254 self.tree.clearActiveFormattingElements() 

2255 self.parser.phase = self.parser.phases["inRow"] 

2256 else: 

2257 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

2258 

2259 def endTagIgnore(self, token): 

2260 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

2261 

2262 def endTagImply(self, token): 

2263 if self.tree.elementInScope(token["name"], variant="table"): 

2264 self.closeCell() 

2265 return token 

2266 else: 

2267 # sometimes innerHTML case 

2268 self.parser.parseError() 

2269 

2270 def endTagOther(self, token): 

2271 return self.parser.phases["inBody"].processEndTag(token) 

2272 

2273 startTagHandler = _utils.MethodDispatcher([ 

2274 ("html", Phase.startTagHtml), 

2275 (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", 

2276 "thead", "tr"), startTagTableOther) 

2277 ]) 

2278 startTagHandler.default = startTagOther 

2279 

2280 endTagHandler = _utils.MethodDispatcher([ 

2281 (("td", "th"), endTagTableCell), 

2282 (("body", "caption", "col", "colgroup", "html"), endTagIgnore), 

2283 (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) 

2284 ]) 

2285 endTagHandler.default = endTagOther 

2286 

2287 class InSelectPhase(Phase): 

2288 __slots__ = tuple() 

2289 

2290 # http://www.whatwg.org/specs/web-apps/current-work/#in-select 

2291 def processEOF(self): 

2292 if self.tree.openElements[-1].name != "html": 

2293 self.parser.parseError("eof-in-select") 

2294 else: 

2295 assert self.parser.innerHTML 

2296 

2297 def processCharacters(self, token): 

2298 if token["data"] == "\u0000": 

2299 return 

2300 self.tree.insertText(token["data"]) 

2301 

2302 def startTagOption(self, token): 

2303 # We need to imply </option> if <option> is the current node. 

2304 if self.tree.openElements[-1].name == "option": 

2305 self.tree.openElements.pop() 

2306 self.tree.insertElement(token) 

2307 

2308 def startTagOptgroup(self, token): 

2309 if self.tree.openElements[-1].name == "option": 

2310 self.tree.openElements.pop() 

2311 if self.tree.openElements[-1].name == "optgroup": 

2312 self.tree.openElements.pop() 

2313 self.tree.insertElement(token) 

2314 

2315 def startTagSelect(self, token): 

2316 self.parser.parseError("unexpected-select-in-select") 

2317 self.endTagSelect(impliedTagToken("select")) 

2318 

2319 def startTagInput(self, token): 

2320 self.parser.parseError("unexpected-input-in-select") 

2321 if self.tree.elementInScope("select", variant="select"): 

2322 self.endTagSelect(impliedTagToken("select")) 

2323 return token 

2324 else: 

2325 assert self.parser.innerHTML 

2326 

2327 def startTagScript(self, token): 

2328 return self.parser.phases["inHead"].processStartTag(token) 

2329 

2330 def startTagOther(self, token): 

2331 self.parser.parseError("unexpected-start-tag-in-select", 

2332 {"name": token["name"]}) 

2333 

2334 def endTagOption(self, token): 

2335 if self.tree.openElements[-1].name == "option": 

2336 self.tree.openElements.pop() 

2337 else: 

2338 self.parser.parseError("unexpected-end-tag-in-select", 

2339 {"name": "option"}) 

2340 

2341 def endTagOptgroup(self, token): 

2342 # </optgroup> implicitly closes <option> 

2343 if (self.tree.openElements[-1].name == "option" and 

2344 self.tree.openElements[-2].name == "optgroup"): 

2345 self.tree.openElements.pop() 

2346 # It also closes </optgroup> 

2347 if self.tree.openElements[-1].name == "optgroup": 

2348 self.tree.openElements.pop() 

2349 # But nothing else 

2350 else: 

2351 self.parser.parseError("unexpected-end-tag-in-select", 

2352 {"name": "optgroup"}) 

2353 

2354 def endTagSelect(self, token): 

2355 if self.tree.elementInScope("select", variant="select"): 

2356 node = self.tree.openElements.pop() 

2357 while node.name != "select": 

2358 node = self.tree.openElements.pop() 

2359 self.parser.resetInsertionMode() 

2360 else: 

2361 # innerHTML case 

2362 assert self.parser.innerHTML 

2363 self.parser.parseError() 

2364 

2365 def endTagOther(self, token): 

2366 self.parser.parseError("unexpected-end-tag-in-select", 

2367 {"name": token["name"]}) 

2368 

2369 startTagHandler = _utils.MethodDispatcher([ 

2370 ("html", Phase.startTagHtml), 

2371 ("option", startTagOption), 

2372 ("optgroup", startTagOptgroup), 

2373 ("select", startTagSelect), 

2374 (("input", "keygen", "textarea"), startTagInput), 

2375 ("script", startTagScript) 

2376 ]) 

2377 startTagHandler.default = startTagOther 

2378 

2379 endTagHandler = _utils.MethodDispatcher([ 

2380 ("option", endTagOption), 

2381 ("optgroup", endTagOptgroup), 

2382 ("select", endTagSelect) 

2383 ]) 

2384 endTagHandler.default = endTagOther 

2385 

2386 class InSelectInTablePhase(Phase): 

2387 __slots__ = tuple() 

2388 

2389 def processEOF(self): 

2390 self.parser.phases["inSelect"].processEOF() 

2391 

2392 def processCharacters(self, token): 

2393 return self.parser.phases["inSelect"].processCharacters(token) 

2394 

2395 def startTagTable(self, token): 

2396 self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) 

2397 self.endTagOther(impliedTagToken("select")) 

2398 return token 

2399 

2400 def startTagOther(self, token): 

2401 return self.parser.phases["inSelect"].processStartTag(token) 

2402 

2403 def endTagTable(self, token): 

2404 self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) 

2405 if self.tree.elementInScope(token["name"], variant="table"): 

2406 self.endTagOther(impliedTagToken("select")) 

2407 return token 

2408 

2409 def endTagOther(self, token): 

2410 return self.parser.phases["inSelect"].processEndTag(token) 

2411 

2412 startTagHandler = _utils.MethodDispatcher([ 

2413 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 

2414 startTagTable) 

2415 ]) 

2416 startTagHandler.default = startTagOther 

2417 

2418 endTagHandler = _utils.MethodDispatcher([ 

2419 (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), 

2420 endTagTable) 

2421 ]) 

2422 endTagHandler.default = endTagOther 

2423 

2424 class InForeignContentPhase(Phase): 

2425 __slots__ = tuple() 

2426 

2427 breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", 

2428 "center", "code", "dd", "div", "dl", "dt", 

2429 "em", "embed", "h1", "h2", "h3", 

2430 "h4", "h5", "h6", "head", "hr", "i", "img", 

2431 "li", "listing", "menu", "meta", "nobr", 

2432 "ol", "p", "pre", "ruby", "s", "small", 

2433 "span", "strong", "strike", "sub", "sup", 

2434 "table", "tt", "u", "ul", "var"]) 

2435 

2436 def adjustSVGTagNames(self, token): 

2437 replacements = {"altglyph": "altGlyph", 

2438 "altglyphdef": "altGlyphDef", 

2439 "altglyphitem": "altGlyphItem", 

2440 "animatecolor": "animateColor", 

2441 "animatemotion": "animateMotion", 

2442 "animatetransform": "animateTransform", 

2443 "clippath": "clipPath", 

2444 "feblend": "feBlend", 

2445 "fecolormatrix": "feColorMatrix", 

2446 "fecomponenttransfer": "feComponentTransfer", 

2447 "fecomposite": "feComposite", 

2448 "feconvolvematrix": "feConvolveMatrix", 

2449 "fediffuselighting": "feDiffuseLighting", 

2450 "fedisplacementmap": "feDisplacementMap", 

2451 "fedistantlight": "feDistantLight", 

2452 "feflood": "feFlood", 

2453 "fefunca": "feFuncA", 

2454 "fefuncb": "feFuncB", 

2455 "fefuncg": "feFuncG", 

2456 "fefuncr": "feFuncR", 

2457 "fegaussianblur": "feGaussianBlur", 

2458 "feimage": "feImage", 

2459 "femerge": "feMerge", 

2460 "femergenode": "feMergeNode", 

2461 "femorphology": "feMorphology", 

2462 "feoffset": "feOffset", 

2463 "fepointlight": "fePointLight", 

2464 "fespecularlighting": "feSpecularLighting", 

2465 "fespotlight": "feSpotLight", 

2466 "fetile": "feTile", 

2467 "feturbulence": "feTurbulence", 

2468 "foreignobject": "foreignObject", 

2469 "glyphref": "glyphRef", 

2470 "lineargradient": "linearGradient", 

2471 "radialgradient": "radialGradient", 

2472 "textpath": "textPath"} 

2473 

2474 if token["name"] in replacements: 

2475 token["name"] = replacements[token["name"]] 

2476 

2477 def processCharacters(self, token): 

2478 if token["data"] == "\u0000": 

2479 token["data"] = "\uFFFD" 

2480 elif (self.parser.framesetOK and 

2481 any(char not in spaceCharacters for char in token["data"])): 

2482 self.parser.framesetOK = False 

2483 Phase.processCharacters(self, token) 

2484 

2485 def processStartTag(self, token): 

2486 currentNode = self.tree.openElements[-1] 

2487 if (token["name"] in self.breakoutElements or 

2488 (token["name"] == "font" and 

2489 set(token["data"].keys()) & {"color", "face", "size"})): 

2490 self.parser.parseError("unexpected-html-element-in-foreign-content", 

2491 {"name": token["name"]}) 

2492 while (self.tree.openElements[-1].namespace != 

2493 self.tree.defaultNamespace and 

2494 not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and 

2495 not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): 

2496 self.tree.openElements.pop() 

2497 return token 

2498 

2499 else: 

2500 if currentNode.namespace == namespaces["mathml"]: 

2501 self.parser.adjustMathMLAttributes(token) 

2502 elif currentNode.namespace == namespaces["svg"]: 

2503 self.adjustSVGTagNames(token) 

2504 self.parser.adjustSVGAttributes(token) 

2505 self.parser.adjustForeignAttributes(token) 

2506 token["namespace"] = currentNode.namespace 

2507 self.tree.insertElement(token) 

2508 if token["selfClosing"]: 

2509 self.tree.openElements.pop() 

2510 token["selfClosingAcknowledged"] = True 

2511 

2512 def processEndTag(self, token): 

2513 nodeIndex = len(self.tree.openElements) - 1 

2514 node = self.tree.openElements[-1] 

2515 if node.name.translate(asciiUpper2Lower) != token["name"]: 

2516 self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) 

2517 

2518 while True: 

2519 if node.name.translate(asciiUpper2Lower) == token["name"]: 

2520 # XXX this isn't in the spec but it seems necessary 

2521 if self.parser.phase == self.parser.phases["inTableText"]: 

2522 self.parser.phase.flushCharacters() 

2523 self.parser.phase = self.parser.phase.originalPhase 

2524 while self.tree.openElements.pop() != node: 

2525 assert self.tree.openElements 

2526 new_token = None 

2527 break 

2528 nodeIndex -= 1 

2529 

2530 node = self.tree.openElements[nodeIndex] 

2531 if node.namespace != self.tree.defaultNamespace: 

2532 continue 

2533 else: 

2534 new_token = self.parser.phase.processEndTag(token) 

2535 break 

2536 return new_token 

2537 

2538 class AfterBodyPhase(Phase): 

2539 __slots__ = tuple() 

2540 

2541 def processEOF(self): 

2542 # Stop parsing 

2543 pass 

2544 

2545 def processComment(self, token): 

2546 # This is needed because data is to be appended to the <html> element 

2547 # here and not to whatever is currently open. 

2548 self.tree.insertComment(token, self.tree.openElements[0]) 

2549 

2550 def processCharacters(self, token): 

2551 self.parser.parseError("unexpected-char-after-body") 

2552 self.parser.phase = self.parser.phases["inBody"] 

2553 return token 

2554 

2555 def startTagHtml(self, token): 

2556 return self.parser.phases["inBody"].processStartTag(token) 

2557 

2558 def startTagOther(self, token): 

2559 self.parser.parseError("unexpected-start-tag-after-body", 

2560 {"name": token["name"]}) 

2561 self.parser.phase = self.parser.phases["inBody"] 

2562 return token 

2563 

2564 def endTagHtml(self, name): 

2565 if self.parser.innerHTML: 

2566 self.parser.parseError("unexpected-end-tag-after-body-innerhtml") 

2567 else: 

2568 self.parser.phase = self.parser.phases["afterAfterBody"] 

2569 

2570 def endTagOther(self, token): 

2571 self.parser.parseError("unexpected-end-tag-after-body", 

2572 {"name": token["name"]}) 

2573 self.parser.phase = self.parser.phases["inBody"] 

2574 return token 

2575 

2576 startTagHandler = _utils.MethodDispatcher([ 

2577 ("html", startTagHtml) 

2578 ]) 

2579 startTagHandler.default = startTagOther 

2580 

2581 endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) 

2582 endTagHandler.default = endTagOther 

2583 

2584 class InFramesetPhase(Phase): 

2585 # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset 

2586 __slots__ = tuple() 

2587 

2588 def processEOF(self): 

2589 if self.tree.openElements[-1].name != "html": 

2590 self.parser.parseError("eof-in-frameset") 

2591 else: 

2592 assert self.parser.innerHTML 

2593 

2594 def processCharacters(self, token): 

2595 self.parser.parseError("unexpected-char-in-frameset") 

2596 

2597 def startTagFrameset(self, token): 

2598 self.tree.insertElement(token) 

2599 

2600 def startTagFrame(self, token): 

2601 self.tree.insertElement(token) 

2602 self.tree.openElements.pop() 

2603 

2604 def startTagNoframes(self, token): 

2605 return self.parser.phases["inBody"].processStartTag(token) 

2606 

2607 def startTagOther(self, token): 

2608 self.parser.parseError("unexpected-start-tag-in-frameset", 

2609 {"name": token["name"]}) 

2610 

2611 def endTagFrameset(self, token): 

2612 if self.tree.openElements[-1].name == "html": 

2613 # innerHTML case 

2614 self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") 

2615 else: 

2616 self.tree.openElements.pop() 

2617 if (not self.parser.innerHTML and 

2618 self.tree.openElements[-1].name != "frameset"): 

2619 # If we're not in innerHTML mode and the current node is not a 

2620 # "frameset" element (anymore) then switch. 

2621 self.parser.phase = self.parser.phases["afterFrameset"] 

2622 

2623 def endTagOther(self, token): 

2624 self.parser.parseError("unexpected-end-tag-in-frameset", 

2625 {"name": token["name"]}) 

2626 

2627 startTagHandler = _utils.MethodDispatcher([ 

2628 ("html", Phase.startTagHtml), 

2629 ("frameset", startTagFrameset), 

2630 ("frame", startTagFrame), 

2631 ("noframes", startTagNoframes) 

2632 ]) 

2633 startTagHandler.default = startTagOther 

2634 

2635 endTagHandler = _utils.MethodDispatcher([ 

2636 ("frameset", endTagFrameset) 

2637 ]) 

2638 endTagHandler.default = endTagOther 

2639 

2640 class AfterFramesetPhase(Phase): 

2641 # http://www.whatwg.org/specs/web-apps/current-work/#after3 

2642 __slots__ = tuple() 

2643 

2644 def processEOF(self): 

2645 # Stop parsing 

2646 pass 

2647 

2648 def processCharacters(self, token): 

2649 self.parser.parseError("unexpected-char-after-frameset") 

2650 

2651 def startTagNoframes(self, token): 

2652 return self.parser.phases["inHead"].processStartTag(token) 

2653 

2654 def startTagOther(self, token): 

2655 self.parser.parseError("unexpected-start-tag-after-frameset", 

2656 {"name": token["name"]}) 

2657 

2658 def endTagHtml(self, token): 

2659 self.parser.phase = self.parser.phases["afterAfterFrameset"] 

2660 

2661 def endTagOther(self, token): 

2662 self.parser.parseError("unexpected-end-tag-after-frameset", 

2663 {"name": token["name"]}) 

2664 

2665 startTagHandler = _utils.MethodDispatcher([ 

2666 ("html", Phase.startTagHtml), 

2667 ("noframes", startTagNoframes) 

2668 ]) 

2669 startTagHandler.default = startTagOther 

2670 

2671 endTagHandler = _utils.MethodDispatcher([ 

2672 ("html", endTagHtml) 

2673 ]) 

2674 endTagHandler.default = endTagOther 

2675 

2676 class AfterAfterBodyPhase(Phase): 

2677 __slots__ = tuple() 

2678 

2679 def processEOF(self): 

2680 pass 

2681 

2682 def processComment(self, token): 

2683 self.tree.insertComment(token, self.tree.document) 

2684 

2685 def processSpaceCharacters(self, token): 

2686 return self.parser.phases["inBody"].processSpaceCharacters(token) 

2687 

2688 def processCharacters(self, token): 

2689 self.parser.parseError("expected-eof-but-got-char") 

2690 self.parser.phase = self.parser.phases["inBody"] 

2691 return token 

2692 

2693 def startTagHtml(self, token): 

2694 return self.parser.phases["inBody"].processStartTag(token) 

2695 

2696 def startTagOther(self, token): 

2697 self.parser.parseError("expected-eof-but-got-start-tag", 

2698 {"name": token["name"]}) 

2699 self.parser.phase = self.parser.phases["inBody"] 

2700 return token 

2701 

2702 def processEndTag(self, token): 

2703 self.parser.parseError("expected-eof-but-got-end-tag", 

2704 {"name": token["name"]}) 

2705 self.parser.phase = self.parser.phases["inBody"] 

2706 return token 

2707 

2708 startTagHandler = _utils.MethodDispatcher([ 

2709 ("html", startTagHtml) 

2710 ]) 

2711 startTagHandler.default = startTagOther 

2712 

2713 class AfterAfterFramesetPhase(Phase): 

2714 __slots__ = tuple() 

2715 

2716 def processEOF(self): 

2717 pass 

2718 

2719 def processComment(self, token): 

2720 self.tree.insertComment(token, self.tree.document) 

2721 

2722 def processSpaceCharacters(self, token): 

2723 return self.parser.phases["inBody"].processSpaceCharacters(token) 

2724 

2725 def processCharacters(self, token): 

2726 self.parser.parseError("expected-eof-but-got-char") 

2727 

2728 def startTagHtml(self, token): 

2729 return self.parser.phases["inBody"].processStartTag(token) 

2730 

2731 def startTagNoFrames(self, token): 

2732 return self.parser.phases["inHead"].processStartTag(token) 

2733 

2734 def startTagOther(self, token): 

2735 self.parser.parseError("expected-eof-but-got-start-tag", 

2736 {"name": token["name"]}) 

2737 

2738 def processEndTag(self, token): 

2739 self.parser.parseError("expected-eof-but-got-end-tag", 

2740 {"name": token["name"]}) 

2741 

2742 startTagHandler = _utils.MethodDispatcher([ 

2743 ("html", startTagHtml), 

2744 ("noframes", startTagNoFrames) 

2745 ]) 

2746 startTagHandler.default = startTagOther 

2747 

2748 # pylint:enable=unused-argument 

2749 

2750 return { 

2751 "initial": InitialPhase, 

2752 "beforeHtml": BeforeHtmlPhase, 

2753 "beforeHead": BeforeHeadPhase, 

2754 "inHead": InHeadPhase, 

2755 "inHeadNoscript": InHeadNoscriptPhase, 

2756 "afterHead": AfterHeadPhase, 

2757 "inBody": InBodyPhase, 

2758 "text": TextPhase, 

2759 "inTable": InTablePhase, 

2760 "inTableText": InTableTextPhase, 

2761 "inCaption": InCaptionPhase, 

2762 "inColumnGroup": InColumnGroupPhase, 

2763 "inTableBody": InTableBodyPhase, 

2764 "inRow": InRowPhase, 

2765 "inCell": InCellPhase, 

2766 "inSelect": InSelectPhase, 

2767 "inSelectInTable": InSelectInTablePhase, 

2768 "inForeignContent": InForeignContentPhase, 

2769 "afterBody": AfterBodyPhase, 

2770 "inFrameset": InFramesetPhase, 

2771 "afterFrameset": AfterFramesetPhase, 

2772 "afterAfterBody": AfterAfterBodyPhase, 

2773 "afterAfterFrameset": AfterAfterFramesetPhase, 

2774 # XXX after after frameset 

2775 } 

2776 

2777 

2778def adjust_attributes(token, replacements): 

2779 needs_adjustment = viewkeys(token['data']) & viewkeys(replacements) 

2780 if needs_adjustment: 

2781 token['data'] = type(token['data'])((replacements.get(k, k), v) 

2782 for k, v in token['data'].items()) 

2783 

2784 

2785def impliedTagToken(name, type="EndTag", attributes=None, 

2786 selfClosing=False): 

2787 if attributes is None: 

2788 attributes = {} 

2789 return {"type": tokenTypes[type], "name": name, "data": attributes, 

2790 "selfClosing": selfClosing} 

2791 

2792 

2793class ParseError(Exception): 

2794 """Error in parsed document""" 

2795 pass