Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/_tokenizer.py: 7%

1243 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1from __future__ import absolute_import, division, unicode_literals 

2 

3from six import unichr as chr 

4 

5from collections import deque, OrderedDict 

6from sys import version_info 

7 

8from .constants import spaceCharacters 

9from .constants import entities 

10from .constants import asciiLetters, asciiUpper2Lower 

11from .constants import digits, hexDigits, EOF 

12from .constants import tokenTypes, tagTokenTypes 

13from .constants import replacementCharacters 

14 

15from ._inputstream import HTMLInputStream 

16 

17from ._trie import Trie 

18 

19entitiesTrie = Trie(entities) 

20 

21if version_info >= (3, 7): 

22 attributeMap = dict 

23else: 

24 attributeMap = OrderedDict 

25 

26 

27class HTMLTokenizer(object): 

28 """ This class takes care of tokenizing HTML. 

29 

30 * self.currentToken 

31 Holds the token that is currently being processed. 

32 

33 * self.state 

34 Holds a reference to the method to be invoked... XXX 

35 

36 * self.stream 

37 Points to HTMLInputStream object. 

38 """ 

39 

40 def __init__(self, stream, parser=None, **kwargs): 

41 

42 self.stream = HTMLInputStream(stream, **kwargs) 

43 self.parser = parser 

44 

45 # Setup the initial tokenizer state 

46 self.escapeFlag = False 

47 self.lastFourChars = [] 

48 self.state = self.dataState 

49 self.escape = False 

50 

51 # The current token being created 

52 self.currentToken = None 

53 super(HTMLTokenizer, self).__init__() 

54 

55 def __iter__(self): 

56 """ This is where the magic happens. 

57 

58 We do our usually processing through the states and when we have a token 

59 to return we yield the token which pauses processing until the next token 

60 is requested. 

61 """ 

62 self.tokenQueue = deque([]) 

63 # Start processing. When EOF is reached self.state will return False 

64 # instead of True and the loop will terminate. 

65 while self.state(): 

66 while self.stream.errors: 

67 yield {"type": tokenTypes["ParseError"], "data": self.stream.errors.pop(0)} 

68 while self.tokenQueue: 

69 yield self.tokenQueue.popleft() 

70 

71 def consumeNumberEntity(self, isHex): 

72 """This function returns either U+FFFD or the character based on the 

73 decimal or hexadecimal representation. It also discards ";" if present. 

74 If not present self.tokenQueue.append({"type": tokenTypes["ParseError"]}) is invoked. 

75 """ 

76 

77 allowed = digits 

78 radix = 10 

79 if isHex: 

80 allowed = hexDigits 

81 radix = 16 

82 

83 charStack = [] 

84 

85 # Consume all the characters that are in range while making sure we 

86 # don't hit an EOF. 

87 c = self.stream.char() 

88 while c in allowed and c is not EOF: 

89 charStack.append(c) 

90 c = self.stream.char() 

91 

92 # Convert the set of characters consumed to an int. 

93 charAsInt = int("".join(charStack), radix) 

94 

95 # Certain characters get replaced with others 

96 if charAsInt in replacementCharacters: 

97 char = replacementCharacters[charAsInt] 

98 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

99 "illegal-codepoint-for-numeric-entity", 

100 "datavars": {"charAsInt": charAsInt}}) 

101 elif ((0xD800 <= charAsInt <= 0xDFFF) or 

102 (charAsInt > 0x10FFFF)): 

103 char = "\uFFFD" 

104 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

105 "illegal-codepoint-for-numeric-entity", 

106 "datavars": {"charAsInt": charAsInt}}) 

107 else: 

108 # Should speed up this check somehow (e.g. move the set to a constant) 

109 if ((0x0001 <= charAsInt <= 0x0008) or 

110 (0x000E <= charAsInt <= 0x001F) or 

111 (0x007F <= charAsInt <= 0x009F) or 

112 (0xFDD0 <= charAsInt <= 0xFDEF) or 

113 charAsInt in frozenset([0x000B, 0xFFFE, 0xFFFF, 0x1FFFE, 

114 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 

115 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 

116 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 

117 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 

118 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 

119 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE, 

120 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 

121 0xFFFFF, 0x10FFFE, 0x10FFFF])): 

122 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

123 "data": 

124 "illegal-codepoint-for-numeric-entity", 

125 "datavars": {"charAsInt": charAsInt}}) 

126 try: 

127 # Try/except needed as UCS-2 Python builds' unichar only works 

128 # within the BMP. 

129 char = chr(charAsInt) 

130 except ValueError: 

131 v = charAsInt - 0x10000 

132 char = chr(0xD800 | (v >> 10)) + chr(0xDC00 | (v & 0x3FF)) 

133 

134 # Discard the ; if present. Otherwise, put it back on the queue and 

135 # invoke parseError on parser. 

136 if c != ";": 

137 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

138 "numeric-entity-without-semicolon"}) 

139 self.stream.unget(c) 

140 

141 return char 

142 

143 def consumeEntity(self, allowedChar=None, fromAttribute=False): 

144 # Initialise to the default output for when no entity is matched 

145 output = "&" 

146 

147 charStack = [self.stream.char()] 

148 if (charStack[0] in spaceCharacters or charStack[0] in (EOF, "<", "&") or 

149 (allowedChar is not None and allowedChar == charStack[0])): 

150 self.stream.unget(charStack[0]) 

151 

152 elif charStack[0] == "#": 

153 # Read the next character to see if it's hex or decimal 

154 hex = False 

155 charStack.append(self.stream.char()) 

156 if charStack[-1] in ("x", "X"): 

157 hex = True 

158 charStack.append(self.stream.char()) 

159 

160 # charStack[-1] should be the first digit 

161 if (hex and charStack[-1] in hexDigits) \ 

162 or (not hex and charStack[-1] in digits): 

163 # At least one digit found, so consume the whole number 

164 self.stream.unget(charStack[-1]) 

165 output = self.consumeNumberEntity(hex) 

166 else: 

167 # No digits found 

168 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

169 "data": "expected-numeric-entity"}) 

170 self.stream.unget(charStack.pop()) 

171 output = "&" + "".join(charStack) 

172 

173 else: 

174 # At this point in the process might have named entity. Entities 

175 # are stored in the global variable "entities". 

176 # 

177 # Consume characters and compare to these to a substring of the 

178 # entity names in the list until the substring no longer matches. 

179 while (charStack[-1] is not EOF): 

180 if not entitiesTrie.has_keys_with_prefix("".join(charStack)): 

181 break 

182 charStack.append(self.stream.char()) 

183 

184 # At this point we have a string that starts with some characters 

185 # that may match an entity 

186 # Try to find the longest entity the string will match to take care 

187 # of &noti for instance. 

188 try: 

189 entityName = entitiesTrie.longest_prefix("".join(charStack[:-1])) 

190 entityLength = len(entityName) 

191 except KeyError: 

192 entityName = None 

193 

194 if entityName is not None: 

195 if entityName[-1] != ";": 

196 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

197 "named-entity-without-semicolon"}) 

198 if (entityName[-1] != ";" and fromAttribute and 

199 (charStack[entityLength] in asciiLetters or 

200 charStack[entityLength] in digits or 

201 charStack[entityLength] == "=")): 

202 self.stream.unget(charStack.pop()) 

203 output = "&" + "".join(charStack) 

204 else: 

205 output = entities[entityName] 

206 self.stream.unget(charStack.pop()) 

207 output += "".join(charStack[entityLength:]) 

208 else: 

209 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

210 "expected-named-entity"}) 

211 self.stream.unget(charStack.pop()) 

212 output = "&" + "".join(charStack) 

213 

214 if fromAttribute: 

215 self.currentToken["data"][-1][1] += output 

216 else: 

217 if output in spaceCharacters: 

218 tokenType = "SpaceCharacters" 

219 else: 

220 tokenType = "Characters" 

221 self.tokenQueue.append({"type": tokenTypes[tokenType], "data": output}) 

222 

223 def processEntityInAttribute(self, allowedChar): 

224 """This method replaces the need for "entityInAttributeValueState". 

225 """ 

226 self.consumeEntity(allowedChar=allowedChar, fromAttribute=True) 

227 

228 def emitCurrentToken(self): 

229 """This method is a generic handler for emitting the tags. It also sets 

230 the state to "data" because that's what's needed after a token has been 

231 emitted. 

232 """ 

233 token = self.currentToken 

234 # Add token to the queue to be yielded 

235 if (token["type"] in tagTokenTypes): 

236 token["name"] = token["name"].translate(asciiUpper2Lower) 

237 if token["type"] == tokenTypes["StartTag"]: 

238 raw = token["data"] 

239 data = attributeMap(raw) 

240 if len(raw) > len(data): 

241 # we had some duplicated attribute, fix so first wins 

242 data.update(raw[::-1]) 

243 token["data"] = data 

244 

245 if token["type"] == tokenTypes["EndTag"]: 

246 if token["data"]: 

247 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

248 "data": "attributes-in-end-tag"}) 

249 if token["selfClosing"]: 

250 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

251 "data": "self-closing-flag-on-end-tag"}) 

252 self.tokenQueue.append(token) 

253 self.state = self.dataState 

254 

255 # Below are the various tokenizer states worked out. 

256 def dataState(self): 

257 data = self.stream.char() 

258 if data == "&": 

259 self.state = self.entityDataState 

260 elif data == "<": 

261 self.state = self.tagOpenState 

262 elif data == "\u0000": 

263 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

264 "data": "invalid-codepoint"}) 

265 self.tokenQueue.append({"type": tokenTypes["Characters"], 

266 "data": "\u0000"}) 

267 elif data is EOF: 

268 # Tokenization ends. 

269 return False 

270 elif data in spaceCharacters: 

271 # Directly after emitting a token you switch back to the "data 

272 # state". At that point spaceCharacters are important so they are 

273 # emitted separately. 

274 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 

275 data + self.stream.charsUntil(spaceCharacters, True)}) 

276 # No need to update lastFourChars here, since the first space will 

277 # have already been appended to lastFourChars and will have broken 

278 # any <!-- or --> sequences 

279 else: 

280 chars = self.stream.charsUntil(("&", "<", "\u0000")) 

281 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 

282 data + chars}) 

283 return True 

284 

285 def entityDataState(self): 

286 self.consumeEntity() 

287 self.state = self.dataState 

288 return True 

289 

290 def rcdataState(self): 

291 data = self.stream.char() 

292 if data == "&": 

293 self.state = self.characterReferenceInRcdata 

294 elif data == "<": 

295 self.state = self.rcdataLessThanSignState 

296 elif data == EOF: 

297 # Tokenization ends. 

298 return False 

299 elif data == "\u0000": 

300 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

301 "data": "invalid-codepoint"}) 

302 self.tokenQueue.append({"type": tokenTypes["Characters"], 

303 "data": "\uFFFD"}) 

304 elif data in spaceCharacters: 

305 # Directly after emitting a token you switch back to the "data 

306 # state". At that point spaceCharacters are important so they are 

307 # emitted separately. 

308 self.tokenQueue.append({"type": tokenTypes["SpaceCharacters"], "data": 

309 data + self.stream.charsUntil(spaceCharacters, True)}) 

310 # No need to update lastFourChars here, since the first space will 

311 # have already been appended to lastFourChars and will have broken 

312 # any <!-- or --> sequences 

313 else: 

314 chars = self.stream.charsUntil(("&", "<", "\u0000")) 

315 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 

316 data + chars}) 

317 return True 

318 

319 def characterReferenceInRcdata(self): 

320 self.consumeEntity() 

321 self.state = self.rcdataState 

322 return True 

323 

324 def rawtextState(self): 

325 data = self.stream.char() 

326 if data == "<": 

327 self.state = self.rawtextLessThanSignState 

328 elif data == "\u0000": 

329 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

330 "data": "invalid-codepoint"}) 

331 self.tokenQueue.append({"type": tokenTypes["Characters"], 

332 "data": "\uFFFD"}) 

333 elif data == EOF: 

334 # Tokenization ends. 

335 return False 

336 else: 

337 chars = self.stream.charsUntil(("<", "\u0000")) 

338 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 

339 data + chars}) 

340 return True 

341 

342 def scriptDataState(self): 

343 data = self.stream.char() 

344 if data == "<": 

345 self.state = self.scriptDataLessThanSignState 

346 elif data == "\u0000": 

347 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

348 "data": "invalid-codepoint"}) 

349 self.tokenQueue.append({"type": tokenTypes["Characters"], 

350 "data": "\uFFFD"}) 

351 elif data == EOF: 

352 # Tokenization ends. 

353 return False 

354 else: 

355 chars = self.stream.charsUntil(("<", "\u0000")) 

356 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 

357 data + chars}) 

358 return True 

359 

360 def plaintextState(self): 

361 data = self.stream.char() 

362 if data == EOF: 

363 # Tokenization ends. 

364 return False 

365 elif data == "\u0000": 

366 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

367 "data": "invalid-codepoint"}) 

368 self.tokenQueue.append({"type": tokenTypes["Characters"], 

369 "data": "\uFFFD"}) 

370 else: 

371 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 

372 data + self.stream.charsUntil("\u0000")}) 

373 return True 

374 

375 def tagOpenState(self): 

376 data = self.stream.char() 

377 if data == "!": 

378 self.state = self.markupDeclarationOpenState 

379 elif data == "/": 

380 self.state = self.closeTagOpenState 

381 elif data in asciiLetters: 

382 self.currentToken = {"type": tokenTypes["StartTag"], 

383 "name": data, "data": [], 

384 "selfClosing": False, 

385 "selfClosingAcknowledged": False} 

386 self.state = self.tagNameState 

387 elif data == ">": 

388 # XXX In theory it could be something besides a tag name. But 

389 # do we really care? 

390 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

391 "expected-tag-name-but-got-right-bracket"}) 

392 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<>"}) 

393 self.state = self.dataState 

394 elif data == "?": 

395 # XXX In theory it could be something besides a tag name. But 

396 # do we really care? 

397 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

398 "expected-tag-name-but-got-question-mark"}) 

399 self.stream.unget(data) 

400 self.state = self.bogusCommentState 

401 else: 

402 # XXX 

403 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

404 "expected-tag-name"}) 

405 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

406 self.stream.unget(data) 

407 self.state = self.dataState 

408 return True 

409 

410 def closeTagOpenState(self): 

411 data = self.stream.char() 

412 if data in asciiLetters: 

413 self.currentToken = {"type": tokenTypes["EndTag"], "name": data, 

414 "data": [], "selfClosing": False} 

415 self.state = self.tagNameState 

416 elif data == ">": 

417 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

418 "expected-closing-tag-but-got-right-bracket"}) 

419 self.state = self.dataState 

420 elif data is EOF: 

421 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

422 "expected-closing-tag-but-got-eof"}) 

423 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 

424 self.state = self.dataState 

425 else: 

426 # XXX data can be _'_... 

427 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

428 "expected-closing-tag-but-got-char", 

429 "datavars": {"data": data}}) 

430 self.stream.unget(data) 

431 self.state = self.bogusCommentState 

432 return True 

433 

434 def tagNameState(self): 

435 data = self.stream.char() 

436 if data in spaceCharacters: 

437 self.state = self.beforeAttributeNameState 

438 elif data == ">": 

439 self.emitCurrentToken() 

440 elif data is EOF: 

441 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

442 "eof-in-tag-name"}) 

443 self.state = self.dataState 

444 elif data == "/": 

445 self.state = self.selfClosingStartTagState 

446 elif data == "\u0000": 

447 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

448 "data": "invalid-codepoint"}) 

449 self.currentToken["name"] += "\uFFFD" 

450 else: 

451 self.currentToken["name"] += data 

452 # (Don't use charsUntil here, because tag names are 

453 # very short and it's faster to not do anything fancy) 

454 return True 

455 

456 def rcdataLessThanSignState(self): 

457 data = self.stream.char() 

458 if data == "/": 

459 self.temporaryBuffer = "" 

460 self.state = self.rcdataEndTagOpenState 

461 else: 

462 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

463 self.stream.unget(data) 

464 self.state = self.rcdataState 

465 return True 

466 

467 def rcdataEndTagOpenState(self): 

468 data = self.stream.char() 

469 if data in asciiLetters: 

470 self.temporaryBuffer += data 

471 self.state = self.rcdataEndTagNameState 

472 else: 

473 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 

474 self.stream.unget(data) 

475 self.state = self.rcdataState 

476 return True 

477 

478 def rcdataEndTagNameState(self): 

479 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 

480 data = self.stream.char() 

481 if data in spaceCharacters and appropriate: 

482 self.currentToken = {"type": tokenTypes["EndTag"], 

483 "name": self.temporaryBuffer, 

484 "data": [], "selfClosing": False} 

485 self.state = self.beforeAttributeNameState 

486 elif data == "/" and appropriate: 

487 self.currentToken = {"type": tokenTypes["EndTag"], 

488 "name": self.temporaryBuffer, 

489 "data": [], "selfClosing": False} 

490 self.state = self.selfClosingStartTagState 

491 elif data == ">" and appropriate: 

492 self.currentToken = {"type": tokenTypes["EndTag"], 

493 "name": self.temporaryBuffer, 

494 "data": [], "selfClosing": False} 

495 self.emitCurrentToken() 

496 self.state = self.dataState 

497 elif data in asciiLetters: 

498 self.temporaryBuffer += data 

499 else: 

500 self.tokenQueue.append({"type": tokenTypes["Characters"], 

501 "data": "</" + self.temporaryBuffer}) 

502 self.stream.unget(data) 

503 self.state = self.rcdataState 

504 return True 

505 

506 def rawtextLessThanSignState(self): 

507 data = self.stream.char() 

508 if data == "/": 

509 self.temporaryBuffer = "" 

510 self.state = self.rawtextEndTagOpenState 

511 else: 

512 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

513 self.stream.unget(data) 

514 self.state = self.rawtextState 

515 return True 

516 

517 def rawtextEndTagOpenState(self): 

518 data = self.stream.char() 

519 if data in asciiLetters: 

520 self.temporaryBuffer += data 

521 self.state = self.rawtextEndTagNameState 

522 else: 

523 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 

524 self.stream.unget(data) 

525 self.state = self.rawtextState 

526 return True 

527 

528 def rawtextEndTagNameState(self): 

529 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 

530 data = self.stream.char() 

531 if data in spaceCharacters and appropriate: 

532 self.currentToken = {"type": tokenTypes["EndTag"], 

533 "name": self.temporaryBuffer, 

534 "data": [], "selfClosing": False} 

535 self.state = self.beforeAttributeNameState 

536 elif data == "/" and appropriate: 

537 self.currentToken = {"type": tokenTypes["EndTag"], 

538 "name": self.temporaryBuffer, 

539 "data": [], "selfClosing": False} 

540 self.state = self.selfClosingStartTagState 

541 elif data == ">" and appropriate: 

542 self.currentToken = {"type": tokenTypes["EndTag"], 

543 "name": self.temporaryBuffer, 

544 "data": [], "selfClosing": False} 

545 self.emitCurrentToken() 

546 self.state = self.dataState 

547 elif data in asciiLetters: 

548 self.temporaryBuffer += data 

549 else: 

550 self.tokenQueue.append({"type": tokenTypes["Characters"], 

551 "data": "</" + self.temporaryBuffer}) 

552 self.stream.unget(data) 

553 self.state = self.rawtextState 

554 return True 

555 

556 def scriptDataLessThanSignState(self): 

557 data = self.stream.char() 

558 if data == "/": 

559 self.temporaryBuffer = "" 

560 self.state = self.scriptDataEndTagOpenState 

561 elif data == "!": 

562 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<!"}) 

563 self.state = self.scriptDataEscapeStartState 

564 else: 

565 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

566 self.stream.unget(data) 

567 self.state = self.scriptDataState 

568 return True 

569 

570 def scriptDataEndTagOpenState(self): 

571 data = self.stream.char() 

572 if data in asciiLetters: 

573 self.temporaryBuffer += data 

574 self.state = self.scriptDataEndTagNameState 

575 else: 

576 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 

577 self.stream.unget(data) 

578 self.state = self.scriptDataState 

579 return True 

580 

581 def scriptDataEndTagNameState(self): 

582 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 

583 data = self.stream.char() 

584 if data in spaceCharacters and appropriate: 

585 self.currentToken = {"type": tokenTypes["EndTag"], 

586 "name": self.temporaryBuffer, 

587 "data": [], "selfClosing": False} 

588 self.state = self.beforeAttributeNameState 

589 elif data == "/" and appropriate: 

590 self.currentToken = {"type": tokenTypes["EndTag"], 

591 "name": self.temporaryBuffer, 

592 "data": [], "selfClosing": False} 

593 self.state = self.selfClosingStartTagState 

594 elif data == ">" and appropriate: 

595 self.currentToken = {"type": tokenTypes["EndTag"], 

596 "name": self.temporaryBuffer, 

597 "data": [], "selfClosing": False} 

598 self.emitCurrentToken() 

599 self.state = self.dataState 

600 elif data in asciiLetters: 

601 self.temporaryBuffer += data 

602 else: 

603 self.tokenQueue.append({"type": tokenTypes["Characters"], 

604 "data": "</" + self.temporaryBuffer}) 

605 self.stream.unget(data) 

606 self.state = self.scriptDataState 

607 return True 

608 

609 def scriptDataEscapeStartState(self): 

610 data = self.stream.char() 

611 if data == "-": 

612 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

613 self.state = self.scriptDataEscapeStartDashState 

614 else: 

615 self.stream.unget(data) 

616 self.state = self.scriptDataState 

617 return True 

618 

619 def scriptDataEscapeStartDashState(self): 

620 data = self.stream.char() 

621 if data == "-": 

622 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

623 self.state = self.scriptDataEscapedDashDashState 

624 else: 

625 self.stream.unget(data) 

626 self.state = self.scriptDataState 

627 return True 

628 

629 def scriptDataEscapedState(self): 

630 data = self.stream.char() 

631 if data == "-": 

632 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

633 self.state = self.scriptDataEscapedDashState 

634 elif data == "<": 

635 self.state = self.scriptDataEscapedLessThanSignState 

636 elif data == "\u0000": 

637 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

638 "data": "invalid-codepoint"}) 

639 self.tokenQueue.append({"type": tokenTypes["Characters"], 

640 "data": "\uFFFD"}) 

641 elif data == EOF: 

642 self.state = self.dataState 

643 else: 

644 chars = self.stream.charsUntil(("<", "-", "\u0000")) 

645 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": 

646 data + chars}) 

647 return True 

648 

649 def scriptDataEscapedDashState(self): 

650 data = self.stream.char() 

651 if data == "-": 

652 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

653 self.state = self.scriptDataEscapedDashDashState 

654 elif data == "<": 

655 self.state = self.scriptDataEscapedLessThanSignState 

656 elif data == "\u0000": 

657 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

658 "data": "invalid-codepoint"}) 

659 self.tokenQueue.append({"type": tokenTypes["Characters"], 

660 "data": "\uFFFD"}) 

661 self.state = self.scriptDataEscapedState 

662 elif data == EOF: 

663 self.state = self.dataState 

664 else: 

665 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

666 self.state = self.scriptDataEscapedState 

667 return True 

668 

669 def scriptDataEscapedDashDashState(self): 

670 data = self.stream.char() 

671 if data == "-": 

672 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

673 elif data == "<": 

674 self.state = self.scriptDataEscapedLessThanSignState 

675 elif data == ">": 

676 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 

677 self.state = self.scriptDataState 

678 elif data == "\u0000": 

679 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

680 "data": "invalid-codepoint"}) 

681 self.tokenQueue.append({"type": tokenTypes["Characters"], 

682 "data": "\uFFFD"}) 

683 self.state = self.scriptDataEscapedState 

684 elif data == EOF: 

685 self.state = self.dataState 

686 else: 

687 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

688 self.state = self.scriptDataEscapedState 

689 return True 

690 

691 def scriptDataEscapedLessThanSignState(self): 

692 data = self.stream.char() 

693 if data == "/": 

694 self.temporaryBuffer = "" 

695 self.state = self.scriptDataEscapedEndTagOpenState 

696 elif data in asciiLetters: 

697 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<" + data}) 

698 self.temporaryBuffer = data 

699 self.state = self.scriptDataDoubleEscapeStartState 

700 else: 

701 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

702 self.stream.unget(data) 

703 self.state = self.scriptDataEscapedState 

704 return True 

705 

706 def scriptDataEscapedEndTagOpenState(self): 

707 data = self.stream.char() 

708 if data in asciiLetters: 

709 self.temporaryBuffer = data 

710 self.state = self.scriptDataEscapedEndTagNameState 

711 else: 

712 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "</"}) 

713 self.stream.unget(data) 

714 self.state = self.scriptDataEscapedState 

715 return True 

716 

717 def scriptDataEscapedEndTagNameState(self): 

718 appropriate = self.currentToken and self.currentToken["name"].lower() == self.temporaryBuffer.lower() 

719 data = self.stream.char() 

720 if data in spaceCharacters and appropriate: 

721 self.currentToken = {"type": tokenTypes["EndTag"], 

722 "name": self.temporaryBuffer, 

723 "data": [], "selfClosing": False} 

724 self.state = self.beforeAttributeNameState 

725 elif data == "/" and appropriate: 

726 self.currentToken = {"type": tokenTypes["EndTag"], 

727 "name": self.temporaryBuffer, 

728 "data": [], "selfClosing": False} 

729 self.state = self.selfClosingStartTagState 

730 elif data == ">" and appropriate: 

731 self.currentToken = {"type": tokenTypes["EndTag"], 

732 "name": self.temporaryBuffer, 

733 "data": [], "selfClosing": False} 

734 self.emitCurrentToken() 

735 self.state = self.dataState 

736 elif data in asciiLetters: 

737 self.temporaryBuffer += data 

738 else: 

739 self.tokenQueue.append({"type": tokenTypes["Characters"], 

740 "data": "</" + self.temporaryBuffer}) 

741 self.stream.unget(data) 

742 self.state = self.scriptDataEscapedState 

743 return True 

744 

745 def scriptDataDoubleEscapeStartState(self): 

746 data = self.stream.char() 

747 if data in (spaceCharacters | frozenset(("/", ">"))): 

748 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

749 if self.temporaryBuffer.lower() == "script": 

750 self.state = self.scriptDataDoubleEscapedState 

751 else: 

752 self.state = self.scriptDataEscapedState 

753 elif data in asciiLetters: 

754 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

755 self.temporaryBuffer += data 

756 else: 

757 self.stream.unget(data) 

758 self.state = self.scriptDataEscapedState 

759 return True 

760 

761 def scriptDataDoubleEscapedState(self): 

762 data = self.stream.char() 

763 if data == "-": 

764 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

765 self.state = self.scriptDataDoubleEscapedDashState 

766 elif data == "<": 

767 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

768 self.state = self.scriptDataDoubleEscapedLessThanSignState 

769 elif data == "\u0000": 

770 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

771 "data": "invalid-codepoint"}) 

772 self.tokenQueue.append({"type": tokenTypes["Characters"], 

773 "data": "\uFFFD"}) 

774 elif data == EOF: 

775 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

776 "eof-in-script-in-script"}) 

777 self.state = self.dataState 

778 else: 

779 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

780 return True 

781 

782 def scriptDataDoubleEscapedDashState(self): 

783 data = self.stream.char() 

784 if data == "-": 

785 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

786 self.state = self.scriptDataDoubleEscapedDashDashState 

787 elif data == "<": 

788 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

789 self.state = self.scriptDataDoubleEscapedLessThanSignState 

790 elif data == "\u0000": 

791 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

792 "data": "invalid-codepoint"}) 

793 self.tokenQueue.append({"type": tokenTypes["Characters"], 

794 "data": "\uFFFD"}) 

795 self.state = self.scriptDataDoubleEscapedState 

796 elif data == EOF: 

797 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

798 "eof-in-script-in-script"}) 

799 self.state = self.dataState 

800 else: 

801 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

802 self.state = self.scriptDataDoubleEscapedState 

803 return True 

804 

805 def scriptDataDoubleEscapedDashDashState(self): 

806 data = self.stream.char() 

807 if data == "-": 

808 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "-"}) 

809 elif data == "<": 

810 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "<"}) 

811 self.state = self.scriptDataDoubleEscapedLessThanSignState 

812 elif data == ">": 

813 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": ">"}) 

814 self.state = self.scriptDataState 

815 elif data == "\u0000": 

816 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

817 "data": "invalid-codepoint"}) 

818 self.tokenQueue.append({"type": tokenTypes["Characters"], 

819 "data": "\uFFFD"}) 

820 self.state = self.scriptDataDoubleEscapedState 

821 elif data == EOF: 

822 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

823 "eof-in-script-in-script"}) 

824 self.state = self.dataState 

825 else: 

826 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

827 self.state = self.scriptDataDoubleEscapedState 

828 return True 

829 

830 def scriptDataDoubleEscapedLessThanSignState(self): 

831 data = self.stream.char() 

832 if data == "/": 

833 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": "/"}) 

834 self.temporaryBuffer = "" 

835 self.state = self.scriptDataDoubleEscapeEndState 

836 else: 

837 self.stream.unget(data) 

838 self.state = self.scriptDataDoubleEscapedState 

839 return True 

840 

841 def scriptDataDoubleEscapeEndState(self): 

842 data = self.stream.char() 

843 if data in (spaceCharacters | frozenset(("/", ">"))): 

844 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

845 if self.temporaryBuffer.lower() == "script": 

846 self.state = self.scriptDataEscapedState 

847 else: 

848 self.state = self.scriptDataDoubleEscapedState 

849 elif data in asciiLetters: 

850 self.tokenQueue.append({"type": tokenTypes["Characters"], "data": data}) 

851 self.temporaryBuffer += data 

852 else: 

853 self.stream.unget(data) 

854 self.state = self.scriptDataDoubleEscapedState 

855 return True 

856 

857 def beforeAttributeNameState(self): 

858 data = self.stream.char() 

859 if data in spaceCharacters: 

860 self.stream.charsUntil(spaceCharacters, True) 

861 elif data in asciiLetters: 

862 self.currentToken["data"].append([data, ""]) 

863 self.state = self.attributeNameState 

864 elif data == ">": 

865 self.emitCurrentToken() 

866 elif data == "/": 

867 self.state = self.selfClosingStartTagState 

868 elif data in ("'", '"', "=", "<"): 

869 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

870 "invalid-character-in-attribute-name"}) 

871 self.currentToken["data"].append([data, ""]) 

872 self.state = self.attributeNameState 

873 elif data == "\u0000": 

874 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

875 "data": "invalid-codepoint"}) 

876 self.currentToken["data"].append(["\uFFFD", ""]) 

877 self.state = self.attributeNameState 

878 elif data is EOF: 

879 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

880 "expected-attribute-name-but-got-eof"}) 

881 self.state = self.dataState 

882 else: 

883 self.currentToken["data"].append([data, ""]) 

884 self.state = self.attributeNameState 

885 return True 

886 

887 def attributeNameState(self): 

888 data = self.stream.char() 

889 leavingThisState = True 

890 emitToken = False 

891 if data == "=": 

892 self.state = self.beforeAttributeValueState 

893 elif data in asciiLetters: 

894 self.currentToken["data"][-1][0] += data +\ 

895 self.stream.charsUntil(asciiLetters, True) 

896 leavingThisState = False 

897 elif data == ">": 

898 # XXX If we emit here the attributes are converted to a dict 

899 # without being checked and when the code below runs we error 

900 # because data is a dict not a list 

901 emitToken = True 

902 elif data in spaceCharacters: 

903 self.state = self.afterAttributeNameState 

904 elif data == "/": 

905 self.state = self.selfClosingStartTagState 

906 elif data == "\u0000": 

907 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

908 "data": "invalid-codepoint"}) 

909 self.currentToken["data"][-1][0] += "\uFFFD" 

910 leavingThisState = False 

911 elif data in ("'", '"', "<"): 

912 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

913 "data": 

914 "invalid-character-in-attribute-name"}) 

915 self.currentToken["data"][-1][0] += data 

916 leavingThisState = False 

917 elif data is EOF: 

918 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

919 "data": "eof-in-attribute-name"}) 

920 self.state = self.dataState 

921 else: 

922 self.currentToken["data"][-1][0] += data 

923 leavingThisState = False 

924 

925 if leavingThisState: 

926 # Attributes are not dropped at this stage. That happens when the 

927 # start tag token is emitted so values can still be safely appended 

928 # to attributes, but we do want to report the parse error in time. 

929 self.currentToken["data"][-1][0] = ( 

930 self.currentToken["data"][-1][0].translate(asciiUpper2Lower)) 

931 for name, _ in self.currentToken["data"][:-1]: 

932 if self.currentToken["data"][-1][0] == name: 

933 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

934 "duplicate-attribute"}) 

935 break 

936 # XXX Fix for above XXX 

937 if emitToken: 

938 self.emitCurrentToken() 

939 return True 

940 

941 def afterAttributeNameState(self): 

942 data = self.stream.char() 

943 if data in spaceCharacters: 

944 self.stream.charsUntil(spaceCharacters, True) 

945 elif data == "=": 

946 self.state = self.beforeAttributeValueState 

947 elif data == ">": 

948 self.emitCurrentToken() 

949 elif data in asciiLetters: 

950 self.currentToken["data"].append([data, ""]) 

951 self.state = self.attributeNameState 

952 elif data == "/": 

953 self.state = self.selfClosingStartTagState 

954 elif data == "\u0000": 

955 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

956 "data": "invalid-codepoint"}) 

957 self.currentToken["data"].append(["\uFFFD", ""]) 

958 self.state = self.attributeNameState 

959 elif data in ("'", '"', "<"): 

960 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

961 "invalid-character-after-attribute-name"}) 

962 self.currentToken["data"].append([data, ""]) 

963 self.state = self.attributeNameState 

964 elif data is EOF: 

965 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

966 "expected-end-of-tag-but-got-eof"}) 

967 self.state = self.dataState 

968 else: 

969 self.currentToken["data"].append([data, ""]) 

970 self.state = self.attributeNameState 

971 return True 

972 

973 def beforeAttributeValueState(self): 

974 data = self.stream.char() 

975 if data in spaceCharacters: 

976 self.stream.charsUntil(spaceCharacters, True) 

977 elif data == "\"": 

978 self.state = self.attributeValueDoubleQuotedState 

979 elif data == "&": 

980 self.state = self.attributeValueUnQuotedState 

981 self.stream.unget(data) 

982 elif data == "'": 

983 self.state = self.attributeValueSingleQuotedState 

984 elif data == ">": 

985 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

986 "expected-attribute-value-but-got-right-bracket"}) 

987 self.emitCurrentToken() 

988 elif data == "\u0000": 

989 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

990 "data": "invalid-codepoint"}) 

991 self.currentToken["data"][-1][1] += "\uFFFD" 

992 self.state = self.attributeValueUnQuotedState 

993 elif data in ("=", "<", "`"): 

994 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

995 "equals-in-unquoted-attribute-value"}) 

996 self.currentToken["data"][-1][1] += data 

997 self.state = self.attributeValueUnQuotedState 

998 elif data is EOF: 

999 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1000 "expected-attribute-value-but-got-eof"}) 

1001 self.state = self.dataState 

1002 else: 

1003 self.currentToken["data"][-1][1] += data 

1004 self.state = self.attributeValueUnQuotedState 

1005 return True 

1006 

1007 def attributeValueDoubleQuotedState(self): 

1008 data = self.stream.char() 

1009 if data == "\"": 

1010 self.state = self.afterAttributeValueState 

1011 elif data == "&": 

1012 self.processEntityInAttribute('"') 

1013 elif data == "\u0000": 

1014 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1015 "data": "invalid-codepoint"}) 

1016 self.currentToken["data"][-1][1] += "\uFFFD" 

1017 elif data is EOF: 

1018 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1019 "eof-in-attribute-value-double-quote"}) 

1020 self.state = self.dataState 

1021 else: 

1022 self.currentToken["data"][-1][1] += data +\ 

1023 self.stream.charsUntil(("\"", "&", "\u0000")) 

1024 return True 

1025 

1026 def attributeValueSingleQuotedState(self): 

1027 data = self.stream.char() 

1028 if data == "'": 

1029 self.state = self.afterAttributeValueState 

1030 elif data == "&": 

1031 self.processEntityInAttribute("'") 

1032 elif data == "\u0000": 

1033 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1034 "data": "invalid-codepoint"}) 

1035 self.currentToken["data"][-1][1] += "\uFFFD" 

1036 elif data is EOF: 

1037 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1038 "eof-in-attribute-value-single-quote"}) 

1039 self.state = self.dataState 

1040 else: 

1041 self.currentToken["data"][-1][1] += data +\ 

1042 self.stream.charsUntil(("'", "&", "\u0000")) 

1043 return True 

1044 

1045 def attributeValueUnQuotedState(self): 

1046 data = self.stream.char() 

1047 if data in spaceCharacters: 

1048 self.state = self.beforeAttributeNameState 

1049 elif data == "&": 

1050 self.processEntityInAttribute(">") 

1051 elif data == ">": 

1052 self.emitCurrentToken() 

1053 elif data in ('"', "'", "=", "<", "`"): 

1054 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1055 "unexpected-character-in-unquoted-attribute-value"}) 

1056 self.currentToken["data"][-1][1] += data 

1057 elif data == "\u0000": 

1058 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1059 "data": "invalid-codepoint"}) 

1060 self.currentToken["data"][-1][1] += "\uFFFD" 

1061 elif data is EOF: 

1062 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1063 "eof-in-attribute-value-no-quotes"}) 

1064 self.state = self.dataState 

1065 else: 

1066 self.currentToken["data"][-1][1] += data + self.stream.charsUntil( 

1067 frozenset(("&", ">", '"', "'", "=", "<", "`", "\u0000")) | spaceCharacters) 

1068 return True 

1069 

1070 def afterAttributeValueState(self): 

1071 data = self.stream.char() 

1072 if data in spaceCharacters: 

1073 self.state = self.beforeAttributeNameState 

1074 elif data == ">": 

1075 self.emitCurrentToken() 

1076 elif data == "/": 

1077 self.state = self.selfClosingStartTagState 

1078 elif data is EOF: 

1079 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1080 "unexpected-EOF-after-attribute-value"}) 

1081 self.stream.unget(data) 

1082 self.state = self.dataState 

1083 else: 

1084 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1085 "unexpected-character-after-attribute-value"}) 

1086 self.stream.unget(data) 

1087 self.state = self.beforeAttributeNameState 

1088 return True 

1089 

1090 def selfClosingStartTagState(self): 

1091 data = self.stream.char() 

1092 if data == ">": 

1093 self.currentToken["selfClosing"] = True 

1094 self.emitCurrentToken() 

1095 elif data is EOF: 

1096 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1097 "data": 

1098 "unexpected-EOF-after-solidus-in-tag"}) 

1099 self.stream.unget(data) 

1100 self.state = self.dataState 

1101 else: 

1102 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1103 "unexpected-character-after-solidus-in-tag"}) 

1104 self.stream.unget(data) 

1105 self.state = self.beforeAttributeNameState 

1106 return True 

1107 

1108 def bogusCommentState(self): 

1109 # Make a new comment token and give it as value all the characters 

1110 # until the first > or EOF (charsUntil checks for EOF automatically) 

1111 # and emit it. 

1112 data = self.stream.charsUntil(">") 

1113 data = data.replace("\u0000", "\uFFFD") 

1114 self.tokenQueue.append( 

1115 {"type": tokenTypes["Comment"], "data": data}) 

1116 

1117 # Eat the character directly after the bogus comment which is either a 

1118 # ">" or an EOF. 

1119 self.stream.char() 

1120 self.state = self.dataState 

1121 return True 

1122 

1123 def markupDeclarationOpenState(self): 

1124 charStack = [self.stream.char()] 

1125 if charStack[-1] == "-": 

1126 charStack.append(self.stream.char()) 

1127 if charStack[-1] == "-": 

1128 self.currentToken = {"type": tokenTypes["Comment"], "data": ""} 

1129 self.state = self.commentStartState 

1130 return True 

1131 elif charStack[-1] in ('d', 'D'): 

1132 matched = True 

1133 for expected in (('o', 'O'), ('c', 'C'), ('t', 'T'), 

1134 ('y', 'Y'), ('p', 'P'), ('e', 'E')): 

1135 charStack.append(self.stream.char()) 

1136 if charStack[-1] not in expected: 

1137 matched = False 

1138 break 

1139 if matched: 

1140 self.currentToken = {"type": tokenTypes["Doctype"], 

1141 "name": "", 

1142 "publicId": None, "systemId": None, 

1143 "correct": True} 

1144 self.state = self.doctypeState 

1145 return True 

1146 elif (charStack[-1] == "[" and 

1147 self.parser is not None and 

1148 self.parser.tree.openElements and 

1149 self.parser.tree.openElements[-1].namespace != self.parser.tree.defaultNamespace): 

1150 matched = True 

1151 for expected in ["C", "D", "A", "T", "A", "["]: 

1152 charStack.append(self.stream.char()) 

1153 if charStack[-1] != expected: 

1154 matched = False 

1155 break 

1156 if matched: 

1157 self.state = self.cdataSectionState 

1158 return True 

1159 

1160 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1161 "expected-dashes-or-doctype"}) 

1162 

1163 while charStack: 

1164 self.stream.unget(charStack.pop()) 

1165 self.state = self.bogusCommentState 

1166 return True 

1167 

1168 def commentStartState(self): 

1169 data = self.stream.char() 

1170 if data == "-": 

1171 self.state = self.commentStartDashState 

1172 elif data == "\u0000": 

1173 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1174 "data": "invalid-codepoint"}) 

1175 self.currentToken["data"] += "\uFFFD" 

1176 elif data == ">": 

1177 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1178 "incorrect-comment"}) 

1179 self.tokenQueue.append(self.currentToken) 

1180 self.state = self.dataState 

1181 elif data is EOF: 

1182 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1183 "eof-in-comment"}) 

1184 self.tokenQueue.append(self.currentToken) 

1185 self.state = self.dataState 

1186 else: 

1187 self.currentToken["data"] += data 

1188 self.state = self.commentState 

1189 return True 

1190 

1191 def commentStartDashState(self): 

1192 data = self.stream.char() 

1193 if data == "-": 

1194 self.state = self.commentEndState 

1195 elif data == "\u0000": 

1196 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1197 "data": "invalid-codepoint"}) 

1198 self.currentToken["data"] += "-\uFFFD" 

1199 elif data == ">": 

1200 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1201 "incorrect-comment"}) 

1202 self.tokenQueue.append(self.currentToken) 

1203 self.state = self.dataState 

1204 elif data is EOF: 

1205 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1206 "eof-in-comment"}) 

1207 self.tokenQueue.append(self.currentToken) 

1208 self.state = self.dataState 

1209 else: 

1210 self.currentToken["data"] += "-" + data 

1211 self.state = self.commentState 

1212 return True 

1213 

1214 def commentState(self): 

1215 data = self.stream.char() 

1216 if data == "-": 

1217 self.state = self.commentEndDashState 

1218 elif data == "\u0000": 

1219 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1220 "data": "invalid-codepoint"}) 

1221 self.currentToken["data"] += "\uFFFD" 

1222 elif data is EOF: 

1223 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1224 "data": "eof-in-comment"}) 

1225 self.tokenQueue.append(self.currentToken) 

1226 self.state = self.dataState 

1227 else: 

1228 self.currentToken["data"] += data + \ 

1229 self.stream.charsUntil(("-", "\u0000")) 

1230 return True 

1231 

1232 def commentEndDashState(self): 

1233 data = self.stream.char() 

1234 if data == "-": 

1235 self.state = self.commentEndState 

1236 elif data == "\u0000": 

1237 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1238 "data": "invalid-codepoint"}) 

1239 self.currentToken["data"] += "-\uFFFD" 

1240 self.state = self.commentState 

1241 elif data is EOF: 

1242 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1243 "eof-in-comment-end-dash"}) 

1244 self.tokenQueue.append(self.currentToken) 

1245 self.state = self.dataState 

1246 else: 

1247 self.currentToken["data"] += "-" + data 

1248 self.state = self.commentState 

1249 return True 

1250 

1251 def commentEndState(self): 

1252 data = self.stream.char() 

1253 if data == ">": 

1254 self.tokenQueue.append(self.currentToken) 

1255 self.state = self.dataState 

1256 elif data == "\u0000": 

1257 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1258 "data": "invalid-codepoint"}) 

1259 self.currentToken["data"] += "--\uFFFD" 

1260 self.state = self.commentState 

1261 elif data == "!": 

1262 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1263 "unexpected-bang-after-double-dash-in-comment"}) 

1264 self.state = self.commentEndBangState 

1265 elif data == "-": 

1266 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1267 "unexpected-dash-after-double-dash-in-comment"}) 

1268 self.currentToken["data"] += data 

1269 elif data is EOF: 

1270 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1271 "eof-in-comment-double-dash"}) 

1272 self.tokenQueue.append(self.currentToken) 

1273 self.state = self.dataState 

1274 else: 

1275 # XXX 

1276 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1277 "unexpected-char-in-comment"}) 

1278 self.currentToken["data"] += "--" + data 

1279 self.state = self.commentState 

1280 return True 

1281 

1282 def commentEndBangState(self): 

1283 data = self.stream.char() 

1284 if data == ">": 

1285 self.tokenQueue.append(self.currentToken) 

1286 self.state = self.dataState 

1287 elif data == "-": 

1288 self.currentToken["data"] += "--!" 

1289 self.state = self.commentEndDashState 

1290 elif data == "\u0000": 

1291 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1292 "data": "invalid-codepoint"}) 

1293 self.currentToken["data"] += "--!\uFFFD" 

1294 self.state = self.commentState 

1295 elif data is EOF: 

1296 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1297 "eof-in-comment-end-bang-state"}) 

1298 self.tokenQueue.append(self.currentToken) 

1299 self.state = self.dataState 

1300 else: 

1301 self.currentToken["data"] += "--!" + data 

1302 self.state = self.commentState 

1303 return True 

1304 

1305 def doctypeState(self): 

1306 data = self.stream.char() 

1307 if data in spaceCharacters: 

1308 self.state = self.beforeDoctypeNameState 

1309 elif data is EOF: 

1310 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1311 "expected-doctype-name-but-got-eof"}) 

1312 self.currentToken["correct"] = False 

1313 self.tokenQueue.append(self.currentToken) 

1314 self.state = self.dataState 

1315 else: 

1316 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1317 "need-space-after-doctype"}) 

1318 self.stream.unget(data) 

1319 self.state = self.beforeDoctypeNameState 

1320 return True 

1321 

1322 def beforeDoctypeNameState(self): 

1323 data = self.stream.char() 

1324 if data in spaceCharacters: 

1325 pass 

1326 elif data == ">": 

1327 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1328 "expected-doctype-name-but-got-right-bracket"}) 

1329 self.currentToken["correct"] = False 

1330 self.tokenQueue.append(self.currentToken) 

1331 self.state = self.dataState 

1332 elif data == "\u0000": 

1333 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1334 "data": "invalid-codepoint"}) 

1335 self.currentToken["name"] = "\uFFFD" 

1336 self.state = self.doctypeNameState 

1337 elif data is EOF: 

1338 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1339 "expected-doctype-name-but-got-eof"}) 

1340 self.currentToken["correct"] = False 

1341 self.tokenQueue.append(self.currentToken) 

1342 self.state = self.dataState 

1343 else: 

1344 self.currentToken["name"] = data 

1345 self.state = self.doctypeNameState 

1346 return True 

1347 

1348 def doctypeNameState(self): 

1349 data = self.stream.char() 

1350 if data in spaceCharacters: 

1351 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 

1352 self.state = self.afterDoctypeNameState 

1353 elif data == ">": 

1354 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 

1355 self.tokenQueue.append(self.currentToken) 

1356 self.state = self.dataState 

1357 elif data == "\u0000": 

1358 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1359 "data": "invalid-codepoint"}) 

1360 self.currentToken["name"] += "\uFFFD" 

1361 self.state = self.doctypeNameState 

1362 elif data is EOF: 

1363 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1364 "eof-in-doctype-name"}) 

1365 self.currentToken["correct"] = False 

1366 self.currentToken["name"] = self.currentToken["name"].translate(asciiUpper2Lower) 

1367 self.tokenQueue.append(self.currentToken) 

1368 self.state = self.dataState 

1369 else: 

1370 self.currentToken["name"] += data 

1371 return True 

1372 

1373 def afterDoctypeNameState(self): 

1374 data = self.stream.char() 

1375 if data in spaceCharacters: 

1376 pass 

1377 elif data == ">": 

1378 self.tokenQueue.append(self.currentToken) 

1379 self.state = self.dataState 

1380 elif data is EOF: 

1381 self.currentToken["correct"] = False 

1382 self.stream.unget(data) 

1383 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1384 "eof-in-doctype"}) 

1385 self.tokenQueue.append(self.currentToken) 

1386 self.state = self.dataState 

1387 else: 

1388 if data in ("p", "P"): 

1389 matched = True 

1390 for expected in (("u", "U"), ("b", "B"), ("l", "L"), 

1391 ("i", "I"), ("c", "C")): 

1392 data = self.stream.char() 

1393 if data not in expected: 

1394 matched = False 

1395 break 

1396 if matched: 

1397 self.state = self.afterDoctypePublicKeywordState 

1398 return True 

1399 elif data in ("s", "S"): 

1400 matched = True 

1401 for expected in (("y", "Y"), ("s", "S"), ("t", "T"), 

1402 ("e", "E"), ("m", "M")): 

1403 data = self.stream.char() 

1404 if data not in expected: 

1405 matched = False 

1406 break 

1407 if matched: 

1408 self.state = self.afterDoctypeSystemKeywordState 

1409 return True 

1410 

1411 # All the characters read before the current 'data' will be 

1412 # [a-zA-Z], so they're garbage in the bogus doctype and can be 

1413 # discarded; only the latest character might be '>' or EOF 

1414 # and needs to be ungetted 

1415 self.stream.unget(data) 

1416 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1417 "expected-space-or-right-bracket-in-doctype", "datavars": 

1418 {"data": data}}) 

1419 self.currentToken["correct"] = False 

1420 self.state = self.bogusDoctypeState 

1421 

1422 return True 

1423 

1424 def afterDoctypePublicKeywordState(self): 

1425 data = self.stream.char() 

1426 if data in spaceCharacters: 

1427 self.state = self.beforeDoctypePublicIdentifierState 

1428 elif data in ("'", '"'): 

1429 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1430 "unexpected-char-in-doctype"}) 

1431 self.stream.unget(data) 

1432 self.state = self.beforeDoctypePublicIdentifierState 

1433 elif data is EOF: 

1434 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1435 "eof-in-doctype"}) 

1436 self.currentToken["correct"] = False 

1437 self.tokenQueue.append(self.currentToken) 

1438 self.state = self.dataState 

1439 else: 

1440 self.stream.unget(data) 

1441 self.state = self.beforeDoctypePublicIdentifierState 

1442 return True 

1443 

1444 def beforeDoctypePublicIdentifierState(self): 

1445 data = self.stream.char() 

1446 if data in spaceCharacters: 

1447 pass 

1448 elif data == "\"": 

1449 self.currentToken["publicId"] = "" 

1450 self.state = self.doctypePublicIdentifierDoubleQuotedState 

1451 elif data == "'": 

1452 self.currentToken["publicId"] = "" 

1453 self.state = self.doctypePublicIdentifierSingleQuotedState 

1454 elif data == ">": 

1455 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1456 "unexpected-end-of-doctype"}) 

1457 self.currentToken["correct"] = False 

1458 self.tokenQueue.append(self.currentToken) 

1459 self.state = self.dataState 

1460 elif data is EOF: 

1461 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1462 "eof-in-doctype"}) 

1463 self.currentToken["correct"] = False 

1464 self.tokenQueue.append(self.currentToken) 

1465 self.state = self.dataState 

1466 else: 

1467 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1468 "unexpected-char-in-doctype"}) 

1469 self.currentToken["correct"] = False 

1470 self.state = self.bogusDoctypeState 

1471 return True 

1472 

1473 def doctypePublicIdentifierDoubleQuotedState(self): 

1474 data = self.stream.char() 

1475 if data == "\"": 

1476 self.state = self.afterDoctypePublicIdentifierState 

1477 elif data == "\u0000": 

1478 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1479 "data": "invalid-codepoint"}) 

1480 self.currentToken["publicId"] += "\uFFFD" 

1481 elif data == ">": 

1482 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1483 "unexpected-end-of-doctype"}) 

1484 self.currentToken["correct"] = False 

1485 self.tokenQueue.append(self.currentToken) 

1486 self.state = self.dataState 

1487 elif data is EOF: 

1488 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1489 "eof-in-doctype"}) 

1490 self.currentToken["correct"] = False 

1491 self.tokenQueue.append(self.currentToken) 

1492 self.state = self.dataState 

1493 else: 

1494 self.currentToken["publicId"] += data 

1495 return True 

1496 

1497 def doctypePublicIdentifierSingleQuotedState(self): 

1498 data = self.stream.char() 

1499 if data == "'": 

1500 self.state = self.afterDoctypePublicIdentifierState 

1501 elif data == "\u0000": 

1502 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1503 "data": "invalid-codepoint"}) 

1504 self.currentToken["publicId"] += "\uFFFD" 

1505 elif data == ">": 

1506 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1507 "unexpected-end-of-doctype"}) 

1508 self.currentToken["correct"] = False 

1509 self.tokenQueue.append(self.currentToken) 

1510 self.state = self.dataState 

1511 elif data is EOF: 

1512 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1513 "eof-in-doctype"}) 

1514 self.currentToken["correct"] = False 

1515 self.tokenQueue.append(self.currentToken) 

1516 self.state = self.dataState 

1517 else: 

1518 self.currentToken["publicId"] += data 

1519 return True 

1520 

1521 def afterDoctypePublicIdentifierState(self): 

1522 data = self.stream.char() 

1523 if data in spaceCharacters: 

1524 self.state = self.betweenDoctypePublicAndSystemIdentifiersState 

1525 elif data == ">": 

1526 self.tokenQueue.append(self.currentToken) 

1527 self.state = self.dataState 

1528 elif data == '"': 

1529 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1530 "unexpected-char-in-doctype"}) 

1531 self.currentToken["systemId"] = "" 

1532 self.state = self.doctypeSystemIdentifierDoubleQuotedState 

1533 elif data == "'": 

1534 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1535 "unexpected-char-in-doctype"}) 

1536 self.currentToken["systemId"] = "" 

1537 self.state = self.doctypeSystemIdentifierSingleQuotedState 

1538 elif data is EOF: 

1539 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1540 "eof-in-doctype"}) 

1541 self.currentToken["correct"] = False 

1542 self.tokenQueue.append(self.currentToken) 

1543 self.state = self.dataState 

1544 else: 

1545 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1546 "unexpected-char-in-doctype"}) 

1547 self.currentToken["correct"] = False 

1548 self.state = self.bogusDoctypeState 

1549 return True 

1550 

1551 def betweenDoctypePublicAndSystemIdentifiersState(self): 

1552 data = self.stream.char() 

1553 if data in spaceCharacters: 

1554 pass 

1555 elif data == ">": 

1556 self.tokenQueue.append(self.currentToken) 

1557 self.state = self.dataState 

1558 elif data == '"': 

1559 self.currentToken["systemId"] = "" 

1560 self.state = self.doctypeSystemIdentifierDoubleQuotedState 

1561 elif data == "'": 

1562 self.currentToken["systemId"] = "" 

1563 self.state = self.doctypeSystemIdentifierSingleQuotedState 

1564 elif data == EOF: 

1565 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1566 "eof-in-doctype"}) 

1567 self.currentToken["correct"] = False 

1568 self.tokenQueue.append(self.currentToken) 

1569 self.state = self.dataState 

1570 else: 

1571 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1572 "unexpected-char-in-doctype"}) 

1573 self.currentToken["correct"] = False 

1574 self.state = self.bogusDoctypeState 

1575 return True 

1576 

1577 def afterDoctypeSystemKeywordState(self): 

1578 data = self.stream.char() 

1579 if data in spaceCharacters: 

1580 self.state = self.beforeDoctypeSystemIdentifierState 

1581 elif data in ("'", '"'): 

1582 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1583 "unexpected-char-in-doctype"}) 

1584 self.stream.unget(data) 

1585 self.state = self.beforeDoctypeSystemIdentifierState 

1586 elif data is EOF: 

1587 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1588 "eof-in-doctype"}) 

1589 self.currentToken["correct"] = False 

1590 self.tokenQueue.append(self.currentToken) 

1591 self.state = self.dataState 

1592 else: 

1593 self.stream.unget(data) 

1594 self.state = self.beforeDoctypeSystemIdentifierState 

1595 return True 

1596 

1597 def beforeDoctypeSystemIdentifierState(self): 

1598 data = self.stream.char() 

1599 if data in spaceCharacters: 

1600 pass 

1601 elif data == "\"": 

1602 self.currentToken["systemId"] = "" 

1603 self.state = self.doctypeSystemIdentifierDoubleQuotedState 

1604 elif data == "'": 

1605 self.currentToken["systemId"] = "" 

1606 self.state = self.doctypeSystemIdentifierSingleQuotedState 

1607 elif data == ">": 

1608 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1609 "unexpected-char-in-doctype"}) 

1610 self.currentToken["correct"] = False 

1611 self.tokenQueue.append(self.currentToken) 

1612 self.state = self.dataState 

1613 elif data is EOF: 

1614 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1615 "eof-in-doctype"}) 

1616 self.currentToken["correct"] = False 

1617 self.tokenQueue.append(self.currentToken) 

1618 self.state = self.dataState 

1619 else: 

1620 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1621 "unexpected-char-in-doctype"}) 

1622 self.currentToken["correct"] = False 

1623 self.state = self.bogusDoctypeState 

1624 return True 

1625 

1626 def doctypeSystemIdentifierDoubleQuotedState(self): 

1627 data = self.stream.char() 

1628 if data == "\"": 

1629 self.state = self.afterDoctypeSystemIdentifierState 

1630 elif data == "\u0000": 

1631 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1632 "data": "invalid-codepoint"}) 

1633 self.currentToken["systemId"] += "\uFFFD" 

1634 elif data == ">": 

1635 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1636 "unexpected-end-of-doctype"}) 

1637 self.currentToken["correct"] = False 

1638 self.tokenQueue.append(self.currentToken) 

1639 self.state = self.dataState 

1640 elif data is EOF: 

1641 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1642 "eof-in-doctype"}) 

1643 self.currentToken["correct"] = False 

1644 self.tokenQueue.append(self.currentToken) 

1645 self.state = self.dataState 

1646 else: 

1647 self.currentToken["systemId"] += data 

1648 return True 

1649 

1650 def doctypeSystemIdentifierSingleQuotedState(self): 

1651 data = self.stream.char() 

1652 if data == "'": 

1653 self.state = self.afterDoctypeSystemIdentifierState 

1654 elif data == "\u0000": 

1655 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1656 "data": "invalid-codepoint"}) 

1657 self.currentToken["systemId"] += "\uFFFD" 

1658 elif data == ">": 

1659 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1660 "unexpected-end-of-doctype"}) 

1661 self.currentToken["correct"] = False 

1662 self.tokenQueue.append(self.currentToken) 

1663 self.state = self.dataState 

1664 elif data is EOF: 

1665 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1666 "eof-in-doctype"}) 

1667 self.currentToken["correct"] = False 

1668 self.tokenQueue.append(self.currentToken) 

1669 self.state = self.dataState 

1670 else: 

1671 self.currentToken["systemId"] += data 

1672 return True 

1673 

1674 def afterDoctypeSystemIdentifierState(self): 

1675 data = self.stream.char() 

1676 if data in spaceCharacters: 

1677 pass 

1678 elif data == ">": 

1679 self.tokenQueue.append(self.currentToken) 

1680 self.state = self.dataState 

1681 elif data is EOF: 

1682 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1683 "eof-in-doctype"}) 

1684 self.currentToken["correct"] = False 

1685 self.tokenQueue.append(self.currentToken) 

1686 self.state = self.dataState 

1687 else: 

1688 self.tokenQueue.append({"type": tokenTypes["ParseError"], "data": 

1689 "unexpected-char-in-doctype"}) 

1690 self.state = self.bogusDoctypeState 

1691 return True 

1692 

1693 def bogusDoctypeState(self): 

1694 data = self.stream.char() 

1695 if data == ">": 

1696 self.tokenQueue.append(self.currentToken) 

1697 self.state = self.dataState 

1698 elif data is EOF: 

1699 # XXX EMIT 

1700 self.stream.unget(data) 

1701 self.tokenQueue.append(self.currentToken) 

1702 self.state = self.dataState 

1703 else: 

1704 pass 

1705 return True 

1706 

1707 def cdataSectionState(self): 

1708 data = [] 

1709 while True: 

1710 data.append(self.stream.charsUntil("]")) 

1711 data.append(self.stream.charsUntil(">")) 

1712 char = self.stream.char() 

1713 if char == EOF: 

1714 break 

1715 else: 

1716 assert char == ">" 

1717 if data[-1][-2:] == "]]": 

1718 data[-1] = data[-1][:-2] 

1719 break 

1720 else: 

1721 data.append(char) 

1722 

1723 data = "".join(data) # pylint:disable=redefined-variable-type 

1724 # Deal with null here rather than in the parser 

1725 nullCount = data.count("\u0000") 

1726 if nullCount > 0: 

1727 for _ in range(nullCount): 

1728 self.tokenQueue.append({"type": tokenTypes["ParseError"], 

1729 "data": "invalid-codepoint"}) 

1730 data = data.replace("\u0000", "\uFFFD") 

1731 if data: 

1732 self.tokenQueue.append({"type": tokenTypes["Characters"], 

1733 "data": data}) 

1734 self.state = self.dataState 

1735 return True