Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/future/backports/html/parser.py: 85%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

336 statements  

1"""A parser for HTML and XHTML. 

2 

3Backported for python-future from Python 3.3. 

4""" 

5 

6# This file is based on sgmllib.py, but the API is slightly different. 

7 

8# XXX There should be a way to distinguish between PCDATA (parsed 

9# character data -- the normal case), RCDATA (replaceable character 

10# data -- only char and entity references and end tags are special) 

11# and CDATA (character data -- only end tags are special). 

12 

13from __future__ import (absolute_import, division, 

14 print_function, unicode_literals) 

15from future.builtins import * 

16from future.backports import _markupbase 

17import re 

18import warnings 

19 

20# Regular expressions used for parsing 

21 

22interesting_normal = re.compile('[&<]') 

23incomplete = re.compile('&[a-zA-Z#]') 

24 

25entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 

26charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 

27 

28starttagopen = re.compile('<[a-zA-Z]') 

29piclose = re.compile('>') 

30commentclose = re.compile(r'--\s*>') 

31tagfind = re.compile('([a-zA-Z][-.a-zA-Z0-9:_]*)(?:\s|/(?!>))*') 

32# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 

33# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 

34tagfind_tolerant = re.compile('[a-zA-Z][^\t\n\r\f />\x00]*') 

35# Note: 

36# 1) the strict attrfind isn't really strict, but we can't make it 

37# correctly strict without breaking backward compatibility; 

38# 2) if you change attrfind remember to update locatestarttagend too; 

39# 3) if you change attrfind and/or locatestarttagend the parser will 

40# explode, so don't do it. 

41attrfind = re.compile( 

42 r'\s*([a-zA-Z_][-.:a-zA-Z_0-9]*)(\s*=\s*' 

43 r'(\'[^\']*\'|"[^"]*"|[^\s"\'=<>`]*))?') 

44attrfind_tolerant = re.compile( 

45 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 

46 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 

47locatestarttagend = re.compile(r""" 

48 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 

49 (?:\s+ # whitespace before attribute name 

50 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 

51 (?:\s*=\s* # value indicator 

52 (?:'[^']*' # LITA-enclosed value 

53 |\"[^\"]*\" # LIT-enclosed value 

54 |[^'\">\s]+ # bare value 

55 ) 

56 )? 

57 ) 

58 )* 

59 \s* # trailing whitespace 

60""", re.VERBOSE) 

61locatestarttagend_tolerant = re.compile(r""" 

62 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 

63 (?:[\s/]* # optional whitespace before attribute name 

64 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 

65 (?:\s*=+\s* # value indicator 

66 (?:'[^']*' # LITA-enclosed value 

67 |"[^"]*" # LIT-enclosed value 

68 |(?!['"])[^>\s]* # bare value 

69 ) 

70 (?:\s*,)* # possibly followed by a comma 

71 )?(?:\s|/(?!>))* 

72 )* 

73 )? 

74 \s* # trailing whitespace 

75""", re.VERBOSE) 

76endendtag = re.compile('>') 

77# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 

78# </ and the tag name, so maybe this should be fixed 

79endtagfind = re.compile('</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 

80 

81 

82class HTMLParseError(Exception): 

83 """Exception raised for all parse errors.""" 

84 

85 def __init__(self, msg, position=(None, None)): 

86 assert msg 

87 self.msg = msg 

88 self.lineno = position[0] 

89 self.offset = position[1] 

90 

91 def __str__(self): 

92 result = self.msg 

93 if self.lineno is not None: 

94 result = result + ", at line %d" % self.lineno 

95 if self.offset is not None: 

96 result = result + ", column %d" % (self.offset + 1) 

97 return result 

98 

99 

100class HTMLParser(_markupbase.ParserBase): 

101 """Find tags and other markup and call handler functions. 

102 

103 Usage: 

104 p = HTMLParser() 

105 p.feed(data) 

106 ... 

107 p.close() 

108 

109 Start tags are handled by calling self.handle_starttag() or 

110 self.handle_startendtag(); end tags by self.handle_endtag(). The 

111 data between tags is passed from the parser to the derived class 

112 by calling self.handle_data() with the data as argument (the data 

113 may be split up in arbitrary chunks). Entity references are 

114 passed by calling self.handle_entityref() with the entity 

115 reference as the argument. Numeric character references are 

116 passed to self.handle_charref() with the string containing the 

117 reference as the argument. 

118 """ 

119 

120 CDATA_CONTENT_ELEMENTS = ("script", "style") 

121 

122 def __init__(self, strict=False): 

123 """Initialize and reset this instance. 

124 

125 If strict is set to False (the default) the parser will parse invalid 

126 markup, otherwise it will raise an error. Note that the strict mode 

127 is deprecated. 

128 """ 

129 if strict: 

130 warnings.warn("The strict mode is deprecated.", 

131 DeprecationWarning, stacklevel=2) 

132 self.strict = strict 

133 self.reset() 

134 

135 def reset(self): 

136 """Reset this instance. Loses all unprocessed data.""" 

137 self.rawdata = '' 

138 self.lasttag = '???' 

139 self.interesting = interesting_normal 

140 self.cdata_elem = None 

141 _markupbase.ParserBase.reset(self) 

142 

143 def feed(self, data): 

144 r"""Feed data to the parser. 

145 

146 Call this as often as you want, with as little or as much text 

147 as you want (may include '\n'). 

148 """ 

149 self.rawdata = self.rawdata + data 

150 self.goahead(0) 

151 

152 def close(self): 

153 """Handle any buffered data.""" 

154 self.goahead(1) 

155 

156 def error(self, message): 

157 raise HTMLParseError(message, self.getpos()) 

158 

159 __starttag_text = None 

160 

161 def get_starttag_text(self): 

162 """Return full source of start tag: '<...>'.""" 

163 return self.__starttag_text 

164 

165 def set_cdata_mode(self, elem): 

166 self.cdata_elem = elem.lower() 

167 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 

168 

169 def clear_cdata_mode(self): 

170 self.interesting = interesting_normal 

171 self.cdata_elem = None 

172 

173 # Internal -- handle data as far as reasonable. May leave state 

174 # and data to be processed by a subsequent call. If 'end' is 

175 # true, force handling all data as if followed by EOF marker. 

176 def goahead(self, end): 

177 rawdata = self.rawdata 

178 i = 0 

179 n = len(rawdata) 

180 while i < n: 

181 match = self.interesting.search(rawdata, i) # < or & 

182 if match: 

183 j = match.start() 

184 else: 

185 if self.cdata_elem: 

186 break 

187 j = n 

188 if i < j: self.handle_data(rawdata[i:j]) 

189 i = self.updatepos(i, j) 

190 if i == n: break 

191 startswith = rawdata.startswith 

192 if startswith('<', i): 

193 if starttagopen.match(rawdata, i): # < + letter 

194 k = self.parse_starttag(i) 

195 elif startswith("</", i): 

196 k = self.parse_endtag(i) 

197 elif startswith("<!--", i): 

198 k = self.parse_comment(i) 

199 elif startswith("<?", i): 

200 k = self.parse_pi(i) 

201 elif startswith("<!", i): 

202 if self.strict: 

203 k = self.parse_declaration(i) 

204 else: 

205 k = self.parse_html_declaration(i) 

206 elif (i + 1) < n: 

207 self.handle_data("<") 

208 k = i + 1 

209 else: 

210 break 

211 if k < 0: 

212 if not end: 

213 break 

214 if self.strict: 

215 self.error("EOF in middle of construct") 

216 k = rawdata.find('>', i + 1) 

217 if k < 0: 

218 k = rawdata.find('<', i + 1) 

219 if k < 0: 

220 k = i + 1 

221 else: 

222 k += 1 

223 self.handle_data(rawdata[i:k]) 

224 i = self.updatepos(i, k) 

225 elif startswith("&#", i): 

226 match = charref.match(rawdata, i) 

227 if match: 

228 name = match.group()[2:-1] 

229 self.handle_charref(name) 

230 k = match.end() 

231 if not startswith(';', k-1): 

232 k = k - 1 

233 i = self.updatepos(i, k) 

234 continue 

235 else: 

236 if ";" in rawdata[i:]: #bail by consuming &# 

237 self.handle_data(rawdata[0:2]) 

238 i = self.updatepos(i, 2) 

239 break 

240 elif startswith('&', i): 

241 match = entityref.match(rawdata, i) 

242 if match: 

243 name = match.group(1) 

244 self.handle_entityref(name) 

245 k = match.end() 

246 if not startswith(';', k-1): 

247 k = k - 1 

248 i = self.updatepos(i, k) 

249 continue 

250 match = incomplete.match(rawdata, i) 

251 if match: 

252 # match.group() will contain at least 2 chars 

253 if end and match.group() == rawdata[i:]: 

254 if self.strict: 

255 self.error("EOF in middle of entity or char ref") 

256 else: 

257 if k <= i: 

258 k = n 

259 i = self.updatepos(i, i + 1) 

260 # incomplete 

261 break 

262 elif (i + 1) < n: 

263 # not the end of the buffer, and can't be confused 

264 # with some other construct 

265 self.handle_data("&") 

266 i = self.updatepos(i, i + 1) 

267 else: 

268 break 

269 else: 

270 assert 0, "interesting.search() lied" 

271 # end while 

272 if end and i < n and not self.cdata_elem: 

273 self.handle_data(rawdata[i:n]) 

274 i = self.updatepos(i, n) 

275 self.rawdata = rawdata[i:] 

276 

277 # Internal -- parse html declarations, return length or -1 if not terminated 

278 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 

279 # See also parse_declaration in _markupbase 

280 def parse_html_declaration(self, i): 

281 rawdata = self.rawdata 

282 assert rawdata[i:i+2] == '<!', ('unexpected call to ' 

283 'parse_html_declaration()') 

284 if rawdata[i:i+4] == '<!--': 

285 # this case is actually already handled in goahead() 

286 return self.parse_comment(i) 

287 elif rawdata[i:i+3] == '<![': 

288 return self.parse_marked_section(i) 

289 elif rawdata[i:i+9].lower() == '<!doctype': 

290 # find the closing > 

291 gtpos = rawdata.find('>', i+9) 

292 if gtpos == -1: 

293 return -1 

294 self.handle_decl(rawdata[i+2:gtpos]) 

295 return gtpos+1 

296 else: 

297 return self.parse_bogus_comment(i) 

298 

299 # Internal -- parse bogus comment, return length or -1 if not terminated 

300 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 

301 def parse_bogus_comment(self, i, report=1): 

302 rawdata = self.rawdata 

303 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 

304 'parse_comment()') 

305 pos = rawdata.find('>', i+2) 

306 if pos == -1: 

307 return -1 

308 if report: 

309 self.handle_comment(rawdata[i+2:pos]) 

310 return pos + 1 

311 

312 # Internal -- parse processing instr, return end or -1 if not terminated 

313 def parse_pi(self, i): 

314 rawdata = self.rawdata 

315 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 

316 match = piclose.search(rawdata, i+2) # > 

317 if not match: 

318 return -1 

319 j = match.start() 

320 self.handle_pi(rawdata[i+2: j]) 

321 j = match.end() 

322 return j 

323 

324 # Internal -- handle starttag, return end or -1 if not terminated 

325 def parse_starttag(self, i): 

326 self.__starttag_text = None 

327 endpos = self.check_for_whole_start_tag(i) 

328 if endpos < 0: 

329 return endpos 

330 rawdata = self.rawdata 

331 self.__starttag_text = rawdata[i:endpos] 

332 

333 # Now parse the data between i+1 and j into a tag and attrs 

334 attrs = [] 

335 match = tagfind.match(rawdata, i+1) 

336 assert match, 'unexpected call to parse_starttag()' 

337 k = match.end() 

338 self.lasttag = tag = match.group(1).lower() 

339 while k < endpos: 

340 if self.strict: 

341 m = attrfind.match(rawdata, k) 

342 else: 

343 m = attrfind_tolerant.match(rawdata, k) 

344 if not m: 

345 break 

346 attrname, rest, attrvalue = m.group(1, 2, 3) 

347 if not rest: 

348 attrvalue = None 

349 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

350 attrvalue[:1] == '"' == attrvalue[-1:]: 

351 attrvalue = attrvalue[1:-1] 

352 if attrvalue: 

353 attrvalue = self.unescape(attrvalue) 

354 attrs.append((attrname.lower(), attrvalue)) 

355 k = m.end() 

356 

357 end = rawdata[k:endpos].strip() 

358 if end not in (">", "/>"): 

359 lineno, offset = self.getpos() 

360 if "\n" in self.__starttag_text: 

361 lineno = lineno + self.__starttag_text.count("\n") 

362 offset = len(self.__starttag_text) \ 

363 - self.__starttag_text.rfind("\n") 

364 else: 

365 offset = offset + len(self.__starttag_text) 

366 if self.strict: 

367 self.error("junk characters in start tag: %r" 

368 % (rawdata[k:endpos][:20],)) 

369 self.handle_data(rawdata[i:endpos]) 

370 return endpos 

371 if end.endswith('/>'): 

372 # XHTML-style empty tag: <span attr="value" /> 

373 self.handle_startendtag(tag, attrs) 

374 else: 

375 self.handle_starttag(tag, attrs) 

376 if tag in self.CDATA_CONTENT_ELEMENTS: 

377 self.set_cdata_mode(tag) 

378 return endpos 

379 

380 # Internal -- check to see if we have a complete starttag; return end 

381 # or -1 if incomplete. 

382 def check_for_whole_start_tag(self, i): 

383 rawdata = self.rawdata 

384 if self.strict: 

385 m = locatestarttagend.match(rawdata, i) 

386 else: 

387 m = locatestarttagend_tolerant.match(rawdata, i) 

388 if m: 

389 j = m.end() 

390 next = rawdata[j:j+1] 

391 if next == ">": 

392 return j + 1 

393 if next == "/": 

394 if rawdata.startswith("/>", j): 

395 return j + 2 

396 if rawdata.startswith("/", j): 

397 # buffer boundary 

398 return -1 

399 # else bogus input 

400 if self.strict: 

401 self.updatepos(i, j + 1) 

402 self.error("malformed empty start tag") 

403 if j > i: 

404 return j 

405 else: 

406 return i + 1 

407 if next == "": 

408 # end of input 

409 return -1 

410 if next in ("abcdefghijklmnopqrstuvwxyz=/" 

411 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 

412 # end of input in or before attribute value, or we have the 

413 # '/' from a '/>' ending 

414 return -1 

415 if self.strict: 

416 self.updatepos(i, j) 

417 self.error("malformed start tag") 

418 if j > i: 

419 return j 

420 else: 

421 return i + 1 

422 raise AssertionError("we should not get here!") 

423 

424 # Internal -- parse endtag, return end or -1 if incomplete 

425 def parse_endtag(self, i): 

426 rawdata = self.rawdata 

427 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 

428 match = endendtag.search(rawdata, i+1) # > 

429 if not match: 

430 return -1 

431 gtpos = match.end() 

432 match = endtagfind.match(rawdata, i) # </ + tag + > 

433 if not match: 

434 if self.cdata_elem is not None: 

435 self.handle_data(rawdata[i:gtpos]) 

436 return gtpos 

437 if self.strict: 

438 self.error("bad end tag: %r" % (rawdata[i:gtpos],)) 

439 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 

440 namematch = tagfind_tolerant.match(rawdata, i+2) 

441 if not namematch: 

442 # w3.org/TR/html5/tokenization.html#end-tag-open-state 

443 if rawdata[i:i+3] == '</>': 

444 return i+3 

445 else: 

446 return self.parse_bogus_comment(i) 

447 tagname = namematch.group().lower() 

448 # consume and ignore other stuff between the name and the > 

449 # Note: this is not 100% correct, since we might have things like 

450 # </tag attr=">">, but looking for > after tha name should cover 

451 # most of the cases and is much simpler 

452 gtpos = rawdata.find('>', namematch.end()) 

453 self.handle_endtag(tagname) 

454 return gtpos+1 

455 

456 elem = match.group(1).lower() # script or style 

457 if self.cdata_elem is not None: 

458 if elem != self.cdata_elem: 

459 self.handle_data(rawdata[i:gtpos]) 

460 return gtpos 

461 

462 self.handle_endtag(elem.lower()) 

463 self.clear_cdata_mode() 

464 return gtpos 

465 

466 # Overridable -- finish processing of start+end tag: <tag.../> 

467 def handle_startendtag(self, tag, attrs): 

468 self.handle_starttag(tag, attrs) 

469 self.handle_endtag(tag) 

470 

471 # Overridable -- handle start tag 

472 def handle_starttag(self, tag, attrs): 

473 pass 

474 

475 # Overridable -- handle end tag 

476 def handle_endtag(self, tag): 

477 pass 

478 

479 # Overridable -- handle character reference 

480 def handle_charref(self, name): 

481 pass 

482 

483 # Overridable -- handle entity reference 

484 def handle_entityref(self, name): 

485 pass 

486 

487 # Overridable -- handle data 

488 def handle_data(self, data): 

489 pass 

490 

491 # Overridable -- handle comment 

492 def handle_comment(self, data): 

493 pass 

494 

495 # Overridable -- handle declaration 

496 def handle_decl(self, decl): 

497 pass 

498 

499 # Overridable -- handle processing instruction 

500 def handle_pi(self, data): 

501 pass 

502 

503 def unknown_decl(self, data): 

504 if self.strict: 

505 self.error("unknown declaration: %r" % (data,)) 

506 

507 # Internal -- helper to remove special character quoting 

508 def unescape(self, s): 

509 if '&' not in s: 

510 return s 

511 def replaceEntities(s): 

512 s = s.groups()[0] 

513 try: 

514 if s[0] == "#": 

515 s = s[1:] 

516 if s[0] in ['x','X']: 

517 c = int(s[1:].rstrip(';'), 16) 

518 else: 

519 c = int(s.rstrip(';')) 

520 return chr(c) 

521 except ValueError: 

522 return '&#' + s 

523 else: 

524 from future.backports.html.entities import html5 

525 if s in html5: 

526 return html5[s] 

527 elif s.endswith(';'): 

528 return '&' + s 

529 for x in range(2, len(s)): 

530 if s[:x] in html5: 

531 return html5[s[:x]] + s[x:] 

532 else: 

533 return '&' + s 

534 

535 return re.sub(r"&(#?[xX]?(?:[0-9a-fA-F]+;|\w{1,32};?))", 

536 replaceEntities, s)