Coverage for /pythoncovmergedfiles/medio/medio/usr/lib/python3.9/html/parser.py: 15%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

284 statements  

1"""A parser for HTML and XHTML.""" 

2 

3# This file is based on sgmllib.py, but the API is slightly different. 

4 

5# XXX There should be a way to distinguish between PCDATA (parsed 

6# character data -- the normal case), RCDATA (replaceable character 

7# data -- only char and entity references and end tags are special) 

8# and CDATA (character data -- only end tags are special). 

9 

10 

11import re 

12import _markupbase 

13 

14from html import unescape 

15 

16 

17__all__ = ['HTMLParser'] 

18 

19# Regular expressions used for parsing 

20 

21interesting_normal = re.compile('[&<]') 

22incomplete = re.compile('&[a-zA-Z#]') 

23 

24entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 

25charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 

26 

27starttagopen = re.compile('<[a-zA-Z]') 

28piclose = re.compile('>') 

29commentclose = re.compile(r'--\s*>') 

30# Note: 

31# 1) if you change tagfind/attrfind remember to update locatestarttagend too; 

32# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will 

33# explode, so don't do it. 

34# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 

35# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 

36tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 

37attrfind_tolerant = re.compile( 

38 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 

39 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 

40locatestarttagend_tolerant = re.compile(r""" 

41 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name 

42 (?:[\s/]* # optional whitespace before attribute name 

43 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 

44 (?:\s*=+\s* # value indicator 

45 (?:'[^']*' # LITA-enclosed value 

46 |"[^"]*" # LIT-enclosed value 

47 |(?!['"])[^>\s]* # bare value 

48 ) 

49 \s* # possibly followed by a space 

50 )?(?:\s|/(?!>))* 

51 )* 

52 )? 

53 \s* # trailing whitespace 

54""", re.VERBOSE) 

55endendtag = re.compile('>') 

56# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 

57# </ and the tag name, so maybe this should be fixed 

58endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 

59 

60 

61 

62class HTMLParser(_markupbase.ParserBase): 

63 """Find tags and other markup and call handler functions. 

64 

65 Usage: 

66 p = HTMLParser() 

67 p.feed(data) 

68 ... 

69 p.close() 

70 

71 Start tags are handled by calling self.handle_starttag() or 

72 self.handle_startendtag(); end tags by self.handle_endtag(). The 

73 data between tags is passed from the parser to the derived class 

74 by calling self.handle_data() with the data as argument (the data 

75 may be split up in arbitrary chunks). If convert_charrefs is 

76 True the character references are converted automatically to the 

77 corresponding Unicode character (and self.handle_data() is no 

78 longer split in chunks), otherwise they are passed by calling 

79 self.handle_entityref() or self.handle_charref() with the string 

80 containing respectively the named or numeric reference as the 

81 argument. 

82 """ 

83 

84 CDATA_CONTENT_ELEMENTS = ("script", "style") 

85 

86 def __init__(self, *, convert_charrefs=True): 

87 """Initialize and reset this instance. 

88 

89 If convert_charrefs is True (the default), all character references 

90 are automatically converted to the corresponding Unicode characters. 

91 """ 

92 self.convert_charrefs = convert_charrefs 

93 self.reset() 

94 

95 def reset(self): 

96 """Reset this instance. Loses all unprocessed data.""" 

97 self.rawdata = '' 

98 self.lasttag = '???' 

99 self.interesting = interesting_normal 

100 self.cdata_elem = None 

101 _markupbase.ParserBase.reset(self) 

102 

103 def feed(self, data): 

104 r"""Feed data to the parser. 

105 

106 Call this as often as you want, with as little or as much text 

107 as you want (may include '\n'). 

108 """ 

109 self.rawdata = self.rawdata + data 

110 self.goahead(0) 

111 

112 def close(self): 

113 """Handle any buffered data.""" 

114 self.goahead(1) 

115 

116 __starttag_text = None 

117 

118 def get_starttag_text(self): 

119 """Return full source of start tag: '<...>'.""" 

120 return self.__starttag_text 

121 

122 def set_cdata_mode(self, elem): 

123 self.cdata_elem = elem.lower() 

124 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 

125 

126 def clear_cdata_mode(self): 

127 self.interesting = interesting_normal 

128 self.cdata_elem = None 

129 

130 # Internal -- handle data as far as reasonable. May leave state 

131 # and data to be processed by a subsequent call. If 'end' is 

132 # true, force handling all data as if followed by EOF marker. 

133 def goahead(self, end): 

134 rawdata = self.rawdata 

135 i = 0 

136 n = len(rawdata) 

137 while i < n: 

138 if self.convert_charrefs and not self.cdata_elem: 

139 j = rawdata.find('<', i) 

140 if j < 0: 

141 # if we can't find the next <, either we are at the end 

142 # or there's more text incoming. If the latter is True, 

143 # we can't pass the text to handle_data in case we have 

144 # a charref cut in half at end. Try to determine if 

145 # this is the case before proceeding by looking for an 

146 # & near the end and see if it's followed by a space or ;. 

147 amppos = rawdata.rfind('&', max(i, n-34)) 

148 if (amppos >= 0 and 

149 not re.compile(r'[\s;]').search(rawdata, amppos)): 

150 break # wait till we get all the text 

151 j = n 

152 else: 

153 match = self.interesting.search(rawdata, i) # < or & 

154 if match: 

155 j = match.start() 

156 else: 

157 if self.cdata_elem: 

158 break 

159 j = n 

160 if i < j: 

161 if self.convert_charrefs and not self.cdata_elem: 

162 self.handle_data(unescape(rawdata[i:j])) 

163 else: 

164 self.handle_data(rawdata[i:j]) 

165 i = self.updatepos(i, j) 

166 if i == n: break 

167 startswith = rawdata.startswith 

168 if startswith('<', i): 

169 if starttagopen.match(rawdata, i): # < + letter 

170 k = self.parse_starttag(i) 

171 elif startswith("</", i): 

172 k = self.parse_endtag(i) 

173 elif startswith("<!--", i): 

174 k = self.parse_comment(i) 

175 elif startswith("<?", i): 

176 k = self.parse_pi(i) 

177 elif startswith("<!", i): 

178 k = self.parse_html_declaration(i) 

179 elif (i + 1) < n: 

180 self.handle_data("<") 

181 k = i + 1 

182 else: 

183 break 

184 if k < 0: 

185 if not end: 

186 break 

187 k = rawdata.find('>', i + 1) 

188 if k < 0: 

189 k = rawdata.find('<', i + 1) 

190 if k < 0: 

191 k = i + 1 

192 else: 

193 k += 1 

194 if self.convert_charrefs and not self.cdata_elem: 

195 self.handle_data(unescape(rawdata[i:k])) 

196 else: 

197 self.handle_data(rawdata[i:k]) 

198 i = self.updatepos(i, k) 

199 elif startswith("&#", i): 

200 match = charref.match(rawdata, i) 

201 if match: 

202 name = match.group()[2:-1] 

203 self.handle_charref(name) 

204 k = match.end() 

205 if not startswith(';', k-1): 

206 k = k - 1 

207 i = self.updatepos(i, k) 

208 continue 

209 else: 

210 if ";" in rawdata[i:]: # bail by consuming &# 

211 self.handle_data(rawdata[i:i+2]) 

212 i = self.updatepos(i, i+2) 

213 break 

214 elif startswith('&', i): 

215 match = entityref.match(rawdata, i) 

216 if match: 

217 name = match.group(1) 

218 self.handle_entityref(name) 

219 k = match.end() 

220 if not startswith(';', k-1): 

221 k = k - 1 

222 i = self.updatepos(i, k) 

223 continue 

224 match = incomplete.match(rawdata, i) 

225 if match: 

226 # match.group() will contain at least 2 chars 

227 if end and match.group() == rawdata[i:]: 

228 k = match.end() 

229 if k <= i: 

230 k = n 

231 i = self.updatepos(i, i + 1) 

232 # incomplete 

233 break 

234 elif (i + 1) < n: 

235 # not the end of the buffer, and can't be confused 

236 # with some other construct 

237 self.handle_data("&") 

238 i = self.updatepos(i, i + 1) 

239 else: 

240 break 

241 else: 

242 assert 0, "interesting.search() lied" 

243 # end while 

244 if end and i < n and not self.cdata_elem: 

245 if self.convert_charrefs and not self.cdata_elem: 

246 self.handle_data(unescape(rawdata[i:n])) 

247 else: 

248 self.handle_data(rawdata[i:n]) 

249 i = self.updatepos(i, n) 

250 self.rawdata = rawdata[i:] 

251 

252 # Internal -- parse html declarations, return length or -1 if not terminated 

253 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 

254 # See also parse_declaration in _markupbase 

255 def parse_html_declaration(self, i): 

256 rawdata = self.rawdata 

257 assert rawdata[i:i+2] == '<!', ('unexpected call to ' 

258 'parse_html_declaration()') 

259 if rawdata[i:i+4] == '<!--': 

260 # this case is actually already handled in goahead() 

261 return self.parse_comment(i) 

262 elif rawdata[i:i+3] == '<![': 

263 return self.parse_marked_section(i) 

264 elif rawdata[i:i+9].lower() == '<!doctype': 

265 # find the closing > 

266 gtpos = rawdata.find('>', i+9) 

267 if gtpos == -1: 

268 return -1 

269 self.handle_decl(rawdata[i+2:gtpos]) 

270 return gtpos+1 

271 else: 

272 return self.parse_bogus_comment(i) 

273 

274 # Internal -- parse bogus comment, return length or -1 if not terminated 

275 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 

276 def parse_bogus_comment(self, i, report=1): 

277 rawdata = self.rawdata 

278 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 

279 'parse_comment()') 

280 pos = rawdata.find('>', i+2) 

281 if pos == -1: 

282 return -1 

283 if report: 

284 self.handle_comment(rawdata[i+2:pos]) 

285 return pos + 1 

286 

287 # Internal -- parse processing instr, return end or -1 if not terminated 

288 def parse_pi(self, i): 

289 rawdata = self.rawdata 

290 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 

291 match = piclose.search(rawdata, i+2) # > 

292 if not match: 

293 return -1 

294 j = match.start() 

295 self.handle_pi(rawdata[i+2: j]) 

296 j = match.end() 

297 return j 

298 

299 # Internal -- handle starttag, return end or -1 if not terminated 

300 def parse_starttag(self, i): 

301 self.__starttag_text = None 

302 endpos = self.check_for_whole_start_tag(i) 

303 if endpos < 0: 

304 return endpos 

305 rawdata = self.rawdata 

306 self.__starttag_text = rawdata[i:endpos] 

307 

308 # Now parse the data between i+1 and j into a tag and attrs 

309 attrs = [] 

310 match = tagfind_tolerant.match(rawdata, i+1) 

311 assert match, 'unexpected call to parse_starttag()' 

312 k = match.end() 

313 self.lasttag = tag = match.group(1).lower() 

314 while k < endpos: 

315 m = attrfind_tolerant.match(rawdata, k) 

316 if not m: 

317 break 

318 attrname, rest, attrvalue = m.group(1, 2, 3) 

319 if not rest: 

320 attrvalue = None 

321 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

322 attrvalue[:1] == '"' == attrvalue[-1:]: 

323 attrvalue = attrvalue[1:-1] 

324 if attrvalue: 

325 attrvalue = unescape(attrvalue) 

326 attrs.append((attrname.lower(), attrvalue)) 

327 k = m.end() 

328 

329 end = rawdata[k:endpos].strip() 

330 if end not in (">", "/>"): 

331 lineno, offset = self.getpos() 

332 if "\n" in self.__starttag_text: 

333 lineno = lineno + self.__starttag_text.count("\n") 

334 offset = len(self.__starttag_text) \ 

335 - self.__starttag_text.rfind("\n") 

336 else: 

337 offset = offset + len(self.__starttag_text) 

338 self.handle_data(rawdata[i:endpos]) 

339 return endpos 

340 if end.endswith('/>'): 

341 # XHTML-style empty tag: <span attr="value" /> 

342 self.handle_startendtag(tag, attrs) 

343 else: 

344 self.handle_starttag(tag, attrs) 

345 if tag in self.CDATA_CONTENT_ELEMENTS: 

346 self.set_cdata_mode(tag) 

347 return endpos 

348 

349 # Internal -- check to see if we have a complete starttag; return end 

350 # or -1 if incomplete. 

351 def check_for_whole_start_tag(self, i): 

352 rawdata = self.rawdata 

353 m = locatestarttagend_tolerant.match(rawdata, i) 

354 if m: 

355 j = m.end() 

356 next = rawdata[j:j+1] 

357 if next == ">": 

358 return j + 1 

359 if next == "/": 

360 if rawdata.startswith("/>", j): 

361 return j + 2 

362 if rawdata.startswith("/", j): 

363 # buffer boundary 

364 return -1 

365 # else bogus input 

366 if j > i: 

367 return j 

368 else: 

369 return i + 1 

370 if next == "": 

371 # end of input 

372 return -1 

373 if next in ("abcdefghijklmnopqrstuvwxyz=/" 

374 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 

375 # end of input in or before attribute value, or we have the 

376 # '/' from a '/>' ending 

377 return -1 

378 if j > i: 

379 return j 

380 else: 

381 return i + 1 

382 raise AssertionError("we should not get here!") 

383 

384 # Internal -- parse endtag, return end or -1 if incomplete 

385 def parse_endtag(self, i): 

386 rawdata = self.rawdata 

387 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 

388 match = endendtag.search(rawdata, i+1) # > 

389 if not match: 

390 return -1 

391 gtpos = match.end() 

392 match = endtagfind.match(rawdata, i) # </ + tag + > 

393 if not match: 

394 if self.cdata_elem is not None: 

395 self.handle_data(rawdata[i:gtpos]) 

396 return gtpos 

397 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 

398 namematch = tagfind_tolerant.match(rawdata, i+2) 

399 if not namematch: 

400 # w3.org/TR/html5/tokenization.html#end-tag-open-state 

401 if rawdata[i:i+3] == '</>': 

402 return i+3 

403 else: 

404 return self.parse_bogus_comment(i) 

405 tagname = namematch.group(1).lower() 

406 # consume and ignore other stuff between the name and the > 

407 # Note: this is not 100% correct, since we might have things like 

408 # </tag attr=">">, but looking for > after tha name should cover 

409 # most of the cases and is much simpler 

410 gtpos = rawdata.find('>', namematch.end()) 

411 self.handle_endtag(tagname) 

412 return gtpos+1 

413 

414 elem = match.group(1).lower() # script or style 

415 if self.cdata_elem is not None: 

416 if elem != self.cdata_elem: 

417 self.handle_data(rawdata[i:gtpos]) 

418 return gtpos 

419 

420 self.handle_endtag(elem) 

421 self.clear_cdata_mode() 

422 return gtpos 

423 

424 # Overridable -- finish processing of start+end tag: <tag.../> 

425 def handle_startendtag(self, tag, attrs): 

426 self.handle_starttag(tag, attrs) 

427 self.handle_endtag(tag) 

428 

429 # Overridable -- handle start tag 

430 def handle_starttag(self, tag, attrs): 

431 pass 

432 

433 # Overridable -- handle end tag 

434 def handle_endtag(self, tag): 

435 pass 

436 

437 # Overridable -- handle character reference 

438 def handle_charref(self, name): 

439 pass 

440 

441 # Overridable -- handle entity reference 

442 def handle_entityref(self, name): 

443 pass 

444 

445 # Overridable -- handle data 

446 def handle_data(self, data): 

447 pass 

448 

449 # Overridable -- handle comment 

450 def handle_comment(self, data): 

451 pass 

452 

453 # Overridable -- handle declaration 

454 def handle_decl(self, decl): 

455 pass 

456 

457 # Overridable -- handle processing instruction 

458 def handle_pi(self, data): 

459 pass 

460 

461 def unknown_decl(self, data): 

462 pass