Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/html/parser.py: 17%

288 statements  

« prev     ^ index     » next       coverage.py v7.0.5, created at 2023-01-17 06:13 +0000

1"""A parser for HTML and XHTML.""" 

2 

3# This file is based on sgmllib.py, but the API is slightly different. 

4 

5# XXX There should be a way to distinguish between PCDATA (parsed 

6# character data -- the normal case), RCDATA (replaceable character 

7# data -- only char and entity references and end tags are special) 

8# and CDATA (character data -- only end tags are special). 

9 

10 

11import re 

12import warnings 

13import _markupbase 

14 

15from html import unescape 

16 

17 

18__all__ = ['HTMLParser'] 

19 

20# Regular expressions used for parsing 

21 

22interesting_normal = re.compile('[&<]') 

23incomplete = re.compile('&[a-zA-Z#]') 

24 

25entityref = re.compile('&([a-zA-Z][-.a-zA-Z0-9]*)[^a-zA-Z0-9]') 

26charref = re.compile('&#(?:[0-9]+|[xX][0-9a-fA-F]+)[^0-9a-fA-F]') 

27 

28starttagopen = re.compile('<[a-zA-Z]') 

29piclose = re.compile('>') 

30commentclose = re.compile(r'--\s*>') 

31# Note: 

32# 1) if you change tagfind/attrfind remember to update locatestarttagend too; 

33# 2) if you change tagfind/attrfind and/or locatestarttagend the parser will 

34# explode, so don't do it. 

35# see http://www.w3.org/TR/html5/tokenization.html#tag-open-state 

36# and http://www.w3.org/TR/html5/tokenization.html#tag-name-state 

37tagfind_tolerant = re.compile(r'([a-zA-Z][^\t\n\r\f />\x00]*)(?:\s|/(?!>))*') 

38attrfind_tolerant = re.compile( 

39 r'((?<=[\'"\s/])[^\s/>][^\s/=>]*)(\s*=+\s*' 

40 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?(?:\s|/(?!>))*') 

41locatestarttagend_tolerant = re.compile(r""" 

42 <[a-zA-Z][^\t\n\r\f />\x00]* # tag name 

43 (?:[\s/]* # optional whitespace before attribute name 

44 (?:(?<=['"\s/])[^\s/>][^\s/=>]* # attribute name 

45 (?:\s*=+\s* # value indicator 

46 (?:'[^']*' # LITA-enclosed value 

47 |"[^"]*" # LIT-enclosed value 

48 |(?!['"])[^>\s]* # bare value 

49 ) 

50 (?:\s*,)* # possibly followed by a comma 

51 )?(?:\s|/(?!>))* 

52 )* 

53 )? 

54 \s* # trailing whitespace 

55""", re.VERBOSE) 

56endendtag = re.compile('>') 

57# the HTML 5 spec, section 8.1.2.2, doesn't allow spaces between 

58# </ and the tag name, so maybe this should be fixed 

59endtagfind = re.compile(r'</\s*([a-zA-Z][-.a-zA-Z0-9:_]*)\s*>') 

60 

61 

62 

63class HTMLParser(_markupbase.ParserBase): 

64 """Find tags and other markup and call handler functions. 

65 

66 Usage: 

67 p = HTMLParser() 

68 p.feed(data) 

69 ... 

70 p.close() 

71 

72 Start tags are handled by calling self.handle_starttag() or 

73 self.handle_startendtag(); end tags by self.handle_endtag(). The 

74 data between tags is passed from the parser to the derived class 

75 by calling self.handle_data() with the data as argument (the data 

76 may be split up in arbitrary chunks). If convert_charrefs is 

77 True the character references are converted automatically to the 

78 corresponding Unicode character (and self.handle_data() is no 

79 longer split in chunks), otherwise they are passed by calling 

80 self.handle_entityref() or self.handle_charref() with the string 

81 containing respectively the named or numeric reference as the 

82 argument. 

83 """ 

84 

85 CDATA_CONTENT_ELEMENTS = ("script", "style") 

86 

87 def __init__(self, *, convert_charrefs=True): 

88 """Initialize and reset this instance. 

89 

90 If convert_charrefs is True (the default), all character references 

91 are automatically converted to the corresponding Unicode characters. 

92 """ 

93 self.convert_charrefs = convert_charrefs 

94 self.reset() 

95 

96 def reset(self): 

97 """Reset this instance. Loses all unprocessed data.""" 

98 self.rawdata = '' 

99 self.lasttag = '???' 

100 self.interesting = interesting_normal 

101 self.cdata_elem = None 

102 _markupbase.ParserBase.reset(self) 

103 

104 def feed(self, data): 

105 r"""Feed data to the parser. 

106 

107 Call this as often as you want, with as little or as much text 

108 as you want (may include '\n'). 

109 """ 

110 self.rawdata = self.rawdata + data 

111 self.goahead(0) 

112 

113 def close(self): 

114 """Handle any buffered data.""" 

115 self.goahead(1) 

116 

117 __starttag_text = None 

118 

119 def get_starttag_text(self): 

120 """Return full source of start tag: '<...>'.""" 

121 return self.__starttag_text 

122 

123 def set_cdata_mode(self, elem): 

124 self.cdata_elem = elem.lower() 

125 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 

126 

127 def clear_cdata_mode(self): 

128 self.interesting = interesting_normal 

129 self.cdata_elem = None 

130 

131 # Internal -- handle data as far as reasonable. May leave state 

132 # and data to be processed by a subsequent call. If 'end' is 

133 # true, force handling all data as if followed by EOF marker. 

134 def goahead(self, end): 

135 rawdata = self.rawdata 

136 i = 0 

137 n = len(rawdata) 

138 while i < n: 

139 if self.convert_charrefs and not self.cdata_elem: 

140 j = rawdata.find('<', i) 

141 if j < 0: 

142 # if we can't find the next <, either we are at the end 

143 # or there's more text incoming. If the latter is True, 

144 # we can't pass the text to handle_data in case we have 

145 # a charref cut in half at end. Try to determine if 

146 # this is the case before proceeding by looking for an 

147 # & near the end and see if it's followed by a space or ;. 

148 amppos = rawdata.rfind('&', max(i, n-34)) 

149 if (amppos >= 0 and 

150 not re.compile(r'[\s;]').search(rawdata, amppos)): 

151 break # wait till we get all the text 

152 j = n 

153 else: 

154 match = self.interesting.search(rawdata, i) # < or & 

155 if match: 

156 j = match.start() 

157 else: 

158 if self.cdata_elem: 

159 break 

160 j = n 

161 if i < j: 

162 if self.convert_charrefs and not self.cdata_elem: 

163 self.handle_data(unescape(rawdata[i:j])) 

164 else: 

165 self.handle_data(rawdata[i:j]) 

166 i = self.updatepos(i, j) 

167 if i == n: break 

168 startswith = rawdata.startswith 

169 if startswith('<', i): 

170 if starttagopen.match(rawdata, i): # < + letter 

171 k = self.parse_starttag(i) 

172 elif startswith("</", i): 

173 k = self.parse_endtag(i) 

174 elif startswith("<!--", i): 

175 k = self.parse_comment(i) 

176 elif startswith("<?", i): 

177 k = self.parse_pi(i) 

178 elif startswith("<!", i): 

179 k = self.parse_html_declaration(i) 

180 elif (i + 1) < n: 

181 self.handle_data("<") 

182 k = i + 1 

183 else: 

184 break 

185 if k < 0: 

186 if not end: 

187 break 

188 k = rawdata.find('>', i + 1) 

189 if k < 0: 

190 k = rawdata.find('<', i + 1) 

191 if k < 0: 

192 k = i + 1 

193 else: 

194 k += 1 

195 if self.convert_charrefs and not self.cdata_elem: 

196 self.handle_data(unescape(rawdata[i:k])) 

197 else: 

198 self.handle_data(rawdata[i:k]) 

199 i = self.updatepos(i, k) 

200 elif startswith("&#", i): 

201 match = charref.match(rawdata, i) 

202 if match: 

203 name = match.group()[2:-1] 

204 self.handle_charref(name) 

205 k = match.end() 

206 if not startswith(';', k-1): 

207 k = k - 1 

208 i = self.updatepos(i, k) 

209 continue 

210 else: 

211 if ";" in rawdata[i:]: # bail by consuming &# 

212 self.handle_data(rawdata[i:i+2]) 

213 i = self.updatepos(i, i+2) 

214 break 

215 elif startswith('&', i): 

216 match = entityref.match(rawdata, i) 

217 if match: 

218 name = match.group(1) 

219 self.handle_entityref(name) 

220 k = match.end() 

221 if not startswith(';', k-1): 

222 k = k - 1 

223 i = self.updatepos(i, k) 

224 continue 

225 match = incomplete.match(rawdata, i) 

226 if match: 

227 # match.group() will contain at least 2 chars 

228 if end and match.group() == rawdata[i:]: 

229 k = match.end() 

230 if k <= i: 

231 k = n 

232 i = self.updatepos(i, i + 1) 

233 # incomplete 

234 break 

235 elif (i + 1) < n: 

236 # not the end of the buffer, and can't be confused 

237 # with some other construct 

238 self.handle_data("&") 

239 i = self.updatepos(i, i + 1) 

240 else: 

241 break 

242 else: 

243 assert 0, "interesting.search() lied" 

244 # end while 

245 if end and i < n and not self.cdata_elem: 

246 if self.convert_charrefs and not self.cdata_elem: 

247 self.handle_data(unescape(rawdata[i:n])) 

248 else: 

249 self.handle_data(rawdata[i:n]) 

250 i = self.updatepos(i, n) 

251 self.rawdata = rawdata[i:] 

252 

253 # Internal -- parse html declarations, return length or -1 if not terminated 

254 # See w3.org/TR/html5/tokenization.html#markup-declaration-open-state 

255 # See also parse_declaration in _markupbase 

256 def parse_html_declaration(self, i): 

257 rawdata = self.rawdata 

258 assert rawdata[i:i+2] == '<!', ('unexpected call to ' 

259 'parse_html_declaration()') 

260 if rawdata[i:i+4] == '<!--': 

261 # this case is actually already handled in goahead() 

262 return self.parse_comment(i) 

263 elif rawdata[i:i+3] == '<![': 

264 return self.parse_marked_section(i) 

265 elif rawdata[i:i+9].lower() == '<!doctype': 

266 # find the closing > 

267 gtpos = rawdata.find('>', i+9) 

268 if gtpos == -1: 

269 return -1 

270 self.handle_decl(rawdata[i+2:gtpos]) 

271 return gtpos+1 

272 else: 

273 return self.parse_bogus_comment(i) 

274 

275 # Internal -- parse bogus comment, return length or -1 if not terminated 

276 # see http://www.w3.org/TR/html5/tokenization.html#bogus-comment-state 

277 def parse_bogus_comment(self, i, report=1): 

278 rawdata = self.rawdata 

279 assert rawdata[i:i+2] in ('<!', '</'), ('unexpected call to ' 

280 'parse_comment()') 

281 pos = rawdata.find('>', i+2) 

282 if pos == -1: 

283 return -1 

284 if report: 

285 self.handle_comment(rawdata[i+2:pos]) 

286 return pos + 1 

287 

288 # Internal -- parse processing instr, return end or -1 if not terminated 

289 def parse_pi(self, i): 

290 rawdata = self.rawdata 

291 assert rawdata[i:i+2] == '<?', 'unexpected call to parse_pi()' 

292 match = piclose.search(rawdata, i+2) # > 

293 if not match: 

294 return -1 

295 j = match.start() 

296 self.handle_pi(rawdata[i+2: j]) 

297 j = match.end() 

298 return j 

299 

300 # Internal -- handle starttag, return end or -1 if not terminated 

301 def parse_starttag(self, i): 

302 self.__starttag_text = None 

303 endpos = self.check_for_whole_start_tag(i) 

304 if endpos < 0: 

305 return endpos 

306 rawdata = self.rawdata 

307 self.__starttag_text = rawdata[i:endpos] 

308 

309 # Now parse the data between i+1 and j into a tag and attrs 

310 attrs = [] 

311 match = tagfind_tolerant.match(rawdata, i+1) 

312 assert match, 'unexpected call to parse_starttag()' 

313 k = match.end() 

314 self.lasttag = tag = match.group(1).lower() 

315 while k < endpos: 

316 m = attrfind_tolerant.match(rawdata, k) 

317 if not m: 

318 break 

319 attrname, rest, attrvalue = m.group(1, 2, 3) 

320 if not rest: 

321 attrvalue = None 

322 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

323 attrvalue[:1] == '"' == attrvalue[-1:]: 

324 attrvalue = attrvalue[1:-1] 

325 if attrvalue: 

326 attrvalue = unescape(attrvalue) 

327 attrs.append((attrname.lower(), attrvalue)) 

328 k = m.end() 

329 

330 end = rawdata[k:endpos].strip() 

331 if end not in (">", "/>"): 

332 lineno, offset = self.getpos() 

333 if "\n" in self.__starttag_text: 

334 lineno = lineno + self.__starttag_text.count("\n") 

335 offset = len(self.__starttag_text) \ 

336 - self.__starttag_text.rfind("\n") 

337 else: 

338 offset = offset + len(self.__starttag_text) 

339 self.handle_data(rawdata[i:endpos]) 

340 return endpos 

341 if end.endswith('/>'): 

342 # XHTML-style empty tag: <span attr="value" /> 

343 self.handle_startendtag(tag, attrs) 

344 else: 

345 self.handle_starttag(tag, attrs) 

346 if tag in self.CDATA_CONTENT_ELEMENTS: 

347 self.set_cdata_mode(tag) 

348 return endpos 

349 

350 # Internal -- check to see if we have a complete starttag; return end 

351 # or -1 if incomplete. 

352 def check_for_whole_start_tag(self, i): 

353 rawdata = self.rawdata 

354 m = locatestarttagend_tolerant.match(rawdata, i) 

355 if m: 

356 j = m.end() 

357 next = rawdata[j:j+1] 

358 if next == ">": 

359 return j + 1 

360 if next == "/": 

361 if rawdata.startswith("/>", j): 

362 return j + 2 

363 if rawdata.startswith("/", j): 

364 # buffer boundary 

365 return -1 

366 # else bogus input 

367 if j > i: 

368 return j 

369 else: 

370 return i + 1 

371 if next == "": 

372 # end of input 

373 return -1 

374 if next in ("abcdefghijklmnopqrstuvwxyz=/" 

375 "ABCDEFGHIJKLMNOPQRSTUVWXYZ"): 

376 # end of input in or before attribute value, or we have the 

377 # '/' from a '/>' ending 

378 return -1 

379 if j > i: 

380 return j 

381 else: 

382 return i + 1 

383 raise AssertionError("we should not get here!") 

384 

385 # Internal -- parse endtag, return end or -1 if incomplete 

386 def parse_endtag(self, i): 

387 rawdata = self.rawdata 

388 assert rawdata[i:i+2] == "</", "unexpected call to parse_endtag" 

389 match = endendtag.search(rawdata, i+1) # > 

390 if not match: 

391 return -1 

392 gtpos = match.end() 

393 match = endtagfind.match(rawdata, i) # </ + tag + > 

394 if not match: 

395 if self.cdata_elem is not None: 

396 self.handle_data(rawdata[i:gtpos]) 

397 return gtpos 

398 # find the name: w3.org/TR/html5/tokenization.html#tag-name-state 

399 namematch = tagfind_tolerant.match(rawdata, i+2) 

400 if not namematch: 

401 # w3.org/TR/html5/tokenization.html#end-tag-open-state 

402 if rawdata[i:i+3] == '</>': 

403 return i+3 

404 else: 

405 return self.parse_bogus_comment(i) 

406 tagname = namematch.group(1).lower() 

407 # consume and ignore other stuff between the name and the > 

408 # Note: this is not 100% correct, since we might have things like 

409 # </tag attr=">">, but looking for > after tha name should cover 

410 # most of the cases and is much simpler 

411 gtpos = rawdata.find('>', namematch.end()) 

412 self.handle_endtag(tagname) 

413 return gtpos+1 

414 

415 elem = match.group(1).lower() # script or style 

416 if self.cdata_elem is not None: 

417 if elem != self.cdata_elem: 

418 self.handle_data(rawdata[i:gtpos]) 

419 return gtpos 

420 

421 self.handle_endtag(elem) 

422 self.clear_cdata_mode() 

423 return gtpos 

424 

425 # Overridable -- finish processing of start+end tag: <tag.../> 

426 def handle_startendtag(self, tag, attrs): 

427 self.handle_starttag(tag, attrs) 

428 self.handle_endtag(tag) 

429 

430 # Overridable -- handle start tag 

431 def handle_starttag(self, tag, attrs): 

432 pass 

433 

434 # Overridable -- handle end tag 

435 def handle_endtag(self, tag): 

436 pass 

437 

438 # Overridable -- handle character reference 

439 def handle_charref(self, name): 

440 pass 

441 

442 # Overridable -- handle entity reference 

443 def handle_entityref(self, name): 

444 pass 

445 

446 # Overridable -- handle data 

447 def handle_data(self, data): 

448 pass 

449 

450 # Overridable -- handle comment 

451 def handle_comment(self, data): 

452 pass 

453 

454 # Overridable -- handle declaration 

455 def handle_decl(self, decl): 

456 pass 

457 

458 # Overridable -- handle processing instruction 

459 def handle_pi(self, data): 

460 pass 

461 

462 def unknown_decl(self, data): 

463 pass 

464 

465 # Internal -- helper to remove special character quoting 

466 def unescape(self, s): 

467 warnings.warn('The unescape method is deprecated and will be removed ' 

468 'in 3.5, use html.unescape() instead.', 

469 DeprecationWarning, stacklevel=2) 

470 return unescape(s)