Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/bs4/builder/_htmlparser.py: 25%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

208 statements  

1# encoding: utf-8 

2"""Use the HTMLParser library to parse HTML files that aren't too bad.""" 

3 

4# Use of this source code is governed by the MIT license. 

5__license__ = "MIT" 

6 

7__all__ = [ 

8 'HTMLParserTreeBuilder', 

9 ] 

10 

11from html.parser import HTMLParser 

12 

13try: 

14 from html.parser import HTMLParseError 

15except ImportError as e: 

16 # HTMLParseError is removed in Python 3.5. Since it can never be 

17 # thrown in 3.5, we can just define our own class as a placeholder. 

18 class HTMLParseError(Exception): 

19 pass 

20 

21import sys 

22import warnings 

23 

24# Starting in Python 3.2, the HTMLParser constructor takes a 'strict' 

25# argument, which we'd like to set to False. Unfortunately, 

26# http://bugs.python.org/issue13273 makes strict=True a better bet 

27# before Python 3.2.3. 

28# 

29# At the end of this file, we monkeypatch HTMLParser so that 

30# strict=True works well on Python 3.2.2. 

31major, minor, release = sys.version_info[:3] 

32CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3 

33CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3 

34CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4 

35 

36 

37from bs4.element import ( 

38 CData, 

39 Comment, 

40 Declaration, 

41 Doctype, 

42 ProcessingInstruction, 

43 ) 

44from bs4.dammit import EntitySubstitution, UnicodeDammit 

45 

46from bs4.builder import ( 

47 DetectsXMLParsedAsHTML, 

48 HTML, 

49 HTMLTreeBuilder, 

50 STRICT, 

51 ) 

52 

53 

54HTMLPARSER = 'html.parser' 

55 

56class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML): 

57 """A subclass of the Python standard library's HTMLParser class, which 

58 listens for HTMLParser events and translates them into calls 

59 to Beautiful Soup's tree construction API. 

60 """ 

61 

62 # Strategies for handling duplicate attributes 

63 IGNORE = 'ignore' 

64 REPLACE = 'replace' 

65 

66 def __init__(self, *args, **kwargs): 

67 """Constructor. 

68 

69 :param on_duplicate_attribute: A strategy for what to do if a 

70 tag includes the same attribute more than once. Accepted 

71 values are: REPLACE (replace earlier values with later 

72 ones, the default), IGNORE (keep the earliest value 

73 encountered), or a callable. A callable must take three 

74 arguments: the dictionary of attributes already processed, 

75 the name of the duplicate attribute, and the most recent value 

76 encountered.  

77 """ 

78 self.on_duplicate_attribute = kwargs.pop( 

79 'on_duplicate_attribute', self.REPLACE 

80 ) 

81 HTMLParser.__init__(self, *args, **kwargs) 

82 

83 # Keep a list of empty-element tags that were encountered 

84 # without an explicit closing tag. If we encounter a closing tag 

85 # of this type, we'll associate it with one of those entries. 

86 # 

87 # This isn't a stack because we don't care about the 

88 # order. It's a list of closing tags we've already handled and 

89 # will ignore, assuming they ever show up. 

90 self.already_closed_empty_element = [] 

91 

92 self._initialize_xml_detector() 

93 

94 def error(self, msg): 

95 """In Python 3, HTMLParser subclasses must implement error(), although 

96 this requirement doesn't appear to be documented. 

97 

98 In Python 2, HTMLParser implements error() by raising an exception, 

99 which we don't want to do. 

100 

101 In any event, this method is called only on very strange 

102 markup and our best strategy is to pretend it didn't happen 

103 and keep going. 

104 """ 

105 warnings.warn(msg) 

106 

107 def handle_startendtag(self, name, attrs): 

108 """Handle an incoming empty-element tag. 

109 

110 This is only called when the markup looks like <tag/>. 

111 

112 :param name: Name of the tag. 

113 :param attrs: Dictionary of the tag's attributes. 

114 """ 

115 # is_startend() tells handle_starttag not to close the tag 

116 # just because its name matches a known empty-element tag. We 

117 # know that this is an empty-element tag and we want to call 

118 # handle_endtag ourselves. 

119 tag = self.handle_starttag(name, attrs, handle_empty_element=False) 

120 self.handle_endtag(name) 

121 

122 def handle_starttag(self, name, attrs, handle_empty_element=True): 

123 """Handle an opening tag, e.g. '<tag>' 

124 

125 :param name: Name of the tag. 

126 :param attrs: Dictionary of the tag's attributes. 

127 :param handle_empty_element: True if this tag is known to be 

128 an empty-element tag (i.e. there is not expected to be any 

129 closing tag). 

130 """ 

131 # XXX namespace 

132 attr_dict = {} 

133 for key, value in attrs: 

134 # Change None attribute values to the empty string 

135 # for consistency with the other tree builders. 

136 if value is None: 

137 value = '' 

138 if key in attr_dict: 

139 # A single attribute shows up multiple times in this 

140 # tag. How to handle it depends on the 

141 # on_duplicate_attribute setting. 

142 on_dupe = self.on_duplicate_attribute 

143 if on_dupe == self.IGNORE: 

144 pass 

145 elif on_dupe in (None, self.REPLACE): 

146 attr_dict[key] = value 

147 else: 

148 on_dupe(attr_dict, key, value) 

149 else: 

150 attr_dict[key] = value 

151 attrvalue = '""' 

152 #print("START", name) 

153 sourceline, sourcepos = self.getpos() 

154 tag = self.soup.handle_starttag( 

155 name, None, None, attr_dict, sourceline=sourceline, 

156 sourcepos=sourcepos 

157 ) 

158 if tag and tag.is_empty_element and handle_empty_element: 

159 # Unlike other parsers, html.parser doesn't send separate end tag 

160 # events for empty-element tags. (It's handled in 

161 # handle_startendtag, but only if the original markup looked like 

162 # <tag/>.) 

163 # 

164 # So we need to call handle_endtag() ourselves. Since we 

165 # know the start event is identical to the end event, we 

166 # don't want handle_endtag() to cross off any previous end 

167 # events for tags of this name. 

168 self.handle_endtag(name, check_already_closed=False) 

169 

170 # But we might encounter an explicit closing tag for this tag 

171 # later on. If so, we want to ignore it. 

172 self.already_closed_empty_element.append(name) 

173 

174 if self._root_tag is None: 

175 self._root_tag_encountered(name) 

176 

177 def handle_endtag(self, name, check_already_closed=True): 

178 """Handle a closing tag, e.g. '</tag>' 

179  

180 :param name: A tag name. 

181 :param check_already_closed: True if this tag is expected to 

182 be the closing portion of an empty-element tag, 

183 e.g. '<tag></tag>'. 

184 """ 

185 #print("END", name) 

186 if check_already_closed and name in self.already_closed_empty_element: 

187 # This is a redundant end tag for an empty-element tag. 

188 # We've already called handle_endtag() for it, so just 

189 # check it off the list. 

190 #print("ALREADY CLOSED", name) 

191 self.already_closed_empty_element.remove(name) 

192 else: 

193 self.soup.handle_endtag(name) 

194 

195 def handle_data(self, data): 

196 """Handle some textual data that shows up between tags.""" 

197 self.soup.handle_data(data) 

198 

199 def handle_charref(self, name): 

200 """Handle a numeric character reference by converting it to the 

201 corresponding Unicode character and treating it as textual 

202 data. 

203 

204 :param name: Character number, possibly in hexadecimal. 

205 """ 

206 # XXX workaround for a bug in HTMLParser. Remove this once 

207 # it's fixed in all supported versions. 

208 # http://bugs.python.org/issue13633 

209 if name.startswith('x'): 

210 real_name = int(name.lstrip('x'), 16) 

211 elif name.startswith('X'): 

212 real_name = int(name.lstrip('X'), 16) 

213 else: 

214 real_name = int(name) 

215 

216 data = None 

217 if real_name < 256: 

218 # HTML numeric entities are supposed to reference Unicode 

219 # code points, but sometimes they reference code points in 

220 # some other encoding (ahem, Windows-1252). E.g. &#147; 

221 # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This 

222 # code tries to detect this situation and compensate. 

223 for encoding in (self.soup.original_encoding, 'windows-1252'): 

224 if not encoding: 

225 continue 

226 try: 

227 data = bytearray([real_name]).decode(encoding) 

228 except UnicodeDecodeError as e: 

229 pass 

230 if not data: 

231 try: 

232 data = chr(real_name) 

233 except (ValueError, OverflowError) as e: 

234 pass 

235 data = data or "\N{REPLACEMENT CHARACTER}" 

236 self.handle_data(data) 

237 

238 def handle_entityref(self, name): 

239 """Handle a named entity reference by converting it to the 

240 corresponding Unicode character(s) and treating it as textual 

241 data. 

242 

243 :param name: Name of the entity reference. 

244 """ 

245 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name) 

246 if character is not None: 

247 data = character 

248 else: 

249 # If this were XML, it would be ambiguous whether "&foo" 

250 # was an character entity reference with a missing 

251 # semicolon or the literal string "&foo". Since this is 

252 # HTML, we have a complete list of all character entity references, 

253 # and this one wasn't found, so assume it's the literal string "&foo". 

254 data = "&%s" % name 

255 self.handle_data(data) 

256 

257 def handle_comment(self, data): 

258 """Handle an HTML comment. 

259 

260 :param data: The text of the comment. 

261 """ 

262 self.soup.endData() 

263 self.soup.handle_data(data) 

264 self.soup.endData(Comment) 

265 

266 def handle_decl(self, data): 

267 """Handle a DOCTYPE declaration. 

268 

269 :param data: The text of the declaration. 

270 """ 

271 self.soup.endData() 

272 data = data[len("DOCTYPE "):] 

273 self.soup.handle_data(data) 

274 self.soup.endData(Doctype) 

275 

276 def unknown_decl(self, data): 

277 """Handle a declaration of unknown type -- probably a CDATA block. 

278 

279 :param data: The text of the declaration. 

280 """ 

281 if data.upper().startswith('CDATA['): 

282 cls = CData 

283 data = data[len('CDATA['):] 

284 else: 

285 cls = Declaration 

286 self.soup.endData() 

287 self.soup.handle_data(data) 

288 self.soup.endData(cls) 

289 

290 def handle_pi(self, data): 

291 """Handle a processing instruction. 

292 

293 :param data: The text of the instruction. 

294 """ 

295 self.soup.endData() 

296 self.soup.handle_data(data) 

297 self._document_might_be_xml(data) 

298 self.soup.endData(ProcessingInstruction) 

299 

300 

301class HTMLParserTreeBuilder(HTMLTreeBuilder): 

302 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser, 

303 found in the Python standard library. 

304 """ 

305 is_xml = False 

306 picklable = True 

307 NAME = HTMLPARSER 

308 features = [NAME, HTML, STRICT] 

309 

310 # The html.parser knows which line number and position in the 

311 # original file is the source of an element. 

312 TRACKS_LINE_NUMBERS = True 

313 

314 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs): 

315 """Constructor. 

316 

317 :param parser_args: Positional arguments to pass into  

318 the BeautifulSoupHTMLParser constructor, once it's 

319 invoked. 

320 :param parser_kwargs: Keyword arguments to pass into  

321 the BeautifulSoupHTMLParser constructor, once it's 

322 invoked. 

323 :param kwargs: Keyword arguments for the superclass constructor. 

324 """ 

325 # Some keyword arguments will be pulled out of kwargs and placed 

326 # into parser_kwargs. 

327 extra_parser_kwargs = dict() 

328 for arg in ('on_duplicate_attribute',): 

329 if arg in kwargs: 

330 value = kwargs.pop(arg) 

331 extra_parser_kwargs[arg] = value 

332 super(HTMLParserTreeBuilder, self).__init__(**kwargs) 

333 parser_args = parser_args or [] 

334 parser_kwargs = parser_kwargs or {} 

335 parser_kwargs.update(extra_parser_kwargs) 

336 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED: 

337 parser_kwargs['strict'] = False 

338 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS: 

339 parser_kwargs['convert_charrefs'] = False 

340 self.parser_args = (parser_args, parser_kwargs) 

341 

342 def prepare_markup(self, markup, user_specified_encoding=None, 

343 document_declared_encoding=None, exclude_encodings=None): 

344 

345 """Run any preliminary steps necessary to make incoming markup 

346 acceptable to the parser. 

347 

348 :param markup: Some markup -- probably a bytestring. 

349 :param user_specified_encoding: The user asked to try this encoding. 

350 :param document_declared_encoding: The markup itself claims to be 

351 in this encoding. 

352 :param exclude_encodings: The user asked _not_ to try any of 

353 these encodings. 

354 

355 :yield: A series of 4-tuples: 

356 (markup, encoding, declared encoding, 

357 has undergone character replacement) 

358 

359 Each 4-tuple represents a strategy for converting the 

360 document to Unicode and parsing it. Each strategy will be tried  

361 in turn. 

362 """ 

363 if isinstance(markup, str): 

364 # Parse Unicode as-is. 

365 yield (markup, None, None, False) 

366 return 

367 

368 # Ask UnicodeDammit to sniff the most likely encoding. 

369 

370 # This was provided by the end-user; treat it as a known 

371 # definite encoding per the algorithm laid out in the HTML5 

372 # spec. (See the EncodingDetector class for details.) 

373 known_definite_encodings = [user_specified_encoding] 

374 

375 # This was found in the document; treat it as a slightly lower-priority 

376 # user encoding. 

377 user_encodings = [document_declared_encoding] 

378 

379 try_encodings = [user_specified_encoding, document_declared_encoding] 

380 dammit = UnicodeDammit( 

381 markup, 

382 known_definite_encodings=known_definite_encodings, 

383 user_encodings=user_encodings, 

384 is_html=True, 

385 exclude_encodings=exclude_encodings 

386 ) 

387 yield (dammit.markup, dammit.original_encoding, 

388 dammit.declared_html_encoding, 

389 dammit.contains_replacement_characters) 

390 

391 def feed(self, markup): 

392 """Run some incoming markup through some parsing process, 

393 populating the `BeautifulSoup` object in self.soup. 

394 """ 

395 args, kwargs = self.parser_args 

396 parser = BeautifulSoupHTMLParser(*args, **kwargs) 

397 parser.soup = self.soup 

398 try: 

399 parser.feed(markup) 

400 parser.close() 

401 except HTMLParseError as e: 

402 warnings.warn(RuntimeWarning( 

403 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help.")) 

404 raise e 

405 parser.already_closed_empty_element = [] 

406 

407# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some 

408# 3.2.3 code. This ensures they don't treat markup like <p></p> as a 

409# string. 

410# 

411# XXX This code can be removed once most Python 3 users are on 3.2.3. 

412if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT: 

413 import re 

414 attrfind_tolerant = re.compile( 

415 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*' 

416 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?') 

417 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant 

418 

419 locatestarttagend = re.compile(r""" 

420 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name 

421 (?:\s+ # whitespace before attribute name 

422 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name 

423 (?:\s*=\s* # value indicator 

424 (?:'[^']*' # LITA-enclosed value 

425 |\"[^\"]*\" # LIT-enclosed value 

426 |[^'\">\s]+ # bare value 

427 ) 

428 )? 

429 ) 

430 )* 

431 \s* # trailing whitespace 

432""", re.VERBOSE) 

433 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend 

434 

435 from html.parser import tagfind, attrfind 

436 

437 def parse_starttag(self, i): 

438 self.__starttag_text = None 

439 endpos = self.check_for_whole_start_tag(i) 

440 if endpos < 0: 

441 return endpos 

442 rawdata = self.rawdata 

443 self.__starttag_text = rawdata[i:endpos] 

444 

445 # Now parse the data between i+1 and j into a tag and attrs 

446 attrs = [] 

447 match = tagfind.match(rawdata, i+1) 

448 assert match, 'unexpected call to parse_starttag()' 

449 k = match.end() 

450 self.lasttag = tag = rawdata[i+1:k].lower() 

451 while k < endpos: 

452 if self.strict: 

453 m = attrfind.match(rawdata, k) 

454 else: 

455 m = attrfind_tolerant.match(rawdata, k) 

456 if not m: 

457 break 

458 attrname, rest, attrvalue = m.group(1, 2, 3) 

459 if not rest: 

460 attrvalue = None 

461 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \ 

462 attrvalue[:1] == '"' == attrvalue[-1:]: 

463 attrvalue = attrvalue[1:-1] 

464 if attrvalue: 

465 attrvalue = self.unescape(attrvalue) 

466 attrs.append((attrname.lower(), attrvalue)) 

467 k = m.end() 

468 

469 end = rawdata[k:endpos].strip() 

470 if end not in (">", "/>"): 

471 lineno, offset = self.getpos() 

472 if "\n" in self.__starttag_text: 

473 lineno = lineno + self.__starttag_text.count("\n") 

474 offset = len(self.__starttag_text) \ 

475 - self.__starttag_text.rfind("\n") 

476 else: 

477 offset = offset + len(self.__starttag_text) 

478 if self.strict: 

479 self.error("junk characters in start tag: %r" 

480 % (rawdata[k:endpos][:20],)) 

481 self.handle_data(rawdata[i:endpos]) 

482 return endpos 

483 if end.endswith('/>'): 

484 # XHTML-style empty tag: <span attr="value" /> 

485 self.handle_startendtag(tag, attrs) 

486 else: 

487 self.handle_starttag(tag, attrs) 

488 if tag in self.CDATA_CONTENT_ELEMENTS: 

489 self.set_cdata_mode(tag) 

490 return endpos 

491 

492 def set_cdata_mode(self, elem): 

493 self.cdata_elem = elem.lower() 

494 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I) 

495 

496 BeautifulSoupHTMLParser.parse_starttag = parse_starttag 

497 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode 

498 

499 CONSTRUCTOR_TAKES_STRICT = True