Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/bs4/builder/

1# encoding: utf-8

2"""Use the HTMLParser library to parse HTML files that aren't too bad."""

4# Use of this source code is governed by the MIT license.

5__license__ = "MIT"

7__all__ = [

8 'HTMLParserTreeBuilder',

9 ]

11from html.parser import HTMLParser

13try:

14 from html.parser import HTMLParseError

15except ImportError as e:

16 # HTMLParseError is removed in Python 3.5. Since it can never be

17 # thrown in 3.5, we can just define our own class as a placeholder.

18 class HTMLParseError(Exception):

19 pass

21import sys

22import warnings

24# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'

25# argument, which we'd like to set to False. Unfortunately,

26# http://bugs.python.org/issue13273 makes strict=True a better bet

27# before Python 3.2.3.

28#

29# At the end of this file, we monkeypatch HTMLParser so that

30# strict=True works well on Python 3.2.2.

31major, minor, release = sys.version_info[:3]

32CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3

33CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3

34CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4

37from bs4.element import (

38 CData,

39 Comment,

40 Declaration,

41 Doctype,

42 ProcessingInstruction,

43 )

44from bs4.dammit import EntitySubstitution, UnicodeDammit

46from bs4.builder import (

47 DetectsXMLParsedAsHTML,

48 HTML,

49 HTMLTreeBuilder,

50 STRICT,

51 )

54HTMLPARSER = 'html.parser'

56class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):

57 """A subclass of the Python standard library's HTMLParser class, which

58 listens for HTMLParser events and translates them into calls

59 to Beautiful Soup's tree construction API.

60 """

62 # Strategies for handling duplicate attributes

63 IGNORE = 'ignore'

64 REPLACE = 'replace'

66 def __init__(self, *args, **kwargs):

67 """Constructor.

69 :param on_duplicate_attribute: A strategy for what to do if a

70 tag includes the same attribute more than once. Accepted

71 values are: REPLACE (replace earlier values with later

72 ones, the default), IGNORE (keep the earliest value

73 encountered), or a callable. A callable must take three

74 arguments: the dictionary of attributes already processed,

75 the name of the duplicate attribute, and the most recent value

76 encountered.

77 """

78 self.on_duplicate_attribute = kwargs.pop(

79 'on_duplicate_attribute', self.REPLACE

80 )

81 HTMLParser.__init__(self, *args, **kwargs)

83 # Keep a list of empty-element tags that were encountered

84 # without an explicit closing tag. If we encounter a closing tag

85 # of this type, we'll associate it with one of those entries.

86 #

87 # This isn't a stack because we don't care about the

88 # order. It's a list of closing tags we've already handled and

89 # will ignore, assuming they ever show up.

90 self.already_closed_empty_element = []

92 self._initialize_xml_detector()

94 def error(self, msg):

95 """In Python 3, HTMLParser subclasses must implement error(), although

96 this requirement doesn't appear to be documented.

98 In Python 2, HTMLParser implements error() by raising an exception,

99 which we don't want to do.

100

101 In any event, this method is called only on very strange

102 markup and our best strategy is to pretend it didn't happen

103 and keep going.

104 """

105 warnings.warn(msg)

106

107 def handle_startendtag(self, name, attrs):

108 """Handle an incoming empty-element tag.

109

110 This is only called when the markup looks like <tag/>.

111

112 :param name: Name of the tag.

113 :param attrs: Dictionary of the tag's attributes.

114 """

115 # is_startend() tells handle_starttag not to close the tag

116 # just because its name matches a known empty-element tag. We

117 # know that this is an empty-element tag and we want to call

118 # handle_endtag ourselves.

119 tag = self.handle_starttag(name, attrs, handle_empty_element=False)

120 self.handle_endtag(name)

121

122 def handle_starttag(self, name, attrs, handle_empty_element=True):

123 """Handle an opening tag, e.g. '<tag>'

124

125 :param name: Name of the tag.

126 :param attrs: Dictionary of the tag's attributes.

127 :param handle_empty_element: True if this tag is known to be

128 an empty-element tag (i.e. there is not expected to be any

129 closing tag).

130 """

131 # XXX namespace

132 attr_dict = {}

133 for key, value in attrs:

134 # Change None attribute values to the empty string

135 # for consistency with the other tree builders.

136 if value is None:

137 value = ''

138 if key in attr_dict:

139 # A single attribute shows up multiple times in this

140 # tag. How to handle it depends on the

141 # on_duplicate_attribute setting.

142 on_dupe = self.on_duplicate_attribute

143 if on_dupe == self.IGNORE:

144 pass

145 elif on_dupe in (None, self.REPLACE):

146 attr_dict[key] = value

147 else:

148 on_dupe(attr_dict, key, value)

149 else:

150 attr_dict[key] = value

151 attrvalue = '""'

152 #print("START", name)

153 sourceline, sourcepos = self.getpos()

154 tag = self.soup.handle_starttag(

155 name, None, None, attr_dict, sourceline=sourceline,

156 sourcepos=sourcepos

157 )

158 if tag and tag.is_empty_element and handle_empty_element:

159 # Unlike other parsers, html.parser doesn't send separate end tag

160 # events for empty-element tags. (It's handled in

161 # handle_startendtag, but only if the original markup looked like

162 # <tag/>.)

163 #

164 # So we need to call handle_endtag() ourselves. Since we

165 # know the start event is identical to the end event, we

166 # don't want handle_endtag() to cross off any previous end

167 # events for tags of this name.

168 self.handle_endtag(name, check_already_closed=False)

169

170 # But we might encounter an explicit closing tag for this tag

171 # later on. If so, we want to ignore it.

172 self.already_closed_empty_element.append(name)

173

174 if self._root_tag is None:

175 self._root_tag_encountered(name)

176

177 def handle_endtag(self, name, check_already_closed=True):

178 """Handle a closing tag, e.g. '</tag>'

179

180 :param name: A tag name.

181 :param check_already_closed: True if this tag is expected to

182 be the closing portion of an empty-element tag,

183 e.g. '<tag></tag>'.

184 """

185 #print("END", name)

186 if check_already_closed and name in self.already_closed_empty_element:

187 # This is a redundant end tag for an empty-element tag.

188 # We've already called handle_endtag() for it, so just

189 # check it off the list.

190 #print("ALREADY CLOSED", name)

191 self.already_closed_empty_element.remove(name)

192 else:

193 self.soup.handle_endtag(name)

194

195 def handle_data(self, data):

196 """Handle some textual data that shows up between tags."""

197 self.soup.handle_data(data)

198

199 def handle_charref(self, name):

200 """Handle a numeric character reference by converting it to the

201 corresponding Unicode character and treating it as textual

202 data.

203

204 :param name: Character number, possibly in hexadecimal.

205 """

206 # XXX workaround for a bug in HTMLParser. Remove this once

207 # it's fixed in all supported versions.

208 # http://bugs.python.org/issue13633

209 if name.startswith('x'):

210 real_name = int(name.lstrip('x'), 16)

211 elif name.startswith('X'):

212 real_name = int(name.lstrip('X'), 16)

213 else:

214 real_name = int(name)

215

216 data = None

217 if real_name < 256:

218 # HTML numeric entities are supposed to reference Unicode

219 # code points, but sometimes they reference code points in

220 # some other encoding (ahem, Windows-1252). E.g. 

221 # instead of É for LEFT DOUBLE QUOTATION MARK. This

222 # code tries to detect this situation and compensate.

223 for encoding in (self.soup.original_encoding, 'windows-1252'):

224 if not encoding:

225 continue

226 try:

227 data = bytearray([real_name]).decode(encoding)

228 except UnicodeDecodeError as e:

229 pass

230 if not data:

231 try:

232 data = chr(real_name)

233 except (ValueError, OverflowError) as e:

234 pass

235 data = data or "\N{REPLACEMENT CHARACTER}"

236 self.handle_data(data)

237

238 def handle_entityref(self, name):

239 """Handle a named entity reference by converting it to the

240 corresponding Unicode character(s) and treating it as textual

241 data.

242

243 :param name: Name of the entity reference.

244 """

245 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)

246 if character is not None:

247 data = character

248 else:

249 # If this were XML, it would be ambiguous whether "&foo"

250 # was an character entity reference with a missing

251 # semicolon or the literal string "&foo". Since this is

252 # HTML, we have a complete list of all character entity references,

253 # and this one wasn't found, so assume it's the literal string "&foo".

254 data = "&%s" % name

255 self.handle_data(data)

256

257 def handle_comment(self, data):

258 """Handle an HTML comment.

259

260 :param data: The text of the comment.

261 """

262 self.soup.endData()

263 self.soup.handle_data(data)

264 self.soup.endData(Comment)

265

266 def handle_decl(self, data):

267 """Handle a DOCTYPE declaration.

268

269 :param data: The text of the declaration.

270 """

271 self.soup.endData()

272 data = data[len("DOCTYPE "):]

273 self.soup.handle_data(data)

274 self.soup.endData(Doctype)

275

276 def unknown_decl(self, data):

277 """Handle a declaration of unknown type -- probably a CDATA block.

278

279 :param data: The text of the declaration.

280 """

281 if data.upper().startswith('CDATA['):

282 cls = CData

283 data = data[len('CDATA['):]

284 else:

285 cls = Declaration

286 self.soup.endData()

287 self.soup.handle_data(data)

288 self.soup.endData(cls)

289

290 def handle_pi(self, data):

291 """Handle a processing instruction.

292

293 :param data: The text of the instruction.

294 """

295 self.soup.endData()

296 self.soup.handle_data(data)

297 self._document_might_be_xml(data)

298 self.soup.endData(ProcessingInstruction)

299

300

301class HTMLParserTreeBuilder(HTMLTreeBuilder):

302 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,

303 found in the Python standard library.

304 """

305 is_xml = False

306 picklable = True

307 NAME = HTMLPARSER

308 features = [NAME, HTML, STRICT]

309

310 # The html.parser knows which line number and position in the

311 # original file is the source of an element.

312 TRACKS_LINE_NUMBERS = True

313

314 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):

315 """Constructor.

316

317 :param parser_args: Positional arguments to pass into

318 the BeautifulSoupHTMLParser constructor, once it's

319 invoked.

320 :param parser_kwargs: Keyword arguments to pass into

321 the BeautifulSoupHTMLParser constructor, once it's

322 invoked.

323 :param kwargs: Keyword arguments for the superclass constructor.

324 """

325 # Some keyword arguments will be pulled out of kwargs and placed

326 # into parser_kwargs.

327 extra_parser_kwargs = dict()

328 for arg in ('on_duplicate_attribute',):

329 if arg in kwargs:

330 value = kwargs.pop(arg)

331 extra_parser_kwargs[arg] = value

332 super(HTMLParserTreeBuilder, self).__init__(**kwargs)

333 parser_args = parser_args or []

334 parser_kwargs = parser_kwargs or {}

335 parser_kwargs.update(extra_parser_kwargs)

336 if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:

337 parser_kwargs['strict'] = False

338 if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:

339 parser_kwargs['convert_charrefs'] = False

340 self.parser_args = (parser_args, parser_kwargs)

341

342 def prepare_markup(self, markup, user_specified_encoding=None,

343 document_declared_encoding=None, exclude_encodings=None):

344

345 """Run any preliminary steps necessary to make incoming markup

346 acceptable to the parser.

347

348 :param markup: Some markup -- probably a bytestring.

349 :param user_specified_encoding: The user asked to try this encoding.

350 :param document_declared_encoding: The markup itself claims to be

351 in this encoding.

352 :param exclude_encodings: The user asked _not_ to try any of

353 these encodings.

354

355 :yield: A series of 4-tuples:

356 (markup, encoding, declared encoding,

357 has undergone character replacement)

358

359 Each 4-tuple represents a strategy for converting the

360 document to Unicode and parsing it. Each strategy will be tried

361 in turn.

362 """

363 if isinstance(markup, str):

364 # Parse Unicode as-is.

365 yield (markup, None, None, False)

366 return

367

368 # Ask UnicodeDammit to sniff the most likely encoding.

369

370 # This was provided by the end-user; treat it as a known

371 # definite encoding per the algorithm laid out in the HTML5

372 # spec. (See the EncodingDetector class for details.)

373 known_definite_encodings = [user_specified_encoding]

374

375 # This was found in the document; treat it as a slightly lower-priority

376 # user encoding.

377 user_encodings = [document_declared_encoding]

378

379 try_encodings = [user_specified_encoding, document_declared_encoding]

380 dammit = UnicodeDammit(

381 markup,

382 known_definite_encodings=known_definite_encodings,

383 user_encodings=user_encodings,

384 is_html=True,

385 exclude_encodings=exclude_encodings

386 )

387 yield (dammit.markup, dammit.original_encoding,

388 dammit.declared_html_encoding,

389 dammit.contains_replacement_characters)

390

391 def feed(self, markup):

392 """Run some incoming markup through some parsing process,

393 populating the `BeautifulSoup` object in self.soup.

394 """

395 args, kwargs = self.parser_args

396 parser = BeautifulSoupHTMLParser(*args, **kwargs)

397 parser.soup = self.soup

398 try:

399 parser.feed(markup)

400 parser.close()

401 except HTMLParseError as e:

402 warnings.warn(RuntimeWarning(

403 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))

404 raise e

405 parser.already_closed_empty_element = []

406

407# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some

408# 3.2.3 code. This ensures they don't treat markup like <p></p> as a

409# string.

410#

411# XXX This code can be removed once most Python 3 users are on 3.2.3.

412if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:

413 import re

414 attrfind_tolerant = re.compile(

415 r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'

416 r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')

417 HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant

418

419 locatestarttagend = re.compile(r"""

420 <[a-zA-Z][-.a-zA-Z0-9:_]* # tag name

421 (?:\s+ # whitespace before attribute name

422 (?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name

423 (?:\s*=\s* # value indicator

424 (?:'[^']*' # LITA-enclosed value

425 |\"[^\"]*\" # LIT-enclosed value

426 |[^'\">\s]+ # bare value

427 )

428 )?

429 )

430 )*

431 \s* # trailing whitespace

432""", re.VERBOSE)

433 BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend

434

435 from html.parser import tagfind, attrfind

436

437 def parse_starttag(self, i):

438 self.__starttag_text = None

439 endpos = self.check_for_whole_start_tag(i)

440 if endpos < 0:

441 return endpos

442 rawdata = self.rawdata

443 self.__starttag_text = rawdata[i:endpos]

444

445 # Now parse the data between i+1 and j into a tag and attrs

446 attrs = []

447 match = tagfind.match(rawdata, i+1)

448 assert match, 'unexpected call to parse_starttag()'

449 k = match.end()

450 self.lasttag = tag = rawdata[i+1:k].lower()

451 while k < endpos:

452 if self.strict:

453 m = attrfind.match(rawdata, k)

454 else:

455 m = attrfind_tolerant.match(rawdata, k)

456 if not m:

457 break

458 attrname, rest, attrvalue = m.group(1, 2, 3)

459 if not rest:

460 attrvalue = None

461 elif attrvalue[:1] == '\'' == attrvalue[-1:] or \

462 attrvalue[:1] == '"' == attrvalue[-1:]:

463 attrvalue = attrvalue[1:-1]

464 if attrvalue:

465 attrvalue = self.unescape(attrvalue)

466 attrs.append((attrname.lower(), attrvalue))

467 k = m.end()

468

469 end = rawdata[k:endpos].strip()

470 if end not in (">", "/>"):

471 lineno, offset = self.getpos()

472 if "\n" in self.__starttag_text:

473 lineno = lineno + self.__starttag_text.count("\n")

474 offset = len(self.__starttag_text) \

475 - self.__starttag_text.rfind("\n")

476 else:

477 offset = offset + len(self.__starttag_text)

478 if self.strict:

479 self.error("junk characters in start tag: %r"

480 % (rawdata[k:endpos][:20],))

481 self.handle_data(rawdata[i:endpos])

482 return endpos

483 if end.endswith('/>'):

484 # XHTML-style empty tag: <span attr="value" />

485 self.handle_startendtag(tag, attrs)

486 else:

487 self.handle_starttag(tag, attrs)

488 if tag in self.CDATA_CONTENT_ELEMENTS:

489 self.set_cdata_mode(tag)

490 return endpos

491

492 def set_cdata_mode(self, elem):

493 self.cdata_elem = elem.lower()

494 self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)

495

496 BeautifulSoupHTMLParser.parse_starttag = parse_starttag

497 BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode

498

499 CONSTRUCTOR_TAKES_STRICT = True

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.10/site-packages/bs4/builder/_htmlparser.py: 25%

208 statements