Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/

1# encoding: utf-8

2"""Use the HTMLParser library to parse HTML files that aren't too bad."""

4# Use of this source code is governed by the MIT license.

5__license__ = "MIT"

7__all__ = [

8 'HTMLParserTreeBuilder',

9 ]

11from html.parser import HTMLParser

13import sys

14import warnings

16from bs4.element import (

17 CData,

18 Comment,

19 Declaration,

20 Doctype,

21 ProcessingInstruction,

22 )

23from bs4.dammit import EntitySubstitution, UnicodeDammit

25from bs4.builder import (

26 DetectsXMLParsedAsHTML,

27 ParserRejectedMarkup,

28 HTML,

29 HTMLTreeBuilder,

30 STRICT,

31 )

34HTMLPARSER = 'html.parser'

36class BeautifulSoupHTMLParser(HTMLParser, DetectsXMLParsedAsHTML):

37 """A subclass of the Python standard library's HTMLParser class, which

38 listens for HTMLParser events and translates them into calls

39 to Beautiful Soup's tree construction API.

40 """

42 # Strategies for handling duplicate attributes

43 IGNORE = 'ignore'

44 REPLACE = 'replace'

46 def __init__(self, *args, **kwargs):

47 """Constructor.

49 :param on_duplicate_attribute: A strategy for what to do if a

50 tag includes the same attribute more than once. Accepted

51 values are: REPLACE (replace earlier values with later

52 ones, the default), IGNORE (keep the earliest value

53 encountered), or a callable. A callable must take three

54 arguments: the dictionary of attributes already processed,

55 the name of the duplicate attribute, and the most recent value

56 encountered.

57 """

58 self.on_duplicate_attribute = kwargs.pop(

59 'on_duplicate_attribute', self.REPLACE

60 )

61 HTMLParser.__init__(self, *args, **kwargs)

63 # Keep a list of empty-element tags that were encountered

64 # without an explicit closing tag. If we encounter a closing tag

65 # of this type, we'll associate it with one of those entries.

66 #

67 # This isn't a stack because we don't care about the

68 # order. It's a list of closing tags we've already handled and

69 # will ignore, assuming they ever show up.

70 self.already_closed_empty_element = []

72 self._initialize_xml_detector()

74 def error(self, message):

75 # NOTE: This method is required so long as Python 3.9 is

76 # supported. The corresponding code is removed from HTMLParser

77 # in 3.5, but not removed from ParserBase until 3.10.

78 # https://github.com/python/cpython/issues/76025

79 #

80 # The original implementation turned the error into a warning,

81 # but in every case I discovered, this made HTMLParser

82 # immediately crash with an error message that was less

83 # helpful than the warning. The new implementation makes it

84 # more clear that html.parser just can't parse this

85 # markup. The 3.10 implementation does the same, though it

86 # raises AssertionError rather than calling a method. (We

87 # catch this error and wrap it in a ParserRejectedMarkup.)

88 raise ParserRejectedMarkup(message)

90 def handle_startendtag(self, name, attrs):

91 """Handle an incoming empty-element tag.

93 This is only called when the markup looks like <tag/>.

95 :param name: Name of the tag.

96 :param attrs: Dictionary of the tag's attributes.

97 """

98 # is_startend() tells handle_starttag not to close the tag

99 # just because its name matches a known empty-element tag. We

100 # know that this is an empty-element tag and we want to call

101 # handle_endtag ourselves.

102 tag = self.handle_starttag(name, attrs, handle_empty_element=False)

103 self.handle_endtag(name)

104

105 def handle_starttag(self, name, attrs, handle_empty_element=True):

106 """Handle an opening tag, e.g. '<tag>'

107

108 :param name: Name of the tag.

109 :param attrs: Dictionary of the tag's attributes.

110 :param handle_empty_element: True if this tag is known to be

111 an empty-element tag (i.e. there is not expected to be any

112 closing tag).

113 """

114 # XXX namespace

115 attr_dict = {}

116 for key, value in attrs:

117 # Change None attribute values to the empty string

118 # for consistency with the other tree builders.

119 if value is None:

120 value = ''

121 if key in attr_dict:

122 # A single attribute shows up multiple times in this

123 # tag. How to handle it depends on the

124 # on_duplicate_attribute setting.

125 on_dupe = self.on_duplicate_attribute

126 if on_dupe == self.IGNORE:

127 pass

128 elif on_dupe in (None, self.REPLACE):

129 attr_dict[key] = value

130 else:

131 on_dupe(attr_dict, key, value)

132 else:

133 attr_dict[key] = value

134 attrvalue = '""'

135 #print("START", name)

136 sourceline, sourcepos = self.getpos()

137 tag = self.soup.handle_starttag(

138 name, None, None, attr_dict, sourceline=sourceline,

139 sourcepos=sourcepos

140 )

141 if tag and tag.is_empty_element and handle_empty_element:

142 # Unlike other parsers, html.parser doesn't send separate end tag

143 # events for empty-element tags. (It's handled in

144 # handle_startendtag, but only if the original markup looked like

145 # <tag/>.)

146 #

147 # So we need to call handle_endtag() ourselves. Since we

148 # know the start event is identical to the end event, we

149 # don't want handle_endtag() to cross off any previous end

150 # events for tags of this name.

151 self.handle_endtag(name, check_already_closed=False)

152

153 # But we might encounter an explicit closing tag for this tag

154 # later on. If so, we want to ignore it.

155 self.already_closed_empty_element.append(name)

156

157 if self._root_tag is None:

158 self._root_tag_encountered(name)

159

160 def handle_endtag(self, name, check_already_closed=True):

161 """Handle a closing tag, e.g. '</tag>'

162

163 :param name: A tag name.

164 :param check_already_closed: True if this tag is expected to

165 be the closing portion of an empty-element tag,

166 e.g. '<tag></tag>'.

167 """

168 #print("END", name)

169 if check_already_closed and name in self.already_closed_empty_element:

170 # This is a redundant end tag for an empty-element tag.

171 # We've already called handle_endtag() for it, so just

172 # check it off the list.

173 #print("ALREADY CLOSED", name)

174 self.already_closed_empty_element.remove(name)

175 else:

176 self.soup.handle_endtag(name)

177

178 def handle_data(self, data):

179 """Handle some textual data that shows up between tags."""

180 self.soup.handle_data(data)

181

182 def handle_charref(self, name):

183 """Handle a numeric character reference by converting it to the

184 corresponding Unicode character and treating it as textual

185 data.

186

187 :param name: Character number, possibly in hexadecimal.

188 """

189 # TODO: This was originally a workaround for a bug in

190 # HTMLParser. (http://bugs.python.org/issue13633) The bug has

191 # been fixed, but removing this code still makes some

192 # Beautiful Soup tests fail. This needs investigation.

193 if name.startswith('x'):

194 real_name = int(name.lstrip('x'), 16)

195 elif name.startswith('X'):

196 real_name = int(name.lstrip('X'), 16)

197 else:

198 real_name = int(name)

199

200 data = None

201 if real_name < 256:

202 # HTML numeric entities are supposed to reference Unicode

203 # code points, but sometimes they reference code points in

204 # some other encoding (ahem, Windows-1252). E.g. 

205 # instead of É for LEFT DOUBLE QUOTATION MARK. This

206 # code tries to detect this situation and compensate.

207 for encoding in (self.soup.original_encoding, 'windows-1252'):

208 if not encoding:

209 continue

210 try:

211 data = bytearray([real_name]).decode(encoding)

212 except UnicodeDecodeError as e:

213 pass

214 if not data:

215 try:

216 data = chr(real_name)

217 except (ValueError, OverflowError) as e:

218 pass

219 data = data or "\N{REPLACEMENT CHARACTER}"

220 self.handle_data(data)

221

222 def handle_entityref(self, name):

223 """Handle a named entity reference by converting it to the

224 corresponding Unicode character(s) and treating it as textual

225 data.

226

227 :param name: Name of the entity reference.

228 """

229 character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)

230 if character is not None:

231 data = character

232 else:

233 # If this were XML, it would be ambiguous whether "&foo"

234 # was an character entity reference with a missing

235 # semicolon or the literal string "&foo". Since this is

236 # HTML, we have a complete list of all character entity references,

237 # and this one wasn't found, so assume it's the literal string "&foo".

238 data = "&%s" % name

239 self.handle_data(data)

240

241 def handle_comment(self, data):

242 """Handle an HTML comment.

243

244 :param data: The text of the comment.

245 """

246 self.soup.endData()

247 self.soup.handle_data(data)

248 self.soup.endData(Comment)

249

250 def handle_decl(self, data):

251 """Handle a DOCTYPE declaration.

252

253 :param data: The text of the declaration.

254 """

255 self.soup.endData()

256 data = data[len("DOCTYPE "):]

257 self.soup.handle_data(data)

258 self.soup.endData(Doctype)

259

260 def unknown_decl(self, data):

261 """Handle a declaration of unknown type -- probably a CDATA block.

262

263 :param data: The text of the declaration.

264 """

265 if data.upper().startswith('CDATA['):

266 cls = CData

267 data = data[len('CDATA['):]

268 else:

269 cls = Declaration

270 self.soup.endData()

271 self.soup.handle_data(data)

272 self.soup.endData(cls)

273

274 def handle_pi(self, data):

275 """Handle a processing instruction.

276

277 :param data: The text of the instruction.

278 """

279 self.soup.endData()

280 self.soup.handle_data(data)

281 self._document_might_be_xml(data)

282 self.soup.endData(ProcessingInstruction)

283

284

285class HTMLParserTreeBuilder(HTMLTreeBuilder):

286 """A Beautiful soup `TreeBuilder` that uses the `HTMLParser` parser,

287 found in the Python standard library.

288 """

289 is_xml = False

290 picklable = True

291 NAME = HTMLPARSER

292 features = [NAME, HTML, STRICT]

293

294 # The html.parser knows which line number and position in the

295 # original file is the source of an element.

296 TRACKS_LINE_NUMBERS = True

297

298 def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):

299 """Constructor.

300

301 :param parser_args: Positional arguments to pass into

302 the BeautifulSoupHTMLParser constructor, once it's

303 invoked.

304 :param parser_kwargs: Keyword arguments to pass into

305 the BeautifulSoupHTMLParser constructor, once it's

306 invoked.

307 :param kwargs: Keyword arguments for the superclass constructor.

308 """

309 # Some keyword arguments will be pulled out of kwargs and placed

310 # into parser_kwargs.

311 extra_parser_kwargs = dict()

312 for arg in ('on_duplicate_attribute',):

313 if arg in kwargs:

314 value = kwargs.pop(arg)

315 extra_parser_kwargs[arg] = value

316 super(HTMLParserTreeBuilder, self).__init__(**kwargs)

317 parser_args = parser_args or []

318 parser_kwargs = parser_kwargs or {}

319 parser_kwargs.update(extra_parser_kwargs)

320 parser_kwargs['convert_charrefs'] = False

321 self.parser_args = (parser_args, parser_kwargs)

322

323 def prepare_markup(self, markup, user_specified_encoding=None,

324 document_declared_encoding=None, exclude_encodings=None):

325

326 """Run any preliminary steps necessary to make incoming markup

327 acceptable to the parser.

328

329 :param markup: Some markup -- probably a bytestring.

330 :param user_specified_encoding: The user asked to try this encoding.

331 :param document_declared_encoding: The markup itself claims to be

332 in this encoding.

333 :param exclude_encodings: The user asked _not_ to try any of

334 these encodings.

335

336 :yield: A series of 4-tuples:

337 (markup, encoding, declared encoding,

338 has undergone character replacement)

339

340 Each 4-tuple represents a strategy for converting the

341 document to Unicode and parsing it. Each strategy will be tried

342 in turn.

343 """

344 if isinstance(markup, str):

345 # Parse Unicode as-is.

346 yield (markup, None, None, False)

347 return

348

349 # Ask UnicodeDammit to sniff the most likely encoding.

350

351 # This was provided by the end-user; treat it as a known

352 # definite encoding per the algorithm laid out in the HTML5

353 # spec. (See the EncodingDetector class for details.)

354 known_definite_encodings = [user_specified_encoding]

355

356 # This was found in the document; treat it as a slightly lower-priority

357 # user encoding.

358 user_encodings = [document_declared_encoding]

359

360 try_encodings = [user_specified_encoding, document_declared_encoding]

361 dammit = UnicodeDammit(

362 markup,

363 known_definite_encodings=known_definite_encodings,

364 user_encodings=user_encodings,

365 is_html=True,

366 exclude_encodings=exclude_encodings

367 )

368 yield (dammit.markup, dammit.original_encoding,

369 dammit.declared_html_encoding,

370 dammit.contains_replacement_characters)

371

372 def feed(self, markup):

373 """Run some incoming markup through some parsing process,

374 populating the `BeautifulSoup` object in self.soup.

375 """

376 args, kwargs = self.parser_args

377 parser = BeautifulSoupHTMLParser(*args, **kwargs)

378 parser.soup = self.soup

379 try:

380 parser.feed(markup)

381 except AssertionError as e:

382 # html.parser raises AssertionError in rare cases to

383 # indicate a fatal problem with the markup, especially

384 # when there's an error in the doctype declaration.

385 raise ParserRejectedMarkup(e)

386 parser.close()

387 parser.already_closed_empty_element = []

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/_htmlparser.py: 25%

137 statements