Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/__init_

1"""Beautiful Soup Elixir and Tonic - "The Screen-Scraper's Friend".

3http://www.crummy.com/software/BeautifulSoup/

5Beautiful Soup uses a pluggable XML or HTML parser to parse a

6(possibly invalid) document into a tree representation. Beautiful Soup

7provides methods and Pythonic idioms that make it easy to navigate,

8search, and modify the parse tree.

10Beautiful Soup works with Python 3.5 and up. It works better if lxml

11and/or html5lib is installed.

13For more than you ever wanted to know about Beautiful Soup, see the

14documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/

15"""

17__author__ = "Leonard Richardson (leonardr@segfault.org)"

18__version__ = "4.11.1"

20# Use of this source code is governed by the MIT license.

21__license__ = "MIT"

23__all__ = ['BeautifulSoup']

25from collections import Counter

26import os

27import re

28import sys

29import traceback

30import warnings

32# The very first thing we do is give a useful error if someone is

33# running this code under Python 2.

34if sys.version_info.major < 3:

35 raise ImportError('You are trying to use a Python 3-specific version of Beautiful Soup under Python 2. This will not work. The final version of Beautiful Soup to support Python 2 was 4.9.3.')

37from .builder import (

38 builder_registry,

39 ParserRejectedMarkup,

40 XMLParsedAsHTMLWarning,

41)

42from .dammit import UnicodeDammit

43from .element import (

44 CData,

45 Comment,

46 DEFAULT_OUTPUT_ENCODING,

47 Declaration,

48 Doctype,

49 NavigableString,

50 PageElement,

51 ProcessingInstruction,

52 PYTHON_SPECIFIC_ENCODINGS,

53 ResultSet,

54 Script,

55 Stylesheet,

56 SoupStrainer,

57 Tag,

58 TemplateString,

59 )

61# Define some custom warnings.

62class GuessedAtParserWarning(UserWarning):

63 """The warning issued when BeautifulSoup has to guess what parser to

64 use -- probably because no parser was specified in the constructor.

65 """

67class MarkupResemblesLocatorWarning(UserWarning):

68 """The warning issued when BeautifulSoup is given 'markup' that

69 actually looks like a resource locator -- a URL or a path to a file

70 on disk.

71 """

74class BeautifulSoup(Tag):

75 """A data structure representing a parsed HTML or XML document.

77 Most of the methods you'll call on a BeautifulSoup object are inherited from

78 PageElement or Tag.

80 Internally, this class defines the basic interface called by the

81 tree builders when converting an HTML/XML document into a data

82 structure. The interface abstracts away the differences between

83 parsers. To write a new tree builder, you'll need to understand

84 these methods as a whole.

86 These methods will be called by the BeautifulSoup constructor:

87 * reset()

88 * feed(markup)

90 The tree builder may call these methods from its feed() implementation:

91 * handle_starttag(name, attrs) # See note about return value

92 * handle_endtag(name)

93 * handle_data(data) # Appends to the current data node

94 * endData(containerClass) # Ends the current data node

96 No matter how complicated the underlying parser is, you should be

97 able to build a tree using 'start tag' events, 'end tag' events,

98 'data' events, and "done with data" events.

100 If you encounter an empty-element tag (aka a self-closing tag,

101 like HTML's <br> tag), call handle_starttag and then

102 handle_endtag.

103 """

104

105 # Since BeautifulSoup subclasses Tag, it's possible to treat it as

106 # a Tag with a .name. This name makes it clear the BeautifulSoup

107 # object isn't a real markup tag.

108 ROOT_TAG_NAME = '[document]'

109

110 # If the end-user gives no indication which tree builder they

111 # want, look for one with these features.

112 DEFAULT_BUILDER_FEATURES = ['html', 'fast']

113

114 # A string containing all ASCII whitespace characters, used in

115 # endData() to detect data chunks that seem 'empty'.

116 ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

117

118 NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"

119

120 def __init__(self, markup="", features=None, builder=None,

121 parse_only=None, from_encoding=None, exclude_encodings=None,

122 element_classes=None, **kwargs):

123 """Constructor.

124

125 :param markup: A string or a file-like object representing

126 markup to be parsed.

127

128 :param features: Desirable features of the parser to be

129 used. This may be the name of a specific parser ("lxml",

130 "lxml-xml", "html.parser", or "html5lib") or it may be the

131 type of markup to be used ("html", "html5", "xml"). It's

132 recommended that you name a specific parser, so that

133 Beautiful Soup gives you the same results across platforms

134 and virtual environments.

135

136 :param builder: A TreeBuilder subclass to instantiate (or

137 instance to use) instead of looking one up based on

138 `features`. You only need to use this if you've implemented a

139 custom TreeBuilder.

140

141 :param parse_only: A SoupStrainer. Only parts of the document

142 matching the SoupStrainer will be considered. This is useful

143 when parsing part of a document that would otherwise be too

144 large to fit into memory.

145

146 :param from_encoding: A string indicating the encoding of the

147 document to be parsed. Pass this in if Beautiful Soup is

148 guessing wrongly about the document's encoding.

149

150 :param exclude_encodings: A list of strings indicating

151 encodings known to be wrong. Pass this in if you don't know

152 the document's encoding but you know Beautiful Soup's guess is

153 wrong.

154

155 :param element_classes: A dictionary mapping BeautifulSoup

156 classes like Tag and NavigableString, to other classes you'd

157 like to be instantiated instead as the parse tree is

158 built. This is useful for subclassing Tag or NavigableString

159 to modify default behavior.

160

161 :param kwargs: For backwards compatibility purposes, the

162 constructor accepts certain keyword arguments used in

163 Beautiful Soup 3. None of these arguments do anything in

164 Beautiful Soup 4; they will result in a warning and then be

165 ignored.

166

167 Apart from this, any keyword arguments passed into the

168 BeautifulSoup constructor are propagated to the TreeBuilder

169 constructor. This makes it possible to configure a

170 TreeBuilder by passing in arguments, not just by saying which

171 one to use.

172 """

173 if 'convertEntities' in kwargs:

174 del kwargs['convertEntities']

175 warnings.warn(

176 "BS4 does not respect the convertEntities argument to the "

177 "BeautifulSoup constructor. Entities are always converted "

178 "to Unicode characters.")

179

180 if 'markupMassage' in kwargs:

181 del kwargs['markupMassage']

182 warnings.warn(

183 "BS4 does not respect the markupMassage argument to the "

184 "BeautifulSoup constructor. The tree builder is responsible "

185 "for any necessary markup massage.")

186

187 if 'smartQuotesTo' in kwargs:

188 del kwargs['smartQuotesTo']

189 warnings.warn(

190 "BS4 does not respect the smartQuotesTo argument to the "

191 "BeautifulSoup constructor. Smart quotes are always converted "

192 "to Unicode characters.")

193

194 if 'selfClosingTags' in kwargs:

195 del kwargs['selfClosingTags']

196 warnings.warn(

197 "BS4 does not respect the selfClosingTags argument to the "

198 "BeautifulSoup constructor. The tree builder is responsible "

199 "for understanding self-closing tags.")

200

201 if 'isHTML' in kwargs:

202 del kwargs['isHTML']

203 warnings.warn(

204 "BS4 does not respect the isHTML argument to the "

205 "BeautifulSoup constructor. Suggest you use "

206 "features='lxml' for HTML and features='lxml-xml' for "

207 "XML.")

208

209 def deprecated_argument(old_name, new_name):

210 if old_name in kwargs:

211 warnings.warn(

212 'The "%s" argument to the BeautifulSoup constructor '

213 'has been renamed to "%s."' % (old_name, new_name),

214 DeprecationWarning

215 )

216 return kwargs.pop(old_name)

217 return None

218

219 parse_only = parse_only or deprecated_argument(

220 "parseOnlyThese", "parse_only")

221

222 from_encoding = from_encoding or deprecated_argument(

223 "fromEncoding", "from_encoding")

224

225 if from_encoding and isinstance(markup, str):

226 warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")

227 from_encoding = None

228

229 self.element_classes = element_classes or dict()

230

231 # We need this information to track whether or not the builder

232 # was specified well enough that we can omit the 'you need to

233 # specify a parser' warning.

234 original_builder = builder

235 original_features = features

236

237 if isinstance(builder, type):

238 # A builder class was passed in; it needs to be instantiated.

239 builder_class = builder

240 builder = None

241 elif builder is None:

242 if isinstance(features, str):

243 features = [features]

244 if features is None or len(features) == 0:

245 features = self.DEFAULT_BUILDER_FEATURES

246 builder_class = builder_registry.lookup(*features)

247 if builder_class is None:

248 raise FeatureNotFound(

249 "Couldn't find a tree builder with the features you "

250 "requested: %s. Do you need to install a parser library?"

251 % ",".join(features))

252

253 # At this point either we have a TreeBuilder instance in

254 # builder, or we have a builder_class that we can instantiate

255 # with the remaining **kwargs.

256 if builder is None:

257 builder = builder_class(**kwargs)

258 if not original_builder and not (

259 original_features == builder.NAME or

260 original_features in builder.ALTERNATE_NAMES

261 ) and markup:

262 # The user did not tell us which TreeBuilder to use,

263 # and we had to guess. Issue a warning.

264 if builder.is_xml:

265 markup_type = "XML"

266 else:

267 markup_type = "HTML"

268

269 # This code adapted from warnings.py so that we get the same line

270 # of code as our warnings.warn() call gets, even if the answer is wrong

271 # (as it may be in a multithreading situation).

272 caller = None

273 try:

274 caller = sys._getframe(1)

275 except ValueError:

276 pass

277 if caller:

278 globals = caller.f_globals

279 line_number = caller.f_lineno

280 else:

281 globals = sys.__dict__

282 line_number= 1

283 filename = globals.get('__file__')

284 if filename:

285 fnl = filename.lower()

286 if fnl.endswith((".pyc", ".pyo")):

287 filename = filename[:-1]

288 if filename:

289 # If there is no filename at all, the user is most likely in a REPL,

290 # and the warning is not necessary.

291 values = dict(

292 filename=filename,

293 line_number=line_number,

294 parser=builder.NAME,

295 markup_type=markup_type

296 )

297 warnings.warn(

298 self.NO_PARSER_SPECIFIED_WARNING % values,

299 GuessedAtParserWarning, stacklevel=2

300 )

301 else:

302 if kwargs:

303 warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")

304

305 self.builder = builder

306 self.is_xml = builder.is_xml

307 self.known_xml = self.is_xml

308 self._namespaces = dict()

309 self.parse_only = parse_only

310

311 if hasattr(markup, 'read'): # It's a file-type object.

312 markup = markup.read()

313 elif len(markup) <= 256 and (

314 (isinstance(markup, bytes) and not b'<' in markup)

315 or (isinstance(markup, str) and not '<' in markup)

316 ):

317 # Issue warnings for a couple beginner problems

318 # involving passing non-markup to Beautiful Soup.

319 # Beautiful Soup will still parse the input as markup,

320 # since that is sometimes the intended behavior.

321 if not self._markup_is_url(markup):

322 self._markup_resembles_filename(markup)

323

324 rejections = []

325 success = False

326 for (self.markup, self.original_encoding, self.declared_html_encoding,

327 self.contains_replacement_characters) in (

328 self.builder.prepare_markup(

329 markup, from_encoding, exclude_encodings=exclude_encodings)):

330 self.reset()

331 self.builder.initialize_soup(self)

332 try:

333 self._feed()

334 success = True

335 break

336 except ParserRejectedMarkup as e:

337 rejections.append(e)

338 pass

339

340 if not success:

341 other_exceptions = [str(e) for e in rejections]

342 raise ParserRejectedMarkup(

343 "The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.\n\nOriginal exception(s) from parser:\n " + "\n ".join(other_exceptions)

344 )

345

346 # Clear out the markup and remove the builder's circular

347 # reference to this object.

348 self.markup = None

349 self.builder.soup = None

350

351 def __copy__(self):

352 """Copy a BeautifulSoup object by converting the document to a string and parsing it again."""

353 copy = type(self)(

354 self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'