Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/

1# Use of this source code is governed by the MIT license.

2__license__ = "MIT"

4__all__ = [

5 'LXMLTreeBuilderForXML',

6 'LXMLTreeBuilder',

7 ]

9try:

10 from collections.abc import Callable # Python 3.6

11except ImportError as e:

12 from collections import Callable

14from io import BytesIO

15from io import StringIO

16from lxml import etree

17from bs4.element import (

18 Comment,

19 Doctype,

20 NamespacedAttribute,

21 ProcessingInstruction,

22 XMLProcessingInstruction,

23)

24from bs4.builder import (

25 DetectsXMLParsedAsHTML,

26 FAST,

27 HTML,

28 HTMLTreeBuilder,

29 PERMISSIVE,

30 ParserRejectedMarkup,

31 TreeBuilder,

32 XML)

33from bs4.dammit import EncodingDetector

35LXML = 'lxml'

37def _invert(d):

38 "Invert a dictionary."

39 return dict((v,k) for k, v in list(d.items()))

41class LXMLTreeBuilderForXML(TreeBuilder):

42 DEFAULT_PARSER_CLASS = etree.XMLParser

44 is_xml = True

45 processing_instruction_class = XMLProcessingInstruction

47 NAME = "lxml-xml"

48 ALTERNATE_NAMES = ["xml"]

50 # Well, it's permissive by XML parser standards.

51 features = [NAME, LXML, XML, FAST, PERMISSIVE]

53 CHUNK_SIZE = 512

55 # This namespace mapping is specified in the XML Namespace

56 # standard.

57 DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')

59 DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)

61 # NOTE: If we parsed Element objects and looked at .sourceline,

62 # we'd be able to see the line numbers from the original document.

63 # But instead we build an XMLParser or HTMLParser object to serve

64 # as the target of parse messages, and those messages don't include

65 # line numbers.

66 # See: https://bugs.launchpad.net/lxml/+bug/1846906

68 def initialize_soup(self, soup):

69 """Let the BeautifulSoup object know about the standard namespace

70 mapping.

72 :param soup: A `BeautifulSoup`.

73 """

74 super(LXMLTreeBuilderForXML, self).initialize_soup(soup)

75 self._register_namespaces(self.DEFAULT_NSMAPS)

77 def _register_namespaces(self, mapping):

78 """Let the BeautifulSoup object know about namespaces encountered

79 while parsing the document.

81 This might be useful later on when creating CSS selectors.

83 This will track (almost) all namespaces, even ones that were

84 only in scope for part of the document. If two namespaces have

85 the same prefix, only the first one encountered will be

86 tracked. Un-prefixed namespaces are not tracked.

88 :param mapping: A dictionary mapping namespace prefixes to URIs.

89 """

90 for key, value in list(mapping.items()):

91 # This is 'if key' and not 'if key is not None' because we

92 # don't track un-prefixed namespaces. Soupselect will

93 # treat an un-prefixed namespace as the default, which

94 # causes confusion in some cases.

95 if key and key not in self.soup._namespaces:

96 # Let the BeautifulSoup object know about a new namespace.

97 # If there are multiple namespaces defined with the same

98 # prefix, the first one in the document takes precedence.

99 self.soup._namespaces[key] = value

100

101 def default_parser(self, encoding):

102 """Find the default parser for the given encoding.

103

104 :param encoding: A string.

105 :return: Either a parser object or a class, which

106 will be instantiated with default arguments.

107 """

108 if self._default_parser is not None:

109 return self._default_parser

110 return etree.XMLParser(

111 target=self, strip_cdata=False, recover=True, encoding=encoding)

112

113 def parser_for(self, encoding):

114 """Instantiate an appropriate parser for the given encoding.

115

116 :param encoding: A string.

117 :return: A parser object such as an `etree.XMLParser`.

118 """

119 # Use the default parser.

120 parser = self.default_parser(encoding)

121

122 if isinstance(parser, Callable):

123 # Instantiate the parser with default arguments

124 parser = parser(

125 target=self, strip_cdata=False, recover=True, encoding=encoding

126 )

127 return parser

128

129 def __init__(self, parser=None, empty_element_tags=None, **kwargs):

130 # TODO: Issue a warning if parser is present but not a

131 # callable, since that means there's no way to create new

132 # parsers for different encodings.

133 self._default_parser = parser

134 if empty_element_tags is not None:

135 self.empty_element_tags = set(empty_element_tags)

136 self.soup = None

137 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

138 self.active_namespace_prefixes = [dict(self.DEFAULT_NSMAPS)]

139 super(LXMLTreeBuilderForXML, self).__init__(**kwargs)

140

141 def _getNsTag(self, tag):

142 # Split the namespace URL out of a fully-qualified lxml tag

143 # name. Copied from lxml's src/lxml/sax.py.

144 if tag[0] == '{':

145 return tuple(tag[1:].split('}', 1))

146 else:

147 return (None, tag)

148

149 def prepare_markup(self, markup, user_specified_encoding=None,

150 exclude_encodings=None,

151 document_declared_encoding=None):

152 """Run any preliminary steps necessary to make incoming markup

153 acceptable to the parser.

154

155 lxml really wants to get a bytestring and convert it to

156 Unicode itself. So instead of using UnicodeDammit to convert

157 the bytestring to Unicode using different encodings, this

158 implementation uses EncodingDetector to iterate over the

159 encodings, and tell lxml to try to parse the document as each

160 one in turn.

161

162 :param markup: Some markup -- hopefully a bytestring.

163 :param user_specified_encoding: The user asked to try this encoding.

164 :param document_declared_encoding: The markup itself claims to be

165 in this encoding.

166 :param exclude_encodings: The user asked _not_ to try any of

167 these encodings.

168

169 :yield: A series of 4-tuples:

170 (markup, encoding, declared encoding,

171 has undergone character replacement)

172

173 Each 4-tuple represents a strategy for converting the

174 document to Unicode and parsing it. Each strategy will be tried

175 in turn.

176 """

177 is_html = not self.is_xml

178 if is_html:

179 self.processing_instruction_class = ProcessingInstruction

180 # We're in HTML mode, so if we're given XML, that's worth

181 # noting.

182 DetectsXMLParsedAsHTML.warn_if_markup_looks_like_xml(markup)

183 else:

184 self.processing_instruction_class = XMLProcessingInstruction

185

186 if isinstance(markup, str):

187 # We were given Unicode. Maybe lxml can parse Unicode on

188 # this system?

189

190 # TODO: This is a workaround for

191 # https://bugs.launchpad.net/lxml/+bug/1948551.

192 # We can remove it once the upstream issue is fixed.

193 if len(markup) > 0 and markup[0] == u'\N{BYTE ORDER MARK}':

194 markup = markup[1:]

195 yield markup, None, document_declared_encoding, False

196

197 if isinstance(markup, str):

198 # No, apparently not. Convert the Unicode to UTF-8 and

199 # tell lxml to parse it as UTF-8.

200 yield (markup.encode("utf8"), "utf8",

201 document_declared_encoding, False)

202

203 # This was provided by the end-user; treat it as a known

204 # definite encoding per the algorithm laid out in the HTML5

205 # spec. (See the EncodingDetector class for details.)

206 known_definite_encodings = [user_specified_encoding]

207

208 # This was found in the document; treat it as a slightly lower-priority

209 # user encoding.

210 user_encodings = [document_declared_encoding]

211 detector = EncodingDetector(

212 markup, known_definite_encodings=known_definite_encodings,

213 user_encodings=user_encodings, is_html=is_html,

214 exclude_encodings=exclude_encodings

215 )

216 for encoding in detector.encodings:

217 yield (detector.markup, encoding, document_declared_encoding, False)

218

219 def feed(self, markup):

220 if isinstance(markup, bytes):

221 markup = BytesIO(markup)

222 elif isinstance(markup, str):

223 markup = StringIO(markup)

224

225 # Call feed() at least once, even if the markup is empty,

226 # or the parser won't be initialized.

227 data = markup.read(self.CHUNK_SIZE)

228 try:

229 self.parser = self.parser_for(self.soup.original_encoding)

230 self.parser.feed(data)

231 while len(data) != 0:

232 # Now call feed() on the rest of the data, chunk by chunk.

233 data = markup.read(self.CHUNK_SIZE)

234 if len(data) != 0:

235 self.parser.feed(data)

236 self.parser.close()

237 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

238 raise ParserRejectedMarkup(e)

239

240 def close(self):

241 self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]

242

243 def start(self, name, attrs, nsmap={}):

244 # Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.

245 attrs = dict(attrs)

246 nsprefix = None

247 # Invert each namespace map as it comes in.

248 if len(nsmap) == 0 and len(self.nsmaps) > 1:

249 # There are no new namespaces for this tag, but

250 # non-default namespaces are in play, so we need a

251 # separate tag stack to know when they end.

252 self.nsmaps.append(None)

253 elif len(nsmap) > 0:

254 # A new namespace mapping has come into play.

255

256 # First, Let the BeautifulSoup object know about it.

257 self._register_namespaces(nsmap)

258

259 # Then, add it to our running list of inverted namespace

260 # mappings.

261 self.nsmaps.append(_invert(nsmap))

262

263 # The currently active namespace prefixes have

264 # changed. Calculate the new mapping so it can be stored

265 # with all Tag objects created while these prefixes are in

266 # scope.

267 current_mapping = dict(self.active_namespace_prefixes[-1])

268 current_mapping.update(nsmap)

269

270 # We should not track un-prefixed namespaces as we can only hold one

271 # and it will be recognized as the default namespace by soupsieve,

272 # which may be confusing in some situations.

273 if '' in current_mapping:

274 del current_mapping['']

275 self.active_namespace_prefixes.append(current_mapping)

276

277 # Also treat the namespace mapping as a set of attributes on the

278 # tag, so we can recreate it later.

279 attrs = attrs.copy()

280 for prefix, namespace in list(nsmap.items()):

281 attribute = NamespacedAttribute(

282 "xmlns", prefix, "http://www.w3.org/2000/xmlns/")

283 attrs[attribute] = namespace

284

285 # Namespaces are in play. Find any attributes that came in

286 # from lxml with namespaces attached to their names, and

287 # turn then into NamespacedAttribute objects.

288 new_attrs = {}

289 for attr, value in list(attrs.items()):

290 namespace, attr = self._getNsTag(attr)

291 if namespace is None:

292 new_attrs[attr] = value

293 else:

294 nsprefix = self._prefix_for_namespace(namespace)

295 attr = NamespacedAttribute(nsprefix, attr, namespace)

296 new_attrs[attr] = value

297 attrs = new_attrs

298

299 namespace, name = self._getNsTag(name)

300 nsprefix = self._prefix_for_namespace(namespace)

301 self.soup.handle_starttag(

302 name, namespace, nsprefix, attrs,

303 namespaces=self.active_namespace_prefixes[-1]

304 )

305

306 def _prefix_for_namespace(self, namespace):

307 """Find the currently active prefix for the given namespace."""

308 if namespace is None:

309 return None

310 for inverted_nsmap in reversed(self.nsmaps):

311 if inverted_nsmap is not None and namespace in inverted_nsmap:

312 return inverted_nsmap[namespace]

313 return None

314

315 def end(self, name):

316 self.soup.endData()

317 completed_tag = self.soup.tagStack[-1]

318 namespace, name = self._getNsTag(name)

319 nsprefix = None

320 if namespace is not None:

321 for inverted_nsmap in reversed(self.nsmaps):

322 if inverted_nsmap is not None and namespace in inverted_nsmap:

323 nsprefix = inverted_nsmap[namespace]

324 break

325 self.soup.handle_endtag(name, nsprefix)

326 if len(self.nsmaps) > 1:

327 # This tag, or one of its parents, introduced a namespace

328 # mapping, so pop it off the stack.

329 out_of_scope_nsmap = self.nsmaps.pop()

330

331 if out_of_scope_nsmap is not None:

332 # This tag introduced a namespace mapping which is no

333 # longer in scope. Recalculate the currently active

334 # namespace prefixes.

335 self.active_namespace_prefixes.pop()

336

337 def pi(self, target, data):

338 self.soup.endData()

339 data = target + ' ' + data

340 self.soup.handle_data(data)

341 self.soup.endData(self.processing_instruction_class)

342

343 def data(self, content):

344 self.soup.handle_data(content)

345

346 def doctype(self, name, pubid, system):

347 self.soup.endData()

348 doctype = Doctype.for_name_and_ids(name, pubid, system)

349 self.soup.object_was_parsed(doctype)

350

351 def comment(self, content):

352 "Handle comments as Comment objects."

353 self.soup.endData()

354 self.soup.handle_data(content)

355 self.soup.endData(Comment)

356

357 def test_fragment_to_document(self, fragment):

358 """See `TreeBuilder`."""

359 return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment

360

361

362class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):

363

364 NAME = LXML

365 ALTERNATE_NAMES = ["lxml-html"]

366

367 features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]

368 is_xml = False

369 processing_instruction_class = ProcessingInstruction

370

371 def default_parser(self, encoding):

372 return etree.HTMLParser

373

374 def feed(self, markup):

375 encoding = self.soup.original_encoding

376 try:

377 self.parser = self.parser_for(encoding)

378 self.parser.feed(markup)

379 self.parser.close()

380 except (UnicodeDecodeError, LookupError, etree.ParserError) as e:

381 raise ParserRejectedMarkup(e)

382

383

384 def test_fragment_to_document(self, fragment):

385 """See `TreeBuilder`."""

386 return '<html><body>%s</body></html>' % fragment

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/builder/_lxml.py: 4%

174 statements