Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/

1from __future__ import absolute_import, division, unicode_literals

2from six import text_type

4import re

6from codecs import register_error, xmlcharrefreplace_errors

8from .constants import voidElements, booleanAttributes, spaceCharacters

9from .constants import rcdataElements, entities, xmlEntities

10from . import treewalkers, _utils

11from xml.sax.saxutils import escape

13_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`"

14_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]")

15_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars +

16 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n"

17 "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"

18 "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f"

19 "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000"

20 "\u2001\u2002\u2003\u2004\u2005\u2006\u2007"

21 "\u2008\u2009\u200a\u2028\u2029\u202f\u205f"

22 "\u3000]")

25_encode_entity_map = {}

26_is_ucs4 = len("\U0010FFFF") == 1

27for k, v in list(entities.items()):

28 # skip multi-character entities

29 if ((_is_ucs4 and len(v) > 1) or

30 (not _is_ucs4 and len(v) > 2)):

31 continue

32 if v != "&":

33 if len(v) == 2:

34 v = _utils.surrogatePairToCodepoint(v)

35 else:

36 v = ord(v)

37 if v not in _encode_entity_map or k.islower():

38 # prefer < over &LT; and similarly for &, >, etc.

39 _encode_entity_map[v] = k

42def htmlentityreplace_errors(exc):

43 if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)):

44 res = []

45 codepoints = []

46 skip = False

47 for i, c in enumerate(exc.object[exc.start:exc.end]):

48 if skip:

49 skip = False

50 continue

51 index = i + exc.start

52 if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]):

53 codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2])

54 skip = True

55 else:

56 codepoint = ord(c)

57 codepoints.append(codepoint)

58 for cp in codepoints:

59 e = _encode_entity_map.get(cp)

60 if e:

61 res.append("&")

62 res.append(e)

63 if not e.endswith(";"):

64 res.append(";")

65 else:

66 res.append("&#x%s;" % (hex(cp)[2:]))

67 return ("".join(res), exc.end)

68 else:

69 return xmlcharrefreplace_errors(exc)

72register_error("htmlentityreplace", htmlentityreplace_errors)

75def serialize(input, tree="etree", encoding=None, **serializer_opts):

76 """Serializes the input token stream using the specified treewalker

78 :arg input: the token stream to serialize

80 :arg tree: the treewalker to use

82 :arg encoding: the encoding to use

84 :arg serializer_opts: any options to pass to the

85 :py:class:`html5lib.serializer.HTMLSerializer` that gets created

87 :returns: the tree serialized as a string

89 Example:

91 >>> from html5lib.html5parser import parse

92 >>> from html5lib.serializer import serialize

93 >>> token_stream = parse('<html><body><p>Hi!</p></body></html>')

94 >>> serialize(token_stream, omit_optional_tags=False)

95 '<html><head></head><body><p>Hi!</p></body></html>'

97 """

98 # XXX: Should we cache this?

99 walker = treewalkers.getTreeWalker(tree)

100 s = HTMLSerializer(**serializer_opts)

101 return s.render(walker(input), encoding)

102

103

104class HTMLSerializer(object):

105

106 # attribute quoting options

107 quote_attr_values = "legacy" # be secure by default

108 quote_char = '"'

109 use_best_quote_char = True

110

111 # tag syntax options

112 omit_optional_tags = True

113 minimize_boolean_attributes = True

114 use_trailing_solidus = False

115 space_before_trailing_solidus = True

116

117 # escaping options

118 escape_lt_in_attrs = False

119 escape_rcdata = False

120 resolve_entities = True

121

122 # miscellaneous options

123 alphabetical_attributes = False

124 inject_meta_charset = True

125 strip_whitespace = False

126 sanitize = False

127

128 options = ("quote_attr_values", "quote_char", "use_best_quote_char",

129 "omit_optional_tags", "minimize_boolean_attributes",

130 "use_trailing_solidus", "space_before_trailing_solidus",

131 "escape_lt_in_attrs", "escape_rcdata", "resolve_entities",

132 "alphabetical_attributes", "inject_meta_charset",

133 "strip_whitespace", "sanitize")

134

135 def __init__(self, **kwargs):

136 """Initialize HTMLSerializer

137

138 :arg inject_meta_charset: Whether or not to inject the meta charset.

139

140 Defaults to ``True``.

141

142 :arg quote_attr_values: Whether to quote attribute values that don't

143 require quoting per legacy browser behavior (``"legacy"``), when

144 required by the standard (``"spec"``), or always (``"always"``).

145

146 Defaults to ``"legacy"``.

147

148 :arg quote_char: Use given quote character for attribute quoting.

149

150 Defaults to ``"`` which will use double quotes unless attribute

151 value contains a double quote, in which case single quotes are

152 used.

153

154 :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute

155 values.

156

157 Defaults to ``False``.

158

159 :arg escape_rcdata: Whether to escape characters that need to be

160 escaped within normal elements within rcdata elements such as

161 style.

162

163 Defaults to ``False``.

164

165 :arg resolve_entities: Whether to resolve named character entities that

166 appear in the source tree. The XML predefined entities < >

167 & " ' are unaffected by this setting.

168

169 Defaults to ``True``.

170

171 :arg strip_whitespace: Whether to remove semantically meaningless

172 whitespace. (This compresses all whitespace to a single space

173 except within ``pre``.)

174

175 Defaults to ``False``.

176

177 :arg minimize_boolean_attributes: Shortens boolean attributes to give

178 just the attribute value, for example::

179

180 <input disabled="disabled">

181

182 becomes::

183

184 <input disabled>

185

186 Defaults to ``True``.

187

188 :arg use_trailing_solidus: Includes a close-tag slash at the end of the

189 start tag of void elements (empty elements whose end tag is

190 forbidden). E.g. ``<hr/>``.

191

192 Defaults to ``False``.

193

194 :arg space_before_trailing_solidus: Places a space immediately before

195 the closing slash in a tag using a trailing solidus. E.g.

196 ``<hr />``. Requires ``use_trailing_solidus=True``.

197

198 Defaults to ``True``.

199

200 :arg sanitize: Strip all unsafe or unknown constructs from output.

201 See :py:class:`html5lib.filters.sanitizer.Filter`.

202

203 Defaults to ``False``.

204

205 :arg omit_optional_tags: Omit start/end tags that are optional.

206

207 Defaults to ``True``.

208

209 :arg alphabetical_attributes: Reorder attributes to be in alphabetical order.

210

211 Defaults to ``False``.

212

213 """

214 unexpected_args = frozenset(kwargs) - frozenset(self.options)

215 if len(unexpected_args) > 0:

216 raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args)))

217 if 'quote_char' in kwargs:

218 self.use_best_quote_char = False

219 for attr in self.options:

220 setattr(self, attr, kwargs.get(attr, getattr(self, attr)))

221 self.errors = []

222 self.strict = False

223

224 def encode(self, string):

225 assert(isinstance(string, text_type))

226 if self.encoding:

227 return string.encode(self.encoding, "htmlentityreplace")

228 else:

229 return string

230

231 def encodeStrict(self, string):

232 assert(isinstance(string, text_type))

233 if self.encoding:

234 return string.encode(self.encoding, "strict")

235 else:

236 return string

237

238 def serialize(self, treewalker, encoding=None):

239 # pylint:disable=too-many-nested-blocks

240 self.encoding = encoding

241 in_cdata = False

242 self.errors = []

243

244 if encoding and self.inject_meta_charset:

245 from .filters.inject_meta_charset import Filter

246 treewalker = Filter(treewalker, encoding)

247 # Alphabetical attributes is here under the assumption that none of

248 # the later filters add or change order of attributes; it needs to be

249 # before the sanitizer so escaped elements come out correctly

250 if self.alphabetical_attributes:

251 from .filters.alphabeticalattributes import Filter

252 treewalker = Filter(treewalker)

253 # WhitespaceFilter should be used before OptionalTagFilter

254 # for maximum efficiently of this latter filter

255 if self.strip_whitespace:

256 from .filters.whitespace import Filter

257 treewalker = Filter(treewalker)

258 if self.sanitize:

259 from .filters.sanitizer import Filter

260 treewalker = Filter(treewalker)

261 if self.omit_optional_tags:

262 from .filters.optionaltags import Filter

263 treewalker = Filter(treewalker)

264

265 for token in treewalker:

266 type = token["type"]

267 if type == "Doctype":

268 doctype = "<!DOCTYPE %s" % token["name"]

269

270 if token["publicId"]:

271 doctype += ' PUBLIC "%s"' % token["publicId"]

272 elif token["systemId"]:

273 doctype += " SYSTEM"

274 if token["systemId"]:

275 if token["systemId"].find('"') >= 0:

276 if token["systemId"].find("'") >= 0:

277 self.serializeError("System identifier contains both single and double quote characters")

278 quote_char = "'"

279 else:

280 quote_char = '"'

281 doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char)

282

283 doctype += ">"

284 yield self.encodeStrict(doctype)

285

286 elif type in ("Characters", "SpaceCharacters"):

287 if type == "SpaceCharacters" or in_cdata:

288 if in_cdata and token["data"].find("</") >= 0:

289 self.serializeError("Unexpected </ in CDATA")

290 yield self.encode(token["data"])

291 else:

292 yield self.encode(escape(token["data"]))

293

294 elif type in ("StartTag", "EmptyTag"):

295 name = token["name"]

296 yield self.encodeStrict("<%s" % name)

297 if name in rcdataElements and not self.escape_rcdata:

298 in_cdata = True

299 elif in_cdata:

300 self.serializeError("Unexpected child element of a CDATA element")

301 for (_, attr_name), attr_value in token["data"].items():

302 # TODO: Add namespace support here

303 k = attr_name

304 v = attr_value

305 yield self.encodeStrict(' ')

306

307 yield self.encodeStrict(k)

308 if not self.minimize_boolean_attributes or \

309 (k not in booleanAttributes.get(name, tuple()) and

310 k not in booleanAttributes.get("", tuple())):

311 yield self.encodeStrict("=")

312 if self.quote_attr_values == "always" or len(v) == 0:

313 quote_attr = True

314 elif self.quote_attr_values == "spec":

315 quote_attr = _quoteAttributeSpec.search(v) is not None

316 elif self.quote_attr_values == "legacy":

317 quote_attr = _quoteAttributeLegacy.search(v) is not None

318 else:

319 raise ValueError("quote_attr_values must be one of: "

320 "'always', 'spec', or 'legacy'")

321 v = v.replace("&", "&")

322 if self.escape_lt_in_attrs:

323 v = v.replace("<", "<")

324 if quote_attr:

325 quote_char = self.quote_char

326 if self.use_best_quote_char:

327 if "'" in v and '"' not in v:

328 quote_char = '"'

329 elif '"' in v and "'" not in v:

330 quote_char = "'"

331 if quote_char == "'":

332 v = v.replace("'", "'")

333 else:

334 v = v.replace('"', """)

335 yield self.encodeStrict(quote_char)

336 yield self.encode(v)

337 yield self.encodeStrict(quote_char)

338 else:

339 yield self.encode(v)

340 if name in voidElements and self.use_trailing_solidus:

341 if self.space_before_trailing_solidus:

342 yield self.encodeStrict(" /")

343 else:

344 yield self.encodeStrict("/")

345 yield self.encode(">")

346

347 elif type == "EndTag":

348 name = token["name"]

349 if name in rcdataElements:

350 in_cdata = False

351 elif in_cdata:

352 self.serializeError("Unexpected child element of a CDATA element")

353 yield self.encodeStrict("</%s>" % name)

354

355 elif type == "Comment":

356 data = token["data"]

357 if data.find("--") >= 0:

358 self.serializeError("Comment contains --")

359 yield self.encodeStrict("" % token["data"])

360

361 elif type == "Entity":

362 name = token["name"]

363 key = name + ";"

364 if key not in entities:

365 self.serializeError("Entity %s not recognized" % name)

366 if self.resolve_entities and key not in xmlEntities:

367 data = entities[key]

368 else:

369 data = "&%s;" % name

370 yield self.encodeStrict(data)

371

372 else:

373 self.serializeError(token["data"])

374

375 def render(self, treewalker, encoding=None):

376 """Serializes the stream from the treewalker into a string

377

378 :arg treewalker: the treewalker to serialize

379

380 :arg encoding: the string encoding to use

381

382 :returns: the serialized tree

383

384 Example:

385

386 >>> from html5lib import parse, getTreeWalker

387 >>> from html5lib.serializer import HTMLSerializer

388 >>> token_stream = parse('<html><body>Hi!</body></html>')

389 >>> walker = getTreeWalker('etree')

390 >>> serializer = HTMLSerializer(omit_optional_tags=False)

391 >>> serializer.render(walker(token_stream))

392 '<html><head></head><body>Hi!</body></html>'

393

394 """

395 if encoding:

396 return b"".join(list(self.serialize(treewalker, encoding)))

397 else:

398 return "".join(list(self.serialize(treewalker)))

399

400 def serializeError(self, data="XXX ERROR MESSAGE NEEDED"):

401 # XXX The idea is to make data mandatory.

402 self.errors.append(data)

403 if self.strict:

404 raise SerializeError

405

406

407class SerializeError(Exception):

408 """Error in serialized tree"""

409 pass

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/serializer.py: 23%

205 statements