Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/serializer.py: 23%

205 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1from __future__ import absolute_import, division, unicode_literals 

2from six import text_type 

3 

4import re 

5 

6from codecs import register_error, xmlcharrefreplace_errors 

7 

8from .constants import voidElements, booleanAttributes, spaceCharacters 

9from .constants import rcdataElements, entities, xmlEntities 

10from . import treewalkers, _utils 

11from xml.sax.saxutils import escape 

12 

13_quoteAttributeSpecChars = "".join(spaceCharacters) + "\"'=<>`" 

14_quoteAttributeSpec = re.compile("[" + _quoteAttributeSpecChars + "]") 

15_quoteAttributeLegacy = re.compile("[" + _quoteAttributeSpecChars + 

16 "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n" 

17 "\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15" 

18 "\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" 

19 "\x20\x2f\x60\xa0\u1680\u180e\u180f\u2000" 

20 "\u2001\u2002\u2003\u2004\u2005\u2006\u2007" 

21 "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" 

22 "\u3000]") 

23 

24 

25_encode_entity_map = {} 

26_is_ucs4 = len("\U0010FFFF") == 1 

27for k, v in list(entities.items()): 

28 # skip multi-character entities 

29 if ((_is_ucs4 and len(v) > 1) or 

30 (not _is_ucs4 and len(v) > 2)): 

31 continue 

32 if v != "&": 

33 if len(v) == 2: 

34 v = _utils.surrogatePairToCodepoint(v) 

35 else: 

36 v = ord(v) 

37 if v not in _encode_entity_map or k.islower(): 

38 # prefer &lt; over &LT; and similarly for &amp;, &gt;, etc. 

39 _encode_entity_map[v] = k 

40 

41 

42def htmlentityreplace_errors(exc): 

43 if isinstance(exc, (UnicodeEncodeError, UnicodeTranslateError)): 

44 res = [] 

45 codepoints = [] 

46 skip = False 

47 for i, c in enumerate(exc.object[exc.start:exc.end]): 

48 if skip: 

49 skip = False 

50 continue 

51 index = i + exc.start 

52 if _utils.isSurrogatePair(exc.object[index:min([exc.end, index + 2])]): 

53 codepoint = _utils.surrogatePairToCodepoint(exc.object[index:index + 2]) 

54 skip = True 

55 else: 

56 codepoint = ord(c) 

57 codepoints.append(codepoint) 

58 for cp in codepoints: 

59 e = _encode_entity_map.get(cp) 

60 if e: 

61 res.append("&") 

62 res.append(e) 

63 if not e.endswith(";"): 

64 res.append(";") 

65 else: 

66 res.append("&#x%s;" % (hex(cp)[2:])) 

67 return ("".join(res), exc.end) 

68 else: 

69 return xmlcharrefreplace_errors(exc) 

70 

71 

72register_error("htmlentityreplace", htmlentityreplace_errors) 

73 

74 

75def serialize(input, tree="etree", encoding=None, **serializer_opts): 

76 """Serializes the input token stream using the specified treewalker 

77 

78 :arg input: the token stream to serialize 

79 

80 :arg tree: the treewalker to use 

81 

82 :arg encoding: the encoding to use 

83 

84 :arg serializer_opts: any options to pass to the 

85 :py:class:`html5lib.serializer.HTMLSerializer` that gets created 

86 

87 :returns: the tree serialized as a string 

88 

89 Example: 

90 

91 >>> from html5lib.html5parser import parse 

92 >>> from html5lib.serializer import serialize 

93 >>> token_stream = parse('<html><body><p>Hi!</p></body></html>') 

94 >>> serialize(token_stream, omit_optional_tags=False) 

95 '<html><head></head><body><p>Hi!</p></body></html>' 

96 

97 """ 

98 # XXX: Should we cache this? 

99 walker = treewalkers.getTreeWalker(tree) 

100 s = HTMLSerializer(**serializer_opts) 

101 return s.render(walker(input), encoding) 

102 

103 

104class HTMLSerializer(object): 

105 

106 # attribute quoting options 

107 quote_attr_values = "legacy" # be secure by default 

108 quote_char = '"' 

109 use_best_quote_char = True 

110 

111 # tag syntax options 

112 omit_optional_tags = True 

113 minimize_boolean_attributes = True 

114 use_trailing_solidus = False 

115 space_before_trailing_solidus = True 

116 

117 # escaping options 

118 escape_lt_in_attrs = False 

119 escape_rcdata = False 

120 resolve_entities = True 

121 

122 # miscellaneous options 

123 alphabetical_attributes = False 

124 inject_meta_charset = True 

125 strip_whitespace = False 

126 sanitize = False 

127 

128 options = ("quote_attr_values", "quote_char", "use_best_quote_char", 

129 "omit_optional_tags", "minimize_boolean_attributes", 

130 "use_trailing_solidus", "space_before_trailing_solidus", 

131 "escape_lt_in_attrs", "escape_rcdata", "resolve_entities", 

132 "alphabetical_attributes", "inject_meta_charset", 

133 "strip_whitespace", "sanitize") 

134 

135 def __init__(self, **kwargs): 

136 """Initialize HTMLSerializer 

137 

138 :arg inject_meta_charset: Whether or not to inject the meta charset. 

139 

140 Defaults to ``True``. 

141 

142 :arg quote_attr_values: Whether to quote attribute values that don't 

143 require quoting per legacy browser behavior (``"legacy"``), when 

144 required by the standard (``"spec"``), or always (``"always"``). 

145 

146 Defaults to ``"legacy"``. 

147 

148 :arg quote_char: Use given quote character for attribute quoting. 

149 

150 Defaults to ``"`` which will use double quotes unless attribute 

151 value contains a double quote, in which case single quotes are 

152 used. 

153 

154 :arg escape_lt_in_attrs: Whether or not to escape ``<`` in attribute 

155 values. 

156 

157 Defaults to ``False``. 

158 

159 :arg escape_rcdata: Whether to escape characters that need to be 

160 escaped within normal elements within rcdata elements such as 

161 style. 

162 

163 Defaults to ``False``. 

164 

165 :arg resolve_entities: Whether to resolve named character entities that 

166 appear in the source tree. The XML predefined entities &lt; &gt; 

167 &amp; &quot; &apos; are unaffected by this setting. 

168 

169 Defaults to ``True``. 

170 

171 :arg strip_whitespace: Whether to remove semantically meaningless 

172 whitespace. (This compresses all whitespace to a single space 

173 except within ``pre``.) 

174 

175 Defaults to ``False``. 

176 

177 :arg minimize_boolean_attributes: Shortens boolean attributes to give 

178 just the attribute value, for example:: 

179 

180 <input disabled="disabled"> 

181 

182 becomes:: 

183 

184 <input disabled> 

185 

186 Defaults to ``True``. 

187 

188 :arg use_trailing_solidus: Includes a close-tag slash at the end of the 

189 start tag of void elements (empty elements whose end tag is 

190 forbidden). E.g. ``<hr/>``. 

191 

192 Defaults to ``False``. 

193 

194 :arg space_before_trailing_solidus: Places a space immediately before 

195 the closing slash in a tag using a trailing solidus. E.g. 

196 ``<hr />``. Requires ``use_trailing_solidus=True``. 

197 

198 Defaults to ``True``. 

199 

200 :arg sanitize: Strip all unsafe or unknown constructs from output. 

201 See :py:class:`html5lib.filters.sanitizer.Filter`. 

202 

203 Defaults to ``False``. 

204 

205 :arg omit_optional_tags: Omit start/end tags that are optional. 

206 

207 Defaults to ``True``. 

208 

209 :arg alphabetical_attributes: Reorder attributes to be in alphabetical order. 

210 

211 Defaults to ``False``. 

212 

213 """ 

214 unexpected_args = frozenset(kwargs) - frozenset(self.options) 

215 if len(unexpected_args) > 0: 

216 raise TypeError("__init__() got an unexpected keyword argument '%s'" % next(iter(unexpected_args))) 

217 if 'quote_char' in kwargs: 

218 self.use_best_quote_char = False 

219 for attr in self.options: 

220 setattr(self, attr, kwargs.get(attr, getattr(self, attr))) 

221 self.errors = [] 

222 self.strict = False 

223 

224 def encode(self, string): 

225 assert(isinstance(string, text_type)) 

226 if self.encoding: 

227 return string.encode(self.encoding, "htmlentityreplace") 

228 else: 

229 return string 

230 

231 def encodeStrict(self, string): 

232 assert(isinstance(string, text_type)) 

233 if self.encoding: 

234 return string.encode(self.encoding, "strict") 

235 else: 

236 return string 

237 

238 def serialize(self, treewalker, encoding=None): 

239 # pylint:disable=too-many-nested-blocks 

240 self.encoding = encoding 

241 in_cdata = False 

242 self.errors = [] 

243 

244 if encoding and self.inject_meta_charset: 

245 from .filters.inject_meta_charset import Filter 

246 treewalker = Filter(treewalker, encoding) 

247 # Alphabetical attributes is here under the assumption that none of 

248 # the later filters add or change order of attributes; it needs to be 

249 # before the sanitizer so escaped elements come out correctly 

250 if self.alphabetical_attributes: 

251 from .filters.alphabeticalattributes import Filter 

252 treewalker = Filter(treewalker) 

253 # WhitespaceFilter should be used before OptionalTagFilter 

254 # for maximum efficiently of this latter filter 

255 if self.strip_whitespace: 

256 from .filters.whitespace import Filter 

257 treewalker = Filter(treewalker) 

258 if self.sanitize: 

259 from .filters.sanitizer import Filter 

260 treewalker = Filter(treewalker) 

261 if self.omit_optional_tags: 

262 from .filters.optionaltags import Filter 

263 treewalker = Filter(treewalker) 

264 

265 for token in treewalker: 

266 type = token["type"] 

267 if type == "Doctype": 

268 doctype = "<!DOCTYPE %s" % token["name"] 

269 

270 if token["publicId"]: 

271 doctype += ' PUBLIC "%s"' % token["publicId"] 

272 elif token["systemId"]: 

273 doctype += " SYSTEM" 

274 if token["systemId"]: 

275 if token["systemId"].find('"') >= 0: 

276 if token["systemId"].find("'") >= 0: 

277 self.serializeError("System identifier contains both single and double quote characters") 

278 quote_char = "'" 

279 else: 

280 quote_char = '"' 

281 doctype += " %s%s%s" % (quote_char, token["systemId"], quote_char) 

282 

283 doctype += ">" 

284 yield self.encodeStrict(doctype) 

285 

286 elif type in ("Characters", "SpaceCharacters"): 

287 if type == "SpaceCharacters" or in_cdata: 

288 if in_cdata and token["data"].find("</") >= 0: 

289 self.serializeError("Unexpected </ in CDATA") 

290 yield self.encode(token["data"]) 

291 else: 

292 yield self.encode(escape(token["data"])) 

293 

294 elif type in ("StartTag", "EmptyTag"): 

295 name = token["name"] 

296 yield self.encodeStrict("<%s" % name) 

297 if name in rcdataElements and not self.escape_rcdata: 

298 in_cdata = True 

299 elif in_cdata: 

300 self.serializeError("Unexpected child element of a CDATA element") 

301 for (_, attr_name), attr_value in token["data"].items(): 

302 # TODO: Add namespace support here 

303 k = attr_name 

304 v = attr_value 

305 yield self.encodeStrict(' ') 

306 

307 yield self.encodeStrict(k) 

308 if not self.minimize_boolean_attributes or \ 

309 (k not in booleanAttributes.get(name, tuple()) and 

310 k not in booleanAttributes.get("", tuple())): 

311 yield self.encodeStrict("=") 

312 if self.quote_attr_values == "always" or len(v) == 0: 

313 quote_attr = True 

314 elif self.quote_attr_values == "spec": 

315 quote_attr = _quoteAttributeSpec.search(v) is not None 

316 elif self.quote_attr_values == "legacy": 

317 quote_attr = _quoteAttributeLegacy.search(v) is not None 

318 else: 

319 raise ValueError("quote_attr_values must be one of: " 

320 "'always', 'spec', or 'legacy'") 

321 v = v.replace("&", "&amp;") 

322 if self.escape_lt_in_attrs: 

323 v = v.replace("<", "&lt;") 

324 if quote_attr: 

325 quote_char = self.quote_char 

326 if self.use_best_quote_char: 

327 if "'" in v and '"' not in v: 

328 quote_char = '"' 

329 elif '"' in v and "'" not in v: 

330 quote_char = "'" 

331 if quote_char == "'": 

332 v = v.replace("'", "&#39;") 

333 else: 

334 v = v.replace('"', "&quot;") 

335 yield self.encodeStrict(quote_char) 

336 yield self.encode(v) 

337 yield self.encodeStrict(quote_char) 

338 else: 

339 yield self.encode(v) 

340 if name in voidElements and self.use_trailing_solidus: 

341 if self.space_before_trailing_solidus: 

342 yield self.encodeStrict(" /") 

343 else: 

344 yield self.encodeStrict("/") 

345 yield self.encode(">") 

346 

347 elif type == "EndTag": 

348 name = token["name"] 

349 if name in rcdataElements: 

350 in_cdata = False 

351 elif in_cdata: 

352 self.serializeError("Unexpected child element of a CDATA element") 

353 yield self.encodeStrict("</%s>" % name) 

354 

355 elif type == "Comment": 

356 data = token["data"] 

357 if data.find("--") >= 0: 

358 self.serializeError("Comment contains --") 

359 yield self.encodeStrict("<!--%s-->" % token["data"]) 

360 

361 elif type == "Entity": 

362 name = token["name"] 

363 key = name + ";" 

364 if key not in entities: 

365 self.serializeError("Entity %s not recognized" % name) 

366 if self.resolve_entities and key not in xmlEntities: 

367 data = entities[key] 

368 else: 

369 data = "&%s;" % name 

370 yield self.encodeStrict(data) 

371 

372 else: 

373 self.serializeError(token["data"]) 

374 

375 def render(self, treewalker, encoding=None): 

376 """Serializes the stream from the treewalker into a string 

377 

378 :arg treewalker: the treewalker to serialize 

379 

380 :arg encoding: the string encoding to use 

381 

382 :returns: the serialized tree 

383 

384 Example: 

385 

386 >>> from html5lib import parse, getTreeWalker 

387 >>> from html5lib.serializer import HTMLSerializer 

388 >>> token_stream = parse('<html><body>Hi!</body></html>') 

389 >>> walker = getTreeWalker('etree') 

390 >>> serializer = HTMLSerializer(omit_optional_tags=False) 

391 >>> serializer.render(walker(token_stream)) 

392 '<html><head></head><body>Hi!</body></html>' 

393 

394 """ 

395 if encoding: 

396 return b"".join(list(self.serialize(treewalker, encoding))) 

397 else: 

398 return "".join(list(self.serialize(treewalker))) 

399 

400 def serializeError(self, data="XXX ERROR MESSAGE NEEDED"): 

401 # XXX The idea is to make data mandatory. 

402 self.errors.append(data) 

403 if self.strict: 

404 raise SerializeError 

405 

406 

407class SerializeError(Exception): 

408 """Error in serialized tree""" 

409 pass