Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/formatter.py: 74%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

70 statements  

1from __future__ import annotations 

2from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union 

3from typing_extensions import TypeAlias 

4from bs4.dammit import EntitySubstitution 

5 

6if TYPE_CHECKING: 

7 from bs4._typing import _AttributeValue 

8 

9 

10class Formatter(EntitySubstitution): 

11 """Describes a strategy to use when outputting a parse tree to a string. 

12 

13 Some parts of this strategy come from the distinction between 

14 HTML4, HTML5, and XML. Others are configurable by the user. 

15 

16 Formatters are passed in as the `formatter` argument to methods 

17 like `bs4.element.Tag.encode`. Most people won't need to 

18 think about formatters, and most people who need to think about 

19 them can pass in one of these predefined strings as `formatter` 

20 rather than making a new Formatter object: 

21 

22 For HTML documents: 

23 * 'html' - HTML entity substitution for generic HTML documents. (default) 

24 * 'html5' - HTML entity substitution for HTML5 documents, as 

25 well as some optimizations in the way tags are rendered. 

26 * 'html5-4.12.0' - The version of the 'html5' formatter used prior to 

27 Beautiful Soup 4.13.0. 

28 * 'minimal' - Only make the substitutions necessary to guarantee 

29 valid HTML. 

30 * None - Do not perform any substitution. This will be faster 

31 but may result in invalid markup. 

32 

33 For XML documents: 

34 * 'html' - Entity substitution for XHTML documents. 

35 * 'minimal' - Only make the substitutions necessary to guarantee 

36 valid XML. (default) 

37 * None - Do not perform any substitution. This will be faster 

38 but may result in invalid markup. 

39 

40 """ 

41 

42 #: Constant name denoting HTML markup 

43 HTML: str = "html" 

44 

45 #: Constant name denoting XML markup 

46 XML: str = "xml" 

47 

48 #: Default values for the various constructor options when the 

49 #: markup language is HTML. 

50 HTML_DEFAULTS: Dict[str, Set[str]] = dict( 

51 cdata_containing_tags=set(["script", "style"]), 

52 ) 

53 

54 language: Optional[str] #: :meta private: 

55 entity_substitution: Optional[_EntitySubstitutionFunction] #: :meta private: 

56 void_element_close_prefix: str #: :meta private: 

57 cdata_containing_tags: Set[str] #: :meta private: 

58 indent: str #: :meta private: 

59 

60 #: If this is set to true by the constructor, then attributes whose 

61 #: values are sent to the empty string will be treated as HTML 

62 #: boolean attributes. (Attributes whose value is None are always 

63 #: rendered this way.) 

64 empty_attributes_are_booleans: bool 

65 

66 def _default( 

67 self, language: str, value: Optional[Set[str]], kwarg: str 

68 ) -> Set[str]: 

69 if value is not None: 

70 return value 

71 if language == self.XML: 

72 # When XML is the markup language in use, all of the 

73 # defaults are the empty list. 

74 return set() 

75 

76 # Otherwise, it depends on what's in HTML_DEFAULTS. 

77 return self.HTML_DEFAULTS[kwarg] 

78 

79 def __init__( 

80 self, 

81 language: Optional[str] = None, 

82 entity_substitution: Optional[_EntitySubstitutionFunction] = None, 

83 void_element_close_prefix: str = "/", 

84 cdata_containing_tags: Optional[Set[str]] = None, 

85 empty_attributes_are_booleans: bool = False, 

86 indent: Union[int,str] = 1, 

87 ): 

88 r"""Constructor. 

89 

90 :param language: This should be `Formatter.XML` if you are formatting 

91 XML markup and `Formatter.HTML` if you are formatting HTML markup. 

92 

93 :param entity_substitution: A function to call to replace special 

94 characters with XML/HTML entities. For examples, see 

95 bs4.dammit.EntitySubstitution.substitute_html and substitute_xml. 

96 :param void_element_close_prefix: By default, void elements 

97 are represented as <tag/> (XML rules) rather than <tag> 

98 (HTML rules). To get <tag>, pass in the empty string. 

99 :param cdata_containing_tags: The set of tags that are defined 

100 as containing CDATA in this dialect. For example, in HTML, 

101 <script> and <style> tags are defined as containing CDATA, 

102 and their contents should not be formatted. 

103 :param empty_attributes_are_booleans: If this is set to true, 

104 then attributes whose values are sent to the empty string 

105 will be treated as `HTML boolean 

106 attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes 

107 whose value is None are always rendered this way.) 

108 :param indent: If indent is a non-negative integer or string, 

109 then the contents of elements will be indented 

110 appropriately when pretty-printing. An indent level of 0, 

111 negative, or "" will only insert newlines. Using a 

112 positive integer indent indents that many spaces per 

113 level. If indent is a string (such as "\t"), that string 

114 is used to indent each level. The default behavior is to 

115 indent one space per level. 

116 

117 """ 

118 self.language = language or self.HTML 

119 self.entity_substitution = entity_substitution 

120 self.void_element_close_prefix = void_element_close_prefix 

121 self.cdata_containing_tags = self._default( 

122 self.language, cdata_containing_tags, "cdata_containing_tags" 

123 ) 

124 self.empty_attributes_are_booleans = empty_attributes_are_booleans 

125 if indent is None: 

126 indent = 0 

127 indent_str: str 

128 if isinstance(indent, int): 

129 if indent < 0: 

130 indent = 0 

131 indent_str = " " * indent 

132 elif isinstance(indent, str): 

133 indent_str = indent 

134 else: 

135 indent_str = " " 

136 self.indent = indent_str 

137 

138 def substitute(self, ns: str) -> str: 

139 """Process a string that needs to undergo entity substitution. 

140 This may be a string encountered in an attribute value or as 

141 text. 

142 

143 :param ns: A string. 

144 :return: The same string but with certain characters replaced by named 

145 or numeric entities. 

146 """ 

147 if not self.entity_substitution: 

148 return ns 

149 from .element import NavigableString 

150 

151 if ( 

152 isinstance(ns, NavigableString) 

153 and ns.parent is not None 

154 and ns.parent.name in self.cdata_containing_tags 

155 ): 

156 # Do nothing. 

157 return ns 

158 # Substitute. 

159 return self.entity_substitution(ns) 

160 

161 def attribute_value(self, value: str) -> str: 

162 """Process the value of an attribute. 

163 

164 :param ns: A string. 

165 :return: A string with certain characters replaced by named 

166 or numeric entities. 

167 """ 

168 return self.substitute(value) 

169 

170 def attributes( 

171 self, tag: bs4.element.Tag 

172 ) -> Iterable[Tuple[str, Optional[_AttributeValue]]]: 

173 """Reorder a tag's attributes however you want. 

174 

175 By default, attributes are sorted alphabetically. This makes 

176 behavior consistent between Python 2 and Python 3, and preserves 

177 backwards compatibility with older versions of Beautiful Soup. 

178 

179 If `empty_attributes_are_booleans` is True, then 

180 attributes whose values are set to the empty string will be 

181 treated as boolean attributes. 

182 """ 

183 if tag.attrs is None: 

184 return [] 

185 

186 items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items()) 

187 return sorted( 

188 (k, (None if self.empty_attributes_are_booleans and v == "" else v)) 

189 for k, v in items 

190 ) 

191 

192 

193class HTMLFormatter(Formatter): 

194 """A generic Formatter for HTML.""" 

195 

196 REGISTRY: Dict[Optional[str], HTMLFormatter] = {} 

197 

198 def __init__( 

199 self, 

200 entity_substitution: Optional[_EntitySubstitutionFunction] = None, 

201 void_element_close_prefix: str = "/", 

202 cdata_containing_tags: Optional[Set[str]] = None, 

203 empty_attributes_are_booleans: bool = False, 

204 indent: Union[int,str] = 1, 

205 ): 

206 super(HTMLFormatter, self).__init__( 

207 self.HTML, 

208 entity_substitution, 

209 void_element_close_prefix, 

210 cdata_containing_tags, 

211 empty_attributes_are_booleans, 

212 indent=indent 

213 ) 

214 

215 

216class XMLFormatter(Formatter): 

217 """A generic Formatter for XML.""" 

218 

219 REGISTRY: Dict[Optional[str], XMLFormatter] = {} 

220 

221 def __init__( 

222 self, 

223 entity_substitution: Optional[_EntitySubstitutionFunction] = None, 

224 void_element_close_prefix: str = "/", 

225 cdata_containing_tags: Optional[Set[str]] = None, 

226 empty_attributes_are_booleans: bool = False, 

227 indent: Union[int,str] = 1, 

228 ): 

229 super(XMLFormatter, self).__init__( 

230 self.XML, 

231 entity_substitution, 

232 void_element_close_prefix, 

233 cdata_containing_tags, 

234 empty_attributes_are_booleans, 

235 indent=indent, 

236 ) 

237 

238 

239# Set up aliases for the default formatters. 

240HTMLFormatter.REGISTRY["html"] = HTMLFormatter( 

241 entity_substitution=EntitySubstitution.substitute_html 

242) 

243 

244HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( 

245 entity_substitution=EntitySubstitution.substitute_html5, 

246 void_element_close_prefix="", 

247 empty_attributes_are_booleans=True, 

248) 

249HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter( 

250 entity_substitution=EntitySubstitution.substitute_html, 

251 void_element_close_prefix="", 

252 empty_attributes_are_booleans=True, 

253) 

254HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( 

255 entity_substitution=EntitySubstitution.substitute_xml 

256) 

257HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None) 

258XMLFormatter.REGISTRY["html"] = XMLFormatter( 

259 entity_substitution=EntitySubstitution.substitute_html 

260) 

261XMLFormatter.REGISTRY["minimal"] = XMLFormatter( 

262 entity_substitution=EntitySubstitution.substitute_xml 

263) 

264 

265XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None) 

266 

267# Define type aliases to improve readability. 

268# 

269 

270#: A function to call to replace special characters with XML or HTML 

271#: entities. 

272_EntitySubstitutionFunction: TypeAlias = Callable[[str], str] 

273 

274# Many of the output-centered methods take an argument that can either 

275# be a Formatter object or the name of a Formatter to be looked up. 

276_FormatterOrName = Union[Formatter, str]