Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bs4/formatter.py: 74%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from __future__ import annotations
2from typing import Callable, Dict, Iterable, Optional, Set, Tuple, TYPE_CHECKING, Union
3from typing_extensions import TypeAlias
4from bs4.dammit import EntitySubstitution
6if TYPE_CHECKING:
7 from bs4._typing import _AttributeValue
10class Formatter(EntitySubstitution):
11 """Describes a strategy to use when outputting a parse tree to a string.
13 Some parts of this strategy come from the distinction between
14 HTML4, HTML5, and XML. Others are configurable by the user.
16 Formatters are passed in as the `formatter` argument to methods
17 like `bs4.element.Tag.encode`. Most people won't need to
18 think about formatters, and most people who need to think about
19 them can pass in one of these predefined strings as `formatter`
20 rather than making a new Formatter object:
22 For HTML documents:
23 * 'html' - HTML entity substitution for generic HTML documents. (default)
24 * 'html5' - HTML entity substitution for HTML5 documents, as
25 well as some optimizations in the way tags are rendered.
26 * 'html5-4.12.0' - The version of the 'html5' formatter used prior to
27 Beautiful Soup 4.13.0.
28 * 'minimal' - Only make the substitutions necessary to guarantee
29 valid HTML.
30 * None - Do not perform any substitution. This will be faster
31 but may result in invalid markup.
33 For XML documents:
34 * 'html' - Entity substitution for XHTML documents.
35 * 'minimal' - Only make the substitutions necessary to guarantee
36 valid XML. (default)
37 * None - Do not perform any substitution. This will be faster
38 but may result in invalid markup.
40 """
42 #: Constant name denoting HTML markup
43 HTML: str = "html"
45 #: Constant name denoting XML markup
46 XML: str = "xml"
48 #: Default values for the various constructor options when the
49 #: markup language is HTML.
50 HTML_DEFAULTS: Dict[str, Set[str]] = dict(
51 cdata_containing_tags=set(["script", "style"]),
52 )
54 language: Optional[str] #: :meta private:
55 entity_substitution: Optional[_EntitySubstitutionFunction] #: :meta private:
56 void_element_close_prefix: str #: :meta private:
57 cdata_containing_tags: Set[str] #: :meta private:
58 indent: str #: :meta private:
60 #: If this is set to true by the constructor, then attributes whose
61 #: values are sent to the empty string will be treated as HTML
62 #: boolean attributes. (Attributes whose value is None are always
63 #: rendered this way.)
64 empty_attributes_are_booleans: bool
66 def _default(
67 self, language: str, value: Optional[Set[str]], kwarg: str
68 ) -> Set[str]:
69 if value is not None:
70 return value
71 if language == self.XML:
72 # When XML is the markup language in use, all of the
73 # defaults are the empty list.
74 return set()
76 # Otherwise, it depends on what's in HTML_DEFAULTS.
77 return self.HTML_DEFAULTS[kwarg]
79 def __init__(
80 self,
81 language: Optional[str] = None,
82 entity_substitution: Optional[_EntitySubstitutionFunction] = None,
83 void_element_close_prefix: str = "/",
84 cdata_containing_tags: Optional[Set[str]] = None,
85 empty_attributes_are_booleans: bool = False,
86 indent: Union[int,str] = 1,
87 ):
88 r"""Constructor.
90 :param language: This should be `Formatter.XML` if you are formatting
91 XML markup and `Formatter.HTML` if you are formatting HTML markup.
93 :param entity_substitution: A function to call to replace special
94 characters with XML/HTML entities. For examples, see
95 bs4.dammit.EntitySubstitution.substitute_html and substitute_xml.
96 :param void_element_close_prefix: By default, void elements
97 are represented as <tag/> (XML rules) rather than <tag>
98 (HTML rules). To get <tag>, pass in the empty string.
99 :param cdata_containing_tags: The set of tags that are defined
100 as containing CDATA in this dialect. For example, in HTML,
101 <script> and <style> tags are defined as containing CDATA,
102 and their contents should not be formatted.
103 :param empty_attributes_are_booleans: If this is set to true,
104 then attributes whose values are sent to the empty string
105 will be treated as `HTML boolean
106 attributes<https://dev.w3.org/html5/spec-LC/common-microsyntaxes.html#boolean-attributes>`_. (Attributes
107 whose value is None are always rendered this way.)
108 :param indent: If indent is a non-negative integer or string,
109 then the contents of elements will be indented
110 appropriately when pretty-printing. An indent level of 0,
111 negative, or "" will only insert newlines. Using a
112 positive integer indent indents that many spaces per
113 level. If indent is a string (such as "\t"), that string
114 is used to indent each level. The default behavior is to
115 indent one space per level.
117 """
118 self.language = language or self.HTML
119 self.entity_substitution = entity_substitution
120 self.void_element_close_prefix = void_element_close_prefix
121 self.cdata_containing_tags = self._default(
122 self.language, cdata_containing_tags, "cdata_containing_tags"
123 )
124 self.empty_attributes_are_booleans = empty_attributes_are_booleans
125 if indent is None:
126 indent = 0
127 indent_str: str
128 if isinstance(indent, int):
129 if indent < 0:
130 indent = 0
131 indent_str = " " * indent
132 elif isinstance(indent, str):
133 indent_str = indent
134 else:
135 indent_str = " "
136 self.indent = indent_str
138 def substitute(self, ns: str) -> str:
139 """Process a string that needs to undergo entity substitution.
140 This may be a string encountered in an attribute value or as
141 text.
143 :param ns: A string.
144 :return: The same string but with certain characters replaced by named
145 or numeric entities.
146 """
147 if not self.entity_substitution:
148 return ns
149 from .element import NavigableString
151 if (
152 isinstance(ns, NavigableString)
153 and ns.parent is not None
154 and ns.parent.name in self.cdata_containing_tags
155 ):
156 # Do nothing.
157 return ns
158 # Substitute.
159 return self.entity_substitution(ns)
161 def attribute_value(self, value: str) -> str:
162 """Process the value of an attribute.
164 :param ns: A string.
165 :return: A string with certain characters replaced by named
166 or numeric entities.
167 """
168 return self.substitute(value)
170 def attributes(
171 self, tag: bs4.element.Tag
172 ) -> Iterable[Tuple[str, Optional[_AttributeValue]]]:
173 """Reorder a tag's attributes however you want.
175 By default, attributes are sorted alphabetically. This makes
176 behavior consistent between Python 2 and Python 3, and preserves
177 backwards compatibility with older versions of Beautiful Soup.
179 If `empty_attributes_are_booleans` is True, then
180 attributes whose values are set to the empty string will be
181 treated as boolean attributes.
182 """
183 if tag.attrs is None:
184 return []
186 items: Iterable[Tuple[str, _AttributeValue]] = list(tag.attrs.items())
187 return sorted(
188 (k, (None if self.empty_attributes_are_booleans and v == "" else v))
189 for k, v in items
190 )
193class HTMLFormatter(Formatter):
194 """A generic Formatter for HTML."""
196 REGISTRY: Dict[Optional[str], HTMLFormatter] = {}
198 def __init__(
199 self,
200 entity_substitution: Optional[_EntitySubstitutionFunction] = None,
201 void_element_close_prefix: str = "/",
202 cdata_containing_tags: Optional[Set[str]] = None,
203 empty_attributes_are_booleans: bool = False,
204 indent: Union[int,str] = 1,
205 ):
206 super(HTMLFormatter, self).__init__(
207 self.HTML,
208 entity_substitution,
209 void_element_close_prefix,
210 cdata_containing_tags,
211 empty_attributes_are_booleans,
212 indent=indent
213 )
216class XMLFormatter(Formatter):
217 """A generic Formatter for XML."""
219 REGISTRY: Dict[Optional[str], XMLFormatter] = {}
221 def __init__(
222 self,
223 entity_substitution: Optional[_EntitySubstitutionFunction] = None,
224 void_element_close_prefix: str = "/",
225 cdata_containing_tags: Optional[Set[str]] = None,
226 empty_attributes_are_booleans: bool = False,
227 indent: Union[int,str] = 1,
228 ):
229 super(XMLFormatter, self).__init__(
230 self.XML,
231 entity_substitution,
232 void_element_close_prefix,
233 cdata_containing_tags,
234 empty_attributes_are_booleans,
235 indent=indent,
236 )
239# Set up aliases for the default formatters.
240HTMLFormatter.REGISTRY["html"] = HTMLFormatter(
241 entity_substitution=EntitySubstitution.substitute_html
242)
244HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
245 entity_substitution=EntitySubstitution.substitute_html5,
246 void_element_close_prefix="",
247 empty_attributes_are_booleans=True,
248)
249HTMLFormatter.REGISTRY["html5-4.12"] = HTMLFormatter(
250 entity_substitution=EntitySubstitution.substitute_html,
251 void_element_close_prefix="",
252 empty_attributes_are_booleans=True,
253)
254HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
255 entity_substitution=EntitySubstitution.substitute_xml
256)
257HTMLFormatter.REGISTRY[None] = HTMLFormatter(entity_substitution=None)
258XMLFormatter.REGISTRY["html"] = XMLFormatter(
259 entity_substitution=EntitySubstitution.substitute_html
260)
261XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
262 entity_substitution=EntitySubstitution.substitute_xml
263)
265XMLFormatter.REGISTRY[None] = XMLFormatter(entity_substitution=None)
267# Define type aliases to improve readability.
268#
270#: A function to call to replace special characters with XML or HTML
271#: entities.
272_EntitySubstitutionFunction: TypeAlias = Callable[[str], str]
274# Many of the output-centered methods take an argument that can either
275# be a Formatter object or the name of a Formatter to be looked up.
276_FormatterOrName = Union[Formatter, str]