Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bs4/formatter.py: 72%

57 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1from bs4.dammit import EntitySubstitution 

2 

3class Formatter(EntitySubstitution): 

4 """Describes a strategy to use when outputting a parse tree to a string. 

5 

6 Some parts of this strategy come from the distinction between 

7 HTML4, HTML5, and XML. Others are configurable by the user. 

8 

9 Formatters are passed in as the `formatter` argument to methods 

10 like `PageElement.encode`. Most people won't need to think about 

11 formatters, and most people who need to think about them can pass 

12 in one of these predefined strings as `formatter` rather than 

13 making a new Formatter object: 

14 

15 For HTML documents: 

16 * 'html' - HTML entity substitution for generic HTML documents. (default) 

17 * 'html5' - HTML entity substitution for HTML5 documents, as 

18 well as some optimizations in the way tags are rendered. 

19 * 'minimal' - Only make the substitutions necessary to guarantee 

20 valid HTML. 

21 * None - Do not perform any substitution. This will be faster 

22 but may result in invalid markup. 

23 

24 For XML documents: 

25 * 'html' - Entity substitution for XHTML documents. 

26 * 'minimal' - Only make the substitutions necessary to guarantee 

27 valid XML. (default) 

28 * None - Do not perform any substitution. This will be faster 

29 but may result in invalid markup. 

30 """ 

31 # Registries of XML and HTML formatters. 

32 XML_FORMATTERS = {} 

33 HTML_FORMATTERS = {} 

34 

35 HTML = 'html' 

36 XML = 'xml' 

37 

38 HTML_DEFAULTS = dict( 

39 cdata_containing_tags=set(["script", "style"]), 

40 ) 

41 

42 def _default(self, language, value, kwarg): 

43 if value is not None: 

44 return value 

45 if language == self.XML: 

46 return set() 

47 return self.HTML_DEFAULTS[kwarg] 

48 

49 def __init__( 

50 self, language=None, entity_substitution=None, 

51 void_element_close_prefix='/', cdata_containing_tags=None, 

52 empty_attributes_are_booleans=False, indent=1, 

53 ): 

54 """Constructor. 

55 

56 :param language: This should be Formatter.XML if you are formatting 

57 XML markup and Formatter.HTML if you are formatting HTML markup. 

58 

59 :param entity_substitution: A function to call to replace special 

60 characters with XML/HTML entities. For examples, see  

61 bs4.dammit.EntitySubstitution.substitute_html and substitute_xml. 

62 :param void_element_close_prefix: By default, void elements 

63 are represented as <tag/> (XML rules) rather than <tag> 

64 (HTML rules). To get <tag>, pass in the empty string. 

65 :param cdata_containing_tags: The list of tags that are defined 

66 as containing CDATA in this dialect. For example, in HTML, 

67 <script> and <style> tags are defined as containing CDATA, 

68 and their contents should not be formatted. 

69 :param blank_attributes_are_booleans: Render attributes whose value 

70 is the empty string as HTML-style boolean attributes. 

71 (Attributes whose value is None are always rendered this way.) 

72 

73 :param indent: If indent is a non-negative integer or string, 

74 then the contents of elements will be indented 

75 appropriately when pretty-printing. An indent level of 0, 

76 negative, or "" will only insert newlines. Using a 

77 positive integer indent indents that many spaces per 

78 level. If indent is a string (such as "\t"), that string 

79 is used to indent each level. The default behavior to 

80 indent one space per level. 

81 """ 

82 self.language = language 

83 self.entity_substitution = entity_substitution 

84 self.void_element_close_prefix = void_element_close_prefix 

85 self.cdata_containing_tags = self._default( 

86 language, cdata_containing_tags, 'cdata_containing_tags' 

87 ) 

88 self.empty_attributes_are_booleans=empty_attributes_are_booleans 

89 if indent is None: 

90 indent = 0 

91 if isinstance(indent, int): 

92 if indent < 0: 

93 indent = 0 

94 indent = ' ' * indent 

95 elif isinstance(indent, str): 

96 indent = indent 

97 else: 

98 indent = ' ' 

99 self.indent = indent 

100 

101 def substitute(self, ns): 

102 """Process a string that needs to undergo entity substitution. 

103 This may be a string encountered in an attribute value or as 

104 text. 

105 

106 :param ns: A string. 

107 :return: A string with certain characters replaced by named 

108 or numeric entities. 

109 """ 

110 if not self.entity_substitution: 

111 return ns 

112 from .element import NavigableString 

113 if (isinstance(ns, NavigableString) 

114 and ns.parent is not None 

115 and ns.parent.name in self.cdata_containing_tags): 

116 # Do nothing. 

117 return ns 

118 # Substitute. 

119 return self.entity_substitution(ns) 

120 

121 def attribute_value(self, value): 

122 """Process the value of an attribute. 

123 

124 :param ns: A string. 

125 :return: A string with certain characters replaced by named 

126 or numeric entities. 

127 """ 

128 return self.substitute(value) 

129 

130 def attributes(self, tag): 

131 """Reorder a tag's attributes however you want. 

132  

133 By default, attributes are sorted alphabetically. This makes 

134 behavior consistent between Python 2 and Python 3, and preserves 

135 backwards compatibility with older versions of Beautiful Soup. 

136 

137 If `empty_boolean_attributes` is True, then attributes whose 

138 values are set to the empty string will be treated as boolean 

139 attributes. 

140 """ 

141 if tag.attrs is None: 

142 return [] 

143 return sorted( 

144 (k, (None if self.empty_attributes_are_booleans and v == '' else v)) 

145 for k, v in list(tag.attrs.items()) 

146 ) 

147 

148class HTMLFormatter(Formatter): 

149 """A generic Formatter for HTML.""" 

150 REGISTRY = {} 

151 def __init__(self, *args, **kwargs): 

152 super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs) 

153 

154 

155class XMLFormatter(Formatter): 

156 """A generic Formatter for XML.""" 

157 REGISTRY = {} 

158 def __init__(self, *args, **kwargs): 

159 super(XMLFormatter, self).__init__(self.XML, *args, **kwargs) 

160 

161 

162# Set up aliases for the default formatters. 

163HTMLFormatter.REGISTRY['html'] = HTMLFormatter( 

164 entity_substitution=EntitySubstitution.substitute_html 

165) 

166HTMLFormatter.REGISTRY["html5"] = HTMLFormatter( 

167 entity_substitution=EntitySubstitution.substitute_html, 

168 void_element_close_prefix=None, 

169 empty_attributes_are_booleans=True, 

170) 

171HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter( 

172 entity_substitution=EntitySubstitution.substitute_xml 

173) 

174HTMLFormatter.REGISTRY[None] = HTMLFormatter( 

175 entity_substitution=None 

176) 

177XMLFormatter.REGISTRY["html"] = XMLFormatter( 

178 entity_substitution=EntitySubstitution.substitute_html 

179) 

180XMLFormatter.REGISTRY["minimal"] = XMLFormatter( 

181 entity_substitution=EntitySubstitution.substitute_xml 

182) 

183XMLFormatter.REGISTRY[None] = Formatter( 

184 Formatter(Formatter.XML, entity_substitution=None) 

185)