Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/treewalkers/__init__.py: 10%

78 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-07-01 06:54 +0000

1"""A collection of modules for iterating through different kinds of 

2tree, generating tokens identical to those produced by the tokenizer 

3module. 

4 

5To create a tree walker for a new type of tree, you need to 

6implement a tree walker object (called TreeWalker by convention) that 

7implements a 'serialize' method which takes a tree as sole argument and 

8returns an iterator which generates tokens. 

9""" 

10 

11from __future__ import absolute_import, division, unicode_literals 

12 

13from .. import constants 

14from .._utils import default_etree 

15 

16__all__ = ["getTreeWalker", "pprint"] 

17 

18treeWalkerCache = {} 

19 

20 

21def getTreeWalker(treeType, implementation=None, **kwargs): 

22 """Get a TreeWalker class for various types of tree with built-in support 

23 

24 :arg str treeType: the name of the tree type required (case-insensitive). 

25 Supported values are: 

26 

27 * "dom": The xml.dom.minidom DOM implementation 

28 * "etree": A generic walker for tree implementations exposing an 

29 elementtree-like interface (known to work with ElementTree, 

30 cElementTree and lxml.etree). 

31 * "lxml": Optimized walker for lxml.etree 

32 * "genshi": a Genshi stream 

33 

34 :arg implementation: A module implementing the tree type e.g. 

35 xml.etree.ElementTree or cElementTree (Currently applies to the "etree" 

36 tree type only). 

37 

38 :arg kwargs: keyword arguments passed to the etree walker--for other 

39 walkers, this has no effect 

40 

41 :returns: a TreeWalker class 

42 

43 """ 

44 

45 treeType = treeType.lower() 

46 if treeType not in treeWalkerCache: 

47 if treeType == "dom": 

48 from . import dom 

49 treeWalkerCache[treeType] = dom.TreeWalker 

50 elif treeType == "genshi": 

51 from . import genshi 

52 treeWalkerCache[treeType] = genshi.TreeWalker 

53 elif treeType == "lxml": 

54 from . import etree_lxml 

55 treeWalkerCache[treeType] = etree_lxml.TreeWalker 

56 elif treeType == "etree": 

57 from . import etree 

58 if implementation is None: 

59 implementation = default_etree 

60 # XXX: NEVER cache here, caching is done in the etree submodule 

61 return etree.getETreeModule(implementation, **kwargs).TreeWalker 

62 return treeWalkerCache.get(treeType) 

63 

64 

65def concatenateCharacterTokens(tokens): 

66 pendingCharacters = [] 

67 for token in tokens: 

68 type = token["type"] 

69 if type in ("Characters", "SpaceCharacters"): 

70 pendingCharacters.append(token["data"]) 

71 else: 

72 if pendingCharacters: 

73 yield {"type": "Characters", "data": "".join(pendingCharacters)} 

74 pendingCharacters = [] 

75 yield token 

76 if pendingCharacters: 

77 yield {"type": "Characters", "data": "".join(pendingCharacters)} 

78 

79 

80def pprint(walker): 

81 """Pretty printer for tree walkers 

82 

83 Takes a TreeWalker instance and pretty prints the output of walking the tree. 

84 

85 :arg walker: a TreeWalker instance 

86 

87 """ 

88 output = [] 

89 indent = 0 

90 for token in concatenateCharacterTokens(walker): 

91 type = token["type"] 

92 if type in ("StartTag", "EmptyTag"): 

93 # tag name 

94 if token["namespace"] and token["namespace"] != constants.namespaces["html"]: 

95 if token["namespace"] in constants.prefixes: 

96 ns = constants.prefixes[token["namespace"]] 

97 else: 

98 ns = token["namespace"] 

99 name = "%s %s" % (ns, token["name"]) 

100 else: 

101 name = token["name"] 

102 output.append("%s<%s>" % (" " * indent, name)) 

103 indent += 2 

104 # attributes (sorted for consistent ordering) 

105 attrs = token["data"] 

106 for (namespace, localname), value in sorted(attrs.items()): 

107 if namespace: 

108 if namespace in constants.prefixes: 

109 ns = constants.prefixes[namespace] 

110 else: 

111 ns = namespace 

112 name = "%s %s" % (ns, localname) 

113 else: 

114 name = localname 

115 output.append("%s%s=\"%s\"" % (" " * indent, name, value)) 

116 # self-closing 

117 if type == "EmptyTag": 

118 indent -= 2 

119 

120 elif type == "EndTag": 

121 indent -= 2 

122 

123 elif type == "Comment": 

124 output.append("%s<!-- %s -->" % (" " * indent, token["data"])) 

125 

126 elif type == "Doctype": 

127 if token["name"]: 

128 if token["publicId"]: 

129 output.append("""%s<!DOCTYPE %s "%s" "%s">""" % 

130 (" " * indent, 

131 token["name"], 

132 token["publicId"], 

133 token["systemId"] if token["systemId"] else "")) 

134 elif token["systemId"]: 

135 output.append("""%s<!DOCTYPE %s "" "%s">""" % 

136 (" " * indent, 

137 token["name"], 

138 token["systemId"])) 

139 else: 

140 output.append("%s<!DOCTYPE %s>" % (" " * indent, 

141 token["name"])) 

142 else: 

143 output.append("%s<!DOCTYPE >" % (" " * indent,)) 

144 

145 elif type == "Characters": 

146 output.append("%s\"%s\"" % (" " * indent, token["data"])) 

147 

148 elif type == "SpaceCharacters": 

149 assert False, "concatenateCharacterTokens should have got rid of all Space tokens" 

150 

151 else: 

152 raise ValueError("Unknown token type, %s" % type) 

153 

154 return "\n".join(output)