Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/bleach/_vendor/html5lib/treewalkers/base.py: 82%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

102 statements  

1from __future__ import absolute_import, division, unicode_literals 

2 

3from xml.dom import Node 

4from ..constants import namespaces, voidElements, spaceCharacters 

5 

6__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", 

7 "TreeWalker", "NonRecursiveTreeWalker"] 

8 

9DOCUMENT = Node.DOCUMENT_NODE 

10DOCTYPE = Node.DOCUMENT_TYPE_NODE 

11TEXT = Node.TEXT_NODE 

12ELEMENT = Node.ELEMENT_NODE 

13COMMENT = Node.COMMENT_NODE 

14ENTITY = Node.ENTITY_NODE 

15UNKNOWN = "<#UNKNOWN#>" 

16 

17spaceCharacters = "".join(spaceCharacters) 

18 

19 

20class TreeWalker(object): 

21 """Walks a tree yielding tokens 

22 

23 Tokens are dicts that all have a ``type`` field specifying the type of the 

24 token. 

25 

26 """ 

27 def __init__(self, tree): 

28 """Creates a TreeWalker 

29 

30 :arg tree: the tree to walk 

31 

32 """ 

33 self.tree = tree 

34 

35 def __iter__(self): 

36 raise NotImplementedError 

37 

38 def error(self, msg): 

39 """Generates an error token with the given message 

40 

41 :arg msg: the error message 

42 

43 :returns: SerializeError token 

44 

45 """ 

46 return {"type": "SerializeError", "data": msg} 

47 

48 def emptyTag(self, namespace, name, attrs, hasChildren=False): 

49 """Generates an EmptyTag token 

50 

51 :arg namespace: the namespace of the token--can be ``None`` 

52 

53 :arg name: the name of the element 

54 

55 :arg attrs: the attributes of the element as a dict 

56 

57 :arg hasChildren: whether or not to yield a SerializationError because 

58 this tag shouldn't have children 

59 

60 :returns: EmptyTag token 

61 

62 """ 

63 yield {"type": "EmptyTag", "name": name, 

64 "namespace": namespace, 

65 "data": attrs} 

66 if hasChildren: 

67 yield self.error("Void element has children") 

68 

69 def startTag(self, namespace, name, attrs): 

70 """Generates a StartTag token 

71 

72 :arg namespace: the namespace of the token--can be ``None`` 

73 

74 :arg name: the name of the element 

75 

76 :arg attrs: the attributes of the element as a dict 

77 

78 :returns: StartTag token 

79 

80 """ 

81 return {"type": "StartTag", 

82 "name": name, 

83 "namespace": namespace, 

84 "data": attrs} 

85 

86 def endTag(self, namespace, name): 

87 """Generates an EndTag token 

88 

89 :arg namespace: the namespace of the token--can be ``None`` 

90 

91 :arg name: the name of the element 

92 

93 :returns: EndTag token 

94 

95 """ 

96 return {"type": "EndTag", 

97 "name": name, 

98 "namespace": namespace} 

99 

100 def text(self, data): 

101 """Generates SpaceCharacters and Characters tokens 

102 

103 Depending on what's in the data, this generates one or more 

104 ``SpaceCharacters`` and ``Characters`` tokens. 

105 

106 For example: 

107 

108 >>> from html5lib.treewalkers.base import TreeWalker 

109 >>> # Give it an empty tree just so it instantiates 

110 >>> walker = TreeWalker([]) 

111 >>> list(walker.text('')) 

112 [] 

113 >>> list(walker.text(' ')) 

114 [{u'data': ' ', u'type': u'SpaceCharacters'}] 

115 >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE 

116 [{u'data': ' ', u'type': u'SpaceCharacters'}, 

117 {u'data': u'abc', u'type': u'Characters'}, 

118 {u'data': u' ', u'type': u'SpaceCharacters'}] 

119 

120 :arg data: the text data 

121 

122 :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens 

123 

124 """ 

125 data = data 

126 middle = data.lstrip(spaceCharacters) 

127 left = data[:len(data) - len(middle)] 

128 if left: 

129 yield {"type": "SpaceCharacters", "data": left} 

130 data = middle 

131 middle = data.rstrip(spaceCharacters) 

132 right = data[len(middle):] 

133 if middle: 

134 yield {"type": "Characters", "data": middle} 

135 if right: 

136 yield {"type": "SpaceCharacters", "data": right} 

137 

138 def comment(self, data): 

139 """Generates a Comment token 

140 

141 :arg data: the comment 

142 

143 :returns: Comment token 

144 

145 """ 

146 return {"type": "Comment", "data": data} 

147 

148 def doctype(self, name, publicId=None, systemId=None): 

149 """Generates a Doctype token 

150 

151 :arg name: 

152 

153 :arg publicId: 

154 

155 :arg systemId: 

156 

157 :returns: the Doctype token 

158 

159 """ 

160 return {"type": "Doctype", 

161 "name": name, 

162 "publicId": publicId, 

163 "systemId": systemId} 

164 

165 def entity(self, name): 

166 """Generates an Entity token 

167 

168 :arg name: the entity name 

169 

170 :returns: an Entity token 

171 

172 """ 

173 return {"type": "Entity", "name": name} 

174 

175 def unknown(self, nodeType): 

176 """Handles unknown node types""" 

177 return self.error("Unknown node type: " + nodeType) 

178 

179 

180class NonRecursiveTreeWalker(TreeWalker): 

181 def getNodeDetails(self, node): 

182 raise NotImplementedError 

183 

184 def getFirstChild(self, node): 

185 raise NotImplementedError 

186 

187 def getNextSibling(self, node): 

188 raise NotImplementedError 

189 

190 def getParentNode(self, node): 

191 raise NotImplementedError 

192 

193 def __iter__(self): 

194 currentNode = self.tree 

195 while currentNode is not None: 

196 details = self.getNodeDetails(currentNode) 

197 type, details = details[0], details[1:] 

198 hasChildren = False 

199 

200 if type == DOCTYPE: 

201 yield self.doctype(*details) 

202 

203 elif type == TEXT: 

204 for token in self.text(*details): 

205 yield token 

206 

207 elif type == ELEMENT: 

208 namespace, name, attributes, hasChildren = details 

209 if (not namespace or namespace == namespaces["html"]) and name in voidElements: 

210 for token in self.emptyTag(namespace, name, attributes, 

211 hasChildren): 

212 yield token 

213 hasChildren = False 

214 else: 

215 yield self.startTag(namespace, name, attributes) 

216 

217 elif type == COMMENT: 

218 yield self.comment(details[0]) 

219 

220 elif type == ENTITY: 

221 yield self.entity(details[0]) 

222 

223 elif type == DOCUMENT: 

224 hasChildren = True 

225 

226 else: 

227 yield self.unknown(details[0]) 

228 

229 if hasChildren: 

230 firstChild = self.getFirstChild(currentNode) 

231 else: 

232 firstChild = None 

233 

234 if firstChild is not None: 

235 currentNode = firstChild 

236 else: 

237 while currentNode is not None: 

238 details = self.getNodeDetails(currentNode) 

239 type, details = details[0], details[1:] 

240 if type == ELEMENT: 

241 namespace, name, attributes, hasChildren = details 

242 if (namespace and namespace != namespaces["html"]) or name not in voidElements: 

243 yield self.endTag(namespace, name) 

244 if self.tree is currentNode: 

245 currentNode = None 

246 break 

247 nextSibling = self.getNextSibling(currentNode) 

248 if nextSibling is not None: 

249 currentNode = nextSibling 

250 break 

251 else: 

252 currentNode = self.getParentNode(currentNode)