Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/treewalkers/base.py: 82%
102 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:27 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:27 +0000
1from __future__ import absolute_import, division, unicode_literals
3from xml.dom import Node
4from ..constants import namespaces, voidElements, spaceCharacters
6__all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN",
7 "TreeWalker", "NonRecursiveTreeWalker"]
9DOCUMENT = Node.DOCUMENT_NODE
10DOCTYPE = Node.DOCUMENT_TYPE_NODE
11TEXT = Node.TEXT_NODE
12ELEMENT = Node.ELEMENT_NODE
13COMMENT = Node.COMMENT_NODE
14ENTITY = Node.ENTITY_NODE
15UNKNOWN = "<#UNKNOWN#>"
17spaceCharacters = "".join(spaceCharacters)
20class TreeWalker(object):
21 """Walks a tree yielding tokens
23 Tokens are dicts that all have a ``type`` field specifying the type of the
24 token.
26 """
27 def __init__(self, tree):
28 """Creates a TreeWalker
30 :arg tree: the tree to walk
32 """
33 self.tree = tree
35 def __iter__(self):
36 raise NotImplementedError
38 def error(self, msg):
39 """Generates an error token with the given message
41 :arg msg: the error message
43 :returns: SerializeError token
45 """
46 return {"type": "SerializeError", "data": msg}
48 def emptyTag(self, namespace, name, attrs, hasChildren=False):
49 """Generates an EmptyTag token
51 :arg namespace: the namespace of the token--can be ``None``
53 :arg name: the name of the element
55 :arg attrs: the attributes of the element as a dict
57 :arg hasChildren: whether or not to yield a SerializationError because
58 this tag shouldn't have children
60 :returns: EmptyTag token
62 """
63 yield {"type": "EmptyTag", "name": name,
64 "namespace": namespace,
65 "data": attrs}
66 if hasChildren:
67 yield self.error("Void element has children")
69 def startTag(self, namespace, name, attrs):
70 """Generates a StartTag token
72 :arg namespace: the namespace of the token--can be ``None``
74 :arg name: the name of the element
76 :arg attrs: the attributes of the element as a dict
78 :returns: StartTag token
80 """
81 return {"type": "StartTag",
82 "name": name,
83 "namespace": namespace,
84 "data": attrs}
86 def endTag(self, namespace, name):
87 """Generates an EndTag token
89 :arg namespace: the namespace of the token--can be ``None``
91 :arg name: the name of the element
93 :returns: EndTag token
95 """
96 return {"type": "EndTag",
97 "name": name,
98 "namespace": namespace}
100 def text(self, data):
101 """Generates SpaceCharacters and Characters tokens
103 Depending on what's in the data, this generates one or more
104 ``SpaceCharacters`` and ``Characters`` tokens.
106 For example:
108 >>> from html5lib.treewalkers.base import TreeWalker
109 >>> # Give it an empty tree just so it instantiates
110 >>> walker = TreeWalker([])
111 >>> list(walker.text(''))
112 []
113 >>> list(walker.text(' '))
114 [{u'data': ' ', u'type': u'SpaceCharacters'}]
115 >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE
116 [{u'data': ' ', u'type': u'SpaceCharacters'},
117 {u'data': u'abc', u'type': u'Characters'},
118 {u'data': u' ', u'type': u'SpaceCharacters'}]
120 :arg data: the text data
122 :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens
124 """
125 data = data
126 middle = data.lstrip(spaceCharacters)
127 left = data[:len(data) - len(middle)]
128 if left:
129 yield {"type": "SpaceCharacters", "data": left}
130 data = middle
131 middle = data.rstrip(spaceCharacters)
132 right = data[len(middle):]
133 if middle:
134 yield {"type": "Characters", "data": middle}
135 if right:
136 yield {"type": "SpaceCharacters", "data": right}
138 def comment(self, data):
139 """Generates a Comment token
141 :arg data: the comment
143 :returns: Comment token
145 """
146 return {"type": "Comment", "data": data}
148 def doctype(self, name, publicId=None, systemId=None):
149 """Generates a Doctype token
151 :arg name:
153 :arg publicId:
155 :arg systemId:
157 :returns: the Doctype token
159 """
160 return {"type": "Doctype",
161 "name": name,
162 "publicId": publicId,
163 "systemId": systemId}
165 def entity(self, name):
166 """Generates an Entity token
168 :arg name: the entity name
170 :returns: an Entity token
172 """
173 return {"type": "Entity", "name": name}
175 def unknown(self, nodeType):
176 """Handles unknown node types"""
177 return self.error("Unknown node type: " + nodeType)
180class NonRecursiveTreeWalker(TreeWalker):
181 def getNodeDetails(self, node):
182 raise NotImplementedError
184 def getFirstChild(self, node):
185 raise NotImplementedError
187 def getNextSibling(self, node):
188 raise NotImplementedError
190 def getParentNode(self, node):
191 raise NotImplementedError
193 def __iter__(self):
194 currentNode = self.tree
195 while currentNode is not None:
196 details = self.getNodeDetails(currentNode)
197 type, details = details[0], details[1:]
198 hasChildren = False
200 if type == DOCTYPE:
201 yield self.doctype(*details)
203 elif type == TEXT:
204 for token in self.text(*details):
205 yield token
207 elif type == ELEMENT:
208 namespace, name, attributes, hasChildren = details
209 if (not namespace or namespace == namespaces["html"]) and name in voidElements:
210 for token in self.emptyTag(namespace, name, attributes,
211 hasChildren):
212 yield token
213 hasChildren = False
214 else:
215 yield self.startTag(namespace, name, attributes)
217 elif type == COMMENT:
218 yield self.comment(details[0])
220 elif type == ENTITY:
221 yield self.entity(details[0])
223 elif type == DOCUMENT:
224 hasChildren = True
226 else:
227 yield self.unknown(details[0])
229 if hasChildren:
230 firstChild = self.getFirstChild(currentNode)
231 else:
232 firstChild = None
234 if firstChild is not None:
235 currentNode = firstChild
236 else:
237 while currentNode is not None:
238 details = self.getNodeDetails(currentNode)
239 type, details = details[0], details[1:]
240 if type == ELEMENT:
241 namespace, name, attributes, hasChildren = details
242 if (namespace and namespace != namespaces["html"]) or name not in voidElements:
243 yield self.endTag(namespace, name)
244 if self.tree is currentNode:
245 currentNode = None
246 break
247 nextSibling = self.getNextSibling(currentNode)
248 if nextSibling is not None:
249 currentNode = nextSibling
250 break
251 else:
252 currentNode = self.getParentNode(currentNode)