Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/bleach/_vendor/html5lib/treewalkers/__init__.py: 10%
78 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-07-01 06:54 +0000
1"""A collection of modules for iterating through different kinds of
2tree, generating tokens identical to those produced by the tokenizer
3module.
5To create a tree walker for a new type of tree, you need to
6implement a tree walker object (called TreeWalker by convention) that
7implements a 'serialize' method which takes a tree as sole argument and
8returns an iterator which generates tokens.
9"""
11from __future__ import absolute_import, division, unicode_literals
13from .. import constants
14from .._utils import default_etree
16__all__ = ["getTreeWalker", "pprint"]
18treeWalkerCache = {}
21def getTreeWalker(treeType, implementation=None, **kwargs):
22 """Get a TreeWalker class for various types of tree with built-in support
24 :arg str treeType: the name of the tree type required (case-insensitive).
25 Supported values are:
27 * "dom": The xml.dom.minidom DOM implementation
28 * "etree": A generic walker for tree implementations exposing an
29 elementtree-like interface (known to work with ElementTree,
30 cElementTree and lxml.etree).
31 * "lxml": Optimized walker for lxml.etree
32 * "genshi": a Genshi stream
34 :arg implementation: A module implementing the tree type e.g.
35 xml.etree.ElementTree or cElementTree (Currently applies to the "etree"
36 tree type only).
38 :arg kwargs: keyword arguments passed to the etree walker--for other
39 walkers, this has no effect
41 :returns: a TreeWalker class
43 """
45 treeType = treeType.lower()
46 if treeType not in treeWalkerCache:
47 if treeType == "dom":
48 from . import dom
49 treeWalkerCache[treeType] = dom.TreeWalker
50 elif treeType == "genshi":
51 from . import genshi
52 treeWalkerCache[treeType] = genshi.TreeWalker
53 elif treeType == "lxml":
54 from . import etree_lxml
55 treeWalkerCache[treeType] = etree_lxml.TreeWalker
56 elif treeType == "etree":
57 from . import etree
58 if implementation is None:
59 implementation = default_etree
60 # XXX: NEVER cache here, caching is done in the etree submodule
61 return etree.getETreeModule(implementation, **kwargs).TreeWalker
62 return treeWalkerCache.get(treeType)
65def concatenateCharacterTokens(tokens):
66 pendingCharacters = []
67 for token in tokens:
68 type = token["type"]
69 if type in ("Characters", "SpaceCharacters"):
70 pendingCharacters.append(token["data"])
71 else:
72 if pendingCharacters:
73 yield {"type": "Characters", "data": "".join(pendingCharacters)}
74 pendingCharacters = []
75 yield token
76 if pendingCharacters:
77 yield {"type": "Characters", "data": "".join(pendingCharacters)}
80def pprint(walker):
81 """Pretty printer for tree walkers
83 Takes a TreeWalker instance and pretty prints the output of walking the tree.
85 :arg walker: a TreeWalker instance
87 """
88 output = []
89 indent = 0
90 for token in concatenateCharacterTokens(walker):
91 type = token["type"]
92 if type in ("StartTag", "EmptyTag"):
93 # tag name
94 if token["namespace"] and token["namespace"] != constants.namespaces["html"]:
95 if token["namespace"] in constants.prefixes:
96 ns = constants.prefixes[token["namespace"]]
97 else:
98 ns = token["namespace"]
99 name = "%s %s" % (ns, token["name"])
100 else:
101 name = token["name"]
102 output.append("%s<%s>" % (" " * indent, name))
103 indent += 2
104 # attributes (sorted for consistent ordering)
105 attrs = token["data"]
106 for (namespace, localname), value in sorted(attrs.items()):
107 if namespace:
108 if namespace in constants.prefixes:
109 ns = constants.prefixes[namespace]
110 else:
111 ns = namespace
112 name = "%s %s" % (ns, localname)
113 else:
114 name = localname
115 output.append("%s%s=\"%s\"" % (" " * indent, name, value))
116 # self-closing
117 if type == "EmptyTag":
118 indent -= 2
120 elif type == "EndTag":
121 indent -= 2
123 elif type == "Comment":
124 output.append("%s<!-- %s -->" % (" " * indent, token["data"]))
126 elif type == "Doctype":
127 if token["name"]:
128 if token["publicId"]:
129 output.append("""%s<!DOCTYPE %s "%s" "%s">""" %
130 (" " * indent,
131 token["name"],
132 token["publicId"],
133 token["systemId"] if token["systemId"] else ""))
134 elif token["systemId"]:
135 output.append("""%s<!DOCTYPE %s "" "%s">""" %
136 (" " * indent,
137 token["name"],
138 token["systemId"]))
139 else:
140 output.append("%s<!DOCTYPE %s>" % (" " * indent,
141 token["name"]))
142 else:
143 output.append("%s<!DOCTYPE >" % (" " * indent,))
145 elif type == "Characters":
146 output.append("%s\"%s\"" % (" " * indent, token["data"]))
148 elif type == "SpaceCharacters":
149 assert False, "concatenateCharacterTokens should have got rid of all Space tokens"
151 else:
152 raise ValueError("Unknown token type, %s" % type)
154 return "\n".join(output)