1# Copyright 2012-2013 Amazon.com, Inc. or its affiliates. All Rights Reserved.
2#
3# Licensed under the Apache License, Version 2.0 (the "License"). You
4# may not use this file except in compliance with the License. A copy of
5# the License is located at
6#
7# http://aws.amazon.com/apache2.0/
8#
9# or in the "license" file accompanying this file. This file is
10# distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF
11# ANY KIND, either express or implied. See the License for the specific
12# language governing permissions and limitations under the License.
13from html.parser import HTMLParser
14from itertools import zip_longest
15
16PRIORITY_PARENT_TAGS = ('code', 'a')
17OMIT_NESTED_TAGS = ('span', 'i', 'code', 'a')
18OMIT_SELF_TAGS = ('i', 'b')
19HTML_BLOCK_DISPLAY_TAGS = ('p', 'note', 'ul', 'li')
20
21
22class DocStringParser(HTMLParser):
23 """
24 A simple HTML parser. Focused on converting the subset of HTML
25 that appears in the documentation strings of the JSON models into
26 simple ReST format.
27 """
28
29 def __init__(self, doc):
30 self.tree = None
31 self.doc = doc
32 super().__init__()
33
34 def reset(self):
35 HTMLParser.reset(self)
36 self.tree = HTMLTree(self.doc)
37
38 def feed(self, data):
39 super().feed(data)
40 self.tree.write()
41 self.tree = HTMLTree(self.doc)
42
43 def close(self):
44 super().close()
45 # Write if there is anything remaining.
46 self.tree.write()
47 self.tree = HTMLTree(self.doc)
48
49 def handle_starttag(self, tag, attrs):
50 self.tree.add_tag(tag, attrs=attrs)
51
52 def handle_endtag(self, tag):
53 self.tree.add_tag(tag, is_start=False)
54
55 def handle_data(self, data):
56 self.tree.add_data(data)
57
58
59class HTMLTree:
60 """
61 A tree which handles HTML nodes. Designed to work with a python HTML parser,
62 meaning that the current_node will be the most recently opened tag. When
63 a tag is closed, the current_node moves up to the parent node.
64 """
65
66 def __init__(self, doc):
67 self.doc = doc
68 self.head = StemNode()
69 self.current_node = self.head
70 self.unhandled_tags = []
71
72 def add_tag(self, tag, attrs=None, is_start=True):
73 if not self._doc_has_handler(tag, is_start):
74 self.unhandled_tags.append(tag)
75 return
76
77 if is_start:
78 node = TagNode(tag, attrs)
79 self.current_node.add_child(node)
80 self.current_node = node
81 else:
82 self.current_node = self.current_node.parent
83
84 def _doc_has_handler(self, tag, is_start):
85 if is_start:
86 handler_name = f'start_{tag}'
87 else:
88 handler_name = f'end_{tag}'
89
90 return hasattr(self.doc.style, handler_name)
91
92 def add_data(self, data):
93 self.current_node.add_child(DataNode(data))
94
95 def write(self):
96 self.head.write(self.doc)
97
98
99class Node:
100 def __init__(self, parent=None):
101 self.parent = parent
102
103 def write(self, doc):
104 raise NotImplementedError
105
106
107class StemNode(Node):
108 def __init__(self, parent=None):
109 super().__init__(parent)
110 self.children = []
111
112 def add_child(self, child):
113 child.parent = self
114 self.children.append(child)
115
116 def write(self, doc):
117 self.collapse_whitespace()
118 self._write_children(doc)
119
120 def _write_children(self, doc):
121 for child, next_child in zip_longest(self.children, self.children[1:]):
122 if isinstance(child, TagNode) and next_child is not None:
123 child.write(doc, next_child)
124 else:
125 child.write(doc)
126
127 def is_whitespace(self):
128 return all(child.is_whitespace() for child in self.children)
129
130 def startswith_whitespace(self):
131 return self.children and self.children[0].startswith_whitespace()
132
133 def endswith_whitespace(self):
134 return self.children and self.children[-1].endswith_whitespace()
135
136 def lstrip(self):
137 while self.children and self.children[0].is_whitespace():
138 self.children = self.children[1:]
139 if self.children:
140 self.children[0].lstrip()
141
142 def rstrip(self):
143 while self.children and self.children[-1].is_whitespace():
144 self.children = self.children[:-1]
145 if self.children:
146 self.children[-1].rstrip()
147
148 def collapse_whitespace(self):
149 """Remove collapsible white-space from HTML.
150
151 HTML in docstrings often contains extraneous white-space around tags,
152 for readability. Browsers would collapse this white-space before
153 rendering. If not removed before conversion to RST where white-space is
154 part of the syntax, for example for indentation, it can result in
155 incorrect output.
156 """
157 self.lstrip()
158 self.rstrip()
159 for child in self.children:
160 child.collapse_whitespace()
161
162
163class TagNode(StemNode):
164 """
165 A generic Tag node. It will verify that handlers exist before writing.
166 """
167
168 def __init__(self, tag, attrs=None, parent=None):
169 super().__init__(parent)
170 self.attrs = attrs
171 self.tag = tag
172
173 def _has_nested_tags(self):
174 # Returns True if any children are TagNodes and False otherwise.
175 return any(isinstance(child, TagNode) for child in self.children)
176
177 def write(self, doc, next_child=None):
178 prioritize_nested_tags = (
179 self.tag in OMIT_SELF_TAGS and self._has_nested_tags()
180 )
181 prioritize_parent_tag = (
182 isinstance(self.parent, TagNode)
183 and self.parent.tag in PRIORITY_PARENT_TAGS
184 and self.tag in OMIT_NESTED_TAGS
185 )
186 if prioritize_nested_tags or prioritize_parent_tag:
187 self._write_children(doc)
188 return
189
190 self._write_start(doc)
191 self._write_children(doc)
192 self._write_end(doc, next_child)
193
194 def collapse_whitespace(self):
195 """Remove collapsible white-space.
196
197 All tags collapse internal whitespace. Block-display HTML tags also
198 strip all leading and trailing whitespace.
199
200 Approximately follows the specification used in browsers:
201 https://www.w3.org/TR/css-text-3/#white-space-rules
202 https://developer.mozilla.org/en-US/docs/Web/API/Document_Object_Model/Whitespace
203 """
204 if self.tag in HTML_BLOCK_DISPLAY_TAGS:
205 self.lstrip()
206 self.rstrip()
207 # Collapse whitespace in situations like ``</b> <i> foo</i>`` into
208 # ``</b><i> foo</i>``.
209 for prev, cur in zip(self.children[:-1], self.children[1:]):
210 if (
211 isinstance(prev, DataNode)
212 and prev.endswith_whitespace()
213 and cur.startswith_whitespace()
214 ):
215 cur.lstrip()
216 # Same logic, but for situations like ``<b>bar </b> <i>``:
217 for cur, nxt in zip(self.children[:-1], self.children[1:]):
218 if (
219 isinstance(nxt, DataNode)
220 and cur.endswith_whitespace()
221 and nxt.startswith_whitespace()
222 ):
223 cur.rstrip()
224 # Recurse into children
225 for child in self.children:
226 child.collapse_whitespace()
227
228 def _write_start(self, doc):
229 handler_name = f'start_{self.tag}'
230 if hasattr(doc.style, handler_name):
231 getattr(doc.style, handler_name)(self.attrs)
232
233 def _write_end(self, doc, next_child):
234 handler_name = f'end_{self.tag}'
235 if hasattr(doc.style, handler_name):
236 if handler_name == 'end_a':
237 # We use lookahead to determine if a space is needed after a link node
238 getattr(doc.style, handler_name)(next_child)
239 else:
240 getattr(doc.style, handler_name)()
241
242
243class DataNode(Node):
244 """
245 A Node that contains only string data.
246 """
247
248 def __init__(self, data, parent=None):
249 super().__init__(parent)
250 if not isinstance(data, str):
251 raise ValueError(f"Expecting string type, {type(data)} given.")
252 self._leading_whitespace = ''
253 self._trailing_whitespace = ''
254 self._stripped_data = ''
255 if data == '':
256 return
257 if data.isspace():
258 self._trailing_whitespace = data
259 return
260 first_non_space = next(
261 idx for idx, ch in enumerate(data) if not ch.isspace()
262 )
263 last_non_space = len(data) - next(
264 idx for idx, ch in enumerate(reversed(data)) if not ch.isspace()
265 )
266 self._leading_whitespace = data[:first_non_space]
267 self._trailing_whitespace = data[last_non_space:]
268 self._stripped_data = data[first_non_space:last_non_space]
269
270 @property
271 def data(self):
272 return (
273 f'{self._leading_whitespace}{self._stripped_data}'
274 f'{self._trailing_whitespace}'
275 )
276
277 def is_whitespace(self):
278 return self._stripped_data == '' and (
279 self._leading_whitespace != '' or self._trailing_whitespace != ''
280 )
281
282 def startswith_whitespace(self):
283 return self._leading_whitespace != '' or (
284 self._stripped_data == '' and self._trailing_whitespace != ''
285 )
286
287 def endswith_whitespace(self):
288 return self._trailing_whitespace != '' or (
289 self._stripped_data == '' and self._leading_whitespace != ''
290 )
291
292 def lstrip(self):
293 if self._leading_whitespace != '':
294 self._leading_whitespace = ''
295 elif self._stripped_data == '':
296 self.rstrip()
297
298 def rstrip(self):
299 if self._trailing_whitespace != '':
300 self._trailing_whitespace = ''
301 elif self._stripped_data == '':
302 self.lstrip()
303
304 def collapse_whitespace(self):
305 """Noop, ``DataNode.write`` always collapses whitespace"""
306 return
307
308 def write(self, doc):
309 words = doc.translate_words(self._stripped_data.split())
310 str_data = (
311 f'{self._leading_whitespace}{" ".join(words)}'
312 f'{self._trailing_whitespace}'
313 )
314 if str_data != '':
315 doc.handle_data(str_data)