Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdownify/__init__.py: 96%
251 statements
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:04 +0000
« prev ^ index » next coverage.py v7.3.1, created at 2023-09-25 06:04 +0000
1from bs4 import BeautifulSoup, NavigableString, Comment, Doctype
2from textwrap import fill
3import re
4import six
7convert_heading_re = re.compile(r'convert_h(\d+)')
8line_beginning_re = re.compile(r'^', re.MULTILINE)
9whitespace_re = re.compile(r'[\t ]+')
10all_whitespace_re = re.compile(r'[\s]+')
11html_heading_re = re.compile(r'h[1-6]')
14# Heading styles
15ATX = 'atx'
16ATX_CLOSED = 'atx_closed'
17UNDERLINED = 'underlined'
18SETEXT = UNDERLINED
20# Newline style
21SPACES = 'spaces'
22BACKSLASH = 'backslash'
24# Strong and emphasis style
25ASTERISK = '*'
26UNDERSCORE = '_'
29def chomp(text):
30 """
31 If the text in an inline tag like b, a, or em contains a leading or trailing
32 space, strip the string and return a space as suffix of prefix, if needed.
33 This function is used to prevent conversions like
34 <b> foo</b> => ** foo**
35 """
36 prefix = ' ' if text and text[0] == ' ' else ''
37 suffix = ' ' if text and text[-1] == ' ' else ''
38 text = text.strip()
39 return (prefix, suffix, text)
42def abstract_inline_conversion(markup_fn):
43 """
44 This abstracts all simple inline tags like b, em, del, ...
45 Returns a function that wraps the chomped text in a pair of the string
46 that is returned by markup_fn. markup_fn is necessary to allow for
47 references to self.strong_em_symbol etc.
48 """
49 def implementation(self, el, text, convert_as_inline):
50 markup = markup_fn(self)
51 prefix, suffix, text = chomp(text)
52 if not text:
53 return ''
54 return '%s%s%s%s%s' % (prefix, markup, text, markup, suffix)
55 return implementation
58def _todict(obj):
59 return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
62class MarkdownConverter(object):
63 class DefaultOptions:
64 autolinks = True
65 bullets = '*+-' # An iterable of bullet types.
66 code_language = ''
67 code_language_callback = None
68 convert = None
69 default_title = False
70 escape_asterisks = True
71 escape_underscores = True
72 heading_style = UNDERLINED
73 keep_inline_images_in = []
74 newline_style = SPACES
75 strip = None
76 strong_em_symbol = ASTERISK
77 sub_symbol = ''
78 sup_symbol = ''
79 wrap = False
80 wrap_width = 80
82 class Options(DefaultOptions):
83 pass
85 def __init__(self, **options):
86 # Create an options dictionary. Use DefaultOptions as a base so that
87 # it doesn't have to be extended.
88 self.options = _todict(self.DefaultOptions)
89 self.options.update(_todict(self.Options))
90 self.options.update(options)
91 if self.options['strip'] is not None and self.options['convert'] is not None:
92 raise ValueError('You may specify either tags to strip or tags to'
93 ' convert, but not both.')
95 def convert(self, html):
96 soup = BeautifulSoup(html, 'html.parser')
97 return self.convert_soup(soup)
99 def convert_soup(self, soup):
100 return self.process_tag(soup, convert_as_inline=False, children_only=True)
102 def process_tag(self, node, convert_as_inline, children_only=False):
103 text = ''
105 # markdown headings or cells can't include
106 # block elements (elements w/newlines)
107 isHeading = html_heading_re.match(node.name) is not None
108 isCell = node.name in ['td', 'th']
109 convert_children_as_inline = convert_as_inline
111 if not children_only and (isHeading or isCell):
112 convert_children_as_inline = True
114 # Remove whitespace-only textnodes in purely nested nodes
115 def is_nested_node(el):
116 return el and el.name in ['ol', 'ul', 'li',
117 'table', 'thead', 'tbody', 'tfoot',
118 'tr', 'td', 'th']
120 if is_nested_node(node):
121 for el in node.children:
122 # Only extract (remove) whitespace-only text node if any of the
123 # conditions is true:
124 # - el is the first element in its parent
125 # - el is the last element in its parent
126 # - el is adjacent to an nested node
127 can_extract = (not el.previous_sibling
128 or not el.next_sibling
129 or is_nested_node(el.previous_sibling)
130 or is_nested_node(el.next_sibling))
131 if (isinstance(el, NavigableString)
132 and six.text_type(el).strip() == ''
133 and can_extract):
134 el.extract()
136 # Convert the children first
137 for el in node.children:
138 if isinstance(el, Comment) or isinstance(el, Doctype):
139 continue
140 elif isinstance(el, NavigableString):
141 text += self.process_text(el)
142 else:
143 text += self.process_tag(el, convert_children_as_inline)
145 if not children_only:
146 convert_fn = getattr(self, 'convert_%s' % node.name, None)
147 if convert_fn and self.should_convert_tag(node.name):
148 text = convert_fn(node, text, convert_as_inline)
150 return text
152 def process_text(self, el):
153 text = six.text_type(el) or ''
155 # dont remove any whitespace when handling pre or code in pre
156 if not (el.parent.name == 'pre'
157 or (el.parent.name == 'code'
158 and el.parent.parent.name == 'pre')):
159 text = whitespace_re.sub(' ', text)
161 if el.parent.name != 'code' and el.parent.name != 'pre':
162 text = self.escape(text)
164 # remove trailing whitespaces if any of the following condition is true:
165 # - current text node is the last node in li
166 # - current text node is followed by an embedded list
167 if (el.parent.name == 'li'
168 and (not el.next_sibling
169 or el.next_sibling.name in ['ul', 'ol'])):
170 text = text.rstrip()
172 return text
174 def __getattr__(self, attr):
175 # Handle headings
176 m = convert_heading_re.match(attr)
177 if m:
178 n = int(m.group(1))
180 def convert_tag(el, text, convert_as_inline):
181 return self.convert_hn(n, el, text, convert_as_inline)
183 convert_tag.__name__ = 'convert_h%s' % n
184 setattr(self, convert_tag.__name__, convert_tag)
185 return convert_tag
187 raise AttributeError(attr)
189 def should_convert_tag(self, tag):
190 tag = tag.lower()
191 strip = self.options['strip']
192 convert = self.options['convert']
193 if strip is not None:
194 return tag not in strip
195 elif convert is not None:
196 return tag in convert
197 else:
198 return True
200 def escape(self, text):
201 if not text:
202 return ''
203 if self.options['escape_asterisks']:
204 text = text.replace('*', r'\*')
205 if self.options['escape_underscores']:
206 text = text.replace('_', r'\_')
207 return text
209 def indent(self, text, level):
210 return line_beginning_re.sub('\t' * level, text) if text else ''
212 def underline(self, text, pad_char):
213 text = (text or '').rstrip()
214 return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
216 def convert_a(self, el, text, convert_as_inline):
217 prefix, suffix, text = chomp(text)
218 if not text:
219 return ''
220 href = el.get('href')
221 title = el.get('title')
222 # For the replacement see #29: text nodes underscores are escaped
223 if (self.options['autolinks']
224 and text.replace(r'\_', '_') == href
225 and not title
226 and not self.options['default_title']):
227 # Shortcut syntax
228 return '<%s>' % href
229 if self.options['default_title'] and not title:
230 title = href
231 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
232 return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
234 convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
236 def convert_blockquote(self, el, text, convert_as_inline):
238 if convert_as_inline:
239 return text
241 return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else ''
243 def convert_br(self, el, text, convert_as_inline):
244 if convert_as_inline:
245 return ""
247 if self.options['newline_style'].lower() == BACKSLASH:
248 return '\\\n'
249 else:
250 return ' \n'
252 def convert_code(self, el, text, convert_as_inline):
253 if el.parent.name == 'pre':
254 return text
255 converter = abstract_inline_conversion(lambda self: '`')
256 return converter(self, el, text, convert_as_inline)
258 convert_del = abstract_inline_conversion(lambda self: '~~')
260 convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
262 convert_kbd = convert_code
264 def convert_hn(self, n, el, text, convert_as_inline):
265 if convert_as_inline:
266 return text
268 style = self.options['heading_style'].lower()
269 text = text.rstrip()
270 if style == UNDERLINED and n <= 2:
271 line = '=' if n == 1 else '-'
272 return self.underline(text, line)
273 hashes = '#' * n
274 if style == ATX_CLOSED:
275 return '%s %s %s\n\n' % (hashes, text, hashes)
276 return '%s %s\n\n' % (hashes, text)
278 def convert_hr(self, el, text, convert_as_inline):
279 return '\n\n---\n\n'
281 convert_i = convert_em
283 def convert_img(self, el, text, convert_as_inline):
284 alt = el.attrs.get('alt', None) or ''
285 src = el.attrs.get('src', None) or ''
286 title = el.attrs.get('title', None) or ''
287 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
288 if (convert_as_inline
289 and el.parent.name not in self.options['keep_inline_images_in']):
290 return alt
292 return '' % (alt, src, title_part)
294 def convert_list(self, el, text, convert_as_inline):
296 # Converting a list to inline is undefined.
297 # Ignoring convert_to_inline for list.
299 nested = False
300 before_paragraph = False
301 if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
302 before_paragraph = True
303 while el:
304 if el.name == 'li':
305 nested = True
306 break
307 el = el.parent
308 if nested:
309 # remove trailing newline if nested
310 return '\n' + self.indent(text, 1).rstrip()
311 return text + ('\n' if before_paragraph else '')
313 convert_ul = convert_list
314 convert_ol = convert_list
316 def convert_li(self, el, text, convert_as_inline):
317 parent = el.parent
318 if parent is not None and parent.name == 'ol':
319 if parent.get("start"):
320 start = int(parent.get("start"))
321 else:
322 start = 1
323 bullet = '%s.' % (start + parent.index(el))
324 else:
325 depth = -1
326 while el:
327 if el.name == 'ul':
328 depth += 1
329 el = el.parent
330 bullets = self.options['bullets']
331 bullet = bullets[depth % len(bullets)]
332 return '%s %s\n' % (bullet, (text or '').strip())
334 def convert_p(self, el, text, convert_as_inline):
335 if convert_as_inline:
336 return text
337 if self.options['wrap']:
338 text = fill(text,
339 width=self.options['wrap_width'],
340 break_long_words=False,
341 break_on_hyphens=False)
342 return '%s\n\n' % text if text else ''
344 def convert_pre(self, el, text, convert_as_inline):
345 if not text:
346 return ''
347 code_language = self.options['code_language']
349 if self.options['code_language_callback']:
350 code_language = self.options['code_language_callback'](el) or code_language
352 return '\n```%s\n%s\n```\n' % (code_language, text)
354 convert_s = convert_del
356 convert_strong = convert_b
358 convert_samp = convert_code
360 convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])
362 convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
364 def convert_table(self, el, text, convert_as_inline):
365 return '\n\n' + text + '\n'
367 def convert_td(self, el, text, convert_as_inline):
368 return ' ' + text + ' |'
370 def convert_th(self, el, text, convert_as_inline):
371 return ' ' + text + ' |'
373 def convert_tr(self, el, text, convert_as_inline):
374 cells = el.find_all(['td', 'th'])
375 is_headrow = all([cell.name == 'th' for cell in cells])
376 overline = ''
377 underline = ''
378 if is_headrow and not el.previous_sibling:
379 # first row and is headline: print headline underline
380 underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
381 elif (not el.previous_sibling
382 and (el.parent.name == 'table'
383 or (el.parent.name == 'tbody'
384 and not el.parent.previous_sibling))):
385 # first row, not headline, and:
386 # - the parent is table or
387 # - the parent is tbody at the beginning of a table.
388 # print empty headline above this row
389 overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
390 overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
391 return overline + '|' + text + '\n' + underline
394def markdownify(html, **options):
395 return MarkdownConverter(**options).convert(html)