1from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
2from textwrap import fill
3import re
4import six
5
6
7# General-purpose regex patterns
8re_convert_heading = re.compile(r'convert_h(\d+)')
9re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
10re_whitespace = re.compile(r'[\t ]+')
11re_all_whitespace = re.compile(r'[\t \r\n]+')
12re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
13re_html_heading = re.compile(r'h(\d+)')
14re_pre_lstrip1 = re.compile(r'^ *\n')
15re_pre_rstrip1 = re.compile(r'\n *$')
16re_pre_lstrip = re.compile(r'^[ \n]*\n')
17re_pre_rstrip = re.compile(r'[ \n]*$')
18
19# Pattern for creating convert_<tag> function names from tag names
20re_make_convert_fn_name = re.compile(r'[\[\]:-]')
21
22# Extract (leading_nl, content, trailing_nl) from a string
23# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
24re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)
25
26# Escape miscellaneous special Markdown characters
27re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])')
28
29# Escape sequence of one or more consecutive '-', preceded
30# and followed by whitespace or start/end of fragment, as it
31# might be confused with an underline of a header, or with a
32# list marker
33re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))')
34
35# Escape sequence of up to six consecutive '#', preceded
36# and followed by whitespace or start/end of fragment, as
37# it might be confused with an ATX heading
38re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
39
40# Escape '.' or ')' preceded by up to nine digits, as it might be
41# confused with a list item
42re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
43
44# Find consecutive backtick sequences in a string
45re_backtick_runs = re.compile(r'`+')
46
47# Heading styles
48ATX = 'atx'
49ATX_CLOSED = 'atx_closed'
50UNDERLINED = 'underlined'
51SETEXT = UNDERLINED
52
53# Newline style
54SPACES = 'spaces'
55BACKSLASH = 'backslash'
56
57# Strong and emphasis style
58ASTERISK = '*'
59UNDERSCORE = '_'
60
61# Document/pre strip styles
62LSTRIP = 'lstrip'
63RSTRIP = 'rstrip'
64STRIP = 'strip'
65STRIP_ONE = 'strip_one'
66
67
68def strip1_pre(text):
69 """Strip one leading and trailing newline from a <pre> string."""
70 text = re_pre_lstrip1.sub('', text)
71 text = re_pre_rstrip1.sub('', text)
72 return text
73
74
75def strip_pre(text):
76 """Strip all leading and trailing newlines from a <pre> string."""
77 text = re_pre_lstrip.sub('', text)
78 text = re_pre_rstrip.sub('', text)
79 return text
80
81
82def chomp(text):
83 """
84 If the text in an inline tag like b, a, or em contains a leading or trailing
85 space, strip the string and return a space as suffix of prefix, if needed.
86 This function is used to prevent conversions like
87 <b> foo</b> => ** foo**
88 """
89 prefix = ' ' if text and text[0] == ' ' else ''
90 suffix = ' ' if text and text[-1] == ' ' else ''
91 text = text.strip()
92 return (prefix, suffix, text)
93
94
95def abstract_inline_conversion(markup_fn):
96 """
97 This abstracts all simple inline tags like b, em, del, ...
98 Returns a function that wraps the chomped text in a pair of the string
99 that is returned by markup_fn, with '/' inserted in the string used after
100 the text if it looks like an HTML tag. markup_fn is necessary to allow for
101 references to self.strong_em_symbol etc.
102 """
103 def implementation(self, el, text, parent_tags):
104 markup_prefix = markup_fn(self)
105 if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
106 markup_suffix = '</' + markup_prefix[1:]
107 else:
108 markup_suffix = markup_prefix
109 if '_noformat' in parent_tags:
110 return text
111 prefix, suffix, text = chomp(text)
112 if not text:
113 return ''
114 return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix)
115 return implementation
116
117
118def _todict(obj):
119 return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
120
121
122def should_remove_whitespace_inside(el):
123 """Return to remove whitespace immediately inside a block-level element."""
124 if not el or not el.name:
125 return False
126 if re_html_heading.match(el.name) is not None:
127 return True
128 return el.name in ('p', 'blockquote',
129 'article', 'div', 'section',
130 'ol', 'ul', 'li',
131 'dl', 'dt', 'dd',
132 'table', 'thead', 'tbody', 'tfoot',
133 'tr', 'td', 'th')
134
135
136def should_remove_whitespace_outside(el):
137 """Return to remove whitespace immediately outside a block-level element."""
138 return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
139
140
141def _is_block_content_element(el):
142 """
143 In a block context, returns:
144
145 - True for content elements (tags and non-whitespace text)
146 - False for non-content elements (whitespace text, comments, doctypes)
147 """
148 if isinstance(el, Tag):
149 return True
150 elif isinstance(el, (Comment, Doctype)):
151 return False # (subclasses of NavigableString, must test first)
152 elif isinstance(el, NavigableString):
153 return el.strip() != ''
154 else:
155 return False
156
157
158def _prev_block_content_sibling(el):
159 """Returns the first previous sibling that is a content element, else None."""
160 while el is not None:
161 el = el.previous_sibling
162 if _is_block_content_element(el):
163 return el
164 return None
165
166
167def _next_block_content_sibling(el):
168 """Returns the first next sibling that is a content element, else None."""
169 while el is not None:
170 el = el.next_sibling
171 if _is_block_content_element(el):
172 return el
173 return None
174
175
176class MarkdownConverter(object):
177 class DefaultOptions:
178 autolinks = True
179 bs4_options = 'html.parser'
180 bullets = '*+-' # An iterable of bullet types.
181 code_language = ''
182 code_language_callback = None
183 convert = None
184 default_title = False
185 escape_asterisks = True
186 escape_underscores = True
187 escape_misc = False
188 heading_style = UNDERLINED
189 keep_inline_images_in = []
190 newline_style = SPACES
191 strip = None
192 strip_document = STRIP
193 strip_pre = STRIP
194 strong_em_symbol = ASTERISK
195 sub_symbol = ''
196 sup_symbol = ''
197 table_infer_header = False
198 wrap = False
199 wrap_width = 80
200
201 class Options(DefaultOptions):
202 pass
203
204 def __init__(self, **options):
205 # Create an options dictionary. Use DefaultOptions as a base so that
206 # it doesn't have to be extended.
207 self.options = _todict(self.DefaultOptions)
208 self.options.update(_todict(self.Options))
209 self.options.update(options)
210 if self.options['strip'] is not None and self.options['convert'] is not None:
211 raise ValueError('You may specify either tags to strip or tags to'
212 ' convert, but not both.')
213
214 # If a string or list is passed to bs4_options, assume it is a 'features' specification
215 if not isinstance(self.options['bs4_options'], dict):
216 self.options['bs4_options'] = {'features': self.options['bs4_options']}
217
218 # Initialize the conversion function cache
219 self.convert_fn_cache = {}
220
221 def convert(self, html):
222 soup = BeautifulSoup(html, **self.options['bs4_options'])
223 return self.convert_soup(soup)
224
225 def convert_soup(self, soup):
226 return self.process_tag(soup, parent_tags=set())
227
228 def process_element(self, node, parent_tags=None):
229 if isinstance(node, NavigableString):
230 return self.process_text(node, parent_tags=parent_tags)
231 else:
232 return self.process_tag(node, parent_tags=parent_tags)
233
234 def process_tag(self, node, parent_tags=None):
235 # For the top-level element, initialize the parent context with an empty set.
236 if parent_tags is None:
237 parent_tags = set()
238
239 # Collect child elements to process, ignoring whitespace-only text elements
240 # adjacent to the inner/outer boundaries of block elements.
241 should_remove_inside = should_remove_whitespace_inside(node)
242
243 def _can_ignore(el):
244 if isinstance(el, Tag):
245 # Tags are always processed.
246 return False
247 elif isinstance(el, (Comment, Doctype)):
248 # Comment and Doctype elements are always ignored.
249 # (subclasses of NavigableString, must test first)
250 return True
251 elif isinstance(el, NavigableString):
252 if six.text_type(el).strip() != '':
253 # Non-whitespace text nodes are always processed.
254 return False
255 elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):
256 # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
257 return True
258 elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
259 # Outside block elements (including <pre>), ignore adjacent whitespace elements.
260 return True
261 else:
262 return False
263 elif el is None:
264 return True
265 else:
266 raise ValueError('Unexpected element type: %s' % type(el))
267
268 children_to_convert = [el for el in node.children if not _can_ignore(el)]
269
270 # Create a copy of this tag's parent context, then update it to include this tag
271 # to propagate down into the children.
272 parent_tags_for_children = set(parent_tags)
273 parent_tags_for_children.add(node.name)
274
275 # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
276 if (
277 re_html_heading.match(node.name) is not None # headings
278 or node.name in {'td', 'th'} # table cells
279 ):
280 parent_tags_for_children.add('_inline')
281
282 # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
283 if node.name in {'pre', 'code', 'kbd', 'samp'}:
284 parent_tags_for_children.add('_noformat')
285
286 # Convert the children elements into a list of result strings.
287 child_strings = [
288 self.process_element(el, parent_tags=parent_tags_for_children)
289 for el in children_to_convert
290 ]
291
292 # Remove empty string values.
293 child_strings = [s for s in child_strings if s]
294
295 # Collapse newlines at child element boundaries, if needed.
296 if node.name == 'pre' or node.find_parent('pre'):
297 # Inside <pre> blocks, do not collapse newlines.
298 pass
299 else:
300 # Collapse newlines at child element boundaries.
301 updated_child_strings = [''] # so the first lookback works
302 for child_string in child_strings:
303 # Separate the leading/trailing newlines from the content.
304 leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()
305
306 # If the last child had trailing newlines and this child has leading newlines,
307 # use the larger newline count, limited to 2.
308 if updated_child_strings[-1] and leading_nl:
309 prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value
310 num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
311 leading_nl = '\n' * num_newlines
312
313 # Add the results to the updated child string list.
314 updated_child_strings.extend([leading_nl, content, trailing_nl])
315
316 child_strings = updated_child_strings
317
318 # Join all child text strings into a single string.
319 text = ''.join(child_strings)
320
321 # apply this tag's final conversion function
322 convert_fn = self.get_conv_fn_cached(node.name)
323 if convert_fn is not None:
324 text = convert_fn(node, text, parent_tags=parent_tags)
325
326 return text
327
328 def convert__document_(self, el, text, parent_tags):
329 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
330 if self.options['strip_document'] == LSTRIP:
331 text = text.lstrip('\n') # remove leading separation newlines
332 elif self.options['strip_document'] == RSTRIP:
333 text = text.rstrip('\n') # remove trailing separation newlines
334 elif self.options['strip_document'] == STRIP:
335 text = text.strip('\n') # remove leading and trailing separation newlines
336 elif self.options['strip_document'] is None:
337 pass # leave leading and trailing separation newlines as-is
338 else:
339 raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
340
341 return text
342
343 def process_text(self, el, parent_tags=None):
344 # For the top-level element, initialize the parent context with an empty set.
345 if parent_tags is None:
346 parent_tags = set()
347
348 text = six.text_type(el) or ''
349
350 # normalize whitespace if we're not inside a preformatted element
351 if 'pre' not in parent_tags:
352 if self.options['wrap']:
353 text = re_all_whitespace.sub(' ', text)
354 else:
355 text = re_newline_whitespace.sub('\n', text)
356 text = re_whitespace.sub(' ', text)
357
358 # escape special characters if we're not inside a preformatted or code element
359 if '_noformat' not in parent_tags:
360 text = self.escape(text, parent_tags)
361
362 # remove leading whitespace at the start or just after a
363 # block-level element; remove traliing whitespace at the end
364 # or just before a block-level element.
365 if (should_remove_whitespace_outside(el.previous_sibling)
366 or (should_remove_whitespace_inside(el.parent)
367 and not el.previous_sibling)):
368 text = text.lstrip(' \t\r\n')
369 if (should_remove_whitespace_outside(el.next_sibling)
370 or (should_remove_whitespace_inside(el.parent)
371 and not el.next_sibling)):
372 text = text.rstrip()
373
374 return text
375
376 def get_conv_fn_cached(self, tag_name):
377 """Given a tag name, return the conversion function using the cache."""
378 # If conversion function is not in cache, add it
379 if tag_name not in self.convert_fn_cache:
380 self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name)
381
382 # Return the cached entry
383 return self.convert_fn_cache[tag_name]
384
385 def get_conv_fn(self, tag_name):
386 """Given a tag name, find and return the conversion function."""
387 tag_name = tag_name.lower()
388
389 # Handle strip/convert exclusion options
390 if not self.should_convert_tag(tag_name):
391 return None
392
393 # Look for an explicitly defined conversion function by tag name first
394 convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
395 convert_fn = getattr(self, convert_fn_name, None)
396 if convert_fn:
397 return convert_fn
398
399 # If tag is any heading, handle with convert_hN() function
400 match = re_html_heading.match(tag_name)
401 if match:
402 n = int(match.group(1)) # get value of N from <hN>
403 return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
404
405 # No conversion function was found
406 return None
407
408 def should_convert_tag(self, tag):
409 """Given a tag name, return whether to convert based on strip/convert options."""
410 strip = self.options['strip']
411 convert = self.options['convert']
412 if strip is not None:
413 return tag not in strip
414 elif convert is not None:
415 return tag in convert
416 else:
417 return True
418
419 def escape(self, text, parent_tags):
420 if not text:
421 return ''
422 if self.options['escape_misc']:
423 text = re_escape_misc_chars.sub(r'\\\1', text)
424 text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
425 text = re_escape_misc_hashes.sub(r'\1\\\2', text)
426 text = re_escape_misc_list_items.sub(r'\1\\\2', text)
427
428 if self.options['escape_asterisks']:
429 text = text.replace('*', r'\*')
430 if self.options['escape_underscores']:
431 text = text.replace('_', r'\_')
432 return text
433
434 def underline(self, text, pad_char):
435 text = (text or '').rstrip()
436 return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
437
438 def convert_a(self, el, text, parent_tags):
439 if '_noformat' in parent_tags:
440 return text
441 prefix, suffix, text = chomp(text)
442 if not text:
443 return ''
444 href = el.get('href')
445 title = el.get('title')
446 # For the replacement see #29: text nodes underscores are escaped
447 if (self.options['autolinks']
448 and text.replace(r'\_', '_') == href
449 and not title
450 and not self.options['default_title']):
451 # Shortcut syntax
452 return '<%s>' % href
453 if self.options['default_title'] and not title:
454 title = href
455 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
456 return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
457
458 convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
459
460 def convert_blockquote(self, el, text, parent_tags):
461 # handle some early-exit scenarios
462 text = (text or '').strip(' \t\r\n')
463 if '_inline' in parent_tags:
464 return ' ' + text + ' '
465 if not text:
466 return "\n"
467
468 # indent lines with blockquote marker
469 def _indent_for_blockquote(match):
470 line_content = match.group(1)
471 return '> ' + line_content if line_content else '>'
472 text = re_line_with_content.sub(_indent_for_blockquote, text)
473
474 return '\n' + text + '\n\n'
475
476 def convert_br(self, el, text, parent_tags):
477 if '_inline' in parent_tags:
478 return ' '
479
480 if self.options['newline_style'].lower() == BACKSLASH:
481 return '\\\n'
482 else:
483 return ' \n'
484
485 def convert_code(self, el, text, parent_tags):
486 if '_noformat' in parent_tags:
487 return text
488
489 prefix, suffix, text = chomp(text)
490 if not text:
491 return ''
492
493 # Find the maximum number of consecutive backticks in the text, then
494 # delimit the code span with one more backtick than that
495 max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
496 markup_delimiter = '`' * (max_backticks + 1)
497
498 # If the maximum number of backticks is greater than zero, add a space
499 # to avoid interpretation of inside backticks as literals
500 if max_backticks > 0:
501 text = " " + text + " "
502
503 return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
504
505 convert_del = abstract_inline_conversion(lambda self: '~~')
506
507 def convert_div(self, el, text, parent_tags):
508 if '_inline' in parent_tags:
509 return ' ' + text.strip() + ' '
510 text = text.strip()
511 return '\n\n%s\n\n' % text if text else ''
512
513 convert_article = convert_div
514
515 convert_section = convert_div
516
517 convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
518
519 convert_kbd = convert_code
520
521 def convert_dd(self, el, text, parent_tags):
522 text = (text or '').strip()
523 if '_inline' in parent_tags:
524 return ' ' + text + ' '
525 if not text:
526 return '\n'
527
528 # indent definition content lines by four spaces
529 def _indent_for_dd(match):
530 line_content = match.group(1)
531 return ' ' + line_content if line_content else ''
532 text = re_line_with_content.sub(_indent_for_dd, text)
533
534 # insert definition marker into first-line indent whitespace
535 text = ':' + text[1:]
536
537 return '%s\n' % text
538
539 # definition lists are formatted as follows:
540 # https://pandoc.org/MANUAL.html#definition-lists
541 # https://michelf.ca/projects/php-markdown/extra/#def-list
542 convert_dl = convert_div
543
544 def convert_dt(self, el, text, parent_tags):
545 # remove newlines from term text
546 text = (text or '').strip()
547 text = re_all_whitespace.sub(' ', text)
548 if '_inline' in parent_tags:
549 return ' ' + text + ' '
550 if not text:
551 return '\n'
552
553 # TODO - format consecutive <dt> elements as directly adjacent lines):
554 # https://michelf.ca/projects/php-markdown/extra/#def-list
555
556 return '\n\n%s\n' % text
557
558 def convert_hN(self, n, el, text, parent_tags):
559 # convert_hN() converts <hN> tags, where N is any integer
560 if '_inline' in parent_tags:
561 return text
562
563 # Markdown does not support heading depths of n > 6
564 n = max(1, min(6, n))
565
566 style = self.options['heading_style'].lower()
567 text = text.strip()
568 if style == UNDERLINED and n <= 2:
569 line = '=' if n == 1 else '-'
570 return self.underline(text, line)
571 text = re_all_whitespace.sub(' ', text)
572 hashes = '#' * n
573 if style == ATX_CLOSED:
574 return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
575 return '\n\n%s %s\n\n' % (hashes, text)
576
577 def convert_hr(self, el, text, parent_tags):
578 return '\n\n---\n\n'
579
580 convert_i = convert_em
581
582 def convert_img(self, el, text, parent_tags):
583 alt = el.attrs.get('alt', None) or ''
584 src = el.attrs.get('src', None) or ''
585 title = el.attrs.get('title', None) or ''
586 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
587 if ('_inline' in parent_tags
588 and el.parent.name not in self.options['keep_inline_images_in']):
589 return alt
590
591 return '' % (alt, src, title_part)
592
593 def convert_video(self, el, text, parent_tags):
594 if ('_inline' in parent_tags
595 and el.parent.name not in self.options['keep_inline_images_in']):
596 return text
597 src = el.attrs.get('src', None) or ''
598 if not src:
599 sources = el.find_all('source', attrs={'src': True})
600 if sources:
601 src = sources[0].attrs.get('src', None) or ''
602 poster = el.attrs.get('poster', None) or ''
603 if src and poster:
604 return '[](%s)' % (text, poster, src)
605 if src:
606 return '[%s](%s)' % (text, src)
607 if poster:
608 return '' % (text, poster)
609 return text
610
611 def convert_list(self, el, text, parent_tags):
612
613 # Converting a list to inline is undefined.
614 # Ignoring inline conversion parents for list.
615
616 before_paragraph = False
617 next_sibling = _next_block_content_sibling(el)
618 if next_sibling and next_sibling.name not in ['ul', 'ol']:
619 before_paragraph = True
620 if 'li' in parent_tags:
621 # remove trailing newline if we're in a nested list
622 return '\n' + text.rstrip()
623 return '\n\n' + text + ('\n' if before_paragraph else '')
624
625 convert_ul = convert_list
626 convert_ol = convert_list
627
628 def convert_li(self, el, text, parent_tags):
629 # handle some early-exit scenarios
630 text = (text or '').strip()
631 if not text:
632 return "\n"
633
634 # determine list item bullet character to use
635 parent = el.parent
636 if parent is not None and parent.name == 'ol':
637 if parent.get("start") and str(parent.get("start")).isnumeric():
638 start = int(parent.get("start"))
639 else:
640 start = 1
641 bullet = '%s.' % (start + len(el.find_previous_siblings('li')))
642 else:
643 depth = -1
644 while el:
645 if el.name == 'ul':
646 depth += 1
647 el = el.parent
648 bullets = self.options['bullets']
649 bullet = bullets[depth % len(bullets)]
650 bullet = bullet + ' '
651 bullet_width = len(bullet)
652 bullet_indent = ' ' * bullet_width
653
654 # indent content lines by bullet width
655 def _indent_for_li(match):
656 line_content = match.group(1)
657 return bullet_indent + line_content if line_content else ''
658 text = re_line_with_content.sub(_indent_for_li, text)
659
660 # insert bullet into first-line indent whitespace
661 text = bullet + text[bullet_width:]
662
663 return '%s\n' % text
664
665 def convert_p(self, el, text, parent_tags):
666 if '_inline' in parent_tags:
667 return ' ' + text.strip(' \t\r\n') + ' '
668 text = text.strip(' \t\r\n')
669 if self.options['wrap']:
670 # Preserve newlines (and preceding whitespace) resulting
671 # from <br> tags. Newlines in the input have already been
672 # replaced by spaces.
673 if self.options['wrap_width'] is not None:
674 lines = text.split('\n')
675 new_lines = []
676 for line in lines:
677 line = line.lstrip(' \t\r\n')
678 line_no_trailing = line.rstrip()
679 trailing = line[len(line_no_trailing):]
680 line = fill(line,
681 width=self.options['wrap_width'],
682 break_long_words=False,
683 break_on_hyphens=False)
684 new_lines.append(line + trailing)
685 text = '\n'.join(new_lines)
686 return '\n\n%s\n\n' % text if text else ''
687
688 def convert_pre(self, el, text, parent_tags):
689 if not text:
690 return ''
691 code_language = self.options['code_language']
692
693 if self.options['code_language_callback']:
694 code_language = self.options['code_language_callback'](el) or code_language
695
696 if self.options['strip_pre'] == STRIP:
697 text = strip_pre(text) # remove all leading/trailing newlines
698 elif self.options['strip_pre'] == STRIP_ONE:
699 text = strip1_pre(text) # remove one leading/trailing newline
700 elif self.options['strip_pre'] is None:
701 pass # leave leading and trailing newlines as-is
702 else:
703 raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
704
705 return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
706
707 def convert_q(self, el, text, parent_tags):
708 return '"' + text + '"'
709
710 def convert_script(self, el, text, parent_tags):
711 return ''
712
713 def convert_style(self, el, text, parent_tags):
714 return ''
715
716 convert_s = convert_del
717
718 convert_strong = convert_b
719
720 convert_samp = convert_code
721
722 convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])
723
724 convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
725
726 def convert_table(self, el, text, parent_tags):
727 return '\n\n' + text.strip() + '\n\n'
728
729 def convert_caption(self, el, text, parent_tags):
730 return text.strip() + '\n\n'
731
732 def convert_figcaption(self, el, text, parent_tags):
733 return '\n\n' + text.strip() + '\n\n'
734
735 def convert_td(self, el, text, parent_tags):
736 colspan = 1
737 if 'colspan' in el.attrs and el['colspan'].isdigit():
738 colspan = max(1, min(1000, int(el['colspan'])))
739 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
740
741 def convert_th(self, el, text, parent_tags):
742 colspan = 1
743 if 'colspan' in el.attrs and el['colspan'].isdigit():
744 colspan = max(1, min(1000, int(el['colspan'])))
745 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
746
747 def convert_tr(self, el, text, parent_tags):
748 cells = el.find_all(['td', 'th'])
749 is_first_row = el.find_previous_sibling() is None
750 is_headrow = (
751 all([cell.name == 'th' for cell in cells])
752 or (el.parent.name == 'thead'
753 # avoid multiple tr in thead
754 and len(el.parent.find_all('tr')) == 1)
755 )
756 is_head_row_missing = (
757 (is_first_row and not el.parent.name == 'tbody')
758 or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
759 )
760 overline = ''
761 underline = ''
762 full_colspan = 0
763 for cell in cells:
764 if 'colspan' in cell.attrs and cell['colspan'].isdigit():
765 full_colspan += max(1, min(1000, int(cell['colspan'])))
766 else:
767 full_colspan += 1
768 if ((is_headrow
769 or (is_head_row_missing
770 and self.options['table_infer_header']))
771 and is_first_row):
772 # first row and:
773 # - is headline or
774 # - headline is missing and header inference is enabled
775 # print headline underline
776 underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
777 elif ((is_head_row_missing
778 and not self.options['table_infer_header'])
779 or (is_first_row
780 and (el.parent.name == 'table'
781 or (el.parent.name == 'tbody'
782 and not el.parent.find_previous_sibling())))):
783 # headline is missing and header inference is disabled or:
784 # first row, not headline, and:
785 # - the parent is table or
786 # - the parent is tbody at the beginning of a table.
787 # print empty headline above this row
788 overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n'
789 overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
790 return overline + '|' + text + '\n' + underline
791
792
793def markdownify(html, **options):
794 return MarkdownConverter(**options).convert(html)