Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdownify/__init__.py: 89%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
2from textwrap import fill
3import re
4import six
7# General-purpose regex patterns
8re_convert_heading = re.compile(r'convert_h(\d+)')
9re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
10re_whitespace = re.compile(r'[\t ]+')
11re_all_whitespace = re.compile(r'[\t \r\n]+')
12re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
13re_html_heading = re.compile(r'h(\d+)')
14re_pre_lstrip1 = re.compile(r'^ *\n')
15re_pre_rstrip1 = re.compile(r'\n *$')
16re_pre_lstrip = re.compile(r'^[ \n]*\n')
17re_pre_rstrip = re.compile(r'[ \n]*$')
19# Pattern for creating convert_<tag> function names from tag names
20re_make_convert_fn_name = re.compile(r'[\[\]:-]')
22# Extract (leading_nl, content, trailing_nl) from a string
23# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
24re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)
26# Escape miscellaneous special Markdown characters
27re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])')
29# Escape sequence of one or more consecutive '-', preceded
30# and followed by whitespace or start/end of fragment, as it
31# might be confused with an underline of a header, or with a
32# list marker
33re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))')
35# Escape sequence of up to six consecutive '#', preceded
36# and followed by whitespace or start/end of fragment, as
37# it might be confused with an ATX heading
38re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
40# Escape '.' or ')' preceded by up to nine digits, as it might be
41# confused with a list item
42re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
44# Find consecutive backtick sequences in a string
45re_backtick_runs = re.compile(r'`+')
47# Heading styles
48ATX = 'atx'
49ATX_CLOSED = 'atx_closed'
50UNDERLINED = 'underlined'
51SETEXT = UNDERLINED
53# Newline style
54SPACES = 'spaces'
55BACKSLASH = 'backslash'
57# Strong and emphasis style
58ASTERISK = '*'
59UNDERSCORE = '_'
61# Document/pre strip styles
62LSTRIP = 'lstrip'
63RSTRIP = 'rstrip'
64STRIP = 'strip'
65STRIP_ONE = 'strip_one'
68def strip1_pre(text):
69 """Strip one leading and trailing newline from a <pre> string."""
70 text = re_pre_lstrip1.sub('', text)
71 text = re_pre_rstrip1.sub('', text)
72 return text
75def strip_pre(text):
76 """Strip all leading and trailing newlines from a <pre> string."""
77 text = re_pre_lstrip.sub('', text)
78 text = re_pre_rstrip.sub('', text)
79 return text
82def chomp(text):
83 """
84 If the text in an inline tag like b, a, or em contains a leading or trailing
85 space, strip the string and return a space as suffix of prefix, if needed.
86 This function is used to prevent conversions like
87 <b> foo</b> => ** foo**
88 """
89 prefix = ' ' if text and text[0] == ' ' else ''
90 suffix = ' ' if text and text[-1] == ' ' else ''
91 text = text.strip()
92 return (prefix, suffix, text)
95def abstract_inline_conversion(markup_fn):
96 """
97 This abstracts all simple inline tags like b, em, del, ...
98 Returns a function that wraps the chomped text in a pair of the string
99 that is returned by markup_fn, with '/' inserted in the string used after
100 the text if it looks like an HTML tag. markup_fn is necessary to allow for
101 references to self.strong_em_symbol etc.
102 """
103 def implementation(self, el, text, parent_tags):
104 markup_prefix = markup_fn(self)
105 if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
106 markup_suffix = '</' + markup_prefix[1:]
107 else:
108 markup_suffix = markup_prefix
109 if '_noformat' in parent_tags:
110 return text
111 prefix, suffix, text = chomp(text)
112 if not text:
113 return ''
114 return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix)
115 return implementation
118def _todict(obj):
119 return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
122def should_remove_whitespace_inside(el):
123 """Return to remove whitespace immediately inside a block-level element."""
124 if not el or not el.name:
125 return False
126 if re_html_heading.match(el.name) is not None:
127 return True
128 return el.name in ('p', 'blockquote',
129 'article', 'div', 'section',
130 'ol', 'ul', 'li',
131 'dl', 'dt', 'dd',
132 'table', 'thead', 'tbody', 'tfoot',
133 'tr', 'td', 'th')
136def should_remove_whitespace_outside(el):
137 """Return to remove whitespace immediately outside a block-level element."""
138 return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
141def _is_block_content_element(el):
142 """
143 In a block context, returns:
145 - True for content elements (tags and non-whitespace text)
146 - False for non-content elements (whitespace text, comments, doctypes)
147 """
148 if isinstance(el, Tag):
149 return True
150 elif isinstance(el, (Comment, Doctype)):
151 return False # (subclasses of NavigableString, must test first)
152 elif isinstance(el, NavigableString):
153 return el.strip() != ''
154 else:
155 return False
158def _prev_block_content_sibling(el):
159 """Returns the first previous sibling that is a content element, else None."""
160 while el is not None:
161 el = el.previous_sibling
162 if _is_block_content_element(el):
163 return el
164 return None
167def _next_block_content_sibling(el):
168 """Returns the first next sibling that is a content element, else None."""
169 while el is not None:
170 el = el.next_sibling
171 if _is_block_content_element(el):
172 return el
173 return None
176class MarkdownConverter(object):
177 class DefaultOptions:
178 autolinks = True
179 bs4_options = 'html.parser'
180 bullets = '*+-' # An iterable of bullet types.
181 code_language = ''
182 code_language_callback = None
183 convert = None
184 default_title = False
185 escape_asterisks = True
186 escape_underscores = True
187 escape_misc = False
188 heading_style = UNDERLINED
189 keep_inline_images_in = []
190 newline_style = SPACES
191 strip = None
192 strip_document = STRIP
193 strip_pre = STRIP
194 strong_em_symbol = ASTERISK
195 sub_symbol = ''
196 sup_symbol = ''
197 table_infer_header = False
198 wrap = False
199 wrap_width = 80
201 class Options(DefaultOptions):
202 pass
204 def __init__(self, **options):
205 # Create an options dictionary. Use DefaultOptions as a base so that
206 # it doesn't have to be extended.
207 self.options = _todict(self.DefaultOptions)
208 self.options.update(_todict(self.Options))
209 self.options.update(options)
210 if self.options['strip'] is not None and self.options['convert'] is not None:
211 raise ValueError('You may specify either tags to strip or tags to'
212 ' convert, but not both.')
214 # If a string or list is passed to bs4_options, assume it is a 'features' specification
215 if not isinstance(self.options['bs4_options'], dict):
216 self.options['bs4_options'] = {'features': self.options['bs4_options']}
218 # Initialize the conversion function cache
219 self.convert_fn_cache = {}
221 def convert(self, html):
222 soup = BeautifulSoup(html, **self.options['bs4_options'])
223 return self.convert_soup(soup)
225 def convert_soup(self, soup):
226 return self.process_tag(soup, parent_tags=set())
228 def process_element(self, node, parent_tags=None):
229 if isinstance(node, NavigableString):
230 return self.process_text(node, parent_tags=parent_tags)
231 else:
232 return self.process_tag(node, parent_tags=parent_tags)
234 def process_tag(self, node, parent_tags=None):
235 # For the top-level element, initialize the parent context with an empty set.
236 if parent_tags is None:
237 parent_tags = set()
239 # Collect child elements to process, ignoring whitespace-only text elements
240 # adjacent to the inner/outer boundaries of block elements.
241 should_remove_inside = should_remove_whitespace_inside(node)
243 def _can_ignore(el):
244 if isinstance(el, Tag):
245 # Tags are always processed.
246 return False
247 elif isinstance(el, (Comment, Doctype)):
248 # Comment and Doctype elements are always ignored.
249 # (subclasses of NavigableString, must test first)
250 return True
251 elif isinstance(el, NavigableString):
252 if six.text_type(el).strip() != '':
253 # Non-whitespace text nodes are always processed.
254 return False
255 elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):
256 # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
257 return True
258 elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
259 # Outside block elements (including <pre>), ignore adjacent whitespace elements.
260 return True
261 else:
262 return False
263 elif el is None:
264 return True
265 else:
266 raise ValueError('Unexpected element type: %s' % type(el))
268 children_to_convert = [el for el in node.children if not _can_ignore(el)]
270 # Create a copy of this tag's parent context, then update it to include this tag
271 # to propagate down into the children.
272 parent_tags_for_children = set(parent_tags)
273 parent_tags_for_children.add(node.name)
275 # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
276 if (
277 re_html_heading.match(node.name) is not None # headings
278 or node.name in {'td', 'th'} # table cells
279 ):
280 parent_tags_for_children.add('_inline')
282 # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
283 if node.name in {'pre', 'code', 'kbd', 'samp'}:
284 parent_tags_for_children.add('_noformat')
286 # Convert the children elements into a list of result strings.
287 child_strings = [
288 self.process_element(el, parent_tags=parent_tags_for_children)
289 for el in children_to_convert
290 ]
292 # Remove empty string values.
293 child_strings = [s for s in child_strings if s]
295 # Collapse newlines at child element boundaries, if needed.
296 if node.name == 'pre' or node.find_parent('pre'):
297 # Inside <pre> blocks, do not collapse newlines.
298 pass
299 else:
300 # Collapse newlines at child element boundaries.
301 updated_child_strings = [''] # so the first lookback works
302 for child_string in child_strings:
303 # Separate the leading/trailing newlines from the content.
304 leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()
306 # If the last child had trailing newlines and this child has leading newlines,
307 # use the larger newline count, limited to 2.
308 if updated_child_strings[-1] and leading_nl:
309 prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value
310 num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
311 leading_nl = '\n' * num_newlines
313 # Add the results to the updated child string list.
314 updated_child_strings.extend([leading_nl, content, trailing_nl])
316 child_strings = updated_child_strings
318 # Join all child text strings into a single string.
319 text = ''.join(child_strings)
321 # apply this tag's final conversion function
322 convert_fn = self.get_conv_fn_cached(node.name)
323 if convert_fn is not None:
324 text = convert_fn(node, text, parent_tags=parent_tags)
326 return text
328 def convert__document_(self, el, text, parent_tags):
329 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
330 if self.options['strip_document'] == LSTRIP:
331 text = text.lstrip('\n') # remove leading separation newlines
332 elif self.options['strip_document'] == RSTRIP:
333 text = text.rstrip('\n') # remove trailing separation newlines
334 elif self.options['strip_document'] == STRIP:
335 text = text.strip('\n') # remove leading and trailing separation newlines
336 elif self.options['strip_document'] is None:
337 pass # leave leading and trailing separation newlines as-is
338 else:
339 raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
341 return text
343 def process_text(self, el, parent_tags=None):
344 # For the top-level element, initialize the parent context with an empty set.
345 if parent_tags is None:
346 parent_tags = set()
348 text = six.text_type(el) or ''
350 # normalize whitespace if we're not inside a preformatted element
351 if 'pre' not in parent_tags:
352 if self.options['wrap']:
353 text = re_all_whitespace.sub(' ', text)
354 else:
355 text = re_newline_whitespace.sub('\n', text)
356 text = re_whitespace.sub(' ', text)
358 # escape special characters if we're not inside a preformatted or code element
359 if '_noformat' not in parent_tags:
360 text = self.escape(text, parent_tags)
362 # remove leading whitespace at the start or just after a
363 # block-level element; remove traliing whitespace at the end
364 # or just before a block-level element.
365 if (should_remove_whitespace_outside(el.previous_sibling)
366 or (should_remove_whitespace_inside(el.parent)
367 and not el.previous_sibling)):
368 text = text.lstrip(' \t\r\n')
369 if (should_remove_whitespace_outside(el.next_sibling)
370 or (should_remove_whitespace_inside(el.parent)
371 and not el.next_sibling)):
372 text = text.rstrip()
374 return text
376 def get_conv_fn_cached(self, tag_name):
377 """Given a tag name, return the conversion function using the cache."""
378 # If conversion function is not in cache, add it
379 if tag_name not in self.convert_fn_cache:
380 self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name)
382 # Return the cached entry
383 return self.convert_fn_cache[tag_name]
385 def get_conv_fn(self, tag_name):
386 """Given a tag name, find and return the conversion function."""
387 tag_name = tag_name.lower()
389 # Handle strip/convert exclusion options
390 if not self.should_convert_tag(tag_name):
391 return None
393 # Look for an explicitly defined conversion function by tag name first
394 convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
395 convert_fn = getattr(self, convert_fn_name, None)
396 if convert_fn:
397 return convert_fn
399 # If tag is any heading, handle with convert_hN() function
400 match = re_html_heading.match(tag_name)
401 if match:
402 n = int(match.group(1)) # get value of N from <hN>
403 return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
405 # No conversion function was found
406 return None
408 def should_convert_tag(self, tag):
409 """Given a tag name, return whether to convert based on strip/convert options."""
410 strip = self.options['strip']
411 convert = self.options['convert']
412 if strip is not None:
413 return tag not in strip
414 elif convert is not None:
415 return tag in convert
416 else:
417 return True
419 def escape(self, text, parent_tags):
420 if not text:
421 return ''
422 if self.options['escape_misc']:
423 text = re_escape_misc_chars.sub(r'\\\1', text)
424 text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
425 text = re_escape_misc_hashes.sub(r'\1\\\2', text)
426 text = re_escape_misc_list_items.sub(r'\1\\\2', text)
428 if self.options['escape_asterisks']:
429 text = text.replace('*', r'\*')
430 if self.options['escape_underscores']:
431 text = text.replace('_', r'\_')
432 return text
434 def underline(self, text, pad_char):
435 text = (text or '').rstrip()
436 return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
438 def convert_a(self, el, text, parent_tags):
439 if '_noformat' in parent_tags:
440 return text
441 prefix, suffix, text = chomp(text)
442 if not text:
443 return ''
444 href = el.get('href')
445 title = el.get('title')
446 # For the replacement see #29: text nodes underscores are escaped
447 if (self.options['autolinks']
448 and text.replace(r'\_', '_') == href
449 and not title
450 and not self.options['default_title']):
451 # Shortcut syntax
452 return '<%s>' % href
453 if self.options['default_title'] and not title:
454 title = href
455 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
456 return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
458 convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
460 def convert_blockquote(self, el, text, parent_tags):
461 # handle some early-exit scenarios
462 text = (text or '').strip(' \t\r\n')
463 if '_inline' in parent_tags:
464 return ' ' + text + ' '
465 if not text:
466 return "\n"
468 # indent lines with blockquote marker
469 def _indent_for_blockquote(match):
470 line_content = match.group(1)
471 return '> ' + line_content if line_content else '>'
472 text = re_line_with_content.sub(_indent_for_blockquote, text)
474 return '\n' + text + '\n\n'
476 def convert_br(self, el, text, parent_tags):
477 if '_inline' in parent_tags:
478 return ' '
480 if self.options['newline_style'].lower() == BACKSLASH:
481 return '\\\n'
482 else:
483 return ' \n'
485 def convert_code(self, el, text, parent_tags):
486 if '_noformat' in parent_tags:
487 return text
489 prefix, suffix, text = chomp(text)
490 if not text:
491 return ''
493 # Find the maximum number of consecutive backticks in the text, then
494 # delimit the code span with one more backtick than that
495 max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
496 markup_delimiter = '`' * (max_backticks + 1)
498 # If the maximum number of backticks is greater than zero, add a space
499 # to avoid interpretation of inside backticks as literals
500 if max_backticks > 0:
501 text = " " + text + " "
503 return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
505 convert_del = abstract_inline_conversion(lambda self: '~~')
507 def convert_div(self, el, text, parent_tags):
508 if '_inline' in parent_tags:
509 return ' ' + text.strip() + ' '
510 text = text.strip()
511 return '\n\n%s\n\n' % text if text else ''
513 convert_article = convert_div
515 convert_section = convert_div
517 convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
519 convert_kbd = convert_code
521 def convert_dd(self, el, text, parent_tags):
522 text = (text or '').strip()
523 if '_inline' in parent_tags:
524 return ' ' + text + ' '
525 if not text:
526 return '\n'
528 # indent definition content lines by four spaces
529 def _indent_for_dd(match):
530 line_content = match.group(1)
531 return ' ' + line_content if line_content else ''
532 text = re_line_with_content.sub(_indent_for_dd, text)
534 # insert definition marker into first-line indent whitespace
535 text = ':' + text[1:]
537 return '%s\n' % text
539 # definition lists are formatted as follows:
540 # https://pandoc.org/MANUAL.html#definition-lists
541 # https://michelf.ca/projects/php-markdown/extra/#def-list
542 convert_dl = convert_div
544 def convert_dt(self, el, text, parent_tags):
545 # remove newlines from term text
546 text = (text or '').strip()
547 text = re_all_whitespace.sub(' ', text)
548 if '_inline' in parent_tags:
549 return ' ' + text + ' '
550 if not text:
551 return '\n'
553 # TODO - format consecutive <dt> elements as directly adjacent lines):
554 # https://michelf.ca/projects/php-markdown/extra/#def-list
556 return '\n\n%s\n' % text
558 def convert_hN(self, n, el, text, parent_tags):
559 # convert_hN() converts <hN> tags, where N is any integer
560 if '_inline' in parent_tags:
561 return text
563 # Markdown does not support heading depths of n > 6
564 n = max(1, min(6, n))
566 style = self.options['heading_style'].lower()
567 text = text.strip()
568 if style == UNDERLINED and n <= 2:
569 line = '=' if n == 1 else '-'
570 return self.underline(text, line)
571 text = re_all_whitespace.sub(' ', text)
572 hashes = '#' * n
573 if style == ATX_CLOSED:
574 return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
575 return '\n\n%s %s\n\n' % (hashes, text)
577 def convert_hr(self, el, text, parent_tags):
578 return '\n\n---\n\n'
580 convert_i = convert_em
582 def convert_img(self, el, text, parent_tags):
583 alt = el.attrs.get('alt', None) or ''
584 src = el.attrs.get('src', None) or ''
585 title = el.attrs.get('title', None) or ''
586 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
587 if ('_inline' in parent_tags
588 and el.parent.name not in self.options['keep_inline_images_in']):
589 return alt
591 return '' % (alt, src, title_part)
593 def convert_video(self, el, text, parent_tags):
594 if ('_inline' in parent_tags
595 and el.parent.name not in self.options['keep_inline_images_in']):
596 return text
597 src = el.attrs.get('src', None) or ''
598 if not src:
599 sources = el.find_all('source', attrs={'src': True})
600 if sources:
601 src = sources[0].attrs.get('src', None) or ''
602 poster = el.attrs.get('poster', None) or ''
603 if src and poster:
604 return '[](%s)' % (text, poster, src)
605 if src:
606 return '[%s](%s)' % (text, src)
607 if poster:
608 return '' % (text, poster)
609 return text
611 def convert_list(self, el, text, parent_tags):
613 # Converting a list to inline is undefined.
614 # Ignoring inline conversion parents for list.
616 before_paragraph = False
617 next_sibling = _next_block_content_sibling(el)
618 if next_sibling and next_sibling.name not in ['ul', 'ol']:
619 before_paragraph = True
620 if 'li' in parent_tags:
621 # remove trailing newline if we're in a nested list
622 return '\n' + text.rstrip()
623 return '\n\n' + text + ('\n' if before_paragraph else '')
625 convert_ul = convert_list
626 convert_ol = convert_list
628 def convert_li(self, el, text, parent_tags):
629 # handle some early-exit scenarios
630 text = (text or '').strip()
631 if not text:
632 return "\n"
634 # determine list item bullet character to use
635 parent = el.parent
636 if parent is not None and parent.name == 'ol':
637 if parent.get("start") and str(parent.get("start")).isnumeric():
638 start = int(parent.get("start"))
639 else:
640 start = 1
641 bullet = '%s.' % (start + len(el.find_previous_siblings('li')))
642 else:
643 depth = -1
644 while el:
645 if el.name == 'ul':
646 depth += 1
647 el = el.parent
648 bullets = self.options['bullets']
649 bullet = bullets[depth % len(bullets)]
650 bullet = bullet + ' '
651 bullet_width = len(bullet)
652 bullet_indent = ' ' * bullet_width
654 # indent content lines by bullet width
655 def _indent_for_li(match):
656 line_content = match.group(1)
657 return bullet_indent + line_content if line_content else ''
658 text = re_line_with_content.sub(_indent_for_li, text)
660 # insert bullet into first-line indent whitespace
661 text = bullet + text[bullet_width:]
663 return '%s\n' % text
665 def convert_p(self, el, text, parent_tags):
666 if '_inline' in parent_tags:
667 return ' ' + text.strip(' \t\r\n') + ' '
668 text = text.strip(' \t\r\n')
669 if self.options['wrap']:
670 # Preserve newlines (and preceding whitespace) resulting
671 # from <br> tags. Newlines in the input have already been
672 # replaced by spaces.
673 if self.options['wrap_width'] is not None:
674 lines = text.split('\n')
675 new_lines = []
676 for line in lines:
677 line = line.lstrip(' \t\r\n')
678 line_no_trailing = line.rstrip()
679 trailing = line[len(line_no_trailing):]
680 line = fill(line,
681 width=self.options['wrap_width'],
682 break_long_words=False,
683 break_on_hyphens=False)
684 new_lines.append(line + trailing)
685 text = '\n'.join(new_lines)
686 return '\n\n%s\n\n' % text if text else ''
688 def convert_pre(self, el, text, parent_tags):
689 if not text:
690 return ''
691 code_language = self.options['code_language']
693 if self.options['code_language_callback']:
694 code_language = self.options['code_language_callback'](el) or code_language
696 if self.options['strip_pre'] == STRIP:
697 text = strip_pre(text) # remove all leading/trailing newlines
698 elif self.options['strip_pre'] == STRIP_ONE:
699 text = strip1_pre(text) # remove one leading/trailing newline
700 elif self.options['strip_pre'] is None:
701 pass # leave leading and trailing newlines as-is
702 else:
703 raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
705 return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
707 def convert_q(self, el, text, parent_tags):
708 return '"' + text + '"'
710 def convert_script(self, el, text, parent_tags):
711 return ''
713 def convert_style(self, el, text, parent_tags):
714 return ''
716 convert_s = convert_del
718 convert_strong = convert_b
720 convert_samp = convert_code
722 convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])
724 convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
726 def convert_table(self, el, text, parent_tags):
727 return '\n\n' + text.strip() + '\n\n'
729 def convert_caption(self, el, text, parent_tags):
730 return text.strip() + '\n\n'
732 def convert_figcaption(self, el, text, parent_tags):
733 return '\n\n' + text.strip() + '\n\n'
735 def convert_td(self, el, text, parent_tags):
736 colspan = 1
737 if 'colspan' in el.attrs and el['colspan'].isdigit():
738 colspan = max(1, min(1000, int(el['colspan'])))
739 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
741 def convert_th(self, el, text, parent_tags):
742 colspan = 1
743 if 'colspan' in el.attrs and el['colspan'].isdigit():
744 colspan = max(1, min(1000, int(el['colspan'])))
745 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
747 def convert_tr(self, el, text, parent_tags):
748 cells = el.find_all(['td', 'th'])
749 is_first_row = el.find_previous_sibling() is None
750 is_headrow = (
751 all([cell.name == 'th' for cell in cells])
752 or (el.parent.name == 'thead'
753 # avoid multiple tr in thead
754 and len(el.parent.find_all('tr')) == 1)
755 )
756 is_head_row_missing = (
757 (is_first_row and not el.parent.name == 'tbody')
758 or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
759 )
760 overline = ''
761 underline = ''
762 full_colspan = 0
763 for cell in cells:
764 if 'colspan' in cell.attrs and cell['colspan'].isdigit():
765 full_colspan += max(1, min(1000, int(cell['colspan'])))
766 else:
767 full_colspan += 1
768 if ((is_headrow
769 or (is_head_row_missing
770 and self.options['table_infer_header']))
771 and is_first_row):
772 # first row and:
773 # - is headline or
774 # - headline is missing and header inference is enabled
775 # print headline underline
776 underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
777 elif ((is_head_row_missing
778 and not self.options['table_infer_header'])
779 or (is_first_row
780 and (el.parent.name == 'table'
781 or (el.parent.name == 'tbody'
782 and not el.parent.find_previous_sibling())))):
783 # headline is missing and header inference is disabled or:
784 # first row, not headline, and:
785 # - the parent is table or
786 # - the parent is tbody at the beginning of a table.
787 # print empty headline above this row
788 overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n'
789 overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
790 return overline + '|' + text + '\n' + underline
793def markdownify(html, **options):
794 return MarkdownConverter(**options).convert(html)