Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdownify/__init_

1from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag

2from textwrap import fill

3import re

4import six

7# General-purpose regex patterns

8re_convert_heading = re.compile(r'convert_h(\d+)')

9re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)

10re_whitespace = re.compile(r'[\t ]+')

11re_all_whitespace = re.compile(r'[\t \r\n]+')

12re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')

13re_html_heading = re.compile(r'h(\d+)')

14re_pre_lstrip1 = re.compile(r'^ *\n')

15re_pre_rstrip1 = re.compile(r'\n *$')

16re_pre_lstrip = re.compile(r'^[ \n]*\n')

17re_pre_rstrip = re.compile(r'[ \n]*$')

19# Pattern for creating convert_<tag> function names from tag names

20re_make_convert_fn_name = re.compile(r'[\[\]:-]')

22# Extract (leading_nl, content, trailing_nl) from a string

23# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)

24re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)

26# Escape miscellaneous special Markdown characters

27re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])')

29# Escape sequence of one or more consecutive '-', preceded

30# and followed by whitespace or start/end of fragment, as it

31# might be confused with an underline of a header, or with a

32# list marker

33re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))')

35# Escape sequence of up to six consecutive '#', preceded

36# and followed by whitespace or start/end of fragment, as

37# it might be confused with an ATX heading

38re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')

40# Escape '.' or ')' preceded by up to nine digits, as it might be

41# confused with a list item

42re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')

44# Find consecutive backtick sequences in a string

45re_backtick_runs = re.compile(r'`+')

47# Heading styles

48ATX = 'atx'

49ATX_CLOSED = 'atx_closed'

50UNDERLINED = 'underlined'

51SETEXT = UNDERLINED

53# Newline style

54SPACES = 'spaces'

55BACKSLASH = 'backslash'

57# Strong and emphasis style

58ASTERISK = '*'

59UNDERSCORE = '_'

61# Document/pre strip styles

62LSTRIP = 'lstrip'

63RSTRIP = 'rstrip'

64STRIP = 'strip'

65STRIP_ONE = 'strip_one'

68def strip1_pre(text):

69 """Strip one leading and trailing newline from a <pre> string."""

70 text = re_pre_lstrip1.sub('', text)

71 text = re_pre_rstrip1.sub('', text)

72 return text

75def strip_pre(text):

76 """Strip all leading and trailing newlines from a <pre> string."""

77 text = re_pre_lstrip.sub('', text)

78 text = re_pre_rstrip.sub('', text)

79 return text

82def chomp(text):

83 """

84 If the text in an inline tag like b, a, or em contains a leading or trailing

85 space, strip the string and return a space as suffix of prefix, if needed.

86 This function is used to prevent conversions like

87 <b> foo</b> => ** foo**

88 """

89 prefix = ' ' if text and text[0] == ' ' else ''

90 suffix = ' ' if text and text[-1] == ' ' else ''

91 text = text.strip()

92 return (prefix, suffix, text)

95def abstract_inline_conversion(markup_fn):

96 """

97 This abstracts all simple inline tags like b, em, del, ...

98 Returns a function that wraps the chomped text in a pair of the string

99 that is returned by markup_fn, with '/' inserted in the string used after

100 the text if it looks like an HTML tag. markup_fn is necessary to allow for

101 references to self.strong_em_symbol etc.

102 """

103 def implementation(self, el, text, parent_tags):

104 markup_prefix = markup_fn(self)

105 if markup_prefix.startswith('<') and markup_prefix.endswith('>'):

106 markup_suffix = '</' + markup_prefix[1:]

107 else:

108 markup_suffix = markup_prefix

109 if '_noformat' in parent_tags:

110 return text

111 prefix, suffix, text = chomp(text)

112 if not text:

113 return ''

114 return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix)

115 return implementation

116

117

118def _todict(obj):

119 return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))

120

121

122def should_remove_whitespace_inside(el):

123 """Return to remove whitespace immediately inside a block-level element."""

124 if not el or not el.name:

125 return False

126 if re_html_heading.match(el.name) is not None:

127 return True

128 return el.name in ('p', 'blockquote',

129 'article', 'div', 'section',

130 'ol', 'ul', 'li',

131 'dl', 'dt', 'dd',

132 'table', 'thead', 'tbody', 'tfoot',

133 'tr', 'td', 'th')

134

135

136def should_remove_whitespace_outside(el):

137 """Return to remove whitespace immediately outside a block-level element."""

138 return should_remove_whitespace_inside(el) or (el and el.name == 'pre')

139

140

141def _is_block_content_element(el):

142 """

143 In a block context, returns:

144

145 - True for content elements (tags and non-whitespace text)

146 - False for non-content elements (whitespace text, comments, doctypes)

147 """

148 if isinstance(el, Tag):

149 return True

150 elif isinstance(el, (Comment, Doctype)):

151 return False # (subclasses of NavigableString, must test first)

152 elif isinstance(el, NavigableString):

153 return el.strip() != ''

154 else:

155 return False

156

157

158def _prev_block_content_sibling(el):

159 """Returns the first previous sibling that is a content element, else None."""

160 while el is not None:

161 el = el.previous_sibling

162 if _is_block_content_element(el):

163 return el

164 return None

165

166

167def _next_block_content_sibling(el):

168 """Returns the first next sibling that is a content element, else None."""

169 while el is not None:

170 el = el.next_sibling

171 if _is_block_content_element(el):

172 return el

173 return None

174

175

176class MarkdownConverter(object):

177 class DefaultOptions:

178 autolinks = True

179 bs4_options = 'html.parser'

180 bullets = '*+-' # An iterable of bullet types.

181 code_language = ''

182 code_language_callback = None

183 convert = None

184 default_title = False

185 escape_asterisks = True

186 escape_underscores = True

187 escape_misc = False

188 heading_style = UNDERLINED

189 keep_inline_images_in = []

190 newline_style = SPACES

191 strip = None

192 strip_document = STRIP

193 strip_pre = STRIP

194 strong_em_symbol = ASTERISK

195 sub_symbol = ''

196 sup_symbol = ''

197 table_infer_header = False

198 wrap = False

199 wrap_width = 80

200

201 class Options(DefaultOptions):

202 pass

203

204 def __init__(self, **options):

205 # Create an options dictionary. Use DefaultOptions as a base so that

206 # it doesn't have to be extended.

207 self.options = _todict(self.DefaultOptions)

208 self.options.update(_todict(self.Options))

209 self.options.update(options)

210 if self.options['strip'] is not None and self.options['convert'] is not None:

211 raise ValueError('You may specify either tags to strip or tags to'

212 ' convert, but not both.')

213

214 # If a string or list is passed to bs4_options, assume it is a 'features' specification

215 if not isinstance(self.options['bs4_options'], dict):

216 self.options['bs4_options'] = {'features': self.options['bs4_options']}

217

218 # Initialize the conversion function cache

219 self.convert_fn_cache = {}

220

221 def convert(self, html):

222 soup = BeautifulSoup(html, **self.options['bs4_options'])

223 return self.convert_soup(soup)

224

225 def convert_soup(self, soup):

226 return self.process_tag(soup, parent_tags=set())

227

228 def process_element(self, node, parent_tags=None):

229 if isinstance(node, NavigableString):

230 return self.process_text(node, parent_tags=parent_tags)

231 else:

232 return self.process_tag(node, parent_tags=parent_tags)

233

234 def process_tag(self, node, parent_tags=None):

235 # For the top-level element, initialize the parent context with an empty set.

236 if parent_tags is None:

237 parent_tags = set()

238

239 # Collect child elements to process, ignoring whitespace-only text elements

240 # adjacent to the inner/outer boundaries of block elements.

241 should_remove_inside = should_remove_whitespace_inside(node)

242

243 def _can_ignore(el):

244 if isinstance(el, Tag):

245 # Tags are always processed.

246 return False

247 elif isinstance(el, (Comment, Doctype)):

248 # Comment and Doctype elements are always ignored.

249 # (subclasses of NavigableString, must test first)

250 return True

251 elif isinstance(el, NavigableString):

252 if six.text_type(el).strip() != '':

253 # Non-whitespace text nodes are always processed.

254 return False

255 elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):

256 # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.

257 return True

258 elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):

259 # Outside block elements (including <pre>), ignore adjacent whitespace elements.

260 return True

261 else:

262 return False

263 elif el is None:

264 return True

265 else:

266 raise ValueError('Unexpected element type: %s' % type(el))

267

268 children_to_convert = [el for el in node.children if not _can_ignore(el)]

269

270 # Create a copy of this tag's parent context, then update it to include this tag

271 # to propagate down into the children.

272 parent_tags_for_children = set(parent_tags)

273 parent_tags_for_children.add(node.name)

274

275 # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag

276 if (

277 re_html_heading.match(node.name) is not None # headings

278 or node.name in {'td', 'th'} # table cells

279 ):

280 parent_tags_for_children.add('_inline')

281

282 # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag

283 if node.name in {'pre', 'code', 'kbd', 'samp'}:

284 parent_tags_for_children.add('_noformat')

285

286 # Convert the children elements into a list of result strings.

287 child_strings = [

288 self.process_element(el, parent_tags=parent_tags_for_children)

289 for el in children_to_convert

290 ]

291

292 # Remove empty string values.

293 child_strings = [s for s in child_strings if s]

294

295 # Collapse newlines at child element boundaries, if needed.

296 if node.name == 'pre' or node.find_parent('pre'):

297 # Inside <pre> blocks, do not collapse newlines.

298 pass

299 else:

300 # Collapse newlines at child element boundaries.

301 updated_child_strings = [''] # so the first lookback works

302 for child_string in child_strings:

303 # Separate the leading/trailing newlines from the content.

304 leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()

305

306 # If the last child had trailing newlines and this child has leading newlines,

307 # use the larger newline count, limited to 2.

308 if updated_child_strings[-1] and leading_nl:

309 prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value

310 num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))

311 leading_nl = '\n' * num_newlines

312

313 # Add the results to the updated child string list.

314 updated_child_strings.extend([leading_nl, content, trailing_nl])

315

316 child_strings = updated_child_strings

317

318 # Join all child text strings into a single string.

319 text = ''.join(child_strings)

320

321 # apply this tag's final conversion function

322 convert_fn = self.get_conv_fn_cached(node.name)

323 if convert_fn is not None:

324 text = convert_fn(node, text, parent_tags=parent_tags)

325

326 return text

327

328 def convert__document_(self, el, text, parent_tags):

329 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""

330 if self.options['strip_document'] == LSTRIP:

331 text = text.lstrip('\n') # remove leading separation newlines

332 elif self.options['strip_document'] == RSTRIP:

333 text = text.rstrip('\n') # remove trailing separation newlines

334 elif self.options['strip_document'] == STRIP:

335 text = text.strip('\n') # remove leading and trailing separation newlines

336 elif self.options['strip_document'] is None:

337 pass # leave leading and trailing separation newlines as-is

338 else:

339 raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])

340

341 return text

342

343 def process_text(self, el, parent_tags=None):

344 # For the top-level element, initialize the parent context with an empty set.

345 if parent_tags is None:

346 parent_tags = set()

347

348 text = six.text_type(el) or ''

349

350 # normalize whitespace if we're not inside a preformatted element

351 if 'pre' not in parent_tags:

352 if self.options['wrap']:

353 text = re_all_whitespace.sub(' ', text)

354 else:

355 text = re_newline_whitespace.sub('\n', text)

356 text = re_whitespace.sub(' ', text)

357

358 # escape special characters if we're not inside a preformatted or code element

359 if '_noformat' not in parent_tags:

360 text = self.escape(text, parent_tags)

361

362 # remove leading whitespace at the start or just after a

363 # block-level element; remove traliing whitespace at the end

364 # or just before a block-level element.

365 if (should_remove_whitespace_outside(el.previous_sibling)

366 or (should_remove_whitespace_inside(el.parent)

367 and not el.previous_sibling)):

368 text = text.lstrip(' \t\r\n')

369 if (should_remove_whitespace_outside(el.next_sibling)

370 or (should_remove_whitespace_inside(el.parent)

371 and not el.next_sibling)):

372 text = text.rstrip()

373

374 return text

375

376 def get_conv_fn_cached(self, tag_name):

377 """Given a tag name, return the conversion function using the cache."""

378 # If conversion function is not in cache, add it

379 if tag_name not in self.convert_fn_cache:

380 self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name)

381

382 # Return the cached entry

383 return self.convert_fn_cache[tag_name]

384

385 def get_conv_fn(self, tag_name):

386 """Given a tag name, find and return the conversion function."""

387 tag_name = tag_name.lower()

388

389 # Handle strip/convert exclusion options

390 if not self.should_convert_tag(tag_name):

391 return None

392

393 # Look for an explicitly defined conversion function by tag name first

394 convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)

395 convert_fn = getattr(self, convert_fn_name, None)

396 if convert_fn:

397 return convert_fn

398

399 # If tag is any heading, handle with convert_hN() function

400 match = re_html_heading.match(tag_name)

401 if match:

402 n = int(match.group(1)) # get value of N from <hN>

403 return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)

404

405 # No conversion function was found

406 return None

407

408 def should_convert_tag(self, tag):

409 """Given a tag name, return whether to convert based on strip/convert options."""

410 strip = self.options['strip']

411 convert = self.options['convert']

412 if strip is not None:

413 return tag not in strip

414 elif convert is not None:

415 return tag in convert

416 else:

417 return True

418

419 def escape(self, text, parent_tags):

420 if not text:

421 return ''

422 if self.options['escape_misc']:

423 text = re_escape_misc_chars.sub(r'\\\1', text)

424 text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)

425 text = re_escape_misc_hashes.sub(r'\1\\\2', text)

426 text = re_escape_misc_list_items.sub(r'\1\\\2', text)

427

428 if self.options['escape_asterisks']:

429 text = text.replace('*', r'\*')

430 if self.options['escape_underscores']:

431 text = text.replace('_', r'\_')

432 return text

433

434 def underline(self, text, pad_char):

435 text = (text or '').rstrip()

436 return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

437

438 def convert_a(self, el, text, parent_tags):

439 if '_noformat' in parent_tags:

440 return text

441 prefix, suffix, text = chomp(text)

442 if not text:

443 return ''

444 href = el.get('href')

445 title = el.get('title')

446 # For the replacement see #29: text nodes underscores are escaped

447 if (self.options['autolinks']

448 and text.replace(r'\_', '_') == href

449 and not title

450 and not self.options['default_title']):

451 # Shortcut syntax

452 return '<%s>' % href

453 if self.options['default_title'] and not title:

454 title = href

455 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''

456 return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text

457

458 convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])

459

460 def convert_blockquote(self, el, text, parent_tags):

461 # handle some early-exit scenarios

462 text = (text or '').strip(' \t\r\n')

463 if '_inline' in parent_tags:

464 return ' ' + text + ' '

465 if not text:

466 return "\n"

467

468 # indent lines with blockquote marker

469 def _indent_for_blockquote(match):

470 line_content = match.group(1)

471 return '> ' + line_content if line_content else '>'

472 text = re_line_with_content.sub(_indent_for_blockquote, text)

473

474 return '\n' + text + '\n\n'

475

476 def convert_br(self, el, text, parent_tags):

477 if '_inline' in parent_tags:

478 return ' '

479

480 if self.options['newline_style'].lower() == BACKSLASH:

481 return '\\\n'

482 else:

483 return ' \n'

484

485 def convert_code(self, el, text, parent_tags):

486 if '_noformat' in parent_tags:

487 return text

488

489 prefix, suffix, text = chomp(text)

490 if not text:

491 return ''

492

493 # Find the maximum number of consecutive backticks in the text, then

494 # delimit the code span with one more backtick than that

495 max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)

496 markup_delimiter = '`' * (max_backticks + 1)

497

498 # If the maximum number of backticks is greater than zero, add a space

499 # to avoid interpretation of inside backticks as literals

500 if max_backticks > 0:

501 text = " " + text + " "

502

503 return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)

504

505 convert_del = abstract_inline_conversion(lambda self: '~~')

506

507 def convert_div(self, el, text, parent_tags):

508 if '_inline' in parent_tags:

509 return ' ' + text.strip() + ' '

510 text = text.strip()

511 return '\n\n%s\n\n' % text if text else ''

512

513 convert_article = convert_div

514

515 convert_section = convert_div

516

517 convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])

518

519 convert_kbd = convert_code

520

521 def convert_dd(self, el, text, parent_tags):

522 text = (text or '').strip()

523 if '_inline' in parent_tags:

524 return ' ' + text + ' '

525 if not text:

526 return '\n'

527

528 # indent definition content lines by four spaces

529 def _indent_for_dd(match):

530 line_content = match.group(1)

531 return ' ' + line_content if line_content else ''

532 text = re_line_with_content.sub(_indent_for_dd, text)

533

534 # insert definition marker into first-line indent whitespace

535 text = ':' + text[1:]

536

537 return '%s\n' % text

538

539 # definition lists are formatted as follows:

540 # https://pandoc.org/MANUAL.html#definition-lists

541 # https://michelf.ca/projects/php-markdown/extra/#def-list

542 convert_dl = convert_div

543

544 def convert_dt(self, el, text, parent_tags):

545 # remove newlines from term text

546 text = (text or '').strip()

547 text = re_all_whitespace.sub(' ', text)

548 if '_inline' in parent_tags:

549 return ' ' + text + ' '

550 if not text:

551 return '\n'

552

553 # TODO - format consecutive <dt> elements as directly adjacent lines):

554 # https://michelf.ca/projects/php-markdown/extra/#def-list

555

556 return '\n\n%s\n' % text

557

558 def convert_hN(self, n, el, text, parent_tags):

559 # convert_hN() converts <hN> tags, where N is any integer

560 if '_inline' in parent_tags:

561 return text

562

563 # Markdown does not support heading depths of n > 6

564 n = max(1, min(6, n))

565

566 style = self.options['heading_style'].lower()

567 text = text.strip()

568 if style == UNDERLINED and n <= 2:

569 line = '=' if n == 1 else '-'

570 return self.underline(text, line)

571 text = re_all_whitespace.sub(' ', text)

572 hashes = '#' * n

573 if style == ATX_CLOSED:

574 return '\n\n%s %s %s\n\n' % (hashes, text, hashes)

575 return '\n\n%s %s\n\n' % (hashes, text)

576

577 def convert_hr(self, el, text, parent_tags):

578 return '\n\n---\n\n'

579

580 convert_i = convert_em

581

582 def convert_img(self, el, text, parent_tags):

583 alt = el.attrs.get('alt', None) or ''

584 src = el.attrs.get('src', None) or ''

585 title = el.attrs.get('title', None) or ''

586 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''

587 if ('_inline' in parent_tags

588 and el.parent.name not in self.options['keep_inline_images_in']):

589 return alt

590

591 return '![%s](%s%s)' % (alt, src, title_part)

592

593 def convert_video(self, el, text, parent_tags):

594 if ('_inline' in parent_tags

595 and el.parent.name not in self.options['keep_inline_images_in']):

596 return text

597 src = el.attrs.get('src', None) or ''

598 if not src:

599 sources = el.find_all('source', attrs={'src': True})

600 if sources:

601 src = sources[0].attrs.get('src', None) or ''

602 poster = el.attrs.get('poster', None) or ''

603 if src and poster:

604 return '[![%s](%s)](%s)' % (text, poster, src)

605 if src:

606 return '[%s](%s)' % (text, src)

607 if poster:

608 return '![%s](%s)' % (text, poster)

609 return text

610

611 def convert_list(self, el, text, parent_tags):

612

613 # Converting a list to inline is undefined.

614 # Ignoring inline conversion parents for list.

615

616 before_paragraph = False

617 next_sibling = _next_block_content_sibling(el)

618 if next_sibling and next_sibling.name not in ['ul', 'ol']:

619 before_paragraph = True

620 if 'li' in parent_tags:

621 # remove trailing newline if we're in a nested list

622 return '\n' + text.rstrip()

623 return '\n\n' + text + ('\n' if before_paragraph else '')

624

625 convert_ul = convert_list

626 convert_ol = convert_list

627

628 def convert_li(self, el, text, parent_tags):

629 # handle some early-exit scenarios

630 text = (text or '').strip()

631 if not text:

632 return "\n"

633

634 # determine list item bullet character to use

635 parent = el.parent

636 if parent is not None and parent.name == 'ol':

637 if parent.get("start") and str(parent.get("start")).isnumeric():

638 start = int(parent.get("start"))

639 else:

640 start = 1

641 bullet = '%s.' % (start + len(el.find_previous_siblings('li')))

642 else:

643 depth = -1

644 while el:

645 if el.name == 'ul':

646 depth += 1

647 el = el.parent

648 bullets = self.options['bullets']

649 bullet = bullets[depth % len(bullets)]

650 bullet = bullet + ' '

651 bullet_width = len(bullet)

652 bullet_indent = ' ' * bullet_width

653

654 # indent content lines by bullet width

655 def _indent_for_li(match):

656 line_content = match.group(1)

657 return bullet_indent + line_content if line_content else ''

658 text = re_line_with_content.sub(_indent_for_li, text)

659

660 # insert bullet into first-line indent whitespace

661 text = bullet + text[bullet_width:]

662

663 return '%s\n' % text

664

665 def convert_p(self, el, text, parent_tags):

666 if '_inline' in parent_tags:

667 return ' ' + text.strip(' \t\r\n') + ' '

668 text = text.strip(' \t\r\n')

669 if self.options['wrap']:

670 # Preserve newlines (and preceding whitespace) resulting

671 # from <br> tags. Newlines in the input have already been

672 # replaced by spaces.

673 if self.options['wrap_width'] is not None:

674 lines = text.split('\n')

675 new_lines = []

676 for line in lines:

677 line = line.lstrip(' \t\r\n')

678 line_no_trailing = line.rstrip()

679 trailing = line[len(line_no_trailing):]

680 line = fill(line,

681 width=self.options['wrap_width'],

682 break_long_words=False,

683 break_on_hyphens=False)

684 new_lines.append(line + trailing)

685 text = '\n'.join(new_lines)

686 return '\n\n%s\n\n' % text if text else ''

687

688 def convert_pre(self, el, text, parent_tags):

689 if not text:

690 return ''

691 code_language = self.options['code_language']

692

693 if self.options['code_language_callback']:

694 code_language = self.options['code_language_callback'](el) or code_language

695

696 if self.options['strip_pre'] == STRIP:

697 text = strip_pre(text) # remove all leading/trailing newlines

698 elif self.options['strip_pre'] == STRIP_ONE:

699 text = strip1_pre(text) # remove one leading/trailing newline

700 elif self.options['strip_pre'] is None:

701 pass # leave leading and trailing newlines as-is

702 else:

703 raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])

704

705 return '\n\n```%s\n%s\n```\n\n' % (code_language, text)

706

707 def convert_q(self, el, text, parent_tags):

708 return '"' + text + '"'

709

710 def convert_script(self, el, text, parent_tags):

711 return ''

712

713 def convert_style(self, el, text, parent_tags):

714 return ''

715

716 convert_s = convert_del

717

718 convert_strong = convert_b

719

720 convert_samp = convert_code

721

722 convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])

723

724 convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])

725

726 def convert_table(self, el, text, parent_tags):

727 return '\n\n' + text.strip() + '\n\n'

728

729 def convert_caption(self, el, text, parent_tags):

730 return text.strip() + '\n\n'

731

732 def convert_figcaption(self, el, text, parent_tags):

733 return '\n\n' + text.strip() + '\n\n'

734

735 def convert_td(self, el, text, parent_tags):

736 colspan = 1

737 if 'colspan' in el.attrs and el['colspan'].isdigit():

738 colspan = max(1, min(1000, int(el['colspan'])))

739 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

740

741 def convert_th(self, el, text, parent_tags):

742 colspan = 1

743 if 'colspan' in el.attrs and el['colspan'].isdigit():

744 colspan = max(1, min(1000, int(el['colspan'])))

745 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

746

747 def convert_tr(self, el, text, parent_tags):

748 cells = el.find_all(['td', 'th'])

749 is_first_row = el.find_previous_sibling() is None

750 is_headrow = (

751 all([cell.name == 'th' for cell in cells])

752 or (el.parent.name == 'thead'

753 # avoid multiple tr in thead

754 and len(el.parent.find_all('tr')) == 1)

755 )

756 is_head_row_missing = (

757 (is_first_row and not el.parent.name == 'tbody')

758 or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)

759 )

760 overline = ''

761 underline = ''

762 full_colspan = 0

763 for cell in cells:

764 if 'colspan' in cell.attrs and cell['colspan'].isdigit():

765 full_colspan += max(1, min(1000, int(cell['colspan'])))

766 else:

767 full_colspan += 1

768 if ((is_headrow

769 or (is_head_row_missing

770 and self.options['table_infer_header']))

771 and is_first_row):

772 # first row and:

773 # - is headline or

774 # - headline is missing and header inference is enabled

775 # print headline underline

776 underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'

777 elif ((is_head_row_missing

778 and not self.options['table_infer_header'])

779 or (is_first_row

780 and (el.parent.name == 'table'

781 or (el.parent.name == 'tbody'

782 and not el.parent.find_previous_sibling())))):

783 # headline is missing and header inference is disabled or:

784 # first row, not headline, and:

785 # - the parent is table or

786 # - the parent is tbody at the beginning of a table.

787 # print empty headline above this row

788 overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n'

789 overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'

790 return overline + '|' + text + '\n' + underline

791

792

793def markdownify(html, **options):

794 return MarkdownConverter(**options).convert(html)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdownify/init.py: 89%

474 statements