Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdownify/__init__.py: 89%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

474 statements  

1from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag 

2from textwrap import fill 

3import re 

4import six 

5 

6 

7# General-purpose regex patterns 

8re_convert_heading = re.compile(r'convert_h(\d+)') 

9re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE) 

10re_whitespace = re.compile(r'[\t ]+') 

11re_all_whitespace = re.compile(r'[\t \r\n]+') 

12re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') 

13re_html_heading = re.compile(r'h(\d+)') 

14re_pre_lstrip1 = re.compile(r'^ *\n') 

15re_pre_rstrip1 = re.compile(r'\n *$') 

16re_pre_lstrip = re.compile(r'^[ \n]*\n') 

17re_pre_rstrip = re.compile(r'[ \n]*$') 

18 

19# Pattern for creating convert_<tag> function names from tag names 

20re_make_convert_fn_name = re.compile(r'[\[\]:-]') 

21 

22# Extract (leading_nl, content, trailing_nl) from a string 

23# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) 

24re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) 

25 

26# Escape miscellaneous special Markdown characters 

27re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])') 

28 

29# Escape sequence of one or more consecutive '-', preceded 

30# and followed by whitespace or start/end of fragment, as it 

31# might be confused with an underline of a header, or with a 

32# list marker 

33re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))') 

34 

35# Escape sequence of up to six consecutive '#', preceded 

36# and followed by whitespace or start/end of fragment, as 

37# it might be confused with an ATX heading 

38re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))') 

39 

40# Escape '.' or ')' preceded by up to nine digits, as it might be 

41# confused with a list item 

42re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))') 

43 

44# Find consecutive backtick sequences in a string 

45re_backtick_runs = re.compile(r'`+') 

46 

47# Heading styles 

48ATX = 'atx' 

49ATX_CLOSED = 'atx_closed' 

50UNDERLINED = 'underlined' 

51SETEXT = UNDERLINED 

52 

53# Newline style 

54SPACES = 'spaces' 

55BACKSLASH = 'backslash' 

56 

57# Strong and emphasis style 

58ASTERISK = '*' 

59UNDERSCORE = '_' 

60 

61# Document/pre strip styles 

62LSTRIP = 'lstrip' 

63RSTRIP = 'rstrip' 

64STRIP = 'strip' 

65STRIP_ONE = 'strip_one' 

66 

67 

68def strip1_pre(text): 

69 """Strip one leading and trailing newline from a <pre> string.""" 

70 text = re_pre_lstrip1.sub('', text) 

71 text = re_pre_rstrip1.sub('', text) 

72 return text 

73 

74 

75def strip_pre(text): 

76 """Strip all leading and trailing newlines from a <pre> string.""" 

77 text = re_pre_lstrip.sub('', text) 

78 text = re_pre_rstrip.sub('', text) 

79 return text 

80 

81 

82def chomp(text): 

83 """ 

84 If the text in an inline tag like b, a, or em contains a leading or trailing 

85 space, strip the string and return a space as suffix of prefix, if needed. 

86 This function is used to prevent conversions like 

87 <b> foo</b> => ** foo** 

88 """ 

89 prefix = ' ' if text and text[0] == ' ' else '' 

90 suffix = ' ' if text and text[-1] == ' ' else '' 

91 text = text.strip() 

92 return (prefix, suffix, text) 

93 

94 

95def abstract_inline_conversion(markup_fn): 

96 """ 

97 This abstracts all simple inline tags like b, em, del, ... 

98 Returns a function that wraps the chomped text in a pair of the string 

99 that is returned by markup_fn, with '/' inserted in the string used after 

100 the text if it looks like an HTML tag. markup_fn is necessary to allow for 

101 references to self.strong_em_symbol etc. 

102 """ 

103 def implementation(self, el, text, parent_tags): 

104 markup_prefix = markup_fn(self) 

105 if markup_prefix.startswith('<') and markup_prefix.endswith('>'): 

106 markup_suffix = '</' + markup_prefix[1:] 

107 else: 

108 markup_suffix = markup_prefix 

109 if '_noformat' in parent_tags: 

110 return text 

111 prefix, suffix, text = chomp(text) 

112 if not text: 

113 return '' 

114 return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix) 

115 return implementation 

116 

117 

118def _todict(obj): 

119 return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_')) 

120 

121 

122def should_remove_whitespace_inside(el): 

123 """Return to remove whitespace immediately inside a block-level element.""" 

124 if not el or not el.name: 

125 return False 

126 if re_html_heading.match(el.name) is not None: 

127 return True 

128 return el.name in ('p', 'blockquote', 

129 'article', 'div', 'section', 

130 'ol', 'ul', 'li', 

131 'dl', 'dt', 'dd', 

132 'table', 'thead', 'tbody', 'tfoot', 

133 'tr', 'td', 'th') 

134 

135 

136def should_remove_whitespace_outside(el): 

137 """Return to remove whitespace immediately outside a block-level element.""" 

138 return should_remove_whitespace_inside(el) or (el and el.name == 'pre') 

139 

140 

141def _is_block_content_element(el): 

142 """ 

143 In a block context, returns: 

144 

145 - True for content elements (tags and non-whitespace text) 

146 - False for non-content elements (whitespace text, comments, doctypes) 

147 """ 

148 if isinstance(el, Tag): 

149 return True 

150 elif isinstance(el, (Comment, Doctype)): 

151 return False # (subclasses of NavigableString, must test first) 

152 elif isinstance(el, NavigableString): 

153 return el.strip() != '' 

154 else: 

155 return False 

156 

157 

158def _prev_block_content_sibling(el): 

159 """Returns the first previous sibling that is a content element, else None.""" 

160 while el is not None: 

161 el = el.previous_sibling 

162 if _is_block_content_element(el): 

163 return el 

164 return None 

165 

166 

167def _next_block_content_sibling(el): 

168 """Returns the first next sibling that is a content element, else None.""" 

169 while el is not None: 

170 el = el.next_sibling 

171 if _is_block_content_element(el): 

172 return el 

173 return None 

174 

175 

176class MarkdownConverter(object): 

177 class DefaultOptions: 

178 autolinks = True 

179 bs4_options = 'html.parser' 

180 bullets = '*+-' # An iterable of bullet types. 

181 code_language = '' 

182 code_language_callback = None 

183 convert = None 

184 default_title = False 

185 escape_asterisks = True 

186 escape_underscores = True 

187 escape_misc = False 

188 heading_style = UNDERLINED 

189 keep_inline_images_in = [] 

190 newline_style = SPACES 

191 strip = None 

192 strip_document = STRIP 

193 strip_pre = STRIP 

194 strong_em_symbol = ASTERISK 

195 sub_symbol = '' 

196 sup_symbol = '' 

197 table_infer_header = False 

198 wrap = False 

199 wrap_width = 80 

200 

201 class Options(DefaultOptions): 

202 pass 

203 

204 def __init__(self, **options): 

205 # Create an options dictionary. Use DefaultOptions as a base so that 

206 # it doesn't have to be extended. 

207 self.options = _todict(self.DefaultOptions) 

208 self.options.update(_todict(self.Options)) 

209 self.options.update(options) 

210 if self.options['strip'] is not None and self.options['convert'] is not None: 

211 raise ValueError('You may specify either tags to strip or tags to' 

212 ' convert, but not both.') 

213 

214 # If a string or list is passed to bs4_options, assume it is a 'features' specification 

215 if not isinstance(self.options['bs4_options'], dict): 

216 self.options['bs4_options'] = {'features': self.options['bs4_options']} 

217 

218 # Initialize the conversion function cache 

219 self.convert_fn_cache = {} 

220 

221 def convert(self, html): 

222 soup = BeautifulSoup(html, **self.options['bs4_options']) 

223 return self.convert_soup(soup) 

224 

225 def convert_soup(self, soup): 

226 return self.process_tag(soup, parent_tags=set()) 

227 

228 def process_element(self, node, parent_tags=None): 

229 if isinstance(node, NavigableString): 

230 return self.process_text(node, parent_tags=parent_tags) 

231 else: 

232 return self.process_tag(node, parent_tags=parent_tags) 

233 

234 def process_tag(self, node, parent_tags=None): 

235 # For the top-level element, initialize the parent context with an empty set. 

236 if parent_tags is None: 

237 parent_tags = set() 

238 

239 # Collect child elements to process, ignoring whitespace-only text elements 

240 # adjacent to the inner/outer boundaries of block elements. 

241 should_remove_inside = should_remove_whitespace_inside(node) 

242 

243 def _can_ignore(el): 

244 if isinstance(el, Tag): 

245 # Tags are always processed. 

246 return False 

247 elif isinstance(el, (Comment, Doctype)): 

248 # Comment and Doctype elements are always ignored. 

249 # (subclasses of NavigableString, must test first) 

250 return True 

251 elif isinstance(el, NavigableString): 

252 if six.text_type(el).strip() != '': 

253 # Non-whitespace text nodes are always processed. 

254 return False 

255 elif should_remove_inside and (not el.previous_sibling or not el.next_sibling): 

256 # Inside block elements (excluding <pre>), ignore adjacent whitespace elements. 

257 return True 

258 elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling): 

259 # Outside block elements (including <pre>), ignore adjacent whitespace elements. 

260 return True 

261 else: 

262 return False 

263 elif el is None: 

264 return True 

265 else: 

266 raise ValueError('Unexpected element type: %s' % type(el)) 

267 

268 children_to_convert = [el for el in node.children if not _can_ignore(el)] 

269 

270 # Create a copy of this tag's parent context, then update it to include this tag 

271 # to propagate down into the children. 

272 parent_tags_for_children = set(parent_tags) 

273 parent_tags_for_children.add(node.name) 

274 

275 # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag 

276 if ( 

277 re_html_heading.match(node.name) is not None # headings 

278 or node.name in {'td', 'th'} # table cells 

279 ): 

280 parent_tags_for_children.add('_inline') 

281 

282 # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag 

283 if node.name in {'pre', 'code', 'kbd', 'samp'}: 

284 parent_tags_for_children.add('_noformat') 

285 

286 # Convert the children elements into a list of result strings. 

287 child_strings = [ 

288 self.process_element(el, parent_tags=parent_tags_for_children) 

289 for el in children_to_convert 

290 ] 

291 

292 # Remove empty string values. 

293 child_strings = [s for s in child_strings if s] 

294 

295 # Collapse newlines at child element boundaries, if needed. 

296 if node.name == 'pre' or node.find_parent('pre'): 

297 # Inside <pre> blocks, do not collapse newlines. 

298 pass 

299 else: 

300 # Collapse newlines at child element boundaries. 

301 updated_child_strings = [''] # so the first lookback works 

302 for child_string in child_strings: 

303 # Separate the leading/trailing newlines from the content. 

304 leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups() 

305 

306 # If the last child had trailing newlines and this child has leading newlines, 

307 # use the larger newline count, limited to 2. 

308 if updated_child_strings[-1] and leading_nl: 

309 prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value 

310 num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl))) 

311 leading_nl = '\n' * num_newlines 

312 

313 # Add the results to the updated child string list. 

314 updated_child_strings.extend([leading_nl, content, trailing_nl]) 

315 

316 child_strings = updated_child_strings 

317 

318 # Join all child text strings into a single string. 

319 text = ''.join(child_strings) 

320 

321 # apply this tag's final conversion function 

322 convert_fn = self.get_conv_fn_cached(node.name) 

323 if convert_fn is not None: 

324 text = convert_fn(node, text, parent_tags=parent_tags) 

325 

326 return text 

327 

328 def convert__document_(self, el, text, parent_tags): 

329 """Final document-level formatting for BeautifulSoup object (node.name == "[document]")""" 

330 if self.options['strip_document'] == LSTRIP: 

331 text = text.lstrip('\n') # remove leading separation newlines 

332 elif self.options['strip_document'] == RSTRIP: 

333 text = text.rstrip('\n') # remove trailing separation newlines 

334 elif self.options['strip_document'] == STRIP: 

335 text = text.strip('\n') # remove leading and trailing separation newlines 

336 elif self.options['strip_document'] is None: 

337 pass # leave leading and trailing separation newlines as-is 

338 else: 

339 raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document']) 

340 

341 return text 

342 

343 def process_text(self, el, parent_tags=None): 

344 # For the top-level element, initialize the parent context with an empty set. 

345 if parent_tags is None: 

346 parent_tags = set() 

347 

348 text = six.text_type(el) or '' 

349 

350 # normalize whitespace if we're not inside a preformatted element 

351 if 'pre' not in parent_tags: 

352 if self.options['wrap']: 

353 text = re_all_whitespace.sub(' ', text) 

354 else: 

355 text = re_newline_whitespace.sub('\n', text) 

356 text = re_whitespace.sub(' ', text) 

357 

358 # escape special characters if we're not inside a preformatted or code element 

359 if '_noformat' not in parent_tags: 

360 text = self.escape(text, parent_tags) 

361 

362 # remove leading whitespace at the start or just after a 

363 # block-level element; remove traliing whitespace at the end 

364 # or just before a block-level element. 

365 if (should_remove_whitespace_outside(el.previous_sibling) 

366 or (should_remove_whitespace_inside(el.parent) 

367 and not el.previous_sibling)): 

368 text = text.lstrip(' \t\r\n') 

369 if (should_remove_whitespace_outside(el.next_sibling) 

370 or (should_remove_whitespace_inside(el.parent) 

371 and not el.next_sibling)): 

372 text = text.rstrip() 

373 

374 return text 

375 

376 def get_conv_fn_cached(self, tag_name): 

377 """Given a tag name, return the conversion function using the cache.""" 

378 # If conversion function is not in cache, add it 

379 if tag_name not in self.convert_fn_cache: 

380 self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name) 

381 

382 # Return the cached entry 

383 return self.convert_fn_cache[tag_name] 

384 

385 def get_conv_fn(self, tag_name): 

386 """Given a tag name, find and return the conversion function.""" 

387 tag_name = tag_name.lower() 

388 

389 # Handle strip/convert exclusion options 

390 if not self.should_convert_tag(tag_name): 

391 return None 

392 

393 # Look for an explicitly defined conversion function by tag name first 

394 convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name) 

395 convert_fn = getattr(self, convert_fn_name, None) 

396 if convert_fn: 

397 return convert_fn 

398 

399 # If tag is any heading, handle with convert_hN() function 

400 match = re_html_heading.match(tag_name) 

401 if match: 

402 n = int(match.group(1)) # get value of N from <hN> 

403 return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags) 

404 

405 # No conversion function was found 

406 return None 

407 

408 def should_convert_tag(self, tag): 

409 """Given a tag name, return whether to convert based on strip/convert options.""" 

410 strip = self.options['strip'] 

411 convert = self.options['convert'] 

412 if strip is not None: 

413 return tag not in strip 

414 elif convert is not None: 

415 return tag in convert 

416 else: 

417 return True 

418 

419 def escape(self, text, parent_tags): 

420 if not text: 

421 return '' 

422 if self.options['escape_misc']: 

423 text = re_escape_misc_chars.sub(r'\\\1', text) 

424 text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text) 

425 text = re_escape_misc_hashes.sub(r'\1\\\2', text) 

426 text = re_escape_misc_list_items.sub(r'\1\\\2', text) 

427 

428 if self.options['escape_asterisks']: 

429 text = text.replace('*', r'\*') 

430 if self.options['escape_underscores']: 

431 text = text.replace('_', r'\_') 

432 return text 

433 

434 def underline(self, text, pad_char): 

435 text = (text or '').rstrip() 

436 return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' 

437 

438 def convert_a(self, el, text, parent_tags): 

439 if '_noformat' in parent_tags: 

440 return text 

441 prefix, suffix, text = chomp(text) 

442 if not text: 

443 return '' 

444 href = el.get('href') 

445 title = el.get('title') 

446 # For the replacement see #29: text nodes underscores are escaped 

447 if (self.options['autolinks'] 

448 and text.replace(r'\_', '_') == href 

449 and not title 

450 and not self.options['default_title']): 

451 # Shortcut syntax 

452 return '<%s>' % href 

453 if self.options['default_title'] and not title: 

454 title = href 

455 title_part = ' "%s"' % title.replace('"', r'\"') if title else '' 

456 return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text 

457 

458 convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol']) 

459 

460 def convert_blockquote(self, el, text, parent_tags): 

461 # handle some early-exit scenarios 

462 text = (text or '').strip(' \t\r\n') 

463 if '_inline' in parent_tags: 

464 return ' ' + text + ' ' 

465 if not text: 

466 return "\n" 

467 

468 # indent lines with blockquote marker 

469 def _indent_for_blockquote(match): 

470 line_content = match.group(1) 

471 return '> ' + line_content if line_content else '>' 

472 text = re_line_with_content.sub(_indent_for_blockquote, text) 

473 

474 return '\n' + text + '\n\n' 

475 

476 def convert_br(self, el, text, parent_tags): 

477 if '_inline' in parent_tags: 

478 return ' ' 

479 

480 if self.options['newline_style'].lower() == BACKSLASH: 

481 return '\\\n' 

482 else: 

483 return ' \n' 

484 

485 def convert_code(self, el, text, parent_tags): 

486 if '_noformat' in parent_tags: 

487 return text 

488 

489 prefix, suffix, text = chomp(text) 

490 if not text: 

491 return '' 

492 

493 # Find the maximum number of consecutive backticks in the text, then 

494 # delimit the code span with one more backtick than that 

495 max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0) 

496 markup_delimiter = '`' * (max_backticks + 1) 

497 

498 # If the maximum number of backticks is greater than zero, add a space 

499 # to avoid interpretation of inside backticks as literals 

500 if max_backticks > 0: 

501 text = " " + text + " " 

502 

503 return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix) 

504 

505 convert_del = abstract_inline_conversion(lambda self: '~~') 

506 

507 def convert_div(self, el, text, parent_tags): 

508 if '_inline' in parent_tags: 

509 return ' ' + text.strip() + ' ' 

510 text = text.strip() 

511 return '\n\n%s\n\n' % text if text else '' 

512 

513 convert_article = convert_div 

514 

515 convert_section = convert_div 

516 

517 convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol']) 

518 

519 convert_kbd = convert_code 

520 

521 def convert_dd(self, el, text, parent_tags): 

522 text = (text or '').strip() 

523 if '_inline' in parent_tags: 

524 return ' ' + text + ' ' 

525 if not text: 

526 return '\n' 

527 

528 # indent definition content lines by four spaces 

529 def _indent_for_dd(match): 

530 line_content = match.group(1) 

531 return ' ' + line_content if line_content else '' 

532 text = re_line_with_content.sub(_indent_for_dd, text) 

533 

534 # insert definition marker into first-line indent whitespace 

535 text = ':' + text[1:] 

536 

537 return '%s\n' % text 

538 

539 # definition lists are formatted as follows: 

540 # https://pandoc.org/MANUAL.html#definition-lists 

541 # https://michelf.ca/projects/php-markdown/extra/#def-list 

542 convert_dl = convert_div 

543 

544 def convert_dt(self, el, text, parent_tags): 

545 # remove newlines from term text 

546 text = (text or '').strip() 

547 text = re_all_whitespace.sub(' ', text) 

548 if '_inline' in parent_tags: 

549 return ' ' + text + ' ' 

550 if not text: 

551 return '\n' 

552 

553 # TODO - format consecutive <dt> elements as directly adjacent lines): 

554 # https://michelf.ca/projects/php-markdown/extra/#def-list 

555 

556 return '\n\n%s\n' % text 

557 

558 def convert_hN(self, n, el, text, parent_tags): 

559 # convert_hN() converts <hN> tags, where N is any integer 

560 if '_inline' in parent_tags: 

561 return text 

562 

563 # Markdown does not support heading depths of n > 6 

564 n = max(1, min(6, n)) 

565 

566 style = self.options['heading_style'].lower() 

567 text = text.strip() 

568 if style == UNDERLINED and n <= 2: 

569 line = '=' if n == 1 else '-' 

570 return self.underline(text, line) 

571 text = re_all_whitespace.sub(' ', text) 

572 hashes = '#' * n 

573 if style == ATX_CLOSED: 

574 return '\n\n%s %s %s\n\n' % (hashes, text, hashes) 

575 return '\n\n%s %s\n\n' % (hashes, text) 

576 

577 def convert_hr(self, el, text, parent_tags): 

578 return '\n\n---\n\n' 

579 

580 convert_i = convert_em 

581 

582 def convert_img(self, el, text, parent_tags): 

583 alt = el.attrs.get('alt', None) or '' 

584 src = el.attrs.get('src', None) or '' 

585 title = el.attrs.get('title', None) or '' 

586 title_part = ' "%s"' % title.replace('"', r'\"') if title else '' 

587 if ('_inline' in parent_tags 

588 and el.parent.name not in self.options['keep_inline_images_in']): 

589 return alt 

590 

591 return '![%s](%s%s)' % (alt, src, title_part) 

592 

593 def convert_video(self, el, text, parent_tags): 

594 if ('_inline' in parent_tags 

595 and el.parent.name not in self.options['keep_inline_images_in']): 

596 return text 

597 src = el.attrs.get('src', None) or '' 

598 if not src: 

599 sources = el.find_all('source', attrs={'src': True}) 

600 if sources: 

601 src = sources[0].attrs.get('src', None) or '' 

602 poster = el.attrs.get('poster', None) or '' 

603 if src and poster: 

604 return '[![%s](%s)](%s)' % (text, poster, src) 

605 if src: 

606 return '[%s](%s)' % (text, src) 

607 if poster: 

608 return '![%s](%s)' % (text, poster) 

609 return text 

610 

611 def convert_list(self, el, text, parent_tags): 

612 

613 # Converting a list to inline is undefined. 

614 # Ignoring inline conversion parents for list. 

615 

616 before_paragraph = False 

617 next_sibling = _next_block_content_sibling(el) 

618 if next_sibling and next_sibling.name not in ['ul', 'ol']: 

619 before_paragraph = True 

620 if 'li' in parent_tags: 

621 # remove trailing newline if we're in a nested list 

622 return '\n' + text.rstrip() 

623 return '\n\n' + text + ('\n' if before_paragraph else '') 

624 

625 convert_ul = convert_list 

626 convert_ol = convert_list 

627 

628 def convert_li(self, el, text, parent_tags): 

629 # handle some early-exit scenarios 

630 text = (text or '').strip() 

631 if not text: 

632 return "\n" 

633 

634 # determine list item bullet character to use 

635 parent = el.parent 

636 if parent is not None and parent.name == 'ol': 

637 if parent.get("start") and str(parent.get("start")).isnumeric(): 

638 start = int(parent.get("start")) 

639 else: 

640 start = 1 

641 bullet = '%s.' % (start + len(el.find_previous_siblings('li'))) 

642 else: 

643 depth = -1 

644 while el: 

645 if el.name == 'ul': 

646 depth += 1 

647 el = el.parent 

648 bullets = self.options['bullets'] 

649 bullet = bullets[depth % len(bullets)] 

650 bullet = bullet + ' ' 

651 bullet_width = len(bullet) 

652 bullet_indent = ' ' * bullet_width 

653 

654 # indent content lines by bullet width 

655 def _indent_for_li(match): 

656 line_content = match.group(1) 

657 return bullet_indent + line_content if line_content else '' 

658 text = re_line_with_content.sub(_indent_for_li, text) 

659 

660 # insert bullet into first-line indent whitespace 

661 text = bullet + text[bullet_width:] 

662 

663 return '%s\n' % text 

664 

665 def convert_p(self, el, text, parent_tags): 

666 if '_inline' in parent_tags: 

667 return ' ' + text.strip(' \t\r\n') + ' ' 

668 text = text.strip(' \t\r\n') 

669 if self.options['wrap']: 

670 # Preserve newlines (and preceding whitespace) resulting 

671 # from <br> tags. Newlines in the input have already been 

672 # replaced by spaces. 

673 if self.options['wrap_width'] is not None: 

674 lines = text.split('\n') 

675 new_lines = [] 

676 for line in lines: 

677 line = line.lstrip(' \t\r\n') 

678 line_no_trailing = line.rstrip() 

679 trailing = line[len(line_no_trailing):] 

680 line = fill(line, 

681 width=self.options['wrap_width'], 

682 break_long_words=False, 

683 break_on_hyphens=False) 

684 new_lines.append(line + trailing) 

685 text = '\n'.join(new_lines) 

686 return '\n\n%s\n\n' % text if text else '' 

687 

688 def convert_pre(self, el, text, parent_tags): 

689 if not text: 

690 return '' 

691 code_language = self.options['code_language'] 

692 

693 if self.options['code_language_callback']: 

694 code_language = self.options['code_language_callback'](el) or code_language 

695 

696 if self.options['strip_pre'] == STRIP: 

697 text = strip_pre(text) # remove all leading/trailing newlines 

698 elif self.options['strip_pre'] == STRIP_ONE: 

699 text = strip1_pre(text) # remove one leading/trailing newline 

700 elif self.options['strip_pre'] is None: 

701 pass # leave leading and trailing newlines as-is 

702 else: 

703 raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre']) 

704 

705 return '\n\n```%s\n%s\n```\n\n' % (code_language, text) 

706 

707 def convert_q(self, el, text, parent_tags): 

708 return '"' + text + '"' 

709 

710 def convert_script(self, el, text, parent_tags): 

711 return '' 

712 

713 def convert_style(self, el, text, parent_tags): 

714 return '' 

715 

716 convert_s = convert_del 

717 

718 convert_strong = convert_b 

719 

720 convert_samp = convert_code 

721 

722 convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol']) 

723 

724 convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) 

725 

726 def convert_table(self, el, text, parent_tags): 

727 return '\n\n' + text.strip() + '\n\n' 

728 

729 def convert_caption(self, el, text, parent_tags): 

730 return text.strip() + '\n\n' 

731 

732 def convert_figcaption(self, el, text, parent_tags): 

733 return '\n\n' + text.strip() + '\n\n' 

734 

735 def convert_td(self, el, text, parent_tags): 

736 colspan = 1 

737 if 'colspan' in el.attrs and el['colspan'].isdigit(): 

738 colspan = max(1, min(1000, int(el['colspan']))) 

739 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan 

740 

741 def convert_th(self, el, text, parent_tags): 

742 colspan = 1 

743 if 'colspan' in el.attrs and el['colspan'].isdigit(): 

744 colspan = max(1, min(1000, int(el['colspan']))) 

745 return ' ' + text.strip().replace("\n", " ") + ' |' * colspan 

746 

747 def convert_tr(self, el, text, parent_tags): 

748 cells = el.find_all(['td', 'th']) 

749 is_first_row = el.find_previous_sibling() is None 

750 is_headrow = ( 

751 all([cell.name == 'th' for cell in cells]) 

752 or (el.parent.name == 'thead' 

753 # avoid multiple tr in thead 

754 and len(el.parent.find_all('tr')) == 1) 

755 ) 

756 is_head_row_missing = ( 

757 (is_first_row and not el.parent.name == 'tbody') 

758 or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) 

759 ) 

760 overline = '' 

761 underline = '' 

762 full_colspan = 0 

763 for cell in cells: 

764 if 'colspan' in cell.attrs and cell['colspan'].isdigit(): 

765 full_colspan += max(1, min(1000, int(cell['colspan']))) 

766 else: 

767 full_colspan += 1 

768 if ((is_headrow 

769 or (is_head_row_missing 

770 and self.options['table_infer_header'])) 

771 and is_first_row): 

772 # first row and: 

773 # - is headline or 

774 # - headline is missing and header inference is enabled 

775 # print headline underline 

776 underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' 

777 elif ((is_head_row_missing 

778 and not self.options['table_infer_header']) 

779 or (is_first_row 

780 and (el.parent.name == 'table' 

781 or (el.parent.name == 'tbody' 

782 and not el.parent.find_previous_sibling())))): 

783 # headline is missing and header inference is disabled or: 

784 # first row, not headline, and: 

785 # - the parent is table or 

786 # - the parent is tbody at the beginning of a table. 

787 # print empty headline above this row 

788 overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n' 

789 overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' 

790 return overline + '|' + text + '\n' + underline 

791 

792 

793def markdownify(html, **options): 

794 return MarkdownConverter(**options).convert(html)