Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdownify/__init__.py: 96%

251 statements  

« prev     ^ index     » next       coverage.py v7.3.1, created at 2023-09-25 06:04 +0000

1from bs4 import BeautifulSoup, NavigableString, Comment, Doctype 

2from textwrap import fill 

3import re 

4import six 

5 

6 

7convert_heading_re = re.compile(r'convert_h(\d+)') 

8line_beginning_re = re.compile(r'^', re.MULTILINE) 

9whitespace_re = re.compile(r'[\t ]+') 

10all_whitespace_re = re.compile(r'[\s]+') 

11html_heading_re = re.compile(r'h[1-6]') 

12 

13 

14# Heading styles 

15ATX = 'atx' 

16ATX_CLOSED = 'atx_closed' 

17UNDERLINED = 'underlined' 

18SETEXT = UNDERLINED 

19 

20# Newline style 

21SPACES = 'spaces' 

22BACKSLASH = 'backslash' 

23 

24# Strong and emphasis style 

25ASTERISK = '*' 

26UNDERSCORE = '_' 

27 

28 

29def chomp(text): 

30 """ 

31 If the text in an inline tag like b, a, or em contains a leading or trailing 

32 space, strip the string and return a space as suffix of prefix, if needed. 

33 This function is used to prevent conversions like 

34 <b> foo</b> => ** foo** 

35 """ 

36 prefix = ' ' if text and text[0] == ' ' else '' 

37 suffix = ' ' if text and text[-1] == ' ' else '' 

38 text = text.strip() 

39 return (prefix, suffix, text) 

40 

41 

42def abstract_inline_conversion(markup_fn): 

43 """ 

44 This abstracts all simple inline tags like b, em, del, ... 

45 Returns a function that wraps the chomped text in a pair of the string 

46 that is returned by markup_fn. markup_fn is necessary to allow for 

47 references to self.strong_em_symbol etc. 

48 """ 

49 def implementation(self, el, text, convert_as_inline): 

50 markup = markup_fn(self) 

51 prefix, suffix, text = chomp(text) 

52 if not text: 

53 return '' 

54 return '%s%s%s%s%s' % (prefix, markup, text, markup, suffix) 

55 return implementation 

56 

57 

58def _todict(obj): 

59 return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_')) 

60 

61 

62class MarkdownConverter(object): 

63 class DefaultOptions: 

64 autolinks = True 

65 bullets = '*+-' # An iterable of bullet types. 

66 code_language = '' 

67 code_language_callback = None 

68 convert = None 

69 default_title = False 

70 escape_asterisks = True 

71 escape_underscores = True 

72 heading_style = UNDERLINED 

73 keep_inline_images_in = [] 

74 newline_style = SPACES 

75 strip = None 

76 strong_em_symbol = ASTERISK 

77 sub_symbol = '' 

78 sup_symbol = '' 

79 wrap = False 

80 wrap_width = 80 

81 

82 class Options(DefaultOptions): 

83 pass 

84 

85 def __init__(self, **options): 

86 # Create an options dictionary. Use DefaultOptions as a base so that 

87 # it doesn't have to be extended. 

88 self.options = _todict(self.DefaultOptions) 

89 self.options.update(_todict(self.Options)) 

90 self.options.update(options) 

91 if self.options['strip'] is not None and self.options['convert'] is not None: 

92 raise ValueError('You may specify either tags to strip or tags to' 

93 ' convert, but not both.') 

94 

95 def convert(self, html): 

96 soup = BeautifulSoup(html, 'html.parser') 

97 return self.convert_soup(soup) 

98 

99 def convert_soup(self, soup): 

100 return self.process_tag(soup, convert_as_inline=False, children_only=True) 

101 

102 def process_tag(self, node, convert_as_inline, children_only=False): 

103 text = '' 

104 

105 # markdown headings or cells can't include 

106 # block elements (elements w/newlines) 

107 isHeading = html_heading_re.match(node.name) is not None 

108 isCell = node.name in ['td', 'th'] 

109 convert_children_as_inline = convert_as_inline 

110 

111 if not children_only and (isHeading or isCell): 

112 convert_children_as_inline = True 

113 

114 # Remove whitespace-only textnodes in purely nested nodes 

115 def is_nested_node(el): 

116 return el and el.name in ['ol', 'ul', 'li', 

117 'table', 'thead', 'tbody', 'tfoot', 

118 'tr', 'td', 'th'] 

119 

120 if is_nested_node(node): 

121 for el in node.children: 

122 # Only extract (remove) whitespace-only text node if any of the 

123 # conditions is true: 

124 # - el is the first element in its parent 

125 # - el is the last element in its parent 

126 # - el is adjacent to an nested node 

127 can_extract = (not el.previous_sibling 

128 or not el.next_sibling 

129 or is_nested_node(el.previous_sibling) 

130 or is_nested_node(el.next_sibling)) 

131 if (isinstance(el, NavigableString) 

132 and six.text_type(el).strip() == '' 

133 and can_extract): 

134 el.extract() 

135 

136 # Convert the children first 

137 for el in node.children: 

138 if isinstance(el, Comment) or isinstance(el, Doctype): 

139 continue 

140 elif isinstance(el, NavigableString): 

141 text += self.process_text(el) 

142 else: 

143 text += self.process_tag(el, convert_children_as_inline) 

144 

145 if not children_only: 

146 convert_fn = getattr(self, 'convert_%s' % node.name, None) 

147 if convert_fn and self.should_convert_tag(node.name): 

148 text = convert_fn(node, text, convert_as_inline) 

149 

150 return text 

151 

152 def process_text(self, el): 

153 text = six.text_type(el) or '' 

154 

155 # dont remove any whitespace when handling pre or code in pre 

156 if not (el.parent.name == 'pre' 

157 or (el.parent.name == 'code' 

158 and el.parent.parent.name == 'pre')): 

159 text = whitespace_re.sub(' ', text) 

160 

161 if el.parent.name != 'code' and el.parent.name != 'pre': 

162 text = self.escape(text) 

163 

164 # remove trailing whitespaces if any of the following condition is true: 

165 # - current text node is the last node in li 

166 # - current text node is followed by an embedded list 

167 if (el.parent.name == 'li' 

168 and (not el.next_sibling 

169 or el.next_sibling.name in ['ul', 'ol'])): 

170 text = text.rstrip() 

171 

172 return text 

173 

174 def __getattr__(self, attr): 

175 # Handle headings 

176 m = convert_heading_re.match(attr) 

177 if m: 

178 n = int(m.group(1)) 

179 

180 def convert_tag(el, text, convert_as_inline): 

181 return self.convert_hn(n, el, text, convert_as_inline) 

182 

183 convert_tag.__name__ = 'convert_h%s' % n 

184 setattr(self, convert_tag.__name__, convert_tag) 

185 return convert_tag 

186 

187 raise AttributeError(attr) 

188 

189 def should_convert_tag(self, tag): 

190 tag = tag.lower() 

191 strip = self.options['strip'] 

192 convert = self.options['convert'] 

193 if strip is not None: 

194 return tag not in strip 

195 elif convert is not None: 

196 return tag in convert 

197 else: 

198 return True 

199 

200 def escape(self, text): 

201 if not text: 

202 return '' 

203 if self.options['escape_asterisks']: 

204 text = text.replace('*', r'\*') 

205 if self.options['escape_underscores']: 

206 text = text.replace('_', r'\_') 

207 return text 

208 

209 def indent(self, text, level): 

210 return line_beginning_re.sub('\t' * level, text) if text else '' 

211 

212 def underline(self, text, pad_char): 

213 text = (text or '').rstrip() 

214 return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' 

215 

216 def convert_a(self, el, text, convert_as_inline): 

217 prefix, suffix, text = chomp(text) 

218 if not text: 

219 return '' 

220 href = el.get('href') 

221 title = el.get('title') 

222 # For the replacement see #29: text nodes underscores are escaped 

223 if (self.options['autolinks'] 

224 and text.replace(r'\_', '_') == href 

225 and not title 

226 and not self.options['default_title']): 

227 # Shortcut syntax 

228 return '<%s>' % href 

229 if self.options['default_title'] and not title: 

230 title = href 

231 title_part = ' "%s"' % title.replace('"', r'\"') if title else '' 

232 return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text 

233 

234 convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol']) 

235 

236 def convert_blockquote(self, el, text, convert_as_inline): 

237 

238 if convert_as_inline: 

239 return text 

240 

241 return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else '' 

242 

243 def convert_br(self, el, text, convert_as_inline): 

244 if convert_as_inline: 

245 return "" 

246 

247 if self.options['newline_style'].lower() == BACKSLASH: 

248 return '\\\n' 

249 else: 

250 return ' \n' 

251 

252 def convert_code(self, el, text, convert_as_inline): 

253 if el.parent.name == 'pre': 

254 return text 

255 converter = abstract_inline_conversion(lambda self: '`') 

256 return converter(self, el, text, convert_as_inline) 

257 

258 convert_del = abstract_inline_conversion(lambda self: '~~') 

259 

260 convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol']) 

261 

262 convert_kbd = convert_code 

263 

264 def convert_hn(self, n, el, text, convert_as_inline): 

265 if convert_as_inline: 

266 return text 

267 

268 style = self.options['heading_style'].lower() 

269 text = text.rstrip() 

270 if style == UNDERLINED and n <= 2: 

271 line = '=' if n == 1 else '-' 

272 return self.underline(text, line) 

273 hashes = '#' * n 

274 if style == ATX_CLOSED: 

275 return '%s %s %s\n\n' % (hashes, text, hashes) 

276 return '%s %s\n\n' % (hashes, text) 

277 

278 def convert_hr(self, el, text, convert_as_inline): 

279 return '\n\n---\n\n' 

280 

281 convert_i = convert_em 

282 

283 def convert_img(self, el, text, convert_as_inline): 

284 alt = el.attrs.get('alt', None) or '' 

285 src = el.attrs.get('src', None) or '' 

286 title = el.attrs.get('title', None) or '' 

287 title_part = ' "%s"' % title.replace('"', r'\"') if title else '' 

288 if (convert_as_inline 

289 and el.parent.name not in self.options['keep_inline_images_in']): 

290 return alt 

291 

292 return '![%s](%s%s)' % (alt, src, title_part) 

293 

294 def convert_list(self, el, text, convert_as_inline): 

295 

296 # Converting a list to inline is undefined. 

297 # Ignoring convert_to_inline for list. 

298 

299 nested = False 

300 before_paragraph = False 

301 if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']: 

302 before_paragraph = True 

303 while el: 

304 if el.name == 'li': 

305 nested = True 

306 break 

307 el = el.parent 

308 if nested: 

309 # remove trailing newline if nested 

310 return '\n' + self.indent(text, 1).rstrip() 

311 return text + ('\n' if before_paragraph else '') 

312 

313 convert_ul = convert_list 

314 convert_ol = convert_list 

315 

316 def convert_li(self, el, text, convert_as_inline): 

317 parent = el.parent 

318 if parent is not None and parent.name == 'ol': 

319 if parent.get("start"): 

320 start = int(parent.get("start")) 

321 else: 

322 start = 1 

323 bullet = '%s.' % (start + parent.index(el)) 

324 else: 

325 depth = -1 

326 while el: 

327 if el.name == 'ul': 

328 depth += 1 

329 el = el.parent 

330 bullets = self.options['bullets'] 

331 bullet = bullets[depth % len(bullets)] 

332 return '%s %s\n' % (bullet, (text or '').strip()) 

333 

334 def convert_p(self, el, text, convert_as_inline): 

335 if convert_as_inline: 

336 return text 

337 if self.options['wrap']: 

338 text = fill(text, 

339 width=self.options['wrap_width'], 

340 break_long_words=False, 

341 break_on_hyphens=False) 

342 return '%s\n\n' % text if text else '' 

343 

344 def convert_pre(self, el, text, convert_as_inline): 

345 if not text: 

346 return '' 

347 code_language = self.options['code_language'] 

348 

349 if self.options['code_language_callback']: 

350 code_language = self.options['code_language_callback'](el) or code_language 

351 

352 return '\n```%s\n%s\n```\n' % (code_language, text) 

353 

354 convert_s = convert_del 

355 

356 convert_strong = convert_b 

357 

358 convert_samp = convert_code 

359 

360 convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol']) 

361 

362 convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) 

363 

364 def convert_table(self, el, text, convert_as_inline): 

365 return '\n\n' + text + '\n' 

366 

367 def convert_td(self, el, text, convert_as_inline): 

368 return ' ' + text + ' |' 

369 

370 def convert_th(self, el, text, convert_as_inline): 

371 return ' ' + text + ' |' 

372 

373 def convert_tr(self, el, text, convert_as_inline): 

374 cells = el.find_all(['td', 'th']) 

375 is_headrow = all([cell.name == 'th' for cell in cells]) 

376 overline = '' 

377 underline = '' 

378 if is_headrow and not el.previous_sibling: 

379 # first row and is headline: print headline underline 

380 underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' 

381 elif (not el.previous_sibling 

382 and (el.parent.name == 'table' 

383 or (el.parent.name == 'tbody' 

384 and not el.parent.previous_sibling))): 

385 # first row, not headline, and: 

386 # - the parent is table or 

387 # - the parent is tbody at the beginning of a table. 

388 # print empty headline above this row 

389 overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n' 

390 overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' 

391 return overline + '|' + text + '\n' + underline 

392 

393 

394def markdownify(html, **options): 

395 return MarkdownConverter(**options).convert(html)