Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdownify/__init_

1from bs4 import BeautifulSoup, NavigableString, Comment, Doctype

2from textwrap import fill

3import re

4import six

7convert_heading_re = re.compile(r'convert_h(\d+)')

8line_beginning_re = re.compile(r'^', re.MULTILINE)

9whitespace_re = re.compile(r'[\t ]+')

10all_whitespace_re = re.compile(r'[\s]+')

11html_heading_re = re.compile(r'h[1-6]')

14# Heading styles

15ATX = 'atx'

16ATX_CLOSED = 'atx_closed'

17UNDERLINED = 'underlined'

18SETEXT = UNDERLINED

20# Newline style

21SPACES = 'spaces'

22BACKSLASH = 'backslash'

24# Strong and emphasis style

25ASTERISK = '*'

26UNDERSCORE = '_'

29def chomp(text):

30 """

31 If the text in an inline tag like b, a, or em contains a leading or trailing

32 space, strip the string and return a space as suffix of prefix, if needed.

33 This function is used to prevent conversions like

34 <b> foo</b> => ** foo**

35 """

36 prefix = ' ' if text and text[0] == ' ' else ''

37 suffix = ' ' if text and text[-1] == ' ' else ''

38 text = text.strip()

39 return (prefix, suffix, text)

42def abstract_inline_conversion(markup_fn):

43 """

44 This abstracts all simple inline tags like b, em, del, ...

45 Returns a function that wraps the chomped text in a pair of the string

46 that is returned by markup_fn. markup_fn is necessary to allow for

47 references to self.strong_em_symbol etc.

48 """

49 def implementation(self, el, text, convert_as_inline):

50 markup = markup_fn(self)

51 prefix, suffix, text = chomp(text)

52 if not text:

53 return ''

54 return '%s%s%s%s%s' % (prefix, markup, text, markup, suffix)

55 return implementation

58def _todict(obj):

59 return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))

62class MarkdownConverter(object):

63 class DefaultOptions:

64 autolinks = True

65 bullets = '*+-' # An iterable of bullet types.

66 code_language = ''

67 code_language_callback = None

68 convert = None

69 default_title = False

70 escape_asterisks = True

71 escape_underscores = True

72 heading_style = UNDERLINED

73 keep_inline_images_in = []

74 newline_style = SPACES

75 strip = None

76 strong_em_symbol = ASTERISK

77 sub_symbol = ''

78 sup_symbol = ''

79 wrap = False

80 wrap_width = 80

82 class Options(DefaultOptions):

83 pass

85 def __init__(self, **options):

86 # Create an options dictionary. Use DefaultOptions as a base so that

87 # it doesn't have to be extended.

88 self.options = _todict(self.DefaultOptions)

89 self.options.update(_todict(self.Options))

90 self.options.update(options)

91 if self.options['strip'] is not None and self.options['convert'] is not None:

92 raise ValueError('You may specify either tags to strip or tags to'

93 ' convert, but not both.')

95 def convert(self, html):

96 soup = BeautifulSoup(html, 'html.parser')

97 return self.convert_soup(soup)

99 def convert_soup(self, soup):

100 return self.process_tag(soup, convert_as_inline=False, children_only=True)

101

102 def process_tag(self, node, convert_as_inline, children_only=False):

103 text = ''

104

105 # markdown headings or cells can't include

106 # block elements (elements w/newlines)

107 isHeading = html_heading_re.match(node.name) is not None

108 isCell = node.name in ['td', 'th']

109 convert_children_as_inline = convert_as_inline

110

111 if not children_only and (isHeading or isCell):

112 convert_children_as_inline = True

113

114 # Remove whitespace-only textnodes in purely nested nodes

115 def is_nested_node(el):

116 return el and el.name in ['ol', 'ul', 'li',

117 'table', 'thead', 'tbody', 'tfoot',

118 'tr', 'td', 'th']

119

120 if is_nested_node(node):

121 for el in node.children:

122 # Only extract (remove) whitespace-only text node if any of the

123 # conditions is true:

124 # - el is the first element in its parent

125 # - el is the last element in its parent

126 # - el is adjacent to an nested node

127 can_extract = (not el.previous_sibling

128 or not el.next_sibling

129 or is_nested_node(el.previous_sibling)

130 or is_nested_node(el.next_sibling))

131 if (isinstance(el, NavigableString)

132 and six.text_type(el).strip() == ''

133 and can_extract):

134 el.extract()

135

136 # Convert the children first

137 for el in node.children:

138 if isinstance(el, Comment) or isinstance(el, Doctype):

139 continue

140 elif isinstance(el, NavigableString):

141 text += self.process_text(el)

142 else:

143 text += self.process_tag(el, convert_children_as_inline)

144

145 if not children_only:

146 convert_fn = getattr(self, 'convert_%s' % node.name, None)

147 if convert_fn and self.should_convert_tag(node.name):

148 text = convert_fn(node, text, convert_as_inline)

149

150 return text

151

152 def process_text(self, el):

153 text = six.text_type(el) or ''

154

155 # dont remove any whitespace when handling pre or code in pre

156 if not (el.parent.name == 'pre'

157 or (el.parent.name == 'code'

158 and el.parent.parent.name == 'pre')):

159 text = whitespace_re.sub(' ', text)

160

161 if el.parent.name != 'code' and el.parent.name != 'pre':

162 text = self.escape(text)

163

164 # remove trailing whitespaces if any of the following condition is true:

165 # - current text node is the last node in li

166 # - current text node is followed by an embedded list

167 if (el.parent.name == 'li'

168 and (not el.next_sibling

169 or el.next_sibling.name in ['ul', 'ol'])):

170 text = text.rstrip()

171

172 return text

173

174 def __getattr__(self, attr):

175 # Handle headings

176 m = convert_heading_re.match(attr)

177 if m:

178 n = int(m.group(1))

179

180 def convert_tag(el, text, convert_as_inline):

181 return self.convert_hn(n, el, text, convert_as_inline)

182

183 convert_tag.__name__ = 'convert_h%s' % n

184 setattr(self, convert_tag.__name__, convert_tag)

185 return convert_tag

186

187 raise AttributeError(attr)

188

189 def should_convert_tag(self, tag):

190 tag = tag.lower()

191 strip = self.options['strip']

192 convert = self.options['convert']

193 if strip is not None:

194 return tag not in strip

195 elif convert is not None:

196 return tag in convert

197 else:

198 return True

199

200 def escape(self, text):

201 if not text:

202 return ''

203 if self.options['escape_asterisks']:

204 text = text.replace('*', r'\*')

205 if self.options['escape_underscores']:

206 text = text.replace('_', r'\_')

207 return text

208

209 def indent(self, text, level):

210 return line_beginning_re.sub('\t' * level, text) if text else ''

211

212 def underline(self, text, pad_char):

213 text = (text or '').rstrip()

214 return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

215

216 def convert_a(self, el, text, convert_as_inline):

217 prefix, suffix, text = chomp(text)

218 if not text:

219 return ''

220 href = el.get('href')

221 title = el.get('title')

222 # For the replacement see #29: text nodes underscores are escaped

223 if (self.options['autolinks']

224 and text.replace(r'\_', '_') == href

225 and not title

226 and not self.options['default_title']):

227 # Shortcut syntax

228 return '<%s>' % href

229 if self.options['default_title'] and not title:

230 title = href

231 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''

232 return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text

233

234 convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])

235

236 def convert_blockquote(self, el, text, convert_as_inline):

237

238 if convert_as_inline:

239 return text

240

241 return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else ''

242

243 def convert_br(self, el, text, convert_as_inline):

244 if convert_as_inline:

245 return ""

246

247 if self.options['newline_style'].lower() == BACKSLASH:

248 return '\\\n'

249 else:

250 return ' \n'

251

252 def convert_code(self, el, text, convert_as_inline):

253 if el.parent.name == 'pre':

254 return text

255 converter = abstract_inline_conversion(lambda self: '`')

256 return converter(self, el, text, convert_as_inline)

257

258 convert_del = abstract_inline_conversion(lambda self: '~~')

259

260 convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])

261

262 convert_kbd = convert_code

263

264 def convert_hn(self, n, el, text, convert_as_inline):

265 if convert_as_inline:

266 return text

267

268 style = self.options['heading_style'].lower()

269 text = text.rstrip()

270 if style == UNDERLINED and n <= 2:

271 line = '=' if n == 1 else '-'

272 return self.underline(text, line)

273 hashes = '#' * n

274 if style == ATX_CLOSED:

275 return '%s %s %s\n\n' % (hashes, text, hashes)

276 return '%s %s\n\n' % (hashes, text)

277

278 def convert_hr(self, el, text, convert_as_inline):

279 return '\n\n---\n\n'

280

281 convert_i = convert_em

282

283 def convert_img(self, el, text, convert_as_inline):

284 alt = el.attrs.get('alt', None) or ''

285 src = el.attrs.get('src', None) or ''

286 title = el.attrs.get('title', None) or ''

287 title_part = ' "%s"' % title.replace('"', r'\"') if title else ''

288 if (convert_as_inline

289 and el.parent.name not in self.options['keep_inline_images_in']):

290 return alt

291

292 return '![%s](%s%s)' % (alt, src, title_part)

293

294 def convert_list(self, el, text, convert_as_inline):

295

296 # Converting a list to inline is undefined.

297 # Ignoring convert_to_inline for list.

298

299 nested = False

300 before_paragraph = False

301 if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:

302 before_paragraph = True

303 while el:

304 if el.name == 'li':

305 nested = True

306 break

307 el = el.parent

308 if nested:

309 # remove trailing newline if nested

310 return '\n' + self.indent(text, 1).rstrip()

311 return text + ('\n' if before_paragraph else '')

312

313 convert_ul = convert_list

314 convert_ol = convert_list

315

316 def convert_li(self, el, text, convert_as_inline):

317 parent = el.parent

318 if parent is not None and parent.name == 'ol':

319 if parent.get("start"):

320 start = int(parent.get("start"))

321 else:

322 start = 1

323 bullet = '%s.' % (start + parent.index(el))

324 else:

325 depth = -1

326 while el:

327 if el.name == 'ul':

328 depth += 1

329 el = el.parent

330 bullets = self.options['bullets']

331 bullet = bullets[depth % len(bullets)]

332 return '%s %s\n' % (bullet, (text or '').strip())

333

334 def convert_p(self, el, text, convert_as_inline):

335 if convert_as_inline:

336 return text

337 if self.options['wrap']:

338 text = fill(text,

339 width=self.options['wrap_width'],

340 break_long_words=False,

341 break_on_hyphens=False)

342 return '%s\n\n' % text if text else ''

343

344 def convert_pre(self, el, text, convert_as_inline):

345 if not text:

346 return ''

347 code_language = self.options['code_language']

348

349 if self.options['code_language_callback']:

350 code_language = self.options['code_language_callback'](el) or code_language

351

352 return '\n```%s\n%s\n```\n' % (code_language, text)

353

354 convert_s = convert_del

355

356 convert_strong = convert_b

357

358 convert_samp = convert_code

359

360 convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol'])

361

362 convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])

363

364 def convert_table(self, el, text, convert_as_inline):

365 return '\n\n' + text + '\n'

366

367 def convert_td(self, el, text, convert_as_inline):

368 return ' ' + text + ' |'

369

370 def convert_th(self, el, text, convert_as_inline):

371 return ' ' + text + ' |'

372

373 def convert_tr(self, el, text, convert_as_inline):

374 cells = el.find_all(['td', 'th'])

375 is_headrow = all([cell.name == 'th' for cell in cells])

376 overline = ''

377 underline = ''

378 if is_headrow and not el.previous_sibling:

379 # first row and is headline: print headline underline

380 underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'

381 elif (not el.previous_sibling

382 and (el.parent.name == 'table'

383 or (el.parent.name == 'tbody'

384 and not el.parent.previous_sibling))):

385 # first row, not headline, and:

386 # - the parent is table or

387 # - the parent is tbody at the beginning of a table.

388 # print empty headline above this row

389 overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'

390 overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'

391 return overline + '|' + text + '\n' + underline

392

393

394def markdownify(html, **options):

395 return MarkdownConverter(**options).convert(html)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdownify/init.py: 96%

251 statements