Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html2text/__init_

1"""html2text: Turn HTML into equivalent Markdown-structured text."""

3import html.entities

4import html.parser

5import re

6import string

7import urllib.parse as urlparse

8from textwrap import wrap

9from typing import Dict, List, Optional, Tuple, Union

11from . import config

12from ._typing import OutCallback

13from ._version import __version_tuple__

14from .elements import AnchorElement, ListElement

15from .utils import (

16 control_character_replacements,

17 dumb_css_parser,

18 element_style,

19 escape_md,

20 escape_md_section,

21 google_fixed_width_font,

22 google_has_height,

23 google_list_style,

24 google_text_emphasis,

25 hn,

26 list_numbering_start,

27 pad_tables_in_text,

28 skipwrap,

29 unifiable_n,

30)

32__version__ = __version_tuple__

34# TODO:

35# Support decoded entities with UNIFIABLE.

38class HTML2Text(html.parser.HTMLParser):

39 def __init__(

40 self,

41 out: Optional[OutCallback] = None,

42 baseurl: str = "",

43 bodywidth: int = config.BODY_WIDTH,

44 ) -> None:

45 """

46 Input parameters:

47 out: possible custom replacement for self.outtextf (which

48 appends lines of text).

49 baseurl: base URL of the document we process

50 """

51 super().__init__(convert_charrefs=False)

53 # Config options

54 self.split_next_td = False

55 self.td_count = 0

56 self.table_start = False

57 self.unicode_snob = config.UNICODE_SNOB # covered in cli

58 self.escape_snob = config.ESCAPE_SNOB # covered in cli

59 self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH

60 self.body_width = bodywidth # covered in cli

61 self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli

62 self.inline_links = config.INLINE_LINKS # covered in cli

63 self.protect_links = config.PROTECT_LINKS # covered in cli

64 self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli

65 self.ignore_links = config.IGNORE_ANCHORS # covered in cli

66 self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli

67 self.ignore_images = config.IGNORE_IMAGES # covered in cli

68 self.images_as_html = config.IMAGES_AS_HTML # covered in cli

69 self.images_to_alt = config.IMAGES_TO_ALT # covered in cli

70 self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli

71 self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli

72 self.bypass_tables = config.BYPASS_TABLES # covered in cli

73 self.ignore_tables = config.IGNORE_TABLES # covered in cli

74 self.google_doc = False # covered in cli

75 self.ul_item_mark = "*" # covered in cli

76 self.emphasis_mark = "_" # covered in cli

77 self.strong_mark = "**"

78 self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli

79 self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli

80 self.hide_strikethrough = False # covered in cli

81 self.mark_code = config.MARK_CODE

82 self.backquote_code_style = config.BACKQUOTE_CODE_STYLE

83 self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli

84 self.wrap_links = config.WRAP_LINKS # covered in cli

85 self.wrap_tables = config.WRAP_TABLES

86 self.pad_tables = config.PAD_TABLES # covered in cli

87 self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli

88 self.tag_callback = None

89 self.open_quote = config.OPEN_QUOTE # covered in cli

90 self.close_quote = config.CLOSE_QUOTE # covered in cli

91 self.include_sup_sub = config.INCLUDE_SUP_SUB # covered in cli

93 if out is None:

94 self.out = self.outtextf

95 else:

96 self.out = out

98 # empty list to store output characters before they are "joined"

99 self.outtextlist: List[str] = []

100

101 self.quiet = 0

102 self.p_p = 0 # number of newline character to print before next output

103 self.outcount = 0

104 self.start = True

105 self.space = False

106 self.a: List[AnchorElement] = []

107 self.astack: List[Optional[Dict[str, Optional[str]]]] = []

108 self.maybe_automatic_link: Optional[str] = None

109 self.empty_link = False

110 self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")

111 self.acount = 0

112 self.list: List[ListElement] = []

113 self.blockquote = 0

114 self.pre = False

115 self.startpre = False

116 self.pre_indent = ""

117 self.list_code_indent = ""

118 self.code = False

119 self.quote = False

120 self.br_toggle = ""

121 self.lastWasNL = False

122 self.lastWasList = False

123 self.style = 0

124 self.style_def: Dict[str, Dict[str, str]] = {}

125 self.tag_stack: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] = []

126 self.emphasis = 0

127 self.drop_white_space = 0

128 self.inheader = False

129 # Current abbreviation definition

130 self.abbr_title: Optional[str] = None

131 # Last inner HTML (for abbr being defined)

132 self.abbr_data: Optional[str] = None

133 # Stack of abbreviations to write later

134 self.abbr_list: Dict[str, str] = {}

135 self.baseurl = baseurl

136 self.stressed = False

137 self.preceding_stressed = False

138 self.preceding_data = ""

139 self.current_tag = ""

140

141 config.UNIFIABLE["nbsp"] = "&nbsp_place_holder;"

142

143 def feed(self, data: str) -> None:

144 data = data.replace("</' + 'script>", "</ignore>")

145 super().feed(data)

146

147 def handle(self, data: str) -> str:

148 self.start = True

149 self.feed(data)

150 self.feed("")

151 markdown = self.optwrap(self.finish())

152 if self.pad_tables:

153 return pad_tables_in_text(markdown)

154 else:

155 return markdown

156

157 def outtextf(self, s: str) -> None:

158 self.outtextlist.append(s)

159 if s:

160 self.lastWasNL = s[-1] == "\n"

161

162 def finish(self) -> str:

163 self.close()

164

165 self.pbr()

166 self.o("", force="end")

167

168 outtext = "".join(self.outtextlist)

169

170 if self.unicode_snob:

171 nbsp = html.entities.html5["nbsp;"]

172 else:

173 nbsp = " "

174 outtext = outtext.replace("&nbsp_place_holder;", nbsp)

175

176 # Clear self.outtextlist to avoid memory leak of its content to

177 # the next handling.

178 self.outtextlist = []

179

180 return outtext

181

182 def handle_charref(self, c: str) -> None:

183 self.handle_data(self.charref(c), True)

184

185 def handle_entityref(self, c: str) -> None:

186 ref = self.entityref(c)

187

188 # ref may be an empty string (e.g. for &lrm;/&rlm; markers that should

189 # not contribute to the final output).

190 # self.handle_data cannot handle a zero-length string right after a

191 # stressed tag or mid-text within a stressed tag (text get split and

192 # self.stressed/self.preceding_stressed gets switched after the first

193 # part of that text).

194 if ref:

195 self.handle_data(ref, True)

196

197 def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:

198 self.handle_tag(tag, dict(attrs), start=True)

199

200 def handle_endtag(self, tag: str) -> None:

201 self.handle_tag(tag, {}, start=False)

202

203 def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:

204 """

205 :type attrs: dict

206

207 :returns: The index of certain set of attributes (of a link) in the

208 self.a list. If the set of attributes is not found, returns None

209 :rtype: int

210 """

211 if "href" not in attrs:

212 return None

213

214 match = False

215 for i, a in enumerate(self.a):

216 if "href" in a.attrs and a.attrs["href"] == attrs["href"]:

217 if "title" in a.attrs or "title" in attrs:

218 if (

219 "title" in a.attrs

220 and "title" in attrs

221 and a.attrs["title"] == attrs["title"]

222 ):

223 match = True

224 else:

225 match = True

226

227 if match:

228 return i

229 return None

230

231 def handle_emphasis(

232 self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str]

233 ) -> None:

234 """

235 Handles various text emphases

236 """

237 tag_emphasis = google_text_emphasis(tag_style)

238 parent_emphasis = google_text_emphasis(parent_style)

239

240 # handle Google's text emphasis

241 strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough

242

243 # google and others may mark a font's weight as `bold` or `700`

244 bold = False

245 for bold_marker in config.BOLD_TEXT_STYLE_VALUES:

246 bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis

247 if bold:

248 break

249

250 italic = "italic" in tag_emphasis and "italic" not in parent_emphasis

251 fixed = (

252 google_fixed_width_font(tag_style)

253 and not google_fixed_width_font(parent_style)

254 and not self.pre

255 )

256

257 if start:

258 # crossed-out text must be handled before other attributes

259 # in order not to output qualifiers unnecessarily

260 if bold or italic or fixed:

261 self.emphasis += 1

262 if strikethrough:

263 self.quiet += 1

264 if italic:

265 self.o(self.emphasis_mark)

266 self.drop_white_space += 1

267 if bold:

268 self.o(self.strong_mark)

269 self.drop_white_space += 1

270 if fixed:

271 self.o("`")

272 self.drop_white_space += 1

273 self.code = True

274 else:

275 if bold or italic or fixed:

276 # there must not be whitespace before closing emphasis mark

277 self.emphasis -= 1

278 self.space = False

279 if fixed:

280 if self.drop_white_space:

281 # empty emphasis, drop it

282 self.drop_white_space -= 1

283 else:

284 self.o("`")

285 self.code = False

286 if bold:

287 if self.drop_white_space:

288 # empty emphasis, drop it

289 self.drop_white_space -= 1

290 else:

291 self.o(self.strong_mark)

292 if italic:

293 if self.drop_white_space:

294 # empty emphasis, drop it

295 self.drop_white_space -= 1

296 else:

297 self.o(self.emphasis_mark)

298 # space is only allowed after *all* emphasis marks

299 if (bold or italic) and not self.emphasis:

300 self.o(" ")

301 if strikethrough:

302 self.quiet -= 1

303

304 def handle_tag(

305 self, tag: str, attrs: Dict[str, Optional[str]], start: bool

306 ) -> None:

307 self.current_tag = tag

308

309 if self.tag_callback is not None:

310 if self.tag_callback(self, tag, attrs, start) is True:

311 return

312

313 # first thing inside the anchor tag is another tag

314 # that produces some output

315 if (

316 start

317 and self.maybe_automatic_link is not None

318 and tag not in ["p", "div", "style", "dl", "dt"]

319 and (tag != "img" or self.ignore_images)

320 ):

321 self.o("[")

322 self.maybe_automatic_link = None

323 self.empty_link = False

324

325 if self.google_doc:

326 # the attrs parameter is empty for a closing tag. in addition, we

327 # need the attributes of the parent nodes in order to get a

328 # complete style description for the current element. we assume

329 # that google docs export well formed html.

330 parent_style: Dict[str, str] = {}

331 if start:

332 if self.tag_stack:

333 parent_style = self.tag_stack[-1][2]

334 tag_style = element_style(attrs, self.style_def, parent_style)

335 self.tag_stack.append((tag, attrs, tag_style))

336 else:

337 dummy, attrs, tag_style = (

338 self.tag_stack.pop() if self.tag_stack else (None, {}, {})

339 )

340 if self.tag_stack:

341 parent_style = self.tag_stack[-1][2]

342

343 if hn(tag):

344 # check if nh is inside of an 'a' tag (incorrect but found in the wild)

345 if self.astack:

346 if start:

347 self.inheader = True

348 # are inside link name, so only add '#' if it can appear before '['

349 if self.outtextlist and self.outtextlist[-1] == "[":

350 self.outtextlist.pop()

351 self.space = False

352 self.o(hn(tag) * "#" + " ")

353 self.o("[")

354 else:

355 self.p_p = 0 # don't break up link name

356 self.inheader = False

357 return # prevent redundant emphasis marks on headers

358 else:

359 self.p()

360 if start:

361 self.inheader = True

362 self.o(hn(tag) * "#" + " ")

363 else:

364 self.inheader = False

365 return # prevent redundant emphasis marks on headers

366

367 if tag in ["p", "div"]:

368 if self.google_doc:

369 if start and google_has_height(tag_style):

370 self.p()

371 else:

372 self.soft_br()

373 elif self.astack:

374 pass

375 elif self.split_next_td:

376 pass

377 else:

378 self.p()

379

380 if tag == "br" and start:

381 if self.blockquote > 0:

382 self.o(" \n> ")

383 else:

384 self.o(" \n")

385

386 if tag == "hr" and start:

387 self.p()

388 self.o("* * *")

389 self.p()

390

391 if tag in ["head", "style", "script"]:

392 if start:

393 self.quiet += 1

394 else:

395 self.quiet -= 1

396

397 if tag == "style":

398 if start:

399 self.style += 1

400 else:

401 self.style -= 1

402

403 if tag in ["body"]:

404 self.quiet = 0 # sites like 9rules.com never close <head>

405

406 if tag == "blockquote":

407 if start:

408 self.p()

409 self.o("> ", force=True)

410 self.start = True

411 self.blockquote += 1

412 else:

413 self.blockquote -= 1

414 self.p()

415

416 if tag in ["em", "i", "u"] and not self.ignore_emphasis:

417 # Separate with a space if we immediately follow an alphanumeric

418 # character, since otherwise Markdown won't render the emphasis

419 # marks, and we'll be left with eg 'foo_bar_' visible.

420 # (Don't add a space otherwise, though, since there isn't one in the

421 # original HTML.)

422 if (

423 start

424 and self.preceding_data

425 and self.preceding_data[-1] not in string.whitespace

426 and self.preceding_data[-1] not in string.punctuation

427 ):

428 emphasis = " " + self.emphasis_mark

429 self.preceding_data += " "

430 else:

431 emphasis = self.emphasis_mark

432

433 self.o(emphasis)

434 if start:

435 self.stressed = True

436

437 if tag in ["strong", "b"] and not self.ignore_emphasis:

438 # Separate with space if we immediately follow an * character, since

439 # without it, Markdown won't render the resulting *** correctly.

440 # (Don't add a space otherwise, though, since there isn't one in the

441 # original HTML.)

442 if (

443 start

444 and self.preceding_data

445 # When `self.strong_mark` is set to empty, the next condition

446 # will cause IndexError since it's trying to match the data

447 # with the first character of the `self.strong_mark`.

448 and len(self.strong_mark) > 0

449 and self.preceding_data[-1] == self.strong_mark[0]

450 ):

451 strong = " " + self.strong_mark

452 self.preceding_data += " "

453 else:

454 strong = self.strong_mark

455

456 self.o(strong)

457 if start:

458 self.stressed = True

459

460 if tag in ["del", "strike", "s"]:

461 if start and self.preceding_data and self.preceding_data[-1] == "~":

462 strike = " ~~"

463 self.preceding_data += " "

464 else:

465 strike = "~~"

466

467 self.o(strike)

468 if start:

469 self.stressed = True

470

471 if self.google_doc:

472 if not self.inheader:

473 # handle some font attributes, but leave headers clean

474 self.handle_emphasis(start, tag_style, parent_style)

475

476 if tag in ["kbd", "code", "tt"] and not self.pre:

477 self.o("`") # TODO: `` `this` ``

478 self.code = not self.code

479

480 if tag == "abbr":

481 if start:

482 self.abbr_title = None

483 self.abbr_data = ""

484 if "title" in attrs:

485 self.abbr_title = attrs["title"]

486 else:

487 if self.abbr_title is not None:

488 assert self.abbr_data is not None

489 self.abbr_list[self.abbr_data] = self.abbr_title

490 self.abbr_title = None

491 self.abbr_data = None

492

493 if tag == "q":

494 if not self.quote:

495 self.o(self.open_quote)

496 else:

497 self.o(self.close_quote)

498 self.quote = not self.quote

499

500 def link_url(self: HTML2Text, link: str, title: str = "") -> None:

501 url = urlparse.urljoin(self.baseurl, link)

502 title = ' "{}"'.format(title) if title.strip() else ""

503 self.o("]({url}{title})".format(url=escape_md(url), title=title))

504

505 if tag == "a" and not self.ignore_links:

506 if start:

507 if (

508 "href" in attrs

509 and attrs["href"] is not None

510 and not (self.skip_internal_links and attrs["href"].startswith("#"))

511 and not (

512 self.ignore_mailto_links and attrs["href"].startswith("mailto:")

513 )

514 ):

515 self.astack.append(attrs)

516 self.maybe_automatic_link = attrs["href"]

517 self.empty_link = True

518 if self.protect_links:

519 attrs["href"] = "<" + attrs["href"] + ">"

520 else:

521 self.astack.append(None)

522 else:

523 if self.astack:

524 a = self.astack.pop()

525 if self.maybe_automatic_link and not self.empty_link:

526 self.maybe_automatic_link = None

527 elif a:

528 assert a["href"] is not None

529 if self.empty_link:

530 self.o("[")

531 self.empty_link = False

532 self.maybe_automatic_link = None

533 if self.inline_links:

534 self.p_p = 0

535 title = a.get("title") or ""

536 title = escape_md(title)

537 link_url(self, a["href"], title)

538 else:

539 i = self.previousIndex(a)

540 if i is not None:

541 a_props = self.a[i]

542 else:

543 self.acount += 1

544 a_props = AnchorElement(a, self.acount, self.outcount)

545 self.a.append(a_props)

546 self.o("][" + str(a_props.count) + "]")

547

548 if tag == "img" and start and not self.ignore_images:

549 if "src" in attrs and attrs["src"] is not None:

550 if not self.images_to_alt:

551 attrs["href"] = attrs["src"]

552 alt = attrs.get("alt") or self.default_image_alt

553

554 # If we have images_with_size, write raw html including width,

555 # height, and alt attributes

556 if self.images_as_html or (

557 self.images_with_size and ("width" in attrs or "height" in attrs)

558 ):

559 self.o("<img src='" + attrs["src"] + "' ")

560 if "width" in attrs and attrs["width"] is not None:

561 self.o("width='" + attrs["width"] + "' ")

562 if "height" in attrs and attrs["height"] is not None:

563 self.o("height='" + attrs["height"] + "' ")

564 if alt:

565 self.o("alt='" + alt + "' ")

566 self.o("/>")

567 return

568

569 # If we have a link to create, output the start

570 if self.maybe_automatic_link is not None:

571 href = self.maybe_automatic_link

572 if (

573 self.images_to_alt

574 and escape_md(alt) == href

575 and self.absolute_url_matcher.match(href)

576 ):

577 self.o("<" + escape_md(alt) + ">")

578 self.empty_link = False

579 return

580 else:

581 self.o("[")

582 self.maybe_automatic_link = None

583 self.empty_link = False

584

585 # If we have images_to_alt, we discard the image itself,

586 # considering only the alt text.

587 if self.images_to_alt:

588 self.o(escape_md(alt))

589 else:

590 self.o("![" + escape_md(alt) + "]")

591 if self.inline_links:

592 href = attrs.get("href") or ""

593 self.o(

594 "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")"

595 )

596 else:

597 i = self.previousIndex(attrs)

598 if i is not None:

599 a_props = self.a[i]

600 else:

601 self.acount += 1

602 a_props = AnchorElement(attrs, self.acount, self.outcount)

603 self.a.append(a_props)

604 self.o("[" + str(a_props.count) + "]")

605

606 if tag == "dl" and start:

607 self.p()

608 if tag == "dt" and not start:

609 self.pbr()

610 if tag == "dd" and start:

611 self.o(" ")

612 if tag == "dd" and not start:

613 self.pbr()

614

615 if tag in ["ol", "ul"]:

616 # Google Docs create sub lists as top level lists

617 if not self.list and not self.lastWasList:

618 self.p()

619 if start:

620 if self.google_doc:

621 list_style = google_list_style(tag_style)

622 else:

623 list_style = tag

624 numbering_start = list_numbering_start(attrs)

625 self.list.append(ListElement(list_style, numbering_start))

626 else:

627 if self.list:

628 self.list.pop()

629 if not self.google_doc and not self.list:

630 self.o("\n")

631 self.lastWasList = True

632 else:

633 self.lastWasList = False

634

635 if tag == "li":

636 self.list_code_indent = ""

637 self.pbr()

638 if start:

639 if self.list:

640 li = self.list[-1]

641 else:

642 li = ListElement("ul", 0)

643 if self.google_doc:

644 self.o(" " * self.google_nest_count(tag_style))

645 else:

646 # Indent two spaces per list, except use three spaces for an

647 # unordered list inside an ordered list.

648 # https://spec.commonmark.org/0.28/#motivation

649 # TODO: line up <ol><li>s > 9 correctly.

650 parent_list = None

651 for list in self.list:

652 self.list_code_indent += " " if parent_list == "ol" else " "

653 parent_list = list.name

654 self.o(self.list_code_indent)

655

656 if li.name == "ul":

657 self.list_code_indent += " "

658 self.o(self.ul_item_mark + " ")

659 elif li.name == "ol":

660 li.num += 1

661 self.list_code_indent += " "

662 self.o(str(li.num) + ". ")

663 self.start = True

664

665 if tag in ["table", "tr", "td", "th"]:

666 if self.ignore_tables:

667 if tag == "tr":

668 if start:

669 pass

670 else:

671 self.soft_br()

672 else:

673 pass

674

675 elif self.bypass_tables:

676 if start:

677 self.soft_br()

678 if tag in ["td", "th"]:

679 if start:

680 self.o("<{}>\n\n".format(tag))

681 else:

682 self.o("\n</{}>".format(tag))

683 else:

684 if start:

685 self.o("<{}>".format(tag))

686 else:

687 self.o("</{}>".format(tag))

688

689 else:

690 if tag == "table":

691 if start:

692 self.table_start = True

693 if self.pad_tables:

694 self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")

695 self.o(" \n")

696 else:

697 if self.pad_tables:

698 # add break in case the table is empty or its 1 row table

699 self.soft_br()

700 self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")

701 self.o(" \n")

702 if tag in ["td", "th"] and start:

703 if self.split_next_td:

704 self.o("| ")

705 self.split_next_td = True

706

707 if tag == "tr" and start:

708 self.td_count = 0

709 if tag == "tr" and not start:

710 self.split_next_td = False

711 self.soft_br()

712 if tag == "tr" and not start and self.table_start:

713 # Underline table header

714 self.o("|".join(["---"] * self.td_count))

715 self.soft_br()

716 self.table_start = False

717 if tag in ["td", "th"] and start:

718 self.td_count += 1

719

720 if tag == "pre":

721 if start:

722 self.startpre = True

723 self.pre = True

724 self.pre_indent = ""

725 else:

726 self.pre = False

727 if self.backquote_code_style:

728 self.out("\n" + self.pre_indent + "```")

729 if self.mark_code:

730 self.out("\n[/code]")

731 self.p()

732

733 if tag in ["sup", "sub"] and self.include_sup_sub:

734 if start:

735 self.o("<{}>".format(tag))

736 else:

737 self.o("</{}>".format(tag))

738

739 # TODO: Add docstring for these one letter functions

740 def pbr(self) -> None:

741 "Pretty print has a line break"

742 if self.p_p == 0:

743 self.p_p = 1

744

745 def p(self) -> None:

746 "Set pretty print to 1 or 2 lines"

747 self.p_p = 1 if self.single_line_break else 2

748

749 def soft_br(self) -> None:

750 "Soft breaks"

751 self.pbr()

752 self.br_toggle = " "

753

754 def o(

755 self, data: str, puredata: bool = False, force: Union[bool, str] = False

756 ) -> None:

757 """

758 Deal with indentation and whitespace

759 """

760 if self.abbr_data is not None:

761 self.abbr_data += data

762

763 if not self.quiet:

764 if self.google_doc:

765 # prevent white space immediately after 'begin emphasis'

766 # marks ('**' and '_')

767 lstripped_data = data.lstrip()

768 if self.drop_white_space and not (self.pre or self.code):

769 data = lstripped_data

770 if lstripped_data != "":

771 self.drop_white_space = 0

772

773 if puredata and not self.pre:

774 # This is a very dangerous call ... it could mess up

775 # all handling of   when not handled properly

776 # (see entityref)

777 data = re.sub(r"\s+", r" ", data)

778 if data and data[0] == " ":

779 self.space = True

780 data = data[1:]

781 if not data and not force:

782 return

783

784 if self.startpre:

785 # self.out(" :") #TODO: not output when already one there

786 if not data.startswith("\n") and not data.startswith("\r\n"):

787 # <pre>stuff...

788 data = "\n" + data

789 if self.mark_code:

790 self.out("\n[code]")

791 self.p_p = 0

792

793 bq = ">" * self.blockquote

794 if not (force and data and data[0] == ">") and self.blockquote:

795 bq += " "

796

797 if self.pre:

798 if self.list:

799 bq += self.list_code_indent

800

801 if not self.backquote_code_style:

802 bq += " "

803

804 data = data.replace("\n", "\n" + bq)

805 self.pre_indent = bq

806

807 if self.startpre:

808 self.startpre = False

809 if self.backquote_code_style:

810 self.out("\n" + self.pre_indent + "```")

811 self.p_p = 0

812 elif self.list:

813 # use existing initial indentation

814 data = data.lstrip("\n" + self.pre_indent)

815

816 if self.start:

817 self.space = False

818 self.p_p = 0

819 self.start = False

820

821 if force == "end":

822 # It's the end.

823 self.p_p = 0

824 self.out("\n")

825 self.space = False

826

827 if self.p_p:

828 self.out((self.br_toggle + "\n" + bq) * self.p_p)

829 self.space = False

830 self.br_toggle = ""

831

832 if self.space:

833 if not self.lastWasNL:

834 self.out(" ")

835 self.space = False

836

837 if self.a and (

838 (self.p_p == 2 and self.links_each_paragraph) or force == "end"

839 ):

840 if force == "end":

841 self.out("\n")

842

843 newa = []

844 for link in self.a:

845 if self.outcount > link.outcount:

846 self.out(

847 " ["

848 + str(link.count)

849 + "]: "

850 + urlparse.urljoin(self.baseurl, link.attrs["href"])

851 )

852 if "title" in link.attrs and link.attrs["title"] is not None:

853 self.out(" (" + link.attrs["title"] + ")")

854 self.out("\n")

855 else:

856 newa.append(link)

857

858 # Don't need an extra line when nothing was done.

859 if self.a != newa:

860 self.out("\n")

861

862 self.a = newa

863

864 if self.abbr_list and force == "end":

865 for abbr, definition in self.abbr_list.items():

866 self.out(" *[" + abbr + "]: " + definition + "\n")

867

868 self.p_p = 0

869 self.out(data)

870 self.outcount += 1

871

872 def handle_data(self, data: str, entity_char: bool = False) -> None:

873 if not data:

874 # Data may be empty for some HTML entities. For example,

875 # LEFT-TO-RIGHT MARK.

876 return

877

878 if self.stressed:

879 data = data.strip()

880 self.stressed = False

881 self.preceding_stressed = True

882 elif self.preceding_stressed:

883 if (

884 re.match(r"[^][(){}\s.!?]", data[0])

885 and not hn(self.current_tag)

886 and self.current_tag not in ["a", "code", "pre"]

887 ):

888 # should match a letter or common punctuation

889 data = " " + data

890 self.preceding_stressed = False

891

892 if self.style:

893 self.style_def.update(dumb_css_parser(data))

894

895 if self.maybe_automatic_link is not None:

896 href = self.maybe_automatic_link

897 if (

898 href == data

899 and self.absolute_url_matcher.match(href)

900 and self.use_automatic_links

901 ):

902 self.o("<" + data + ">")

903 self.empty_link = False

904 return

905 else:

906 self.o("[")

907 self.maybe_automatic_link = None

908 self.empty_link = False

909

910 if not self.code and not self.pre and not entity_char:

911 data = escape_md_section(data, snob=self.escape_snob)

912 self.preceding_data = data

913 self.o(data, puredata=True)

914

915 def charref(self, name: str) -> str:

916 if name[0] in ["x", "X"]:

917 c = int(name[1:], 16)

918 else:

919 c = int(name)

920

921 if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000: # invalid or surrogate

922 c = 0xFFFD # REPLACEMENT CHARACTER

923 c = control_character_replacements.get(c, c)

924

925 if not self.unicode_snob and c in unifiable_n:

926 return unifiable_n[c]

927 else:

928 return chr(c)

929

930 def entityref(self, c: str) -> str:

931 if not self.unicode_snob and c in config.UNIFIABLE:

932 return config.UNIFIABLE[c]

933 try:

934 ch = html.entities.html5[c + ";"]

935 except KeyError:

936 return "&" + c + ";"

937 return config.UNIFIABLE[c] if c == "nbsp" else ch

938

939 def google_nest_count(self, style: Dict[str, str]) -> int:

940 """

941 Calculate the nesting count of google doc lists

942

943 :type style: dict

944

945 :rtype: int

946 """

947 nest_count = 0

948 if "margin-left" in style:

949 nest_count = int(style["margin-left"][:-2]) // self.google_list_indent

950

951 return nest_count

952

953 def optwrap(self, text: str) -> str:

954 """

955 Wrap all paragraphs in the provided text.

956

957 :type text: str

958

959 :rtype: str

960 """

961 if not self.body_width:

962 return text

963

964 result = ""

965 newlines = 0

966 # I cannot think of a better solution for now.

967 # To avoid the non-wrap behaviour for entire paras

968 # because of the presence of a link in it

969 if not self.wrap_links:

970 self.inline_links = False

971 start_code = False

972 for para in text.split("\n"):

973 # If the text is between tri-backquote pairs, it's a code block;

974 # don't wrap

975 if self.backquote_code_style and para.lstrip().startswith("```"):

976 start_code = not start_code

977 if start_code:

978 result += para + "\n"

979 elif len(para) > 0:

980 if not skipwrap(

981 para, self.wrap_links, self.wrap_list_items, self.wrap_tables

982 ):

983 indent = ""

984 if para.startswith(" " + self.ul_item_mark):

985 # list item continuation: add a double indent to the

986 # new lines

987 indent = " "

988 elif para.startswith("> "):

989 # blockquote continuation: add the greater than symbol

990 # to the new lines

991 indent = "> "

992 wrapped = wrap(

993 para,

994 self.body_width,

995 break_long_words=False,

996 subsequent_indent=indent,

997 )

998 result += "\n".join(wrapped)

999 if para.endswith(" "):

1000 result += " \n"

1001 newlines = 1

1002 elif indent:

1003 result += "\n"

1004 newlines = 1

1005 else:

1006 result += "\n\n"

1007 newlines = 2

1008 else:

1009 # Warning for the tempted!!!

1010 # Be aware that obvious replacement of this with

1011 # line.isspace()

1012 # DOES NOT work! Explanations are welcome.

1013 if not config.RE_SPACE.match(para):

1014 result += para + "\n"

1015 newlines = 1

1016 else:

1017 if newlines < 2:

1018 result += "\n"

1019 newlines += 1

1020 return result

1021

1022

1023def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:

1024 if bodywidth is None:

1025 bodywidth = config.BODY_WIDTH

1026 h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)

1027

1028 return h.handle(html)

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html2text/init.py: 76%

659 statements