Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html2text/__init__.py: 76%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

659 statements  

1"""html2text: Turn HTML into equivalent Markdown-structured text.""" 

2 

3import html.entities 

4import html.parser 

5import re 

6import string 

7import urllib.parse as urlparse 

8from textwrap import wrap 

9from typing import Dict, List, Optional, Tuple, Union 

10 

11from . import config 

12from ._typing import OutCallback 

13from ._version import __version_tuple__ 

14from .elements import AnchorElement, ListElement 

15from .utils import ( 

16 control_character_replacements, 

17 dumb_css_parser, 

18 element_style, 

19 escape_md, 

20 escape_md_section, 

21 google_fixed_width_font, 

22 google_has_height, 

23 google_list_style, 

24 google_text_emphasis, 

25 hn, 

26 list_numbering_start, 

27 pad_tables_in_text, 

28 skipwrap, 

29 unifiable_n, 

30) 

31 

32__version__ = __version_tuple__ 

33 

34# TODO: 

35# Support decoded entities with UNIFIABLE. 

36 

37 

38class HTML2Text(html.parser.HTMLParser): 

39 def __init__( 

40 self, 

41 out: Optional[OutCallback] = None, 

42 baseurl: str = "", 

43 bodywidth: int = config.BODY_WIDTH, 

44 ) -> None: 

45 """ 

46 Input parameters: 

47 out: possible custom replacement for self.outtextf (which 

48 appends lines of text). 

49 baseurl: base URL of the document we process 

50 """ 

51 super().__init__(convert_charrefs=False) 

52 

53 # Config options 

54 self.split_next_td = False 

55 self.td_count = 0 

56 self.table_start = False 

57 self.unicode_snob = config.UNICODE_SNOB # covered in cli 

58 self.escape_snob = config.ESCAPE_SNOB # covered in cli 

59 self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH 

60 self.body_width = bodywidth # covered in cli 

61 self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli 

62 self.inline_links = config.INLINE_LINKS # covered in cli 

63 self.protect_links = config.PROTECT_LINKS # covered in cli 

64 self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli 

65 self.ignore_links = config.IGNORE_ANCHORS # covered in cli 

66 self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli 

67 self.ignore_images = config.IGNORE_IMAGES # covered in cli 

68 self.images_as_html = config.IMAGES_AS_HTML # covered in cli 

69 self.images_to_alt = config.IMAGES_TO_ALT # covered in cli 

70 self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli 

71 self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli 

72 self.bypass_tables = config.BYPASS_TABLES # covered in cli 

73 self.ignore_tables = config.IGNORE_TABLES # covered in cli 

74 self.google_doc = False # covered in cli 

75 self.ul_item_mark = "*" # covered in cli 

76 self.emphasis_mark = "_" # covered in cli 

77 self.strong_mark = "**" 

78 self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli 

79 self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli 

80 self.hide_strikethrough = False # covered in cli 

81 self.mark_code = config.MARK_CODE 

82 self.backquote_code_style = config.BACKQUOTE_CODE_STYLE 

83 self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli 

84 self.wrap_links = config.WRAP_LINKS # covered in cli 

85 self.wrap_tables = config.WRAP_TABLES 

86 self.pad_tables = config.PAD_TABLES # covered in cli 

87 self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli 

88 self.tag_callback = None 

89 self.open_quote = config.OPEN_QUOTE # covered in cli 

90 self.close_quote = config.CLOSE_QUOTE # covered in cli 

91 self.include_sup_sub = config.INCLUDE_SUP_SUB # covered in cli 

92 

93 if out is None: 

94 self.out = self.outtextf 

95 else: 

96 self.out = out 

97 

98 # empty list to store output characters before they are "joined" 

99 self.outtextlist: List[str] = [] 

100 

101 self.quiet = 0 

102 self.p_p = 0 # number of newline character to print before next output 

103 self.outcount = 0 

104 self.start = True 

105 self.space = False 

106 self.a: List[AnchorElement] = [] 

107 self.astack: List[Optional[Dict[str, Optional[str]]]] = [] 

108 self.maybe_automatic_link: Optional[str] = None 

109 self.empty_link = False 

110 self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://") 

111 self.acount = 0 

112 self.list: List[ListElement] = [] 

113 self.blockquote = 0 

114 self.pre = False 

115 self.startpre = False 

116 self.pre_indent = "" 

117 self.list_code_indent = "" 

118 self.code = False 

119 self.quote = False 

120 self.br_toggle = "" 

121 self.lastWasNL = False 

122 self.lastWasList = False 

123 self.style = 0 

124 self.style_def: Dict[str, Dict[str, str]] = {} 

125 self.tag_stack: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] = [] 

126 self.emphasis = 0 

127 self.drop_white_space = 0 

128 self.inheader = False 

129 # Current abbreviation definition 

130 self.abbr_title: Optional[str] = None 

131 # Last inner HTML (for abbr being defined) 

132 self.abbr_data: Optional[str] = None 

133 # Stack of abbreviations to write later 

134 self.abbr_list: Dict[str, str] = {} 

135 self.baseurl = baseurl 

136 self.stressed = False 

137 self.preceding_stressed = False 

138 self.preceding_data = "" 

139 self.current_tag = "" 

140 

141 config.UNIFIABLE["nbsp"] = "&nbsp_place_holder;" 

142 

143 def feed(self, data: str) -> None: 

144 data = data.replace("</' + 'script>", "</ignore>") 

145 super().feed(data) 

146 

147 def handle(self, data: str) -> str: 

148 self.start = True 

149 self.feed(data) 

150 self.feed("") 

151 markdown = self.optwrap(self.finish()) 

152 if self.pad_tables: 

153 return pad_tables_in_text(markdown) 

154 else: 

155 return markdown 

156 

157 def outtextf(self, s: str) -> None: 

158 self.outtextlist.append(s) 

159 if s: 

160 self.lastWasNL = s[-1] == "\n" 

161 

162 def finish(self) -> str: 

163 self.close() 

164 

165 self.pbr() 

166 self.o("", force="end") 

167 

168 outtext = "".join(self.outtextlist) 

169 

170 if self.unicode_snob: 

171 nbsp = html.entities.html5["nbsp;"] 

172 else: 

173 nbsp = " " 

174 outtext = outtext.replace("&nbsp_place_holder;", nbsp) 

175 

176 # Clear self.outtextlist to avoid memory leak of its content to 

177 # the next handling. 

178 self.outtextlist = [] 

179 

180 return outtext 

181 

182 def handle_charref(self, c: str) -> None: 

183 self.handle_data(self.charref(c), True) 

184 

185 def handle_entityref(self, c: str) -> None: 

186 ref = self.entityref(c) 

187 

188 # ref may be an empty string (e.g. for &lrm;/&rlm; markers that should 

189 # not contribute to the final output). 

190 # self.handle_data cannot handle a zero-length string right after a 

191 # stressed tag or mid-text within a stressed tag (text get split and 

192 # self.stressed/self.preceding_stressed gets switched after the first 

193 # part of that text). 

194 if ref: 

195 self.handle_data(ref, True) 

196 

197 def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None: 

198 self.handle_tag(tag, dict(attrs), start=True) 

199 

200 def handle_endtag(self, tag: str) -> None: 

201 self.handle_tag(tag, {}, start=False) 

202 

203 def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]: 

204 """ 

205 :type attrs: dict 

206 

207 :returns: The index of certain set of attributes (of a link) in the 

208 self.a list. If the set of attributes is not found, returns None 

209 :rtype: int 

210 """ 

211 if "href" not in attrs: 

212 return None 

213 

214 match = False 

215 for i, a in enumerate(self.a): 

216 if "href" in a.attrs and a.attrs["href"] == attrs["href"]: 

217 if "title" in a.attrs or "title" in attrs: 

218 if ( 

219 "title" in a.attrs 

220 and "title" in attrs 

221 and a.attrs["title"] == attrs["title"] 

222 ): 

223 match = True 

224 else: 

225 match = True 

226 

227 if match: 

228 return i 

229 return None 

230 

231 def handle_emphasis( 

232 self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str] 

233 ) -> None: 

234 """ 

235 Handles various text emphases 

236 """ 

237 tag_emphasis = google_text_emphasis(tag_style) 

238 parent_emphasis = google_text_emphasis(parent_style) 

239 

240 # handle Google's text emphasis 

241 strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough 

242 

243 # google and others may mark a font's weight as `bold` or `700` 

244 bold = False 

245 for bold_marker in config.BOLD_TEXT_STYLE_VALUES: 

246 bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis 

247 if bold: 

248 break 

249 

250 italic = "italic" in tag_emphasis and "italic" not in parent_emphasis 

251 fixed = ( 

252 google_fixed_width_font(tag_style) 

253 and not google_fixed_width_font(parent_style) 

254 and not self.pre 

255 ) 

256 

257 if start: 

258 # crossed-out text must be handled before other attributes 

259 # in order not to output qualifiers unnecessarily 

260 if bold or italic or fixed: 

261 self.emphasis += 1 

262 if strikethrough: 

263 self.quiet += 1 

264 if italic: 

265 self.o(self.emphasis_mark) 

266 self.drop_white_space += 1 

267 if bold: 

268 self.o(self.strong_mark) 

269 self.drop_white_space += 1 

270 if fixed: 

271 self.o("`") 

272 self.drop_white_space += 1 

273 self.code = True 

274 else: 

275 if bold or italic or fixed: 

276 # there must not be whitespace before closing emphasis mark 

277 self.emphasis -= 1 

278 self.space = False 

279 if fixed: 

280 if self.drop_white_space: 

281 # empty emphasis, drop it 

282 self.drop_white_space -= 1 

283 else: 

284 self.o("`") 

285 self.code = False 

286 if bold: 

287 if self.drop_white_space: 

288 # empty emphasis, drop it 

289 self.drop_white_space -= 1 

290 else: 

291 self.o(self.strong_mark) 

292 if italic: 

293 if self.drop_white_space: 

294 # empty emphasis, drop it 

295 self.drop_white_space -= 1 

296 else: 

297 self.o(self.emphasis_mark) 

298 # space is only allowed after *all* emphasis marks 

299 if (bold or italic) and not self.emphasis: 

300 self.o(" ") 

301 if strikethrough: 

302 self.quiet -= 1 

303 

304 def handle_tag( 

305 self, tag: str, attrs: Dict[str, Optional[str]], start: bool 

306 ) -> None: 

307 self.current_tag = tag 

308 

309 if self.tag_callback is not None: 

310 if self.tag_callback(self, tag, attrs, start) is True: 

311 return 

312 

313 # first thing inside the anchor tag is another tag 

314 # that produces some output 

315 if ( 

316 start 

317 and self.maybe_automatic_link is not None 

318 and tag not in ["p", "div", "style", "dl", "dt"] 

319 and (tag != "img" or self.ignore_images) 

320 ): 

321 self.o("[") 

322 self.maybe_automatic_link = None 

323 self.empty_link = False 

324 

325 if self.google_doc: 

326 # the attrs parameter is empty for a closing tag. in addition, we 

327 # need the attributes of the parent nodes in order to get a 

328 # complete style description for the current element. we assume 

329 # that google docs export well formed html. 

330 parent_style: Dict[str, str] = {} 

331 if start: 

332 if self.tag_stack: 

333 parent_style = self.tag_stack[-1][2] 

334 tag_style = element_style(attrs, self.style_def, parent_style) 

335 self.tag_stack.append((tag, attrs, tag_style)) 

336 else: 

337 dummy, attrs, tag_style = ( 

338 self.tag_stack.pop() if self.tag_stack else (None, {}, {}) 

339 ) 

340 if self.tag_stack: 

341 parent_style = self.tag_stack[-1][2] 

342 

343 if hn(tag): 

344 # check if nh is inside of an 'a' tag (incorrect but found in the wild) 

345 if self.astack: 

346 if start: 

347 self.inheader = True 

348 # are inside link name, so only add '#' if it can appear before '[' 

349 if self.outtextlist and self.outtextlist[-1] == "[": 

350 self.outtextlist.pop() 

351 self.space = False 

352 self.o(hn(tag) * "#" + " ") 

353 self.o("[") 

354 else: 

355 self.p_p = 0 # don't break up link name 

356 self.inheader = False 

357 return # prevent redundant emphasis marks on headers 

358 else: 

359 self.p() 

360 if start: 

361 self.inheader = True 

362 self.o(hn(tag) * "#" + " ") 

363 else: 

364 self.inheader = False 

365 return # prevent redundant emphasis marks on headers 

366 

367 if tag in ["p", "div"]: 

368 if self.google_doc: 

369 if start and google_has_height(tag_style): 

370 self.p() 

371 else: 

372 self.soft_br() 

373 elif self.astack: 

374 pass 

375 elif self.split_next_td: 

376 pass 

377 else: 

378 self.p() 

379 

380 if tag == "br" and start: 

381 if self.blockquote > 0: 

382 self.o(" \n> ") 

383 else: 

384 self.o(" \n") 

385 

386 if tag == "hr" and start: 

387 self.p() 

388 self.o("* * *") 

389 self.p() 

390 

391 if tag in ["head", "style", "script"]: 

392 if start: 

393 self.quiet += 1 

394 else: 

395 self.quiet -= 1 

396 

397 if tag == "style": 

398 if start: 

399 self.style += 1 

400 else: 

401 self.style -= 1 

402 

403 if tag in ["body"]: 

404 self.quiet = 0 # sites like 9rules.com never close <head> 

405 

406 if tag == "blockquote": 

407 if start: 

408 self.p() 

409 self.o("> ", force=True) 

410 self.start = True 

411 self.blockquote += 1 

412 else: 

413 self.blockquote -= 1 

414 self.p() 

415 

416 if tag in ["em", "i", "u"] and not self.ignore_emphasis: 

417 # Separate with a space if we immediately follow an alphanumeric 

418 # character, since otherwise Markdown won't render the emphasis 

419 # marks, and we'll be left with eg 'foo_bar_' visible. 

420 # (Don't add a space otherwise, though, since there isn't one in the 

421 # original HTML.) 

422 if ( 

423 start 

424 and self.preceding_data 

425 and self.preceding_data[-1] not in string.whitespace 

426 and self.preceding_data[-1] not in string.punctuation 

427 ): 

428 emphasis = " " + self.emphasis_mark 

429 self.preceding_data += " " 

430 else: 

431 emphasis = self.emphasis_mark 

432 

433 self.o(emphasis) 

434 if start: 

435 self.stressed = True 

436 

437 if tag in ["strong", "b"] and not self.ignore_emphasis: 

438 # Separate with space if we immediately follow an * character, since 

439 # without it, Markdown won't render the resulting *** correctly. 

440 # (Don't add a space otherwise, though, since there isn't one in the 

441 # original HTML.) 

442 if ( 

443 start 

444 and self.preceding_data 

445 # When `self.strong_mark` is set to empty, the next condition 

446 # will cause IndexError since it's trying to match the data 

447 # with the first character of the `self.strong_mark`. 

448 and len(self.strong_mark) > 0 

449 and self.preceding_data[-1] == self.strong_mark[0] 

450 ): 

451 strong = " " + self.strong_mark 

452 self.preceding_data += " " 

453 else: 

454 strong = self.strong_mark 

455 

456 self.o(strong) 

457 if start: 

458 self.stressed = True 

459 

460 if tag in ["del", "strike", "s"]: 

461 if start and self.preceding_data and self.preceding_data[-1] == "~": 

462 strike = " ~~" 

463 self.preceding_data += " " 

464 else: 

465 strike = "~~" 

466 

467 self.o(strike) 

468 if start: 

469 self.stressed = True 

470 

471 if self.google_doc: 

472 if not self.inheader: 

473 # handle some font attributes, but leave headers clean 

474 self.handle_emphasis(start, tag_style, parent_style) 

475 

476 if tag in ["kbd", "code", "tt"] and not self.pre: 

477 self.o("`") # TODO: `` `this` `` 

478 self.code = not self.code 

479 

480 if tag == "abbr": 

481 if start: 

482 self.abbr_title = None 

483 self.abbr_data = "" 

484 if "title" in attrs: 

485 self.abbr_title = attrs["title"] 

486 else: 

487 if self.abbr_title is not None: 

488 assert self.abbr_data is not None 

489 self.abbr_list[self.abbr_data] = self.abbr_title 

490 self.abbr_title = None 

491 self.abbr_data = None 

492 

493 if tag == "q": 

494 if not self.quote: 

495 self.o(self.open_quote) 

496 else: 

497 self.o(self.close_quote) 

498 self.quote = not self.quote 

499 

500 def link_url(self: HTML2Text, link: str, title: str = "") -> None: 

501 url = urlparse.urljoin(self.baseurl, link) 

502 title = ' "{}"'.format(title) if title.strip() else "" 

503 self.o("]({url}{title})".format(url=escape_md(url), title=title)) 

504 

505 if tag == "a" and not self.ignore_links: 

506 if start: 

507 if ( 

508 "href" in attrs 

509 and attrs["href"] is not None 

510 and not (self.skip_internal_links and attrs["href"].startswith("#")) 

511 and not ( 

512 self.ignore_mailto_links and attrs["href"].startswith("mailto:") 

513 ) 

514 ): 

515 self.astack.append(attrs) 

516 self.maybe_automatic_link = attrs["href"] 

517 self.empty_link = True 

518 if self.protect_links: 

519 attrs["href"] = "<" + attrs["href"] + ">" 

520 else: 

521 self.astack.append(None) 

522 else: 

523 if self.astack: 

524 a = self.astack.pop() 

525 if self.maybe_automatic_link and not self.empty_link: 

526 self.maybe_automatic_link = None 

527 elif a: 

528 assert a["href"] is not None 

529 if self.empty_link: 

530 self.o("[") 

531 self.empty_link = False 

532 self.maybe_automatic_link = None 

533 if self.inline_links: 

534 self.p_p = 0 

535 title = a.get("title") or "" 

536 title = escape_md(title) 

537 link_url(self, a["href"], title) 

538 else: 

539 i = self.previousIndex(a) 

540 if i is not None: 

541 a_props = self.a[i] 

542 else: 

543 self.acount += 1 

544 a_props = AnchorElement(a, self.acount, self.outcount) 

545 self.a.append(a_props) 

546 self.o("][" + str(a_props.count) + "]") 

547 

548 if tag == "img" and start and not self.ignore_images: 

549 if "src" in attrs and attrs["src"] is not None: 

550 if not self.images_to_alt: 

551 attrs["href"] = attrs["src"] 

552 alt = attrs.get("alt") or self.default_image_alt 

553 

554 # If we have images_with_size, write raw html including width, 

555 # height, and alt attributes 

556 if self.images_as_html or ( 

557 self.images_with_size and ("width" in attrs or "height" in attrs) 

558 ): 

559 self.o("<img src='" + attrs["src"] + "' ") 

560 if "width" in attrs and attrs["width"] is not None: 

561 self.o("width='" + attrs["width"] + "' ") 

562 if "height" in attrs and attrs["height"] is not None: 

563 self.o("height='" + attrs["height"] + "' ") 

564 if alt: 

565 self.o("alt='" + alt + "' ") 

566 self.o("/>") 

567 return 

568 

569 # If we have a link to create, output the start 

570 if self.maybe_automatic_link is not None: 

571 href = self.maybe_automatic_link 

572 if ( 

573 self.images_to_alt 

574 and escape_md(alt) == href 

575 and self.absolute_url_matcher.match(href) 

576 ): 

577 self.o("<" + escape_md(alt) + ">") 

578 self.empty_link = False 

579 return 

580 else: 

581 self.o("[") 

582 self.maybe_automatic_link = None 

583 self.empty_link = False 

584 

585 # If we have images_to_alt, we discard the image itself, 

586 # considering only the alt text. 

587 if self.images_to_alt: 

588 self.o(escape_md(alt)) 

589 else: 

590 self.o("![" + escape_md(alt) + "]") 

591 if self.inline_links: 

592 href = attrs.get("href") or "" 

593 self.o( 

594 "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")" 

595 ) 

596 else: 

597 i = self.previousIndex(attrs) 

598 if i is not None: 

599 a_props = self.a[i] 

600 else: 

601 self.acount += 1 

602 a_props = AnchorElement(attrs, self.acount, self.outcount) 

603 self.a.append(a_props) 

604 self.o("[" + str(a_props.count) + "]") 

605 

606 if tag == "dl" and start: 

607 self.p() 

608 if tag == "dt" and not start: 

609 self.pbr() 

610 if tag == "dd" and start: 

611 self.o(" ") 

612 if tag == "dd" and not start: 

613 self.pbr() 

614 

615 if tag in ["ol", "ul"]: 

616 # Google Docs create sub lists as top level lists 

617 if not self.list and not self.lastWasList: 

618 self.p() 

619 if start: 

620 if self.google_doc: 

621 list_style = google_list_style(tag_style) 

622 else: 

623 list_style = tag 

624 numbering_start = list_numbering_start(attrs) 

625 self.list.append(ListElement(list_style, numbering_start)) 

626 else: 

627 if self.list: 

628 self.list.pop() 

629 if not self.google_doc and not self.list: 

630 self.o("\n") 

631 self.lastWasList = True 

632 else: 

633 self.lastWasList = False 

634 

635 if tag == "li": 

636 self.list_code_indent = "" 

637 self.pbr() 

638 if start: 

639 if self.list: 

640 li = self.list[-1] 

641 else: 

642 li = ListElement("ul", 0) 

643 if self.google_doc: 

644 self.o(" " * self.google_nest_count(tag_style)) 

645 else: 

646 # Indent two spaces per list, except use three spaces for an 

647 # unordered list inside an ordered list. 

648 # https://spec.commonmark.org/0.28/#motivation 

649 # TODO: line up <ol><li>s > 9 correctly. 

650 parent_list = None 

651 for list in self.list: 

652 self.list_code_indent += " " if parent_list == "ol" else " " 

653 parent_list = list.name 

654 self.o(self.list_code_indent) 

655 

656 if li.name == "ul": 

657 self.list_code_indent += " " 

658 self.o(self.ul_item_mark + " ") 

659 elif li.name == "ol": 

660 li.num += 1 

661 self.list_code_indent += " " 

662 self.o(str(li.num) + ". ") 

663 self.start = True 

664 

665 if tag in ["table", "tr", "td", "th"]: 

666 if self.ignore_tables: 

667 if tag == "tr": 

668 if start: 

669 pass 

670 else: 

671 self.soft_br() 

672 else: 

673 pass 

674 

675 elif self.bypass_tables: 

676 if start: 

677 self.soft_br() 

678 if tag in ["td", "th"]: 

679 if start: 

680 self.o("<{}>\n\n".format(tag)) 

681 else: 

682 self.o("\n</{}>".format(tag)) 

683 else: 

684 if start: 

685 self.o("<{}>".format(tag)) 

686 else: 

687 self.o("</{}>".format(tag)) 

688 

689 else: 

690 if tag == "table": 

691 if start: 

692 self.table_start = True 

693 if self.pad_tables: 

694 self.o("<" + config.TABLE_MARKER_FOR_PAD + ">") 

695 self.o(" \n") 

696 else: 

697 if self.pad_tables: 

698 # add break in case the table is empty or its 1 row table 

699 self.soft_br() 

700 self.o("</" + config.TABLE_MARKER_FOR_PAD + ">") 

701 self.o(" \n") 

702 if tag in ["td", "th"] and start: 

703 if self.split_next_td: 

704 self.o("| ") 

705 self.split_next_td = True 

706 

707 if tag == "tr" and start: 

708 self.td_count = 0 

709 if tag == "tr" and not start: 

710 self.split_next_td = False 

711 self.soft_br() 

712 if tag == "tr" and not start and self.table_start: 

713 # Underline table header 

714 self.o("|".join(["---"] * self.td_count)) 

715 self.soft_br() 

716 self.table_start = False 

717 if tag in ["td", "th"] and start: 

718 self.td_count += 1 

719 

720 if tag == "pre": 

721 if start: 

722 self.startpre = True 

723 self.pre = True 

724 self.pre_indent = "" 

725 else: 

726 self.pre = False 

727 if self.backquote_code_style: 

728 self.out("\n" + self.pre_indent + "```") 

729 if self.mark_code: 

730 self.out("\n[/code]") 

731 self.p() 

732 

733 if tag in ["sup", "sub"] and self.include_sup_sub: 

734 if start: 

735 self.o("<{}>".format(tag)) 

736 else: 

737 self.o("</{}>".format(tag)) 

738 

739 # TODO: Add docstring for these one letter functions 

740 def pbr(self) -> None: 

741 "Pretty print has a line break" 

742 if self.p_p == 0: 

743 self.p_p = 1 

744 

745 def p(self) -> None: 

746 "Set pretty print to 1 or 2 lines" 

747 self.p_p = 1 if self.single_line_break else 2 

748 

749 def soft_br(self) -> None: 

750 "Soft breaks" 

751 self.pbr() 

752 self.br_toggle = " " 

753 

754 def o( 

755 self, data: str, puredata: bool = False, force: Union[bool, str] = False 

756 ) -> None: 

757 """ 

758 Deal with indentation and whitespace 

759 """ 

760 if self.abbr_data is not None: 

761 self.abbr_data += data 

762 

763 if not self.quiet: 

764 if self.google_doc: 

765 # prevent white space immediately after 'begin emphasis' 

766 # marks ('**' and '_') 

767 lstripped_data = data.lstrip() 

768 if self.drop_white_space and not (self.pre or self.code): 

769 data = lstripped_data 

770 if lstripped_data != "": 

771 self.drop_white_space = 0 

772 

773 if puredata and not self.pre: 

774 # This is a very dangerous call ... it could mess up 

775 # all handling of &nbsp; when not handled properly 

776 # (see entityref) 

777 data = re.sub(r"\s+", r" ", data) 

778 if data and data[0] == " ": 

779 self.space = True 

780 data = data[1:] 

781 if not data and not force: 

782 return 

783 

784 if self.startpre: 

785 # self.out(" :") #TODO: not output when already one there 

786 if not data.startswith("\n") and not data.startswith("\r\n"): 

787 # <pre>stuff... 

788 data = "\n" + data 

789 if self.mark_code: 

790 self.out("\n[code]") 

791 self.p_p = 0 

792 

793 bq = ">" * self.blockquote 

794 if not (force and data and data[0] == ">") and self.blockquote: 

795 bq += " " 

796 

797 if self.pre: 

798 if self.list: 

799 bq += self.list_code_indent 

800 

801 if not self.backquote_code_style: 

802 bq += " " 

803 

804 data = data.replace("\n", "\n" + bq) 

805 self.pre_indent = bq 

806 

807 if self.startpre: 

808 self.startpre = False 

809 if self.backquote_code_style: 

810 self.out("\n" + self.pre_indent + "```") 

811 self.p_p = 0 

812 elif self.list: 

813 # use existing initial indentation 

814 data = data.lstrip("\n" + self.pre_indent) 

815 

816 if self.start: 

817 self.space = False 

818 self.p_p = 0 

819 self.start = False 

820 

821 if force == "end": 

822 # It's the end. 

823 self.p_p = 0 

824 self.out("\n") 

825 self.space = False 

826 

827 if self.p_p: 

828 self.out((self.br_toggle + "\n" + bq) * self.p_p) 

829 self.space = False 

830 self.br_toggle = "" 

831 

832 if self.space: 

833 if not self.lastWasNL: 

834 self.out(" ") 

835 self.space = False 

836 

837 if self.a and ( 

838 (self.p_p == 2 and self.links_each_paragraph) or force == "end" 

839 ): 

840 if force == "end": 

841 self.out("\n") 

842 

843 newa = [] 

844 for link in self.a: 

845 if self.outcount > link.outcount: 

846 self.out( 

847 " [" 

848 + str(link.count) 

849 + "]: " 

850 + urlparse.urljoin(self.baseurl, link.attrs["href"]) 

851 ) 

852 if "title" in link.attrs and link.attrs["title"] is not None: 

853 self.out(" (" + link.attrs["title"] + ")") 

854 self.out("\n") 

855 else: 

856 newa.append(link) 

857 

858 # Don't need an extra line when nothing was done. 

859 if self.a != newa: 

860 self.out("\n") 

861 

862 self.a = newa 

863 

864 if self.abbr_list and force == "end": 

865 for abbr, definition in self.abbr_list.items(): 

866 self.out(" *[" + abbr + "]: " + definition + "\n") 

867 

868 self.p_p = 0 

869 self.out(data) 

870 self.outcount += 1 

871 

872 def handle_data(self, data: str, entity_char: bool = False) -> None: 

873 if not data: 

874 # Data may be empty for some HTML entities. For example, 

875 # LEFT-TO-RIGHT MARK. 

876 return 

877 

878 if self.stressed: 

879 data = data.strip() 

880 self.stressed = False 

881 self.preceding_stressed = True 

882 elif self.preceding_stressed: 

883 if ( 

884 re.match(r"[^][(){}\s.!?]", data[0]) 

885 and not hn(self.current_tag) 

886 and self.current_tag not in ["a", "code", "pre"] 

887 ): 

888 # should match a letter or common punctuation 

889 data = " " + data 

890 self.preceding_stressed = False 

891 

892 if self.style: 

893 self.style_def.update(dumb_css_parser(data)) 

894 

895 if self.maybe_automatic_link is not None: 

896 href = self.maybe_automatic_link 

897 if ( 

898 href == data 

899 and self.absolute_url_matcher.match(href) 

900 and self.use_automatic_links 

901 ): 

902 self.o("<" + data + ">") 

903 self.empty_link = False 

904 return 

905 else: 

906 self.o("[") 

907 self.maybe_automatic_link = None 

908 self.empty_link = False 

909 

910 if not self.code and not self.pre and not entity_char: 

911 data = escape_md_section(data, snob=self.escape_snob) 

912 self.preceding_data = data 

913 self.o(data, puredata=True) 

914 

915 def charref(self, name: str) -> str: 

916 if name[0] in ["x", "X"]: 

917 c = int(name[1:], 16) 

918 else: 

919 c = int(name) 

920 

921 if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000: # invalid or surrogate 

922 c = 0xFFFD # REPLACEMENT CHARACTER 

923 c = control_character_replacements.get(c, c) 

924 

925 if not self.unicode_snob and c in unifiable_n: 

926 return unifiable_n[c] 

927 else: 

928 return chr(c) 

929 

930 def entityref(self, c: str) -> str: 

931 if not self.unicode_snob and c in config.UNIFIABLE: 

932 return config.UNIFIABLE[c] 

933 try: 

934 ch = html.entities.html5[c + ";"] 

935 except KeyError: 

936 return "&" + c + ";" 

937 return config.UNIFIABLE[c] if c == "nbsp" else ch 

938 

939 def google_nest_count(self, style: Dict[str, str]) -> int: 

940 """ 

941 Calculate the nesting count of google doc lists 

942 

943 :type style: dict 

944 

945 :rtype: int 

946 """ 

947 nest_count = 0 

948 if "margin-left" in style: 

949 nest_count = int(style["margin-left"][:-2]) // self.google_list_indent 

950 

951 return nest_count 

952 

953 def optwrap(self, text: str) -> str: 

954 """ 

955 Wrap all paragraphs in the provided text. 

956 

957 :type text: str 

958 

959 :rtype: str 

960 """ 

961 if not self.body_width: 

962 return text 

963 

964 result = "" 

965 newlines = 0 

966 # I cannot think of a better solution for now. 

967 # To avoid the non-wrap behaviour for entire paras 

968 # because of the presence of a link in it 

969 if not self.wrap_links: 

970 self.inline_links = False 

971 start_code = False 

972 for para in text.split("\n"): 

973 # If the text is between tri-backquote pairs, it's a code block; 

974 # don't wrap 

975 if self.backquote_code_style and para.lstrip().startswith("```"): 

976 start_code = not start_code 

977 if start_code: 

978 result += para + "\n" 

979 elif len(para) > 0: 

980 if not skipwrap( 

981 para, self.wrap_links, self.wrap_list_items, self.wrap_tables 

982 ): 

983 indent = "" 

984 if para.startswith(" " + self.ul_item_mark): 

985 # list item continuation: add a double indent to the 

986 # new lines 

987 indent = " " 

988 elif para.startswith("> "): 

989 # blockquote continuation: add the greater than symbol 

990 # to the new lines 

991 indent = "> " 

992 wrapped = wrap( 

993 para, 

994 self.body_width, 

995 break_long_words=False, 

996 subsequent_indent=indent, 

997 ) 

998 result += "\n".join(wrapped) 

999 if para.endswith(" "): 

1000 result += " \n" 

1001 newlines = 1 

1002 elif indent: 

1003 result += "\n" 

1004 newlines = 1 

1005 else: 

1006 result += "\n\n" 

1007 newlines = 2 

1008 else: 

1009 # Warning for the tempted!!! 

1010 # Be aware that obvious replacement of this with 

1011 # line.isspace() 

1012 # DOES NOT work! Explanations are welcome. 

1013 if not config.RE_SPACE.match(para): 

1014 result += para + "\n" 

1015 newlines = 1 

1016 else: 

1017 if newlines < 2: 

1018 result += "\n" 

1019 newlines += 1 

1020 return result 

1021 

1022 

1023def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str: 

1024 if bodywidth is None: 

1025 bodywidth = config.BODY_WIDTH 

1026 h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth) 

1027 

1028 return h.handle(html)