1"""html2text: Turn HTML into equivalent Markdown-structured text."""
2
3import html.entities
4import html.parser
5import re
6import string
7import urllib.parse as urlparse
8from textwrap import wrap
9from typing import Dict, List, Optional, Tuple, Union
10
11from . import config
12from ._typing import OutCallback
13from ._version import __version_tuple__
14from .elements import AnchorElement, ListElement
15from .utils import (
16 control_character_replacements,
17 dumb_css_parser,
18 element_style,
19 escape_md,
20 escape_md_section,
21 google_fixed_width_font,
22 google_has_height,
23 google_list_style,
24 google_text_emphasis,
25 hn,
26 list_numbering_start,
27 pad_tables_in_text,
28 skipwrap,
29 unifiable_n,
30)
31
32__version__ = __version_tuple__
33
34# TODO:
35# Support decoded entities with UNIFIABLE.
36
37
38class HTML2Text(html.parser.HTMLParser):
39 def __init__(
40 self,
41 out: Optional[OutCallback] = None,
42 baseurl: str = "",
43 bodywidth: int = config.BODY_WIDTH,
44 ) -> None:
45 """
46 Input parameters:
47 out: possible custom replacement for self.outtextf (which
48 appends lines of text).
49 baseurl: base URL of the document we process
50 """
51 super().__init__(convert_charrefs=False)
52
53 # Config options
54 self.split_next_td = False
55 self.td_count = 0
56 self.table_start = False
57 self.unicode_snob = config.UNICODE_SNOB # covered in cli
58 self.escape_snob = config.ESCAPE_SNOB # covered in cli
59 self.links_each_paragraph = config.LINKS_EACH_PARAGRAPH
60 self.body_width = bodywidth # covered in cli
61 self.skip_internal_links = config.SKIP_INTERNAL_LINKS # covered in cli
62 self.inline_links = config.INLINE_LINKS # covered in cli
63 self.protect_links = config.PROTECT_LINKS # covered in cli
64 self.google_list_indent = config.GOOGLE_LIST_INDENT # covered in cli
65 self.ignore_links = config.IGNORE_ANCHORS # covered in cli
66 self.ignore_mailto_links = config.IGNORE_MAILTO_LINKS # covered in cli
67 self.ignore_images = config.IGNORE_IMAGES # covered in cli
68 self.images_as_html = config.IMAGES_AS_HTML # covered in cli
69 self.images_to_alt = config.IMAGES_TO_ALT # covered in cli
70 self.images_with_size = config.IMAGES_WITH_SIZE # covered in cli
71 self.ignore_emphasis = config.IGNORE_EMPHASIS # covered in cli
72 self.bypass_tables = config.BYPASS_TABLES # covered in cli
73 self.ignore_tables = config.IGNORE_TABLES # covered in cli
74 self.google_doc = False # covered in cli
75 self.ul_item_mark = "*" # covered in cli
76 self.emphasis_mark = "_" # covered in cli
77 self.strong_mark = "**"
78 self.single_line_break = config.SINGLE_LINE_BREAK # covered in cli
79 self.use_automatic_links = config.USE_AUTOMATIC_LINKS # covered in cli
80 self.hide_strikethrough = False # covered in cli
81 self.mark_code = config.MARK_CODE
82 self.backquote_code_style = config.BACKQUOTE_CODE_STYLE
83 self.wrap_list_items = config.WRAP_LIST_ITEMS # covered in cli
84 self.wrap_links = config.WRAP_LINKS # covered in cli
85 self.wrap_tables = config.WRAP_TABLES
86 self.pad_tables = config.PAD_TABLES # covered in cli
87 self.default_image_alt = config.DEFAULT_IMAGE_ALT # covered in cli
88 self.tag_callback = None
89 self.open_quote = config.OPEN_QUOTE # covered in cli
90 self.close_quote = config.CLOSE_QUOTE # covered in cli
91 self.include_sup_sub = config.INCLUDE_SUP_SUB # covered in cli
92
93 if out is None:
94 self.out = self.outtextf
95 else:
96 self.out = out
97
98 # empty list to store output characters before they are "joined"
99 self.outtextlist: List[str] = []
100
101 self.quiet = 0
102 self.p_p = 0 # number of newline character to print before next output
103 self.outcount = 0
104 self.start = True
105 self.space = False
106 self.a: List[AnchorElement] = []
107 self.astack: List[Optional[Dict[str, Optional[str]]]] = []
108 self.maybe_automatic_link: Optional[str] = None
109 self.empty_link = False
110 self.absolute_url_matcher = re.compile(r"^[a-zA-Z+]+://")
111 self.acount = 0
112 self.list: List[ListElement] = []
113 self.blockquote = 0
114 self.pre = False
115 self.startpre = False
116 self.pre_indent = ""
117 self.list_code_indent = ""
118 self.code = False
119 self.quote = False
120 self.br_toggle = ""
121 self.lastWasNL = False
122 self.lastWasList = False
123 self.style = 0
124 self.style_def: Dict[str, Dict[str, str]] = {}
125 self.tag_stack: List[Tuple[str, Dict[str, Optional[str]], Dict[str, str]]] = []
126 self.emphasis = 0
127 self.drop_white_space = 0
128 self.inheader = False
129 # Current abbreviation definition
130 self.abbr_title: Optional[str] = None
131 # Last inner HTML (for abbr being defined)
132 self.abbr_data: Optional[str] = None
133 # Stack of abbreviations to write later
134 self.abbr_list: Dict[str, str] = {}
135 self.baseurl = baseurl
136 self.stressed = False
137 self.preceding_stressed = False
138 self.preceding_data = ""
139 self.current_tag = ""
140
141 config.UNIFIABLE["nbsp"] = " _place_holder;"
142
143 def feed(self, data: str) -> None:
144 data = data.replace("</' + 'script>", "</ignore>")
145 super().feed(data)
146
147 def handle(self, data: str) -> str:
148 self.start = True
149 self.feed(data)
150 self.feed("")
151 markdown = self.optwrap(self.finish())
152 if self.pad_tables:
153 return pad_tables_in_text(markdown)
154 else:
155 return markdown
156
157 def outtextf(self, s: str) -> None:
158 self.outtextlist.append(s)
159 if s:
160 self.lastWasNL = s[-1] == "\n"
161
162 def finish(self) -> str:
163 self.close()
164
165 self.pbr()
166 self.o("", force="end")
167
168 outtext = "".join(self.outtextlist)
169
170 if self.unicode_snob:
171 nbsp = html.entities.html5["nbsp;"]
172 else:
173 nbsp = " "
174 outtext = outtext.replace(" _place_holder;", nbsp)
175
176 # Clear self.outtextlist to avoid memory leak of its content to
177 # the next handling.
178 self.outtextlist = []
179
180 return outtext
181
182 def handle_charref(self, c: str) -> None:
183 self.handle_data(self.charref(c), True)
184
185 def handle_entityref(self, c: str) -> None:
186 ref = self.entityref(c)
187
188 # ref may be an empty string (e.g. for ‎/‏ markers that should
189 # not contribute to the final output).
190 # self.handle_data cannot handle a zero-length string right after a
191 # stressed tag or mid-text within a stressed tag (text get split and
192 # self.stressed/self.preceding_stressed gets switched after the first
193 # part of that text).
194 if ref:
195 self.handle_data(ref, True)
196
197 def handle_starttag(self, tag: str, attrs: List[Tuple[str, Optional[str]]]) -> None:
198 self.handle_tag(tag, dict(attrs), start=True)
199
200 def handle_endtag(self, tag: str) -> None:
201 self.handle_tag(tag, {}, start=False)
202
203 def previousIndex(self, attrs: Dict[str, Optional[str]]) -> Optional[int]:
204 """
205 :type attrs: dict
206
207 :returns: The index of certain set of attributes (of a link) in the
208 self.a list. If the set of attributes is not found, returns None
209 :rtype: int
210 """
211 if "href" not in attrs:
212 return None
213
214 match = False
215 for i, a in enumerate(self.a):
216 if "href" in a.attrs and a.attrs["href"] == attrs["href"]:
217 if "title" in a.attrs or "title" in attrs:
218 if (
219 "title" in a.attrs
220 and "title" in attrs
221 and a.attrs["title"] == attrs["title"]
222 ):
223 match = True
224 else:
225 match = True
226
227 if match:
228 return i
229 return None
230
231 def handle_emphasis(
232 self, start: bool, tag_style: Dict[str, str], parent_style: Dict[str, str]
233 ) -> None:
234 """
235 Handles various text emphases
236 """
237 tag_emphasis = google_text_emphasis(tag_style)
238 parent_emphasis = google_text_emphasis(parent_style)
239
240 # handle Google's text emphasis
241 strikethrough = "line-through" in tag_emphasis and self.hide_strikethrough
242
243 # google and others may mark a font's weight as `bold` or `700`
244 bold = False
245 for bold_marker in config.BOLD_TEXT_STYLE_VALUES:
246 bold = bold_marker in tag_emphasis and bold_marker not in parent_emphasis
247 if bold:
248 break
249
250 italic = "italic" in tag_emphasis and "italic" not in parent_emphasis
251 fixed = (
252 google_fixed_width_font(tag_style)
253 and not google_fixed_width_font(parent_style)
254 and not self.pre
255 )
256
257 if start:
258 # crossed-out text must be handled before other attributes
259 # in order not to output qualifiers unnecessarily
260 if bold or italic or fixed:
261 self.emphasis += 1
262 if strikethrough:
263 self.quiet += 1
264 if italic:
265 self.o(self.emphasis_mark)
266 self.drop_white_space += 1
267 if bold:
268 self.o(self.strong_mark)
269 self.drop_white_space += 1
270 if fixed:
271 self.o("`")
272 self.drop_white_space += 1
273 self.code = True
274 else:
275 if bold or italic or fixed:
276 # there must not be whitespace before closing emphasis mark
277 self.emphasis -= 1
278 self.space = False
279 if fixed:
280 if self.drop_white_space:
281 # empty emphasis, drop it
282 self.drop_white_space -= 1
283 else:
284 self.o("`")
285 self.code = False
286 if bold:
287 if self.drop_white_space:
288 # empty emphasis, drop it
289 self.drop_white_space -= 1
290 else:
291 self.o(self.strong_mark)
292 if italic:
293 if self.drop_white_space:
294 # empty emphasis, drop it
295 self.drop_white_space -= 1
296 else:
297 self.o(self.emphasis_mark)
298 # space is only allowed after *all* emphasis marks
299 if (bold or italic) and not self.emphasis:
300 self.o(" ")
301 if strikethrough:
302 self.quiet -= 1
303
304 def handle_tag(
305 self, tag: str, attrs: Dict[str, Optional[str]], start: bool
306 ) -> None:
307 self.current_tag = tag
308
309 if self.tag_callback is not None:
310 if self.tag_callback(self, tag, attrs, start) is True:
311 return
312
313 # first thing inside the anchor tag is another tag
314 # that produces some output
315 if (
316 start
317 and self.maybe_automatic_link is not None
318 and tag not in ["p", "div", "style", "dl", "dt"]
319 and (tag != "img" or self.ignore_images)
320 ):
321 self.o("[")
322 self.maybe_automatic_link = None
323 self.empty_link = False
324
325 if self.google_doc:
326 # the attrs parameter is empty for a closing tag. in addition, we
327 # need the attributes of the parent nodes in order to get a
328 # complete style description for the current element. we assume
329 # that google docs export well formed html.
330 parent_style: Dict[str, str] = {}
331 if start:
332 if self.tag_stack:
333 parent_style = self.tag_stack[-1][2]
334 tag_style = element_style(attrs, self.style_def, parent_style)
335 self.tag_stack.append((tag, attrs, tag_style))
336 else:
337 dummy, attrs, tag_style = (
338 self.tag_stack.pop() if self.tag_stack else (None, {}, {})
339 )
340 if self.tag_stack:
341 parent_style = self.tag_stack[-1][2]
342
343 if hn(tag):
344 # check if nh is inside of an 'a' tag (incorrect but found in the wild)
345 if self.astack:
346 if start:
347 self.inheader = True
348 # are inside link name, so only add '#' if it can appear before '['
349 if self.outtextlist and self.outtextlist[-1] == "[":
350 self.outtextlist.pop()
351 self.space = False
352 self.o(hn(tag) * "#" + " ")
353 self.o("[")
354 else:
355 self.p_p = 0 # don't break up link name
356 self.inheader = False
357 return # prevent redundant emphasis marks on headers
358 else:
359 self.p()
360 if start:
361 self.inheader = True
362 self.o(hn(tag) * "#" + " ")
363 else:
364 self.inheader = False
365 return # prevent redundant emphasis marks on headers
366
367 if tag in ["p", "div"]:
368 if self.google_doc:
369 if start and google_has_height(tag_style):
370 self.p()
371 else:
372 self.soft_br()
373 elif self.astack:
374 pass
375 elif self.split_next_td:
376 pass
377 else:
378 self.p()
379
380 if tag == "br" and start:
381 if self.blockquote > 0:
382 self.o(" \n> ")
383 else:
384 self.o(" \n")
385
386 if tag == "hr" and start:
387 self.p()
388 self.o("* * *")
389 self.p()
390
391 if tag in ["head", "style", "script"]:
392 if start:
393 self.quiet += 1
394 else:
395 self.quiet -= 1
396
397 if tag == "style":
398 if start:
399 self.style += 1
400 else:
401 self.style -= 1
402
403 if tag in ["body"]:
404 self.quiet = 0 # sites like 9rules.com never close <head>
405
406 if tag == "blockquote":
407 if start:
408 self.p()
409 self.o("> ", force=True)
410 self.start = True
411 self.blockquote += 1
412 else:
413 self.blockquote -= 1
414 self.p()
415
416 if tag in ["em", "i", "u"] and not self.ignore_emphasis:
417 # Separate with a space if we immediately follow an alphanumeric
418 # character, since otherwise Markdown won't render the emphasis
419 # marks, and we'll be left with eg 'foo_bar_' visible.
420 # (Don't add a space otherwise, though, since there isn't one in the
421 # original HTML.)
422 if (
423 start
424 and self.preceding_data
425 and self.preceding_data[-1] not in string.whitespace
426 and self.preceding_data[-1] not in string.punctuation
427 ):
428 emphasis = " " + self.emphasis_mark
429 self.preceding_data += " "
430 else:
431 emphasis = self.emphasis_mark
432
433 self.o(emphasis)
434 if start:
435 self.stressed = True
436
437 if tag in ["strong", "b"] and not self.ignore_emphasis:
438 # Separate with space if we immediately follow an * character, since
439 # without it, Markdown won't render the resulting *** correctly.
440 # (Don't add a space otherwise, though, since there isn't one in the
441 # original HTML.)
442 if (
443 start
444 and self.preceding_data
445 # When `self.strong_mark` is set to empty, the next condition
446 # will cause IndexError since it's trying to match the data
447 # with the first character of the `self.strong_mark`.
448 and len(self.strong_mark) > 0
449 and self.preceding_data[-1] == self.strong_mark[0]
450 ):
451 strong = " " + self.strong_mark
452 self.preceding_data += " "
453 else:
454 strong = self.strong_mark
455
456 self.o(strong)
457 if start:
458 self.stressed = True
459
460 if tag in ["del", "strike", "s"]:
461 if start and self.preceding_data and self.preceding_data[-1] == "~":
462 strike = " ~~"
463 self.preceding_data += " "
464 else:
465 strike = "~~"
466
467 self.o(strike)
468 if start:
469 self.stressed = True
470
471 if self.google_doc:
472 if not self.inheader:
473 # handle some font attributes, but leave headers clean
474 self.handle_emphasis(start, tag_style, parent_style)
475
476 if tag in ["kbd", "code", "tt"] and not self.pre:
477 self.o("`") # TODO: `` `this` ``
478 self.code = not self.code
479
480 if tag == "abbr":
481 if start:
482 self.abbr_title = None
483 self.abbr_data = ""
484 if "title" in attrs:
485 self.abbr_title = attrs["title"]
486 else:
487 if self.abbr_title is not None:
488 assert self.abbr_data is not None
489 self.abbr_list[self.abbr_data] = self.abbr_title
490 self.abbr_title = None
491 self.abbr_data = None
492
493 if tag == "q":
494 if not self.quote:
495 self.o(self.open_quote)
496 else:
497 self.o(self.close_quote)
498 self.quote = not self.quote
499
500 def link_url(self: HTML2Text, link: str, title: str = "") -> None:
501 url = urlparse.urljoin(self.baseurl, link)
502 title = ' "{}"'.format(title) if title.strip() else ""
503 self.o("]({url}{title})".format(url=escape_md(url), title=title))
504
505 if tag == "a" and not self.ignore_links:
506 if start:
507 if (
508 "href" in attrs
509 and attrs["href"] is not None
510 and not (self.skip_internal_links and attrs["href"].startswith("#"))
511 and not (
512 self.ignore_mailto_links and attrs["href"].startswith("mailto:")
513 )
514 ):
515 self.astack.append(attrs)
516 self.maybe_automatic_link = attrs["href"]
517 self.empty_link = True
518 if self.protect_links:
519 attrs["href"] = "<" + attrs["href"] + ">"
520 else:
521 self.astack.append(None)
522 else:
523 if self.astack:
524 a = self.astack.pop()
525 if self.maybe_automatic_link and not self.empty_link:
526 self.maybe_automatic_link = None
527 elif a:
528 assert a["href"] is not None
529 if self.empty_link:
530 self.o("[")
531 self.empty_link = False
532 self.maybe_automatic_link = None
533 if self.inline_links:
534 self.p_p = 0
535 title = a.get("title") or ""
536 title = escape_md(title)
537 link_url(self, a["href"], title)
538 else:
539 i = self.previousIndex(a)
540 if i is not None:
541 a_props = self.a[i]
542 else:
543 self.acount += 1
544 a_props = AnchorElement(a, self.acount, self.outcount)
545 self.a.append(a_props)
546 self.o("][" + str(a_props.count) + "]")
547
548 if tag == "img" and start and not self.ignore_images:
549 if "src" in attrs and attrs["src"] is not None:
550 if not self.images_to_alt:
551 attrs["href"] = attrs["src"]
552 alt = attrs.get("alt") or self.default_image_alt
553
554 # If we have images_with_size, write raw html including width,
555 # height, and alt attributes
556 if self.images_as_html or (
557 self.images_with_size and ("width" in attrs or "height" in attrs)
558 ):
559 self.o("<img src='" + attrs["src"] + "' ")
560 if "width" in attrs and attrs["width"] is not None:
561 self.o("width='" + attrs["width"] + "' ")
562 if "height" in attrs and attrs["height"] is not None:
563 self.o("height='" + attrs["height"] + "' ")
564 if alt:
565 self.o("alt='" + alt + "' ")
566 self.o("/>")
567 return
568
569 # If we have a link to create, output the start
570 if self.maybe_automatic_link is not None:
571 href = self.maybe_automatic_link
572 if (
573 self.images_to_alt
574 and escape_md(alt) == href
575 and self.absolute_url_matcher.match(href)
576 ):
577 self.o("<" + escape_md(alt) + ">")
578 self.empty_link = False
579 return
580 else:
581 self.o("[")
582 self.maybe_automatic_link = None
583 self.empty_link = False
584
585 # If we have images_to_alt, we discard the image itself,
586 # considering only the alt text.
587 if self.images_to_alt:
588 self.o(escape_md(alt))
589 else:
590 self.o("![" + escape_md(alt) + "]")
591 if self.inline_links:
592 href = attrs.get("href") or ""
593 self.o(
594 "(" + escape_md(urlparse.urljoin(self.baseurl, href)) + ")"
595 )
596 else:
597 i = self.previousIndex(attrs)
598 if i is not None:
599 a_props = self.a[i]
600 else:
601 self.acount += 1
602 a_props = AnchorElement(attrs, self.acount, self.outcount)
603 self.a.append(a_props)
604 self.o("[" + str(a_props.count) + "]")
605
606 if tag == "dl" and start:
607 self.p()
608 if tag == "dt" and not start:
609 self.pbr()
610 if tag == "dd" and start:
611 self.o(" ")
612 if tag == "dd" and not start:
613 self.pbr()
614
615 if tag in ["ol", "ul"]:
616 # Google Docs create sub lists as top level lists
617 if not self.list and not self.lastWasList:
618 self.p()
619 if start:
620 if self.google_doc:
621 list_style = google_list_style(tag_style)
622 else:
623 list_style = tag
624 numbering_start = list_numbering_start(attrs)
625 self.list.append(ListElement(list_style, numbering_start))
626 else:
627 if self.list:
628 self.list.pop()
629 if not self.google_doc and not self.list:
630 self.o("\n")
631 self.lastWasList = True
632 else:
633 self.lastWasList = False
634
635 if tag == "li":
636 self.list_code_indent = ""
637 self.pbr()
638 if start:
639 if self.list:
640 li = self.list[-1]
641 else:
642 li = ListElement("ul", 0)
643 if self.google_doc:
644 self.o(" " * self.google_nest_count(tag_style))
645 else:
646 # Indent two spaces per list, except use three spaces for an
647 # unordered list inside an ordered list.
648 # https://spec.commonmark.org/0.28/#motivation
649 # TODO: line up <ol><li>s > 9 correctly.
650 parent_list = None
651 for list in self.list:
652 self.list_code_indent += " " if parent_list == "ol" else " "
653 parent_list = list.name
654 self.o(self.list_code_indent)
655
656 if li.name == "ul":
657 self.list_code_indent += " "
658 self.o(self.ul_item_mark + " ")
659 elif li.name == "ol":
660 li.num += 1
661 self.list_code_indent += " "
662 self.o(str(li.num) + ". ")
663 self.start = True
664
665 if tag in ["table", "tr", "td", "th"]:
666 if self.ignore_tables:
667 if tag == "tr":
668 if start:
669 pass
670 else:
671 self.soft_br()
672 else:
673 pass
674
675 elif self.bypass_tables:
676 if start:
677 self.soft_br()
678 if tag in ["td", "th"]:
679 if start:
680 self.o("<{}>\n\n".format(tag))
681 else:
682 self.o("\n</{}>".format(tag))
683 else:
684 if start:
685 self.o("<{}>".format(tag))
686 else:
687 self.o("</{}>".format(tag))
688
689 else:
690 if tag == "table":
691 if start:
692 self.table_start = True
693 if self.pad_tables:
694 self.o("<" + config.TABLE_MARKER_FOR_PAD + ">")
695 self.o(" \n")
696 else:
697 if self.pad_tables:
698 # add break in case the table is empty or its 1 row table
699 self.soft_br()
700 self.o("</" + config.TABLE_MARKER_FOR_PAD + ">")
701 self.o(" \n")
702 if tag in ["td", "th"] and start:
703 if self.split_next_td:
704 self.o("| ")
705 self.split_next_td = True
706
707 if tag == "tr" and start:
708 self.td_count = 0
709 if tag == "tr" and not start:
710 self.split_next_td = False
711 self.soft_br()
712 if tag == "tr" and not start and self.table_start:
713 # Underline table header
714 self.o("|".join(["---"] * self.td_count))
715 self.soft_br()
716 self.table_start = False
717 if tag in ["td", "th"] and start:
718 self.td_count += 1
719
720 if tag == "pre":
721 if start:
722 self.startpre = True
723 self.pre = True
724 self.pre_indent = ""
725 else:
726 self.pre = False
727 if self.backquote_code_style:
728 self.out("\n" + self.pre_indent + "```")
729 if self.mark_code:
730 self.out("\n[/code]")
731 self.p()
732
733 if tag in ["sup", "sub"] and self.include_sup_sub:
734 if start:
735 self.o("<{}>".format(tag))
736 else:
737 self.o("</{}>".format(tag))
738
739 # TODO: Add docstring for these one letter functions
740 def pbr(self) -> None:
741 "Pretty print has a line break"
742 if self.p_p == 0:
743 self.p_p = 1
744
745 def p(self) -> None:
746 "Set pretty print to 1 or 2 lines"
747 self.p_p = 1 if self.single_line_break else 2
748
749 def soft_br(self) -> None:
750 "Soft breaks"
751 self.pbr()
752 self.br_toggle = " "
753
754 def o(
755 self, data: str, puredata: bool = False, force: Union[bool, str] = False
756 ) -> None:
757 """
758 Deal with indentation and whitespace
759 """
760 if self.abbr_data is not None:
761 self.abbr_data += data
762
763 if not self.quiet:
764 if self.google_doc:
765 # prevent white space immediately after 'begin emphasis'
766 # marks ('**' and '_')
767 lstripped_data = data.lstrip()
768 if self.drop_white_space and not (self.pre or self.code):
769 data = lstripped_data
770 if lstripped_data != "":
771 self.drop_white_space = 0
772
773 if puredata and not self.pre:
774 # This is a very dangerous call ... it could mess up
775 # all handling of when not handled properly
776 # (see entityref)
777 data = re.sub(r"\s+", r" ", data)
778 if data and data[0] == " ":
779 self.space = True
780 data = data[1:]
781 if not data and not force:
782 return
783
784 if self.startpre:
785 # self.out(" :") #TODO: not output when already one there
786 if not data.startswith("\n") and not data.startswith("\r\n"):
787 # <pre>stuff...
788 data = "\n" + data
789 if self.mark_code:
790 self.out("\n[code]")
791 self.p_p = 0
792
793 bq = ">" * self.blockquote
794 if not (force and data and data[0] == ">") and self.blockquote:
795 bq += " "
796
797 if self.pre:
798 if self.list:
799 bq += self.list_code_indent
800
801 if not self.backquote_code_style:
802 bq += " "
803
804 data = data.replace("\n", "\n" + bq)
805 self.pre_indent = bq
806
807 if self.startpre:
808 self.startpre = False
809 if self.backquote_code_style:
810 self.out("\n" + self.pre_indent + "```")
811 self.p_p = 0
812 elif self.list:
813 # use existing initial indentation
814 data = data.lstrip("\n" + self.pre_indent)
815
816 if self.start:
817 self.space = False
818 self.p_p = 0
819 self.start = False
820
821 if force == "end":
822 # It's the end.
823 self.p_p = 0
824 self.out("\n")
825 self.space = False
826
827 if self.p_p:
828 self.out((self.br_toggle + "\n" + bq) * self.p_p)
829 self.space = False
830 self.br_toggle = ""
831
832 if self.space:
833 if not self.lastWasNL:
834 self.out(" ")
835 self.space = False
836
837 if self.a and (
838 (self.p_p == 2 and self.links_each_paragraph) or force == "end"
839 ):
840 if force == "end":
841 self.out("\n")
842
843 newa = []
844 for link in self.a:
845 if self.outcount > link.outcount:
846 self.out(
847 " ["
848 + str(link.count)
849 + "]: "
850 + urlparse.urljoin(self.baseurl, link.attrs["href"])
851 )
852 if "title" in link.attrs and link.attrs["title"] is not None:
853 self.out(" (" + link.attrs["title"] + ")")
854 self.out("\n")
855 else:
856 newa.append(link)
857
858 # Don't need an extra line when nothing was done.
859 if self.a != newa:
860 self.out("\n")
861
862 self.a = newa
863
864 if self.abbr_list and force == "end":
865 for abbr, definition in self.abbr_list.items():
866 self.out(" *[" + abbr + "]: " + definition + "\n")
867
868 self.p_p = 0
869 self.out(data)
870 self.outcount += 1
871
872 def handle_data(self, data: str, entity_char: bool = False) -> None:
873 if not data:
874 # Data may be empty for some HTML entities. For example,
875 # LEFT-TO-RIGHT MARK.
876 return
877
878 if self.stressed:
879 data = data.strip()
880 self.stressed = False
881 self.preceding_stressed = True
882 elif self.preceding_stressed:
883 if (
884 re.match(r"[^][(){}\s.!?]", data[0])
885 and not hn(self.current_tag)
886 and self.current_tag not in ["a", "code", "pre"]
887 ):
888 # should match a letter or common punctuation
889 data = " " + data
890 self.preceding_stressed = False
891
892 if self.style:
893 self.style_def.update(dumb_css_parser(data))
894
895 if self.maybe_automatic_link is not None:
896 href = self.maybe_automatic_link
897 if (
898 href == data
899 and self.absolute_url_matcher.match(href)
900 and self.use_automatic_links
901 ):
902 self.o("<" + data + ">")
903 self.empty_link = False
904 return
905 else:
906 self.o("[")
907 self.maybe_automatic_link = None
908 self.empty_link = False
909
910 if not self.code and not self.pre and not entity_char:
911 data = escape_md_section(data, snob=self.escape_snob)
912 self.preceding_data = data
913 self.o(data, puredata=True)
914
915 def charref(self, name: str) -> str:
916 if name[0] in ["x", "X"]:
917 c = int(name[1:], 16)
918 else:
919 c = int(name)
920
921 if not 0 < c < 0x110000 or 0xD800 <= c < 0xE000: # invalid or surrogate
922 c = 0xFFFD # REPLACEMENT CHARACTER
923 c = control_character_replacements.get(c, c)
924
925 if not self.unicode_snob and c in unifiable_n:
926 return unifiable_n[c]
927 else:
928 return chr(c)
929
930 def entityref(self, c: str) -> str:
931 if not self.unicode_snob and c in config.UNIFIABLE:
932 return config.UNIFIABLE[c]
933 try:
934 ch = html.entities.html5[c + ";"]
935 except KeyError:
936 return "&" + c + ";"
937 return config.UNIFIABLE[c] if c == "nbsp" else ch
938
939 def google_nest_count(self, style: Dict[str, str]) -> int:
940 """
941 Calculate the nesting count of google doc lists
942
943 :type style: dict
944
945 :rtype: int
946 """
947 nest_count = 0
948 if "margin-left" in style:
949 nest_count = int(style["margin-left"][:-2]) // self.google_list_indent
950
951 return nest_count
952
953 def optwrap(self, text: str) -> str:
954 """
955 Wrap all paragraphs in the provided text.
956
957 :type text: str
958
959 :rtype: str
960 """
961 if not self.body_width:
962 return text
963
964 result = ""
965 newlines = 0
966 # I cannot think of a better solution for now.
967 # To avoid the non-wrap behaviour for entire paras
968 # because of the presence of a link in it
969 if not self.wrap_links:
970 self.inline_links = False
971 start_code = False
972 for para in text.split("\n"):
973 # If the text is between tri-backquote pairs, it's a code block;
974 # don't wrap
975 if self.backquote_code_style and para.lstrip().startswith("```"):
976 start_code = not start_code
977 if start_code:
978 result += para + "\n"
979 elif len(para) > 0:
980 if not skipwrap(
981 para, self.wrap_links, self.wrap_list_items, self.wrap_tables
982 ):
983 indent = ""
984 if para.startswith(" " + self.ul_item_mark):
985 # list item continuation: add a double indent to the
986 # new lines
987 indent = " "
988 elif para.startswith("> "):
989 # blockquote continuation: add the greater than symbol
990 # to the new lines
991 indent = "> "
992 wrapped = wrap(
993 para,
994 self.body_width,
995 break_long_words=False,
996 subsequent_indent=indent,
997 )
998 result += "\n".join(wrapped)
999 if para.endswith(" "):
1000 result += " \n"
1001 newlines = 1
1002 elif indent:
1003 result += "\n"
1004 newlines = 1
1005 else:
1006 result += "\n\n"
1007 newlines = 2
1008 else:
1009 # Warning for the tempted!!!
1010 # Be aware that obvious replacement of this with
1011 # line.isspace()
1012 # DOES NOT work! Explanations are welcome.
1013 if not config.RE_SPACE.match(para):
1014 result += para + "\n"
1015 newlines = 1
1016 else:
1017 if newlines < 2:
1018 result += "\n"
1019 newlines += 1
1020 return result
1021
1022
1023def html2text(html: str, baseurl: str = "", bodywidth: Optional[int] = None) -> str:
1024 if bodywidth is None:
1025 bodywidth = config.BODY_WIDTH
1026 h = HTML2Text(baseurl=baseurl, bodywidth=bodywidth)
1027
1028 return h.handle(html)