Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html2text/utils.py: 43%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import html.entities
2from typing import Dict, List, Optional
4from . import config
6unifiable_n = {
7 html.entities.name2codepoint[k]: v
8 for k, v in config.UNIFIABLE.items()
9 if k != "nbsp"
10}
12# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code
13control_character_replacements = {
14 0x80: 0x20AC, # EURO SIGN (€)
15 0x82: 0x201A, # SINGLE LOW-9 QUOTATION MARK (‚)
16 0x83: 0x0192, # LATIN SMALL LETTER F WITH HOOK (ƒ)
17 0x84: 0x201E, # DOUBLE LOW-9 QUOTATION MARK („)
18 0x85: 0x2026, # HORIZONTAL ELLIPSIS (…)
19 0x86: 0x2020, # DAGGER (†)
20 0x87: 0x2021, # DOUBLE DAGGER (‡)
21 0x88: 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)
22 0x89: 0x2030, # PER MILLE SIGN (‰)
23 0x8A: 0x0160, # LATIN CAPITAL LETTER S WITH CARON (Š)
24 0x8B: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)
25 0x8C: 0x0152, # LATIN CAPITAL LIGATURE OE (Œ)
26 0x8E: 0x017D, # LATIN CAPITAL LETTER Z WITH CARON (Ž)
27 0x91: 0x2018, # LEFT SINGLE QUOTATION MARK (‘)
28 0x92: 0x2019, # RIGHT SINGLE QUOTATION MARK (’)
29 0x93: 0x201C, # LEFT DOUBLE QUOTATION MARK (“)
30 0x94: 0x201D, # RIGHT DOUBLE QUOTATION MARK (”)
31 0x95: 0x2022, # BULLET (•)
32 0x96: 0x2013, # EN DASH (–)
33 0x97: 0x2014, # EM DASH (—)
34 0x98: 0x02DC, # SMALL TILDE (˜)
35 0x99: 0x2122, # TRADE MARK SIGN (™)
36 0x9A: 0x0161, # LATIN SMALL LETTER S WITH CARON (š)
37 0x9B: 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)
38 0x9C: 0x0153, # LATIN SMALL LIGATURE OE (œ)
39 0x9E: 0x017E, # LATIN SMALL LETTER Z WITH CARON (ž)
40 0x9F: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)
41}
44def hn(tag: str) -> int:
45 if tag[0] == "h" and len(tag) == 2:
46 n = tag[1]
47 if "0" < n <= "9":
48 return int(n)
49 return 0
52def dumb_property_dict(style: str) -> Dict[str, str]:
53 """
54 :returns: A hash of css attributes
55 """
56 return {
57 x.strip().lower(): y.strip().lower()
58 for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]
59 }
62def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:
63 """
64 :type data: str
66 :returns: A hash of css selectors, each of which contains a hash of
67 css attributes.
68 :rtype: dict
69 """
70 # remove @import sentences
71 data += ";"
72 importIndex = data.find("@import")
73 while importIndex != -1:
74 data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]
75 importIndex = data.find("@import")
77 # parse the css. reverted from dictionary comprehension in order to
78 # support older pythons
79 pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]
80 try:
81 elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}
82 except ValueError:
83 elements = {} # not that important
85 return elements
88def element_style(
89 attrs: Dict[str, Optional[str]],
90 style_def: Dict[str, Dict[str, str]],
91 parent_style: Dict[str, str],
92) -> Dict[str, str]:
93 """
94 :type attrs: dict
95 :type style_def: dict
96 :type style_def: dict
98 :returns: A hash of the 'final' style attributes of the element
99 :rtype: dict
100 """
101 style = parent_style.copy()
102 if "class" in attrs:
103 assert attrs["class"] is not None
104 for css_class in attrs["class"].split():
105 css_style = style_def.get("." + css_class, {})
106 style.update(css_style)
107 if "style" in attrs:
108 assert attrs["style"] is not None
109 immediate_style = dumb_property_dict(attrs["style"])
110 style.update(immediate_style)
112 return style
115def google_list_style(style: Dict[str, str]) -> str:
116 """
117 Finds out whether this is an ordered or unordered list
119 :type style: dict
121 :rtype: str
122 """
123 if "list-style-type" in style:
124 list_style = style["list-style-type"]
125 if list_style in ["disc", "circle", "square", "none"]:
126 return "ul"
128 return "ol"
131def google_has_height(style: Dict[str, str]) -> bool:
132 """
133 Check if the style of the element has the 'height' attribute
134 explicitly defined
136 :type style: dict
138 :rtype: bool
139 """
140 return "height" in style
143def google_text_emphasis(style: Dict[str, str]) -> List[str]:
144 """
145 :type style: dict
147 :returns: A list of all emphasis modifiers of the element
148 :rtype: list
149 """
150 emphasis = []
151 if "text-decoration" in style:
152 emphasis.append(style["text-decoration"])
153 if "font-style" in style:
154 emphasis.append(style["font-style"])
155 if "font-weight" in style:
156 emphasis.append(style["font-weight"])
158 return emphasis
161def google_fixed_width_font(style: Dict[str, str]) -> bool:
162 """
163 Check if the css of the current element defines a fixed width font
165 :type style: dict
167 :rtype: bool
168 """
169 font_family = ""
170 if "font-family" in style:
171 font_family = style["font-family"]
172 return "courier new" == font_family or "consolas" == font_family
175def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:
176 """
177 Extract numbering from list element attributes
179 :type attrs: dict
181 :rtype: int or None
182 """
183 if "start" in attrs:
184 assert attrs["start"] is not None
185 try:
186 return int(attrs["start"]) - 1
187 except ValueError:
188 pass
190 return 0
193def skipwrap(
194 para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool
195) -> bool:
196 # If it appears to contain a link
197 # don't wrap
198 if not wrap_links and config.RE_LINK.search(para):
199 return True
200 # If the text begins with four spaces or one tab, it's a code block;
201 # don't wrap
202 if para[0:4] == " " or para[0] == "\t":
203 return True
205 # If the text begins with only two "--", possibly preceded by
206 # whitespace, that's an emdash; so wrap.
207 stripped = para.lstrip()
208 if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":
209 return False
211 # I'm not sure what this is for; I thought it was to detect lists,
212 # but there's a <br>-inside-<span> case in one of the tests that
213 # also depends upon it.
214 if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":
215 return not wrap_list_items
217 # If text contains a pipe character it is likely a table
218 if not wrap_tables and config.RE_TABLE.search(para):
219 return True
221 # If the text begins with a single -, *, or +, followed by a space,
222 # or an integer, followed by a ., followed by a space (in either
223 # case optionally proceeded by whitespace), it's a list; don't wrap.
224 return bool(
225 config.RE_ORDERED_LIST_MATCHER.match(stripped)
226 or config.RE_UNORDERED_LIST_MATCHER.match(stripped)
227 )
230def escape_md(text: str) -> str:
231 """
232 Escapes markdown-sensitive characters within other markdown
233 constructs.
234 """
235 return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)
238def escape_md_section(text: str, snob: bool = False) -> str:
239 """
240 Escapes markdown-sensitive characters across whole document sections.
241 """
242 text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)
244 if snob:
245 text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)
247 text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)
248 text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)
249 text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)
251 return text
254def reformat_table(lines: List[str], right_margin: int) -> List[str]:
255 """
256 Given the lines of a table
257 padds the cells and returns the new lines
258 """
259 # find the maximum width of the columns
260 max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]
261 max_cols = len(max_width)
262 for line in lines:
263 cols = [x.rstrip() for x in line.split("|")]
264 num_cols = len(cols)
266 # don't drop any data if colspan attributes result in unequal lengths
267 if num_cols < max_cols:
268 cols += [""] * (max_cols - num_cols)
269 elif max_cols < num_cols:
270 max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]
271 max_cols = num_cols
273 max_width = [
274 max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)
275 ]
277 # reformat
278 new_lines = []
279 for line in lines:
280 cols = [x.rstrip() for x in line.split("|")]
281 if set(line.strip()) == set("-|"):
282 filler = "-"
283 new_cols = [
284 x.rstrip() + (filler * (M - len(x.rstrip())))
285 for x, M in zip(cols, max_width)
286 ]
287 new_lines.append("|-" + "|".join(new_cols) + "|")
288 else:
289 filler = " "
290 new_cols = [
291 x.rstrip() + (filler * (M - len(x.rstrip())))
292 for x, M in zip(cols, max_width)
293 ]
294 new_lines.append("| " + "|".join(new_cols) + "|")
295 return new_lines
298def pad_tables_in_text(text: str, right_margin: int = 1) -> str:
299 """
300 Provide padding for tables in the text
301 """
302 lines = text.split("\n")
303 table_buffer = [] # type: List[str]
304 table_started = False
305 new_lines = []
306 for line in lines:
307 # Toggle table started
308 if config.TABLE_MARKER_FOR_PAD in line:
309 table_started = not table_started
310 if not table_started:
311 table = reformat_table(table_buffer, right_margin)
312 new_lines.extend(table)
313 table_buffer = []
314 new_lines.append("")
315 continue
316 # Process lines
317 if table_started:
318 table_buffer.append(line)
319 else:
320 new_lines.append(line)
321 return "\n".join(new_lines)