Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html2text/utils.py: 43%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

131 statements  

1import html.entities 

2from typing import Dict, List, Optional 

3 

4from . import config 

5 

6unifiable_n = { 

7 html.entities.name2codepoint[k]: v 

8 for k, v in config.UNIFIABLE.items() 

9 if k != "nbsp" 

10} 

11 

12# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code 

13control_character_replacements = { 

14 0x80: 0x20AC, # EURO SIGN (€) 

15 0x82: 0x201A, # SINGLE LOW-9 QUOTATION MARK (‚) 

16 0x83: 0x0192, # LATIN SMALL LETTER F WITH HOOK (ƒ) 

17 0x84: 0x201E, # DOUBLE LOW-9 QUOTATION MARK („) 

18 0x85: 0x2026, # HORIZONTAL ELLIPSIS (…) 

19 0x86: 0x2020, # DAGGER (†) 

20 0x87: 0x2021, # DOUBLE DAGGER (‡) 

21 0x88: 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ) 

22 0x89: 0x2030, # PER MILLE SIGN (‰) 

23 0x8A: 0x0160, # LATIN CAPITAL LETTER S WITH CARON (Š) 

24 0x8B: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹) 

25 0x8C: 0x0152, # LATIN CAPITAL LIGATURE OE (Œ) 

26 0x8E: 0x017D, # LATIN CAPITAL LETTER Z WITH CARON (Ž) 

27 0x91: 0x2018, # LEFT SINGLE QUOTATION MARK (‘) 

28 0x92: 0x2019, # RIGHT SINGLE QUOTATION MARK (’) 

29 0x93: 0x201C, # LEFT DOUBLE QUOTATION MARK (“) 

30 0x94: 0x201D, # RIGHT DOUBLE QUOTATION MARK (”) 

31 0x95: 0x2022, # BULLET (•) 

32 0x96: 0x2013, # EN DASH (–) 

33 0x97: 0x2014, # EM DASH (—) 

34 0x98: 0x02DC, # SMALL TILDE (˜) 

35 0x99: 0x2122, # TRADE MARK SIGN (™) 

36 0x9A: 0x0161, # LATIN SMALL LETTER S WITH CARON (š) 

37 0x9B: 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›) 

38 0x9C: 0x0153, # LATIN SMALL LIGATURE OE (œ) 

39 0x9E: 0x017E, # LATIN SMALL LETTER Z WITH CARON (ž) 

40 0x9F: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ) 

41} 

42 

43 

44def hn(tag: str) -> int: 

45 if tag[0] == "h" and len(tag) == 2: 

46 n = tag[1] 

47 if "0" < n <= "9": 

48 return int(n) 

49 return 0 

50 

51 

52def dumb_property_dict(style: str) -> Dict[str, str]: 

53 """ 

54 :returns: A hash of css attributes 

55 """ 

56 return { 

57 x.strip().lower(): y.strip().lower() 

58 for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z] 

59 } 

60 

61 

62def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]: 

63 """ 

64 :type data: str 

65 

66 :returns: A hash of css selectors, each of which contains a hash of 

67 css attributes. 

68 :rtype: dict 

69 """ 

70 # remove @import sentences 

71 data += ";" 

72 importIndex = data.find("@import") 

73 while importIndex != -1: 

74 data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :] 

75 importIndex = data.find("@import") 

76 

77 # parse the css. reverted from dictionary comprehension in order to 

78 # support older pythons 

79 pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()] 

80 try: 

81 elements = {a.strip(): dumb_property_dict(b) for a, b in pairs} 

82 except ValueError: 

83 elements = {} # not that important 

84 

85 return elements 

86 

87 

88def element_style( 

89 attrs: Dict[str, Optional[str]], 

90 style_def: Dict[str, Dict[str, str]], 

91 parent_style: Dict[str, str], 

92) -> Dict[str, str]: 

93 """ 

94 :type attrs: dict 

95 :type style_def: dict 

96 :type style_def: dict 

97 

98 :returns: A hash of the 'final' style attributes of the element 

99 :rtype: dict 

100 """ 

101 style = parent_style.copy() 

102 if "class" in attrs: 

103 assert attrs["class"] is not None 

104 for css_class in attrs["class"].split(): 

105 css_style = style_def.get("." + css_class, {}) 

106 style.update(css_style) 

107 if "style" in attrs: 

108 assert attrs["style"] is not None 

109 immediate_style = dumb_property_dict(attrs["style"]) 

110 style.update(immediate_style) 

111 

112 return style 

113 

114 

115def google_list_style(style: Dict[str, str]) -> str: 

116 """ 

117 Finds out whether this is an ordered or unordered list 

118 

119 :type style: dict 

120 

121 :rtype: str 

122 """ 

123 if "list-style-type" in style: 

124 list_style = style["list-style-type"] 

125 if list_style in ["disc", "circle", "square", "none"]: 

126 return "ul" 

127 

128 return "ol" 

129 

130 

131def google_has_height(style: Dict[str, str]) -> bool: 

132 """ 

133 Check if the style of the element has the 'height' attribute 

134 explicitly defined 

135 

136 :type style: dict 

137 

138 :rtype: bool 

139 """ 

140 return "height" in style 

141 

142 

143def google_text_emphasis(style: Dict[str, str]) -> List[str]: 

144 """ 

145 :type style: dict 

146 

147 :returns: A list of all emphasis modifiers of the element 

148 :rtype: list 

149 """ 

150 emphasis = [] 

151 if "text-decoration" in style: 

152 emphasis.append(style["text-decoration"]) 

153 if "font-style" in style: 

154 emphasis.append(style["font-style"]) 

155 if "font-weight" in style: 

156 emphasis.append(style["font-weight"]) 

157 

158 return emphasis 

159 

160 

161def google_fixed_width_font(style: Dict[str, str]) -> bool: 

162 """ 

163 Check if the css of the current element defines a fixed width font 

164 

165 :type style: dict 

166 

167 :rtype: bool 

168 """ 

169 font_family = "" 

170 if "font-family" in style: 

171 font_family = style["font-family"] 

172 return "courier new" == font_family or "consolas" == font_family 

173 

174 

175def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int: 

176 """ 

177 Extract numbering from list element attributes 

178 

179 :type attrs: dict 

180 

181 :rtype: int or None 

182 """ 

183 if "start" in attrs: 

184 assert attrs["start"] is not None 

185 try: 

186 return int(attrs["start"]) - 1 

187 except ValueError: 

188 pass 

189 

190 return 0 

191 

192 

193def skipwrap( 

194 para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool 

195) -> bool: 

196 # If it appears to contain a link 

197 # don't wrap 

198 if not wrap_links and config.RE_LINK.search(para): 

199 return True 

200 # If the text begins with four spaces or one tab, it's a code block; 

201 # don't wrap 

202 if para[0:4] == " " or para[0] == "\t": 

203 return True 

204 

205 # If the text begins with only two "--", possibly preceded by 

206 # whitespace, that's an emdash; so wrap. 

207 stripped = para.lstrip() 

208 if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-": 

209 return False 

210 

211 # I'm not sure what this is for; I thought it was to detect lists, 

212 # but there's a <br>-inside-<span> case in one of the tests that 

213 # also depends upon it. 

214 if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**": 

215 return not wrap_list_items 

216 

217 # If text contains a pipe character it is likely a table 

218 if not wrap_tables and config.RE_TABLE.search(para): 

219 return True 

220 

221 # If the text begins with a single -, *, or +, followed by a space, 

222 # or an integer, followed by a ., followed by a space (in either 

223 # case optionally proceeded by whitespace), it's a list; don't wrap. 

224 return bool( 

225 config.RE_ORDERED_LIST_MATCHER.match(stripped) 

226 or config.RE_UNORDERED_LIST_MATCHER.match(stripped) 

227 ) 

228 

229 

230def escape_md(text: str) -> str: 

231 """ 

232 Escapes markdown-sensitive characters within other markdown 

233 constructs. 

234 """ 

235 return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text) 

236 

237 

238def escape_md_section(text: str, snob: bool = False) -> str: 

239 """ 

240 Escapes markdown-sensitive characters across whole document sections. 

241 """ 

242 text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text) 

243 

244 if snob: 

245 text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text) 

246 

247 text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text) 

248 text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text) 

249 text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text) 

250 

251 return text 

252 

253 

254def reformat_table(lines: List[str], right_margin: int) -> List[str]: 

255 """ 

256 Given the lines of a table 

257 padds the cells and returns the new lines 

258 """ 

259 # find the maximum width of the columns 

260 max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")] 

261 max_cols = len(max_width) 

262 for line in lines: 

263 cols = [x.rstrip() for x in line.split("|")] 

264 num_cols = len(cols) 

265 

266 # don't drop any data if colspan attributes result in unequal lengths 

267 if num_cols < max_cols: 

268 cols += [""] * (max_cols - num_cols) 

269 elif max_cols < num_cols: 

270 max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]] 

271 max_cols = num_cols 

272 

273 max_width = [ 

274 max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width) 

275 ] 

276 

277 # reformat 

278 new_lines = [] 

279 for line in lines: 

280 cols = [x.rstrip() for x in line.split("|")] 

281 if set(line.strip()) == set("-|"): 

282 filler = "-" 

283 new_cols = [ 

284 x.rstrip() + (filler * (M - len(x.rstrip()))) 

285 for x, M in zip(cols, max_width) 

286 ] 

287 new_lines.append("|-" + "|".join(new_cols) + "|") 

288 else: 

289 filler = " " 

290 new_cols = [ 

291 x.rstrip() + (filler * (M - len(x.rstrip()))) 

292 for x, M in zip(cols, max_width) 

293 ] 

294 new_lines.append("| " + "|".join(new_cols) + "|") 

295 return new_lines 

296 

297 

298def pad_tables_in_text(text: str, right_margin: int = 1) -> str: 

299 """ 

300 Provide padding for tables in the text 

301 """ 

302 lines = text.split("\n") 

303 table_buffer = [] # type: List[str] 

304 table_started = False 

305 new_lines = [] 

306 for line in lines: 

307 # Toggle table started 

308 if config.TABLE_MARKER_FOR_PAD in line: 

309 table_started = not table_started 

310 if not table_started: 

311 table = reformat_table(table_buffer, right_margin) 

312 new_lines.extend(table) 

313 table_buffer = [] 

314 new_lines.append("") 

315 continue 

316 # Process lines 

317 if table_started: 

318 table_buffer.append(line) 

319 else: 

320 new_lines.append(line) 

321 return "\n".join(new_lines)