Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/html2text/utils.py: 43%

1import html.entities

2from typing import Dict, List, Optional

4from . import config

6unifiable_n = {

7 html.entities.name2codepoint[k]: v

8 for k, v in config.UNIFIABLE.items()

9 if k != "nbsp"

10}

12# https://html.spec.whatwg.org/multipage/parsing.html#character-reference-code

13control_character_replacements = {

14 0x80: 0x20AC, # EURO SIGN (€)

15 0x82: 0x201A, # SINGLE LOW-9 QUOTATION MARK (‚)

16 0x83: 0x0192, # LATIN SMALL LETTER F WITH HOOK (ƒ)

17 0x84: 0x201E, # DOUBLE LOW-9 QUOTATION MARK („)

18 0x85: 0x2026, # HORIZONTAL ELLIPSIS (…)

19 0x86: 0x2020, # DAGGER (†)

20 0x87: 0x2021, # DOUBLE DAGGER (‡)

21 0x88: 0x02C6, # MODIFIER LETTER CIRCUMFLEX ACCENT (ˆ)

22 0x89: 0x2030, # PER MILLE SIGN (‰)

23 0x8A: 0x0160, # LATIN CAPITAL LETTER S WITH CARON (Š)

24 0x8B: 0x2039, # SINGLE LEFT-POINTING ANGLE QUOTATION MARK (‹)

25 0x8C: 0x0152, # LATIN CAPITAL LIGATURE OE (Œ)

26 0x8E: 0x017D, # LATIN CAPITAL LETTER Z WITH CARON (Ž)

27 0x91: 0x2018, # LEFT SINGLE QUOTATION MARK (‘)

28 0x92: 0x2019, # RIGHT SINGLE QUOTATION MARK (’)

29 0x93: 0x201C, # LEFT DOUBLE QUOTATION MARK (“)

30 0x94: 0x201D, # RIGHT DOUBLE QUOTATION MARK (”)

31 0x95: 0x2022, # BULLET (•)

32 0x96: 0x2013, # EN DASH (–)

33 0x97: 0x2014, # EM DASH (—)

34 0x98: 0x02DC, # SMALL TILDE (˜)

35 0x99: 0x2122, # TRADE MARK SIGN (™)

36 0x9A: 0x0161, # LATIN SMALL LETTER S WITH CARON (š)

37 0x9B: 0x203A, # SINGLE RIGHT-POINTING ANGLE QUOTATION MARK (›)

38 0x9C: 0x0153, # LATIN SMALL LIGATURE OE (œ)

39 0x9E: 0x017E, # LATIN SMALL LETTER Z WITH CARON (ž)

40 0x9F: 0x0178, # LATIN CAPITAL LETTER Y WITH DIAERESIS (Ÿ)

41}

44def hn(tag: str) -> int:

45 if tag[0] == "h" and len(tag) == 2:

46 n = tag[1]

47 if "0" < n <= "9":

48 return int(n)

49 return 0

52def dumb_property_dict(style: str) -> Dict[str, str]:

53 """

54 :returns: A hash of css attributes

55 """

56 return {

57 x.strip().lower(): y.strip().lower()

58 for x, y in [z.split(":", 1) for z in style.split(";") if ":" in z]

59 }

62def dumb_css_parser(data: str) -> Dict[str, Dict[str, str]]:

63 """

64 :type data: str

66 :returns: A hash of css selectors, each of which contains a hash of

67 css attributes.

68 :rtype: dict

69 """

70 # remove @import sentences

71 data += ";"

72 importIndex = data.find("@import")

73 while importIndex != -1:

74 data = data[0:importIndex] + data[data.find(";", importIndex) + 1 :]

75 importIndex = data.find("@import")

77 # parse the css. reverted from dictionary comprehension in order to

78 # support older pythons

79 pairs = [x.split("{") for x in data.split("}") if "{" in x.strip()]

80 try:

81 elements = {a.strip(): dumb_property_dict(b) for a, b in pairs}

82 except ValueError:

83 elements = {} # not that important

85 return elements

88def element_style(

89 attrs: Dict[str, Optional[str]],

90 style_def: Dict[str, Dict[str, str]],

91 parent_style: Dict[str, str],

92) -> Dict[str, str]:

93 """

94 :type attrs: dict

95 :type style_def: dict

96 :type style_def: dict

98 :returns: A hash of the 'final' style attributes of the element

99 :rtype: dict

100 """

101 style = parent_style.copy()

102 if "class" in attrs:

103 assert attrs["class"] is not None

104 for css_class in attrs["class"].split():

105 css_style = style_def.get("." + css_class, {})

106 style.update(css_style)

107 if "style" in attrs:

108 assert attrs["style"] is not None

109 immediate_style = dumb_property_dict(attrs["style"])

110 style.update(immediate_style)

111

112 return style

113

114

115def google_list_style(style: Dict[str, str]) -> str:

116 """

117 Finds out whether this is an ordered or unordered list

118

119 :type style: dict

120

121 :rtype: str

122 """

123 if "list-style-type" in style:

124 list_style = style["list-style-type"]

125 if list_style in ["disc", "circle", "square", "none"]:

126 return "ul"

127

128 return "ol"

129

130

131def google_has_height(style: Dict[str, str]) -> bool:

132 """

133 Check if the style of the element has the 'height' attribute

134 explicitly defined

135

136 :type style: dict

137

138 :rtype: bool

139 """

140 return "height" in style

141

142

143def google_text_emphasis(style: Dict[str, str]) -> List[str]:

144 """

145 :type style: dict

146

147 :returns: A list of all emphasis modifiers of the element

148 :rtype: list

149 """

150 emphasis = []

151 if "text-decoration" in style:

152 emphasis.append(style["text-decoration"])

153 if "font-style" in style:

154 emphasis.append(style["font-style"])

155 if "font-weight" in style:

156 emphasis.append(style["font-weight"])

157

158 return emphasis

159

160

161def google_fixed_width_font(style: Dict[str, str]) -> bool:

162 """

163 Check if the css of the current element defines a fixed width font

164

165 :type style: dict

166

167 :rtype: bool

168 """

169 font_family = ""

170 if "font-family" in style:

171 font_family = style["font-family"]

172 return "courier new" == font_family or "consolas" == font_family

173

174

175def list_numbering_start(attrs: Dict[str, Optional[str]]) -> int:

176 """

177 Extract numbering from list element attributes

178

179 :type attrs: dict

180

181 :rtype: int or None

182 """

183 if "start" in attrs:

184 assert attrs["start"] is not None

185 try:

186 return int(attrs["start"]) - 1

187 except ValueError:

188 pass

189

190 return 0

191

192

193def skipwrap(

194 para: str, wrap_links: bool, wrap_list_items: bool, wrap_tables: bool

195) -> bool:

196 # If it appears to contain a link

197 # don't wrap

198 if not wrap_links and config.RE_LINK.search(para):

199 return True

200 # If the text begins with four spaces or one tab, it's a code block;

201 # don't wrap

202 if para[0:4] == " " or para[0] == "\t":

203 return True

204

205 # If the text begins with only two "--", possibly preceded by

206 # whitespace, that's an emdash; so wrap.

207 stripped = para.lstrip()

208 if stripped[0:2] == "--" and len(stripped) > 2 and stripped[2] != "-":

209 return False

210

211 # I'm not sure what this is for; I thought it was to detect lists,

212 # but there's a <br>-inside-<span> case in one of the tests that

213 # also depends upon it.

214 if stripped[0:1] in ("-", "*") and not stripped[0:2] == "**":

215 return not wrap_list_items

216

217 # If text contains a pipe character it is likely a table

218 if not wrap_tables and config.RE_TABLE.search(para):

219 return True

220

221 # If the text begins with a single -, *, or +, followed by a space,

222 # or an integer, followed by a ., followed by a space (in either

223 # case optionally proceeded by whitespace), it's a list; don't wrap.

224 return bool(

225 config.RE_ORDERED_LIST_MATCHER.match(stripped)

226 or config.RE_UNORDERED_LIST_MATCHER.match(stripped)

227 )

228

229

230def escape_md(text: str) -> str:

231 """

232 Escapes markdown-sensitive characters within other markdown

233 constructs.

234 """

235 return config.RE_MD_CHARS_MATCHER.sub(r"\\\1", text)

236

237

238def escape_md_section(text: str, snob: bool = False) -> str:

239 """

240 Escapes markdown-sensitive characters across whole document sections.

241 """

242 text = config.RE_MD_BACKSLASH_MATCHER.sub(r"\\\1", text)

243

244 if snob:

245 text = config.RE_MD_CHARS_MATCHER_ALL.sub(r"\\\1", text)

246

247 text = config.RE_MD_DOT_MATCHER.sub(r"\1\\\2", text)

248 text = config.RE_MD_PLUS_MATCHER.sub(r"\1\\\2", text)

249 text = config.RE_MD_DASH_MATCHER.sub(r"\1\\\2", text)

250

251 return text

252

253

254def reformat_table(lines: List[str], right_margin: int) -> List[str]:

255 """

256 Given the lines of a table

257 padds the cells and returns the new lines

258 """

259 # find the maximum width of the columns

260 max_width = [len(x.rstrip()) + right_margin for x in lines[0].split("|")]

261 max_cols = len(max_width)

262 for line in lines:

263 cols = [x.rstrip() for x in line.split("|")]

264 num_cols = len(cols)

265

266 # don't drop any data if colspan attributes result in unequal lengths

267 if num_cols < max_cols:

268 cols += [""] * (max_cols - num_cols)

269 elif max_cols < num_cols:

270 max_width += [len(x) + right_margin for x in cols[-(num_cols - max_cols) :]]

271 max_cols = num_cols

272

273 max_width = [

274 max(len(x) + right_margin, old_len) for x, old_len in zip(cols, max_width)

275 ]

276

277 # reformat

278 new_lines = []

279 for line in lines:

280 cols = [x.rstrip() for x in line.split("|")]

281 if set(line.strip()) == set("-|"):

282 filler = "-"

283 new_cols = [

284 x.rstrip() + (filler * (M - len(x.rstrip())))

285 for x, M in zip(cols, max_width)

286 ]

287 new_lines.append("|-" + "|".join(new_cols) + "|")

288 else:

289 filler = " "

290 new_cols = [

291 x.rstrip() + (filler * (M - len(x.rstrip())))

292 for x, M in zip(cols, max_width)

293 ]

294 new_lines.append("| " + "|".join(new_cols) + "|")

295 return new_lines

296

297

298def pad_tables_in_text(text: str, right_margin: int = 1) -> str:

299 """

300 Provide padding for tables in the text

301 """

302 lines = text.split("\n")

303 table_buffer = [] # type: List[str]

304 table_started = False

305 new_lines = []

306 for line in lines:

307 # Toggle table started

308 if config.TABLE_MARKER_FOR_PAD in line:

309 table_started = not table_started

310 if not table_started:

311 table = reformat_table(table_buffer, right_margin)

312 new_lines.extend(table)

313 table_buffer = []

314 new_lines.append("")

315 continue

316 # Process lines

317 if table_started:

318 table_buffer.append(line)

319 else:

320 new_lines.append(line)

321 return "\n".join(new_lines)