Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown_it/common/utils.py: 94%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

99 statements  

1"""Utilities for parsing source text""" 

2 

3from __future__ import annotations 

4 

5import re 

6from re import Match 

7from typing import TypeVar 

8 

9from .entities import entities 

10 

11 

12def charCodeAt(src: str, pos: int) -> int | None: 

13 """ 

14 Returns the Unicode value of the character at the specified location. 

15 

16 @param - index The zero-based index of the desired character. 

17 If there is no character at the specified index, NaN is returned. 

18 

19 This was added for compatibility with python 

20 """ 

21 try: 

22 return ord(src[pos]) 

23 except IndexError: 

24 return None 

25 

26 

27def charStrAt(src: str, pos: int) -> str | None: 

28 """ 

29 Returns the Unicode value of the character at the specified location. 

30 

31 @param - index The zero-based index of the desired character. 

32 If there is no character at the specified index, NaN is returned. 

33 

34 This was added for compatibility with python 

35 """ 

36 try: 

37 return src[pos] 

38 except IndexError: 

39 return None 

40 

41 

42_ItemTV = TypeVar("_ItemTV") 

43 

44 

45def arrayReplaceAt( 

46 src: list[_ItemTV], pos: int, newElements: list[_ItemTV] 

47) -> list[_ItemTV]: 

48 """ 

49 Remove element from array and put another array at those position. 

50 Useful for some operations with tokens 

51 """ 

52 return src[:pos] + newElements + src[pos + 1 :] 

53 

54 

55def isValidEntityCode(c: int) -> bool: 

56 # broken sequence 

57 if c >= 0xD800 and c <= 0xDFFF: 

58 return False 

59 # never used 

60 if c >= 0xFDD0 and c <= 0xFDEF: 

61 return False 

62 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE): 

63 return False 

64 # control codes 

65 if c >= 0x00 and c <= 0x08: 

66 return False 

67 if c == 0x0B: 

68 return False 

69 if c >= 0x0E and c <= 0x1F: 

70 return False 

71 if c >= 0x7F and c <= 0x9F: 

72 return False 

73 # out of range 

74 return not (c > 0x10FFFF) 

75 

76 

77def fromCodePoint(c: int) -> str: 

78 """Convert ordinal to unicode. 

79 

80 Note, in the original Javascript two string characters were required, 

81 for codepoints larger than `0xFFFF`. 

82 But Python 3 can represent any unicode codepoint in one character. 

83 """ 

84 return chr(c) 

85 

86 

87# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') 

88# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) 

89UNESCAPE_ALL_RE = re.compile( 

90 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", 

91 re.IGNORECASE, 

92) 

93DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})") 

94DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE) 

95 

96 

97def replaceEntityPattern(match: str, name: str) -> str: 

98 """Convert HTML entity patterns, 

99 see https://spec.commonmark.org/0.30/#entity-references 

100 """ 

101 if name in entities: 

102 return entities[name] 

103 

104 code: None | int = None 

105 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name): 

106 code = int(pat.group(1), 10) 

107 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name): 

108 code = int(pat.group(1), 16) 

109 

110 if code is not None and isValidEntityCode(code): 

111 return fromCodePoint(code) 

112 

113 return match 

114 

115 

116def unescapeAll(string: str) -> str: 

117 def replacer_func(match: Match[str]) -> str: 

118 escaped = match.group(1) 

119 if escaped: 

120 return escaped 

121 entity = match.group(2) 

122 return replaceEntityPattern(match.group(), entity) 

123 

124 if "\\" not in string and "&" not in string: 

125 return string 

126 return UNESCAPE_ALL_RE.sub(replacer_func, string) 

127 

128 

129ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" 

130ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])") 

131 

132 

133def stripEscape(string: str) -> str: 

134 """Strip escape \\ characters""" 

135 return ESCAPE_CHAR.sub(r"\1", string) 

136 

137 

138def escapeHtml(raw: str) -> str: 

139 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences.""" 

140 # like html.escape, but without escaping single quotes 

141 raw = raw.replace("&", "&amp;") # Must be done first! 

142 raw = raw.replace("<", "&lt;") 

143 raw = raw.replace(">", "&gt;") 

144 raw = raw.replace('"', "&quot;") 

145 return raw 

146 

147 

148# ////////////////////////////////////////////////////////////////////////////// 

149 

150REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]") 

151 

152 

153def escapeRE(string: str) -> str: 

154 string = REGEXP_ESCAPE_RE.sub("\\$&", string) 

155 return string 

156 

157 

158# ////////////////////////////////////////////////////////////////////////////// 

159 

160 

161def isSpace(code: int | None) -> bool: 

162 """Check if character code is a whitespace.""" 

163 return code in (0x09, 0x20) 

164 

165 

166def isStrSpace(ch: str | None) -> bool: 

167 """Check if character is a whitespace.""" 

168 return ch in ("\t", " ") 

169 

170 

171MD_WHITESPACE = { 

172 0x09, # \t 

173 0x0A, # \n 

174 0x0B, # \v 

175 0x0C, # \f 

176 0x0D, # \r 

177 0x20, # space 

178 0xA0, 

179 0x1680, 

180 0x202F, 

181 0x205F, 

182 0x3000, 

183} 

184 

185 

186def isWhiteSpace(code: int) -> bool: 

187 r"""Zs (unicode class) || [\t\f\v\r\n]""" 

188 if code >= 0x2000 and code <= 0x200A: 

189 return True 

190 return code in MD_WHITESPACE 

191 

192 

193# ////////////////////////////////////////////////////////////////////////////// 

194 

195UNICODE_PUNCT_RE = re.compile( 

196 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" 

197) 

198 

199 

200# Currently without astral characters support. 

201def isPunctChar(ch: str) -> bool: 

202 """Check if character is a punctuation character.""" 

203 return UNICODE_PUNCT_RE.search(ch) is not None 

204 

205 

206MD_ASCII_PUNCT = { 

207 0x21, # /* ! */ 

208 0x22, # /* " */ 

209 0x23, # /* # */ 

210 0x24, # /* $ */ 

211 0x25, # /* % */ 

212 0x26, # /* & */ 

213 0x27, # /* ' */ 

214 0x28, # /* ( */ 

215 0x29, # /* ) */ 

216 0x2A, # /* * */ 

217 0x2B, # /* + */ 

218 0x2C, # /* , */ 

219 0x2D, # /* - */ 

220 0x2E, # /* . */ 

221 0x2F, # /* / */ 

222 0x3A, # /* : */ 

223 0x3B, # /* ; */ 

224 0x3C, # /* < */ 

225 0x3D, # /* = */ 

226 0x3E, # /* > */ 

227 0x3F, # /* ? */ 

228 0x40, # /* @ */ 

229 0x5B, # /* [ */ 

230 0x5C, # /* \ */ 

231 0x5D, # /* ] */ 

232 0x5E, # /* ^ */ 

233 0x5F, # /* _ */ 

234 0x60, # /* ` */ 

235 0x7B, # /* { */ 

236 0x7C, # /* | */ 

237 0x7D, # /* } */ 

238 0x7E, # /* ~ */ 

239} 

240 

241 

242def isMdAsciiPunct(ch: int) -> bool: 

243 """Markdown ASCII punctuation characters. 

244 

245 :: 

246 

247 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~ 

248 

249 See http://spec.commonmark.org/0.15/#ascii-punctuation-character 

250 

251 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range. 

252 

253 """ 

254 return ch in MD_ASCII_PUNCT 

255 

256 

257def normalizeReference(string: str) -> str: 

258 """Helper to unify [reference labels].""" 

259 # Trim and collapse whitespace 

260 # 

261 string = re.sub(r"\s+", " ", string.strip()) 

262 

263 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug 

264 # fixed in v12 (couldn't find any details). 

265 # 

266 # So treat this one as a special case 

267 # (remove this when node v10 is no longer supported). 

268 # 

269 # if ('ẞ'.toLowerCase() === 'Ṿ') { 

270 # str = str.replace(/ẞ/g, 'ß') 

271 # } 

272 

273 # .toLowerCase().toUpperCase() should get rid of all differences 

274 # between letter variants. 

275 # 

276 # Simple .toLowerCase() doesn't normalize 125 code points correctly, 

277 # and .toUpperCase doesn't normalize 6 of them (list of exceptions: 

278 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently 

279 # uppercased versions). 

280 # 

281 # Here's an example showing how it happens. Lets take greek letter omega: 

282 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) 

283 # 

284 # Unicode entries: 

285 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8 

286 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 

287 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 

288 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8 

289 # 

290 # Case-insensitive comparison should treat all of them as equivalent. 

291 # 

292 # But .toLowerCase() doesn't change ϑ (it's already lowercase), 

293 # and .toUpperCase() doesn't change ϴ (already uppercase). 

294 # 

295 # Applying first lower then upper case normalizes any character: 

296 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' 

297 # 

298 # Note: this is equivalent to unicode case folding; unicode normalization 

299 # is a different step that is not required here. 

300 # 

301 # Final result should be uppercased, because it's later stored in an object 

302 # (this avoid a conflict with Object.prototype members, 

303 # most notably, `__proto__`) 

304 # 

305 return string.lower().upper() 

306 

307 

308LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE) 

309LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE) 

310 

311 

312def isLinkOpen(string: str) -> bool: 

313 return bool(LINK_OPEN_RE.search(string)) 

314 

315 

316def isLinkClose(string: str) -> bool: 

317 return bool(LINK_CLOSE_RE.search(string))