Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown_it/common/utils.py: 89%

99 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:15 +0000

1"""Utilities for parsing source text 

2""" 

3from __future__ import annotations 

4 

5import re 

6from typing import Match, TypeVar 

7 

8from .entities import entities 

9 

10 

11def charCodeAt(src: str, pos: int) -> int | None: 

12 """ 

13 Returns the Unicode value of the character at the specified location. 

14 

15 @param - index The zero-based index of the desired character. 

16 If there is no character at the specified index, NaN is returned. 

17 

18 This was added for compatibility with python 

19 """ 

20 try: 

21 return ord(src[pos]) 

22 except IndexError: 

23 return None 

24 

25 

26def charStrAt(src: str, pos: int) -> str | None: 

27 """ 

28 Returns the Unicode value of the character at the specified location. 

29 

30 @param - index The zero-based index of the desired character. 

31 If there is no character at the specified index, NaN is returned. 

32 

33 This was added for compatibility with python 

34 """ 

35 try: 

36 return src[pos] 

37 except IndexError: 

38 return None 

39 

40 

41_ItemTV = TypeVar("_ItemTV") 

42 

43 

44def arrayReplaceAt( 

45 src: list[_ItemTV], pos: int, newElements: list[_ItemTV] 

46) -> list[_ItemTV]: 

47 """ 

48 Remove element from array and put another array at those position. 

49 Useful for some operations with tokens 

50 """ 

51 return src[:pos] + newElements + src[pos + 1 :] 

52 

53 

54def isValidEntityCode(c: int) -> bool: 

55 # broken sequence 

56 if c >= 0xD800 and c <= 0xDFFF: 

57 return False 

58 # never used 

59 if c >= 0xFDD0 and c <= 0xFDEF: 

60 return False 

61 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE): 

62 return False 

63 # control codes 

64 if c >= 0x00 and c <= 0x08: 

65 return False 

66 if c == 0x0B: 

67 return False 

68 if c >= 0x0E and c <= 0x1F: 

69 return False 

70 if c >= 0x7F and c <= 0x9F: 

71 return False 

72 # out of range 

73 if c > 0x10FFFF: 

74 return False 

75 return True 

76 

77 

78def fromCodePoint(c: int) -> str: 

79 """Convert ordinal to unicode. 

80 

81 Note, in the original Javascript two string characters were required, 

82 for codepoints larger than `0xFFFF`. 

83 But Python 3 can represent any unicode codepoint in one character. 

84 """ 

85 return chr(c) 

86 

87 

88# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') 

89# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) 

90UNESCAPE_ALL_RE = re.compile( 

91 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", 

92 re.IGNORECASE, 

93) 

94DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})") 

95DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE) 

96 

97 

98def replaceEntityPattern(match: str, name: str) -> str: 

99 """Convert HTML entity patterns, 

100 see https://spec.commonmark.org/0.30/#entity-references 

101 """ 

102 if name in entities: 

103 return entities[name] 

104 

105 code: None | int = None 

106 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name): 

107 code = int(pat.group(1), 10) 

108 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name): 

109 code = int(pat.group(1), 16) 

110 

111 if code is not None and isValidEntityCode(code): 

112 return fromCodePoint(code) 

113 

114 return match 

115 

116 

117def unescapeAll(string: str) -> str: 

118 def replacer_func(match: Match[str]) -> str: 

119 escaped = match.group(1) 

120 if escaped: 

121 return escaped 

122 entity = match.group(2) 

123 return replaceEntityPattern(match.group(), entity) 

124 

125 if "\\" not in string and "&" not in string: 

126 return string 

127 return UNESCAPE_ALL_RE.sub(replacer_func, string) 

128 

129 

130ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" 

131ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])") 

132 

133 

134def stripEscape(string: str) -> str: 

135 """Strip escape \\ characters""" 

136 return ESCAPE_CHAR.sub(r"\1", string) 

137 

138 

139def escapeHtml(raw: str) -> str: 

140 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences.""" 

141 # like html.escape, but without escaping single quotes 

142 raw = raw.replace("&", "&amp;") # Must be done first! 

143 raw = raw.replace("<", "&lt;") 

144 raw = raw.replace(">", "&gt;") 

145 raw = raw.replace('"', "&quot;") 

146 return raw 

147 

148 

149# ////////////////////////////////////////////////////////////////////////////// 

150 

151REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]") 

152 

153 

154def escapeRE(string: str) -> str: 

155 string = REGEXP_ESCAPE_RE.sub("\\$&", string) 

156 return string 

157 

158 

159# ////////////////////////////////////////////////////////////////////////////// 

160 

161 

162def isSpace(code: int | None) -> bool: 

163 """Check if character code is a whitespace.""" 

164 return code in (0x09, 0x20) 

165 

166 

167def isStrSpace(ch: str | None) -> bool: 

168 """Check if character is a whitespace.""" 

169 return ch in ("\t", " ") 

170 

171 

172MD_WHITESPACE = { 

173 0x09, # \t 

174 0x0A, # \n 

175 0x0B, # \v 

176 0x0C, # \f 

177 0x0D, # \r 

178 0x20, # space 

179 0xA0, 

180 0x1680, 

181 0x202F, 

182 0x205F, 

183 0x3000, 

184} 

185 

186 

187def isWhiteSpace(code: int) -> bool: 

188 r"""Zs (unicode class) || [\t\f\v\r\n]""" 

189 if code >= 0x2000 and code <= 0x200A: 

190 return True 

191 return code in MD_WHITESPACE 

192 

193 

194# ////////////////////////////////////////////////////////////////////////////// 

195 

196UNICODE_PUNCT_RE = re.compile( 

197 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501 

198) 

199 

200 

201# Currently without astral characters support. 

202def isPunctChar(ch: str) -> bool: 

203 """Check if character is a punctuation character.""" 

204 return UNICODE_PUNCT_RE.search(ch) is not None 

205 

206 

207MD_ASCII_PUNCT = { 

208 0x21, # /* ! */ 

209 0x22, # /* " */ 

210 0x23, # /* # */ 

211 0x24, # /* $ */ 

212 0x25, # /* % */ 

213 0x26, # /* & */ 

214 0x27, # /* ' */ 

215 0x28, # /* ( */ 

216 0x29, # /* ) */ 

217 0x2A, # /* * */ 

218 0x2B, # /* + */ 

219 0x2C, # /* , */ 

220 0x2D, # /* - */ 

221 0x2E, # /* . */ 

222 0x2F, # /* / */ 

223 0x3A, # /* : */ 

224 0x3B, # /* ; */ 

225 0x3C, # /* < */ 

226 0x3D, # /* = */ 

227 0x3E, # /* > */ 

228 0x3F, # /* ? */ 

229 0x40, # /* @ */ 

230 0x5B, # /* [ */ 

231 0x5C, # /* \ */ 

232 0x5D, # /* ] */ 

233 0x5E, # /* ^ */ 

234 0x5F, # /* _ */ 

235 0x60, # /* ` */ 

236 0x7B, # /* { */ 

237 0x7C, # /* | */ 

238 0x7D, # /* } */ 

239 0x7E, # /* ~ */ 

240} 

241 

242 

243def isMdAsciiPunct(ch: int) -> bool: 

244 """Markdown ASCII punctuation characters. 

245 

246 :: 

247 

248 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~ 

249 

250 See http://spec.commonmark.org/0.15/#ascii-punctuation-character 

251 

252 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range. 

253 

254 """ # noqa: E501 

255 return ch in MD_ASCII_PUNCT 

256 

257 

258def normalizeReference(string: str) -> str: 

259 """Helper to unify [reference labels].""" 

260 # Trim and collapse whitespace 

261 # 

262 string = re.sub(r"\s+", " ", string.strip()) 

263 

264 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug 

265 # fixed in v12 (couldn't find any details). 

266 # 

267 # So treat this one as a special case 

268 # (remove this when node v10 is no longer supported). 

269 # 

270 # if ('ẞ'.toLowerCase() === 'Ṿ') { 

271 # str = str.replace(/ẞ/g, 'ß') 

272 # } 

273 

274 # .toLowerCase().toUpperCase() should get rid of all differences 

275 # between letter variants. 

276 # 

277 # Simple .toLowerCase() doesn't normalize 125 code points correctly, 

278 # and .toUpperCase doesn't normalize 6 of them (list of exceptions: 

279 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently 

280 # uppercased versions). 

281 # 

282 # Here's an example showing how it happens. Lets take greek letter omega: 

283 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) 

284 # 

285 # Unicode entries: 

286 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8 

287 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 

288 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 

289 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8 

290 # 

291 # Case-insensitive comparison should treat all of them as equivalent. 

292 # 

293 # But .toLowerCase() doesn't change ϑ (it's already lowercase), 

294 # and .toUpperCase() doesn't change ϴ (already uppercase). 

295 # 

296 # Applying first lower then upper case normalizes any character: 

297 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' 

298 # 

299 # Note: this is equivalent to unicode case folding; unicode normalization 

300 # is a different step that is not required here. 

301 # 

302 # Final result should be uppercased, because it's later stored in an object 

303 # (this avoid a conflict with Object.prototype members, 

304 # most notably, `__proto__`) 

305 # 

306 return string.lower().upper() 

307 

308 

309LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE) 

310LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE) 

311 

312 

313def isLinkOpen(string: str) -> bool: 

314 return bool(LINK_OPEN_RE.search(string)) 

315 

316 

317def isLinkClose(string: str) -> bool: 

318 return bool(LINK_CLOSE_RE.search(string))