Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown_it/common/utils.py: 94%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

99 statements  

1"""Utilities for parsing source text""" 

2 

3from __future__ import annotations 

4 

5import re 

6from re import Match 

7from typing import TypeVar 

8import unicodedata 

9 

10from .entities import entities 

11 

12 

13def charCodeAt(src: str, pos: int) -> int | None: 

14 """ 

15 Returns the Unicode value of the character at the specified location. 

16 

17 @param - index The zero-based index of the desired character. 

18 If there is no character at the specified index, NaN is returned. 

19 

20 This was added for compatibility with python 

21 """ 

22 try: 

23 return ord(src[pos]) 

24 except IndexError: 

25 return None 

26 

27 

28def charStrAt(src: str, pos: int) -> str | None: 

29 """ 

30 Returns the Unicode value of the character at the specified location. 

31 

32 @param - index The zero-based index of the desired character. 

33 If there is no character at the specified index, NaN is returned. 

34 

35 This was added for compatibility with python 

36 """ 

37 try: 

38 return src[pos] 

39 except IndexError: 

40 return None 

41 

42 

43_ItemTV = TypeVar("_ItemTV") 

44 

45 

46def arrayReplaceAt( 

47 src: list[_ItemTV], pos: int, newElements: list[_ItemTV] 

48) -> list[_ItemTV]: 

49 """ 

50 Remove element from array and put another array at those position. 

51 Useful for some operations with tokens 

52 """ 

53 return src[:pos] + newElements + src[pos + 1 :] 

54 

55 

56def isValidEntityCode(c: int) -> bool: 

57 # broken sequence 

58 if c >= 0xD800 and c <= 0xDFFF: 

59 return False 

60 # never used 

61 if c >= 0xFDD0 and c <= 0xFDEF: 

62 return False 

63 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE): 

64 return False 

65 # control codes 

66 if c >= 0x00 and c <= 0x08: 

67 return False 

68 if c == 0x0B: 

69 return False 

70 if c >= 0x0E and c <= 0x1F: 

71 return False 

72 if c >= 0x7F and c <= 0x9F: 

73 return False 

74 # out of range 

75 return not (c > 0x10FFFF) 

76 

77 

78def fromCodePoint(c: int) -> str: 

79 """Convert ordinal to unicode. 

80 

81 Note, in the original Javascript two string characters were required, 

82 for codepoints larger than `0xFFFF`. 

83 But Python 3 can represent any unicode codepoint in one character. 

84 """ 

85 return chr(c) 

86 

87 

88# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') 

89# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) 

90UNESCAPE_ALL_RE = re.compile( 

91 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", 

92 re.IGNORECASE, 

93) 

94DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})") 

95DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE) 

96 

97 

98def replaceEntityPattern(match: str, name: str) -> str: 

99 """Convert HTML entity patterns, 

100 see https://spec.commonmark.org/0.30/#entity-references 

101 """ 

102 if name in entities: 

103 return entities[name] 

104 

105 code: None | int = None 

106 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name): 

107 code = int(pat.group(1), 10) 

108 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name): 

109 code = int(pat.group(1), 16) 

110 

111 if code is not None and isValidEntityCode(code): 

112 return fromCodePoint(code) 

113 

114 return match 

115 

116 

117def unescapeAll(string: str) -> str: 

118 def replacer_func(match: Match[str]) -> str: 

119 escaped = match.group(1) 

120 if escaped: 

121 return escaped 

122 entity = match.group(2) 

123 return replaceEntityPattern(match.group(), entity) 

124 

125 if "\\" not in string and "&" not in string: 

126 return string 

127 return UNESCAPE_ALL_RE.sub(replacer_func, string) 

128 

129 

130ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" 

131ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])") 

132 

133 

134def stripEscape(string: str) -> str: 

135 """Strip escape \\ characters""" 

136 return ESCAPE_CHAR.sub(r"\1", string) 

137 

138 

139def escapeHtml(raw: str) -> str: 

140 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences.""" 

141 # like html.escape, but without escaping single quotes 

142 raw = raw.replace("&", "&amp;") # Must be done first! 

143 raw = raw.replace("<", "&lt;") 

144 raw = raw.replace(">", "&gt;") 

145 raw = raw.replace('"', "&quot;") 

146 return raw 

147 

148 

149# ////////////////////////////////////////////////////////////////////////////// 

150 

151REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]") 

152 

153 

154def escapeRE(string: str) -> str: 

155 string = REGEXP_ESCAPE_RE.sub("\\$&", string) 

156 return string 

157 

158 

159# ////////////////////////////////////////////////////////////////////////////// 

160 

161 

162def isSpace(code: int | None) -> bool: 

163 """Check if character code is a whitespace.""" 

164 return code in (0x09, 0x20) 

165 

166 

167def isStrSpace(ch: str | None) -> bool: 

168 """Check if character is a whitespace.""" 

169 return ch in ("\t", " ") 

170 

171 

172MD_WHITESPACE = { 

173 0x09, # \t 

174 0x0A, # \n 

175 0x0B, # \v 

176 0x0C, # \f 

177 0x0D, # \r 

178 0x20, # space 

179 0xA0, 

180 0x1680, 

181 0x202F, 

182 0x205F, 

183 0x3000, 

184} 

185 

186 

187def isWhiteSpace(code: int) -> bool: 

188 r"""Zs (unicode class) || [\t\f\v\r\n]""" 

189 if code >= 0x2000 and code <= 0x200A: 

190 return True 

191 return code in MD_WHITESPACE 

192 

193 

194# ////////////////////////////////////////////////////////////////////////////// 

195 

196 

197def isPunctChar(ch: str) -> bool: 

198 """Check if character is a punctuation character.""" 

199 return unicodedata.category(ch).startswith(("P", "S")) 

200 

201 

202MD_ASCII_PUNCT = { 

203 0x21, # /* ! */ 

204 0x22, # /* " */ 

205 0x23, # /* # */ 

206 0x24, # /* $ */ 

207 0x25, # /* % */ 

208 0x26, # /* & */ 

209 0x27, # /* ' */ 

210 0x28, # /* ( */ 

211 0x29, # /* ) */ 

212 0x2A, # /* * */ 

213 0x2B, # /* + */ 

214 0x2C, # /* , */ 

215 0x2D, # /* - */ 

216 0x2E, # /* . */ 

217 0x2F, # /* / */ 

218 0x3A, # /* : */ 

219 0x3B, # /* ; */ 

220 0x3C, # /* < */ 

221 0x3D, # /* = */ 

222 0x3E, # /* > */ 

223 0x3F, # /* ? */ 

224 0x40, # /* @ */ 

225 0x5B, # /* [ */ 

226 0x5C, # /* \ */ 

227 0x5D, # /* ] */ 

228 0x5E, # /* ^ */ 

229 0x5F, # /* _ */ 

230 0x60, # /* ` */ 

231 0x7B, # /* { */ 

232 0x7C, # /* | */ 

233 0x7D, # /* } */ 

234 0x7E, # /* ~ */ 

235} 

236 

237 

238def isMdAsciiPunct(ch: int) -> bool: 

239 """Markdown ASCII punctuation characters. 

240 

241 :: 

242 

243 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~ 

244 

245 See http://spec.commonmark.org/0.15/#ascii-punctuation-character 

246 

247 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range. 

248 

249 """ 

250 return ch in MD_ASCII_PUNCT 

251 

252 

253def normalizeReference(string: str) -> str: 

254 """Helper to unify [reference labels].""" 

255 # Trim and collapse whitespace 

256 # 

257 string = re.sub(r"\s+", " ", string.strip()) 

258 

259 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug 

260 # fixed in v12 (couldn't find any details). 

261 # 

262 # So treat this one as a special case 

263 # (remove this when node v10 is no longer supported). 

264 # 

265 # if ('ẞ'.toLowerCase() === 'Ṿ') { 

266 # str = str.replace(/ẞ/g, 'ß') 

267 # } 

268 

269 # .toLowerCase().toUpperCase() should get rid of all differences 

270 # between letter variants. 

271 # 

272 # Simple .toLowerCase() doesn't normalize 125 code points correctly, 

273 # and .toUpperCase doesn't normalize 6 of them (list of exceptions: 

274 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently 

275 # uppercased versions). 

276 # 

277 # Here's an example showing how it happens. Lets take greek letter omega: 

278 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) 

279 # 

280 # Unicode entries: 

281 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8 

282 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 

283 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 

284 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8 

285 # 

286 # Case-insensitive comparison should treat all of them as equivalent. 

287 # 

288 # But .toLowerCase() doesn't change ϑ (it's already lowercase), 

289 # and .toUpperCase() doesn't change ϴ (already uppercase). 

290 # 

291 # Applying first lower then upper case normalizes any character: 

292 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' 

293 # 

294 # Note: this is equivalent to unicode case folding; unicode normalization 

295 # is a different step that is not required here. 

296 # 

297 # Final result should be uppercased, because it's later stored in an object 

298 # (this avoid a conflict with Object.prototype members, 

299 # most notably, `__proto__`) 

300 # 

301 return string.lower().upper() 

302 

303 

304LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE) 

305LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE) 

306 

307 

308def isLinkOpen(string: str) -> bool: 

309 return bool(LINK_OPEN_RE.search(string)) 

310 

311 

312def isLinkClose(string: str) -> bool: 

313 return bool(LINK_CLOSE_RE.search(string))