Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown_it/common/utils.py: 92%

83 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:07 +0000

1"""Utilities for parsing source text 

2""" 

3import html 

4import re 

5from typing import Any 

6 

7from .entities import entities 

8 

9 

10def charCodeAt(src: str, pos: int) -> Any: 

11 """ 

12 Returns the Unicode value of the character at the specified location. 

13 

14 @param - index The zero-based index of the desired character. 

15 If there is no character at the specified index, NaN is returned. 

16 

17 This was added for compatibility with python 

18 """ 

19 try: 

20 return ord(src[pos]) 

21 except IndexError: 

22 return None 

23 

24 

25# Merge objects 

26# 

27def assign(obj): 

28 """Merge objects /*from1, from2, from3, ...*/)""" 

29 raise NotImplementedError 

30 # sources = Array.prototype.slice.call(arguments, 1) 

31 

32 # sources.forEach(function (source) { 

33 # if (!source) { return; } 

34 

35 # if (typeof source !== 'object') { 

36 # throw new TypeError(source + 'must be object') 

37 # } 

38 

39 # Object.keys(source).forEach(function (key) { 

40 # obj[key] = source[key] 

41 # }) 

42 # }) 

43 

44 # return obj 

45 

46 

47def arrayReplaceAt(src: list, pos: int, newElements: list) -> list: 

48 """ 

49 Remove element from array and put another array at those position. 

50 Useful for some operations with tokens 

51 """ 

52 return src[:pos] + newElements + src[pos + 1 :] 

53 

54 

55###################################################################### 

56 

57 

58def isValidEntityCode(c: int) -> bool: 

59 # broken sequence 

60 if c >= 0xD800 and c <= 0xDFFF: 

61 return False 

62 # never used 

63 if c >= 0xFDD0 and c <= 0xFDEF: 

64 return False 

65 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE): 

66 return False 

67 # control codes 

68 if c >= 0x00 and c <= 0x08: 

69 return False 

70 if c == 0x0B: 

71 return False 

72 if c >= 0x0E and c <= 0x1F: 

73 return False 

74 if c >= 0x7F and c <= 0x9F: 

75 return False 

76 # out of range 

77 if c > 0x10FFFF: 

78 return False 

79 return True 

80 

81 

82def fromCodePoint(c: int) -> str: 

83 """Convert ordinal to unicode. 

84 

85 Note, in the original Javascript two string characters were required, 

86 for codepoints larger than `0xFFFF`. 

87 But Python 3 can represent any unicode codepoint in one character. 

88 """ 

89 return chr(c) 

90 

91 

92UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') 

93# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) 

94UNESCAPE_ALL_RE = re.compile( 

95 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", 

96 re.IGNORECASE, 

97) 

98DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE) 

99 

100 

101def replaceEntityPattern(match: str, name: str) -> str: 

102 """Convert HTML entity patterns 

103 

104 :: 

105 

106 https://www.google.com -> https%3A//www.google.com 

107 

108 """ 

109 code = 0 

110 

111 if name in entities: 

112 return entities[name] 

113 

114 if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name): 

115 code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10) 

116 if isValidEntityCode(code): 

117 return fromCodePoint(code) 

118 

119 return match 

120 

121 

122# def replaceEntities(string): 

123# if (string.indexOf('&') < 0): 

124# return string 

125# return string.replace(ENTITY_RE, replaceEntityPattern) 

126 

127 

128def unescapeMd(string: str) -> str: 

129 raise NotImplementedError 

130 # if "\\" in string: 

131 # return string 

132 # return string.replace(UNESCAPE_MD_RE, "$1") 

133 

134 

135def unescapeAll(string: str) -> str: 

136 def replacer_func(match): 

137 escaped = match.group(1) 

138 if escaped: 

139 return escaped 

140 entity = match.group(2) 

141 return replaceEntityPattern(match.group(), entity) 

142 

143 if "\\" not in string and "&" not in string: 

144 return string 

145 return UNESCAPE_ALL_RE.sub(replacer_func, string) 

146 

147 

148ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" 

149ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])") 

150 

151 

152def stripEscape(string: str) -> str: 

153 """Strip escape \\ characters""" 

154 return ESCAPE_CHAR.sub(r"\1", string) 

155 

156 

157# ////////////////////////////////////////////////////////////////////////////// 

158 

159# TODO This section changed quite a lot, should re-check 

160 

161# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))") 

162# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))") 

163# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]') 

164 

165 

166# def escapeHtml(string: str): 

167 

168# if HTML_ESCAPE_REPLACE_RE.search(string): 

169 

170# string = UNESCAPE_HTML_RE.sub("&", string) 

171# string = ESCAPE_AND_HTML.sub("&amp;", string) 

172# for k, v in {"<": "&lt;", ">": "&gt;", '"': "&quot;"}.items(): 

173# string = string.replace(k, v) 

174 

175# return string 

176 

177 

178def escapeHtml(raw: str) -> str: 

179 # return html.escape(html.unescape(raw)).replace("&#x27;", "'") 

180 return html.escape(raw).replace("&#x27;", "'") 

181 

182 

183# ////////////////////////////////////////////////////////////////////////////// 

184 

185REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]") 

186 

187 

188def escapeRE(string: str) -> str: 

189 string = REGEXP_ESCAPE_RE.sub("\\$&", string) 

190 return string 

191 

192 

193# ////////////////////////////////////////////////////////////////////////////// 

194 

195 

196def isSpace(code: object) -> bool: 

197 return code in {0x09, 0x20} 

198 

199 

200MD_WHITESPACE = { 

201 0x09, # \t 

202 0x0A, # \n 

203 0x0B, # \v 

204 0x0C, # \f 

205 0x0D, # \r 

206 0x20, 

207 0xA0, 

208 0x1680, 

209 0x202F, 

210 0x205F, 

211 0x3000, 

212} 

213 

214 

215def isWhiteSpace(code: int) -> bool: 

216 r"""Zs (unicode class) || [\t\f\v\r\n]""" 

217 if code >= 0x2000 and code <= 0x200A: 

218 return True 

219 return code in MD_WHITESPACE 

220 

221 

222# ////////////////////////////////////////////////////////////////////////////// 

223 

224UNICODE_PUNCT_RE = re.compile( 

225 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501 

226) 

227 

228 

229# Currently without astral characters support. 

230def isPunctChar(ch: str) -> bool: 

231 return UNICODE_PUNCT_RE.search(ch) is not None 

232 

233 

234MD_ASCII_PUNCT = { 

235 0x21, # /* ! */ 

236 0x22, # /* " */ 

237 0x23, # /* # */ 

238 0x24, # /* $ */ 

239 0x25, # /* % */ 

240 0x26, # /* & */ 

241 0x27, # /* ' */ 

242 0x28, # /* ( */ 

243 0x29, # /* ) */ 

244 0x2A, # /* * */ 

245 0x2B, # /* + */ 

246 0x2C, # /* , */ 

247 0x2D, # /* - */ 

248 0x2E, # /* . */ 

249 0x2F, # /* / */ 

250 0x3A, # /* : */ 

251 0x3B, # /* ; */ 

252 0x3C, # /* < */ 

253 0x3D, # /* = */ 

254 0x3E, # /* > */ 

255 0x3F, # /* ? */ 

256 0x40, # /* @ */ 

257 0x5B, # /* [ */ 

258 0x5C, # /* \ */ 

259 0x5D, # /* ] */ 

260 0x5E, # /* ^ */ 

261 0x5F, # /* _ */ 

262 0x60, # /* ` */ 

263 0x7B, # /* { */ 

264 0x7C, # /* | */ 

265 0x7D, # /* } */ 

266 0x7E, # /* ~ */ 

267} 

268 

269 

270def isMdAsciiPunct(ch: int) -> bool: 

271 """Markdown ASCII punctuation characters. 

272 

273 :: 

274 

275 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~ 

276 

277 See http://spec.commonmark.org/0.15/#ascii-punctuation-character 

278 

279 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range. 

280 

281 """ # noqa: E501 

282 return ch in MD_ASCII_PUNCT 

283 

284 

285def normalizeReference(string: str) -> str: 

286 """Helper to unify [reference labels].""" 

287 # Trim and collapse whitespace 

288 # 

289 string = re.sub(r"\s+", " ", string.strip()) 

290 

291 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug 

292 # fixed in v12 (couldn't find any details). 

293 # 

294 # So treat this one as a special case 

295 # (remove this when node v10 is no longer supported). 

296 # 

297 # if ('ẞ'.toLowerCase() === 'Ṿ') { 

298 # str = str.replace(/ẞ/g, 'ß') 

299 # } 

300 

301 # .toLowerCase().toUpperCase() should get rid of all differences 

302 # between letter variants. 

303 # 

304 # Simple .toLowerCase() doesn't normalize 125 code points correctly, 

305 # and .toUpperCase doesn't normalize 6 of them (list of exceptions: 

306 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently 

307 # uppercased versions). 

308 # 

309 # Here's an example showing how it happens. Lets take greek letter omega: 

310 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) 

311 # 

312 # Unicode entries: 

313 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8 

314 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 

315 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 

316 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8 

317 # 

318 # Case-insensitive comparison should treat all of them as equivalent. 

319 # 

320 # But .toLowerCase() doesn't change ϑ (it's already lowercase), 

321 # and .toUpperCase() doesn't change ϴ (already uppercase). 

322 # 

323 # Applying first lower then upper case normalizes any character: 

324 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' 

325 # 

326 # Note: this is equivalent to unicode case folding; unicode normalization 

327 # is a different step that is not required here. 

328 # 

329 # Final result should be uppercased, because it's later stored in an object 

330 # (this avoid a conflict with Object.prototype members, 

331 # most notably, `__proto__`) 

332 # 

333 return string.lower().upper()