Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown

1"""Utilities for parsing source text

2"""

3from __future__ import annotations

5import re

6from typing import Match, TypeVar

8from .entities import entities

11def charCodeAt(src: str, pos: int) -> int | None:

12 """

13 Returns the Unicode value of the character at the specified location.

15 @param - index The zero-based index of the desired character.

16 If there is no character at the specified index, NaN is returned.

18 This was added for compatibility with python

19 """

20 try:

21 return ord(src[pos])

22 except IndexError:

23 return None

26def charStrAt(src: str, pos: int) -> str | None:

27 """

28 Returns the Unicode value of the character at the specified location.

30 @param - index The zero-based index of the desired character.

31 If there is no character at the specified index, NaN is returned.

33 This was added for compatibility with python

34 """

35 try:

36 return src[pos]

37 except IndexError:

38 return None

41_ItemTV = TypeVar("_ItemTV")

44def arrayReplaceAt(

45 src: list[_ItemTV], pos: int, newElements: list[_ItemTV]

46) -> list[_ItemTV]:

47 """

48 Remove element from array and put another array at those position.

49 Useful for some operations with tokens

50 """

51 return src[:pos] + newElements + src[pos + 1 :]

54def isValidEntityCode(c: int) -> bool:

55 # broken sequence

56 if c >= 0xD800 and c <= 0xDFFF:

57 return False

58 # never used

59 if c >= 0xFDD0 and c <= 0xFDEF:

60 return False

61 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):

62 return False

63 # control codes

64 if c >= 0x00 and c <= 0x08:

65 return False

66 if c == 0x0B:

67 return False

68 if c >= 0x0E and c <= 0x1F:

69 return False

70 if c >= 0x7F and c <= 0x9F:

71 return False

72 # out of range

73 if c > 0x10FFFF:

74 return False

75 return True

78def fromCodePoint(c: int) -> str:

79 """Convert ordinal to unicode.

81 Note, in the original Javascript two string characters were required,

82 for codepoints larger than `0xFFFF`.

83 But Python 3 can represent any unicode codepoint in one character.

84 """

85 return chr(c)

88# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')

89# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)

90UNESCAPE_ALL_RE = re.compile(

91 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",

92 re.IGNORECASE,

93)

94DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")

95DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)

98def replaceEntityPattern(match: str, name: str) -> str:

99 """Convert HTML entity patterns,

100 see https://spec.commonmark.org/0.30/#entity-references

101 """

102 if name in entities:

103 return entities[name]

104

105 code: None | int = None

106 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):

107 code = int(pat.group(1), 10)

108 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):

109 code = int(pat.group(1), 16)

110

111 if code is not None and isValidEntityCode(code):

112 return fromCodePoint(code)

113

114 return match

115

116

117def unescapeAll(string: str) -> str:

118 def replacer_func(match: Match[str]) -> str:

119 escaped = match.group(1)

120 if escaped:

121 return escaped

122 entity = match.group(2)

123 return replaceEntityPattern(match.group(), entity)

124

125 if "\\" not in string and "&" not in string:

126 return string

127 return UNESCAPE_ALL_RE.sub(replacer_func, string)

128

129

130ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""

131ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")

132

133

134def stripEscape(string: str) -> str:

135 """Strip escape \\ characters"""

136 return ESCAPE_CHAR.sub(r"\1", string)

137

138

139def escapeHtml(raw: str) -> str:

140 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""

141 # like html.escape, but without escaping single quotes

142 raw = raw.replace("&", "&") # Must be done first!

143 raw = raw.replace("<", "<")

144 raw = raw.replace(">", ">")

145 raw = raw.replace('"', """)

146 return raw

147

148

149# //////////////////////////////////////////////////////////////////////////////

150

151REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")

152

153

154def escapeRE(string: str) -> str:

155 string = REGEXP_ESCAPE_RE.sub("\\$&", string)

156 return string

157

158

159# //////////////////////////////////////////////////////////////////////////////

160

161

162def isSpace(code: int | None) -> bool:

163 """Check if character code is a whitespace."""

164 return code in (0x09, 0x20)

165

166

167def isStrSpace(ch: str | None) -> bool:

168 """Check if character is a whitespace."""

169 return ch in ("\t", " ")

170

171

172MD_WHITESPACE = {

173 0x09, # \t

174 0x0A, # \n

175 0x0B, # \v

176 0x0C, # \f

177 0x0D, # \r

178 0x20, # space

179 0xA0,

180 0x1680,

181 0x202F,

182 0x205F,

183 0x3000,

184}

185

186

187def isWhiteSpace(code: int) -> bool:

188 r"""Zs (unicode class) || [\t\f\v\r\n]"""

189 if code >= 0x2000 and code <= 0x200A:

190 return True

191 return code in MD_WHITESPACE

192

193

194# //////////////////////////////////////////////////////////////////////////////

195

196UNICODE_PUNCT_RE = re.compile(

197 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501

198)

199

200

201# Currently without astral characters support.

202def isPunctChar(ch: str) -> bool:

203 """Check if character is a punctuation character."""

204 return UNICODE_PUNCT_RE.search(ch) is not None

205

206

207MD_ASCII_PUNCT = {

208 0x21, # /* ! */

209 0x22, # /* " */

210 0x23, # /* # */

211 0x24, # /* $ */

212 0x25, # /* % */

213 0x26, # /* & */

214 0x27, # /* ' */

215 0x28, # /* ( */

216 0x29, # /* ) */

217 0x2A, # /* * */

218 0x2B, # /* + */

219 0x2C, # /* , */

220 0x2D, # /* - */

221 0x2E, # /* . */

222 0x2F, # /* / */

223 0x3A, # /* : */

224 0x3B, # /* ; */

225 0x3C, # /* < */

226 0x3D, # /* = */

227 0x3E, # /* > */

228 0x3F, # /* ? */

229 0x40, # /* @ */

230 0x5B, # /* [ */

231 0x5C, # /* \ */

232 0x5D, # /* ] */

233 0x5E, # /* ^ */

234 0x5F, # /* _ */

235 0x60, # /* ` */

236 0x7B, # /* { */

237 0x7C, # /* | */

238 0x7D, # /* } */

239 0x7E, # /* ~ */

240}

241

242

243def isMdAsciiPunct(ch: int) -> bool:

244 """Markdown ASCII punctuation characters.

245

246 ::

247

248 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~

249

250 See http://spec.commonmark.org/0.15/#ascii-punctuation-character

251

252 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.

253

254 """ # noqa: E501

255 return ch in MD_ASCII_PUNCT

256

257

258def normalizeReference(string: str) -> str:

259 """Helper to unify [reference labels]."""

260 # Trim and collapse whitespace

261 #

262 string = re.sub(r"\s+", " ", string.strip())

263

264 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug

265 # fixed in v12 (couldn't find any details).

266 #

267 # So treat this one as a special case

268 # (remove this when node v10 is no longer supported).

269 #

270 # if ('ẞ'.toLowerCase() === 'Ṿ') {

271 # str = str.replace(/ẞ/g, 'ß')

272 # }

273

274 # .toLowerCase().toUpperCase() should get rid of all differences

275 # between letter variants.

276 #

277 # Simple .toLowerCase() doesn't normalize 125 code points correctly,

278 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:

279 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently

280 # uppercased versions).

281 #

282 # Here's an example showing how it happens. Lets take greek letter omega:

283 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)

284 #

285 # Unicode entries:

286 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8

287 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398

288 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398

289 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8

290 #

291 # Case-insensitive comparison should treat all of them as equivalent.

292 #

293 # But .toLowerCase() doesn't change ϑ (it's already lowercase),

294 # and .toUpperCase() doesn't change ϴ (already uppercase).

295 #

296 # Applying first lower then upper case normalizes any character:

297 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'

298 #

299 # Note: this is equivalent to unicode case folding; unicode normalization

300 # is a different step that is not required here.

301 #

302 # Final result should be uppercased, because it's later stored in an object

303 # (this avoid a conflict with Object.prototype members,

304 # most notably, `__proto__`)

305 #

306 return string.lower().upper()

307

308

309LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)

310LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)

311

312

313def isLinkOpen(string: str) -> bool:

314 return bool(LINK_OPEN_RE.search(string))

315

316

317def isLinkClose(string: str) -> bool:

318 return bool(LINK_CLOSE_RE.search(string))

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown_it/common/utils.py: 89%

99 statements