Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown

1"""Utilities for parsing source text"""

3from __future__ import annotations

5import re

6from re import Match

7from typing import TypeVar

8import unicodedata

10from .entities import entities

13def charCodeAt(src: str, pos: int) -> int | None:

14 """

15 Returns the Unicode value of the character at the specified location.

17 @param - index The zero-based index of the desired character.

18 If there is no character at the specified index, NaN is returned.

20 This was added for compatibility with python

21 """

22 try:

23 return ord(src[pos])

24 except IndexError:

25 return None

28def charStrAt(src: str, pos: int) -> str | None:

29 """

30 Returns the Unicode value of the character at the specified location.

32 @param - index The zero-based index of the desired character.

33 If there is no character at the specified index, NaN is returned.

35 This was added for compatibility with python

36 """

37 try:

38 return src[pos]

39 except IndexError:

40 return None

43_ItemTV = TypeVar("_ItemTV")

46def arrayReplaceAt(

47 src: list[_ItemTV], pos: int, newElements: list[_ItemTV]

48) -> list[_ItemTV]:

49 """

50 Remove element from array and put another array at those position.

51 Useful for some operations with tokens

52 """

53 return src[:pos] + newElements + src[pos + 1 :]

56def isValidEntityCode(c: int) -> bool:

57 # broken sequence

58 if c >= 0xD800 and c <= 0xDFFF:

59 return False

60 # never used

61 if c >= 0xFDD0 and c <= 0xFDEF:

62 return False

63 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):

64 return False

65 # control codes

66 if c >= 0x00 and c <= 0x08:

67 return False

68 if c == 0x0B:

69 return False

70 if c >= 0x0E and c <= 0x1F:

71 return False

72 if c >= 0x7F and c <= 0x9F:

73 return False

74 # out of range

75 return not (c > 0x10FFFF)

78def fromCodePoint(c: int) -> str:

79 """Convert ordinal to unicode.

81 Note, in the original Javascript two string characters were required,

82 for codepoints larger than `0xFFFF`.

83 But Python 3 can represent any unicode codepoint in one character.

84 """

85 return chr(c)

88# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')

89# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)

90UNESCAPE_ALL_RE = re.compile(

91 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",

92 re.IGNORECASE,

93)

94DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")

95DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)

98def replaceEntityPattern(match: str, name: str) -> str:

99 """Convert HTML entity patterns,

100 see https://spec.commonmark.org/0.30/#entity-references

101 """

102 if name in entities:

103 return entities[name]

104

105 code: None | int = None

106 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):

107 code = int(pat.group(1), 10)

108 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):

109 code = int(pat.group(1), 16)

110

111 if code is not None and isValidEntityCode(code):

112 return fromCodePoint(code)

113

114 return match

115

116

117def unescapeAll(string: str) -> str:

118 def replacer_func(match: Match[str]) -> str:

119 escaped = match.group(1)

120 if escaped:

121 return escaped

122 entity = match.group(2)

123 return replaceEntityPattern(match.group(), entity)

124

125 if "\\" not in string and "&" not in string:

126 return string

127 return UNESCAPE_ALL_RE.sub(replacer_func, string)

128

129

130ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""

131ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")

132

133

134def stripEscape(string: str) -> str:

135 """Strip escape \\ characters"""

136 return ESCAPE_CHAR.sub(r"\1", string)

137

138

139def escapeHtml(raw: str) -> str:

140 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""

141 # like html.escape, but without escaping single quotes

142 raw = raw.replace("&", "&") # Must be done first!

143 raw = raw.replace("<", "<")

144 raw = raw.replace(">", ">")

145 raw = raw.replace('"', """)

146 return raw

147

148

149# //////////////////////////////////////////////////////////////////////////////

150

151REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")

152

153

154def escapeRE(string: str) -> str:

155 string = REGEXP_ESCAPE_RE.sub("\\$&", string)

156 return string

157

158

159# //////////////////////////////////////////////////////////////////////////////

160

161

162def isSpace(code: int | None) -> bool:

163 """Check if character code is a whitespace."""

164 return code in (0x09, 0x20)

165

166

167def isStrSpace(ch: str | None) -> bool:

168 """Check if character is a whitespace."""

169 return ch in ("\t", " ")

170

171

172MD_WHITESPACE = {

173 0x09, # \t

174 0x0A, # \n

175 0x0B, # \v

176 0x0C, # \f

177 0x0D, # \r

178 0x20, # space

179 0xA0,

180 0x1680,

181 0x202F,

182 0x205F,

183 0x3000,

184}

185

186

187def isWhiteSpace(code: int) -> bool:

188 r"""Zs (unicode class) || [\t\f\v\r\n]"""

189 if code >= 0x2000 and code <= 0x200A:

190 return True

191 return code in MD_WHITESPACE

192

193

194# //////////////////////////////////////////////////////////////////////////////

195

196

197def isPunctChar(ch: str) -> bool:

198 """Check if character is a punctuation character."""

199 return unicodedata.category(ch).startswith(("P", "S"))

200

201

202MD_ASCII_PUNCT = {

203 0x21, # /* ! */

204 0x22, # /* " */

205 0x23, # /* # */

206 0x24, # /* $ */

207 0x25, # /* % */

208 0x26, # /* & */

209 0x27, # /* ' */

210 0x28, # /* ( */

211 0x29, # /* ) */

212 0x2A, # /* * */

213 0x2B, # /* + */

214 0x2C, # /* , */

215 0x2D, # /* - */

216 0x2E, # /* . */

217 0x2F, # /* / */

218 0x3A, # /* : */

219 0x3B, # /* ; */

220 0x3C, # /* < */

221 0x3D, # /* = */

222 0x3E, # /* > */

223 0x3F, # /* ? */

224 0x40, # /* @ */

225 0x5B, # /* [ */

226 0x5C, # /* \ */

227 0x5D, # /* ] */

228 0x5E, # /* ^ */

229 0x5F, # /* _ */

230 0x60, # /* ` */

231 0x7B, # /* { */

232 0x7C, # /* | */

233 0x7D, # /* } */

234 0x7E, # /* ~ */

235}

236

237

238def isMdAsciiPunct(ch: int) -> bool:

239 """Markdown ASCII punctuation characters.

240

241 ::

242

243 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~

244

245 See http://spec.commonmark.org/0.15/#ascii-punctuation-character

246

247 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.

248

249 """

250 return ch in MD_ASCII_PUNCT

251

252

253def normalizeReference(string: str) -> str:

254 """Helper to unify [reference labels]."""

255 # Trim and collapse whitespace

256 #

257 string = re.sub(r"\s+", " ", string.strip())

258

259 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug

260 # fixed in v12 (couldn't find any details).

261 #

262 # So treat this one as a special case

263 # (remove this when node v10 is no longer supported).

264 #

265 # if ('ẞ'.toLowerCase() === 'Ṿ') {

266 # str = str.replace(/ẞ/g, 'ß')

267 # }

268

269 # .toLowerCase().toUpperCase() should get rid of all differences

270 # between letter variants.

271 #

272 # Simple .toLowerCase() doesn't normalize 125 code points correctly,

273 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:

274 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently

275 # uppercased versions).

276 #

277 # Here's an example showing how it happens. Lets take greek letter omega:

278 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)

279 #

280 # Unicode entries:

281 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8

282 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398

283 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398

284 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8

285 #

286 # Case-insensitive comparison should treat all of them as equivalent.

287 #

288 # But .toLowerCase() doesn't change ϑ (it's already lowercase),

289 # and .toUpperCase() doesn't change ϴ (already uppercase).

290 #

291 # Applying first lower then upper case normalizes any character:

292 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'

293 #

294 # Note: this is equivalent to unicode case folding; unicode normalization

295 # is a different step that is not required here.

296 #

297 # Final result should be uppercased, because it's later stored in an object

298 # (this avoid a conflict with Object.prototype members,

299 # most notably, `__proto__`)

300 #

301 return string.lower().upper()

302

303

304LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)

305LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)

306

307

308def isLinkOpen(string: str) -> bool:

309 return bool(LINK_OPEN_RE.search(string))

310

311

312def isLinkClose(string: str) -> bool:

313 return bool(LINK_CLOSE_RE.search(string))

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown_it/common/utils.py: 89%

99 statements