Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/markdown_it/common/utils.py: 94%

Shortcuts on this page

r m x toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

99 statements

1"""Utilities for parsing source text"""

3from __future__ import annotations

5import re

6from re import Match

7from typing import TypeVar

9from .entities import entities

12def charCodeAt(src: str, pos: int) -> int | None:

13 """

14 Returns the Unicode value of the character at the specified location.

16 @param - index The zero-based index of the desired character.

17 If there is no character at the specified index, NaN is returned.

19 This was added for compatibility with python

20 """

21 try:

22 return ord(src[pos])

23 except IndexError:

24 return None

27def charStrAt(src: str, pos: int) -> str | None:

28 """

29 Returns the Unicode value of the character at the specified location.

31 @param - index The zero-based index of the desired character.

32 If there is no character at the specified index, NaN is returned.

34 This was added for compatibility with python

35 """

36 try:

37 return src[pos]

38 except IndexError:

39 return None

42_ItemTV = TypeVar("_ItemTV")

45def arrayReplaceAt(

46 src: list[_ItemTV], pos: int, newElements: list[_ItemTV]

47) -> list[_ItemTV]:

48 """

49 Remove element from array and put another array at those position.

50 Useful for some operations with tokens

51 """

52 return src[:pos] + newElements + src[pos + 1 :]

55def isValidEntityCode(c: int) -> bool:

56 # broken sequence

57 if c >= 0xD800 and c <= 0xDFFF:

58 return False

59 # never used

60 if c >= 0xFDD0 and c <= 0xFDEF:

61 return False

62 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):

63 return False

64 # control codes

65 if c >= 0x00 and c <= 0x08:

66 return False

67 if c == 0x0B:

68 return False

69 if c >= 0x0E and c <= 0x1F:

70 return False

71 if c >= 0x7F and c <= 0x9F:

72 return False

73 # out of range

74 return not (c > 0x10FFFF)

77def fromCodePoint(c: int) -> str:

78 """Convert ordinal to unicode.

80 Note, in the original Javascript two string characters were required,

81 for codepoints larger than `0xFFFF`.

82 But Python 3 can represent any unicode codepoint in one character.

83 """

84 return chr(c)

87# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')

88# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)

89UNESCAPE_ALL_RE = re.compile(

90 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",

91 re.IGNORECASE,

93DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")

94DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)

97def replaceEntityPattern(match: str, name: str) -> str:

98 """Convert HTML entity patterns,

99 see https://spec.commonmark.org/0.30/#entity-references

100 """

101 if name in entities:

102 return entities[name]

104 code: None | int = None

105 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):

106 code = int(pat.group(1), 10)

107 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):

108 code = int(pat.group(1), 16)

110 if code is not None and isValidEntityCode(code):

111 return fromCodePoint(code)

113 return match

116def unescapeAll(string: str) -> str:

117 def replacer_func(match: Match[str]) -> str:

118 escaped = match.group(1)

119 if escaped:

120 return escaped

121 entity = match.group(2)

122 return replaceEntityPattern(match.group(), entity)

124 if "\\" not in string and "&" not in string:

125 return string

126 return UNESCAPE_ALL_RE.sub(replacer_func, string)

129ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""

130ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")

133def stripEscape(string: str) -> str:

134 """Strip escape \\ characters"""

135 return ESCAPE_CHAR.sub(r"\1", string)

138def escapeHtml(raw: str) -> str:

139 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""

140 # like html.escape, but without escaping single quotes

141 raw = raw.replace("&", "&") # Must be done first!

142 raw = raw.replace("<", "<")

143 raw = raw.replace(">", ">")

144 raw = raw.replace('"', """)

145 return raw

148# //////////////////////////////////////////////////////////////////////////////

150REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")

153def escapeRE(string: str) -> str:

154 string = REGEXP_ESCAPE_RE.sub("\\$&", string)

155 return string

158# //////////////////////////////////////////////////////////////////////////////

161def isSpace(code: int | None) -> bool:

162 """Check if character code is a whitespace."""

163 return code in (0x09, 0x20)

166def isStrSpace(ch: str | None) -> bool:

167 """Check if character is a whitespace."""

168 return ch in ("\t", " ")

171MD_WHITESPACE = {

172 0x09, # \t

173 0x0A, # \n

174 0x0B, # \v

175 0x0C, # \f

176 0x0D, # \r

177 0x20, # space

178 0xA0,

179 0x1680,

180 0x202F,

181 0x205F,

182 0x3000,

186def isWhiteSpace(code: int) -> bool:

187 r"""Zs (unicode class) || [\t\f\v\r\n]"""

188 if code >= 0x2000 and code <= 0x200A:

189 return True

190 return code in MD_WHITESPACE

193# //////////////////////////////////////////////////////////////////////////////

195UNICODE_PUNCT_RE = re.compile(

196 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]"

200# Currently without astral characters support.

201def isPunctChar(ch: str) -> bool:

202 """Check if character is a punctuation character."""

203 return UNICODE_PUNCT_RE.search(ch) is not None

206MD_ASCII_PUNCT = {

207 0x21, # /* ! */

208 0x22, # /* " */

209 0x23, # /* # */

210 0x24, # /* $ */

211 0x25, # /* % */

212 0x26, # /* & */

213 0x27, # /* ' */

214 0x28, # /* ( */

215 0x29, # /* ) */

216 0x2A, # /* * */

217 0x2B, # /* + */

218 0x2C, # /* , */

219 0x2D, # /* - */

220 0x2E, # /* . */

221 0x2F, # /* / */

222 0x3A, # /* : */

223 0x3B, # /* ; */

224 0x3C, # /* < */

225 0x3D, # /* = */

226 0x3E, # /* > */

227 0x3F, # /* ? */

228 0x40, # /* @ */

229 0x5B, # /* [ */

230 0x5C, # /* \ */

231 0x5D, # /* ] */

232 0x5E, # /* ^ */

233 0x5F, # /* _ */

234 0x60, # /* ` */

235 0x7B, # /* { */

236 0x7C, # /* | */

237 0x7D, # /* } */

238 0x7E, # /* ~ */

242def isMdAsciiPunct(ch: int) -> bool:

243 """Markdown ASCII punctuation characters.

245 ::

247 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~

249 See http://spec.commonmark.org/0.15/#ascii-punctuation-character

251 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.

253 """

254 return ch in MD_ASCII_PUNCT

257def normalizeReference(string: str) -> str:

258 """Helper to unify [reference labels]."""

259 # Trim and collapse whitespace

261 string = re.sub(r"\s+", " ", string.strip())

263 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug

264 # fixed in v12 (couldn't find any details).

266 # So treat this one as a special case

267 # (remove this when node v10 is no longer supported).

269 # if ('ẞ'.toLowerCase() === 'Ṿ') {

270 # str = str.replace(/ẞ/g, 'ß')

271 # }

273 # .toLowerCase().toUpperCase() should get rid of all differences

274 # between letter variants.

276 # Simple .toLowerCase() doesn't normalize 125 code points correctly,

277 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:

278 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently

279 # uppercased versions).

281 # Here's an example showing how it happens. Lets take greek letter omega:

282 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)

284 # Unicode entries:

285 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8

286 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398

287 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398

288 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8

290 # Case-insensitive comparison should treat all of them as equivalent.

292 # But .toLowerCase() doesn't change ϑ (it's already lowercase),

293 # and .toUpperCase() doesn't change ϴ (already uppercase).

295 # Applying first lower then upper case normalizes any character:

296 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'

298 # Note: this is equivalent to unicode case folding; unicode normalization

299 # is a different step that is not required here.

301 # Final result should be uppercased, because it's later stored in an object

302 # (this avoid a conflict with Object.prototype members,

303 # most notably, `__proto__`)

305 return string.lower().upper()

308LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)

309LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)

312def isLinkOpen(string: str) -> bool:

313 return bool(LINK_OPEN_RE.search(string))

316def isLinkClose(string: str) -> bool:

317 return bool(LINK_CLOSE_RE.search(string))