Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown_it/common/utils.py: 89%
99 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:15 +0000
1"""Utilities for parsing source text
2"""
3from __future__ import annotations
5import re
6from typing import Match, TypeVar
8from .entities import entities
11def charCodeAt(src: str, pos: int) -> int | None:
12 """
13 Returns the Unicode value of the character at the specified location.
15 @param - index The zero-based index of the desired character.
16 If there is no character at the specified index, NaN is returned.
18 This was added for compatibility with python
19 """
20 try:
21 return ord(src[pos])
22 except IndexError:
23 return None
26def charStrAt(src: str, pos: int) -> str | None:
27 """
28 Returns the Unicode value of the character at the specified location.
30 @param - index The zero-based index of the desired character.
31 If there is no character at the specified index, NaN is returned.
33 This was added for compatibility with python
34 """
35 try:
36 return src[pos]
37 except IndexError:
38 return None
41_ItemTV = TypeVar("_ItemTV")
44def arrayReplaceAt(
45 src: list[_ItemTV], pos: int, newElements: list[_ItemTV]
46) -> list[_ItemTV]:
47 """
48 Remove element from array and put another array at those position.
49 Useful for some operations with tokens
50 """
51 return src[:pos] + newElements + src[pos + 1 :]
54def isValidEntityCode(c: int) -> bool:
55 # broken sequence
56 if c >= 0xD800 and c <= 0xDFFF:
57 return False
58 # never used
59 if c >= 0xFDD0 and c <= 0xFDEF:
60 return False
61 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
62 return False
63 # control codes
64 if c >= 0x00 and c <= 0x08:
65 return False
66 if c == 0x0B:
67 return False
68 if c >= 0x0E and c <= 0x1F:
69 return False
70 if c >= 0x7F and c <= 0x9F:
71 return False
72 # out of range
73 if c > 0x10FFFF:
74 return False
75 return True
78def fromCodePoint(c: int) -> str:
79 """Convert ordinal to unicode.
81 Note, in the original Javascript two string characters were required,
82 for codepoints larger than `0xFFFF`.
83 But Python 3 can represent any unicode codepoint in one character.
84 """
85 return chr(c)
88# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
89# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
90UNESCAPE_ALL_RE = re.compile(
91 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
92 re.IGNORECASE,
93)
94DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
95DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
98def replaceEntityPattern(match: str, name: str) -> str:
99 """Convert HTML entity patterns,
100 see https://spec.commonmark.org/0.30/#entity-references
101 """
102 if name in entities:
103 return entities[name]
105 code: None | int = None
106 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
107 code = int(pat.group(1), 10)
108 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
109 code = int(pat.group(1), 16)
111 if code is not None and isValidEntityCode(code):
112 return fromCodePoint(code)
114 return match
117def unescapeAll(string: str) -> str:
118 def replacer_func(match: Match[str]) -> str:
119 escaped = match.group(1)
120 if escaped:
121 return escaped
122 entity = match.group(2)
123 return replaceEntityPattern(match.group(), entity)
125 if "\\" not in string and "&" not in string:
126 return string
127 return UNESCAPE_ALL_RE.sub(replacer_func, string)
130ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
131ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
134def stripEscape(string: str) -> str:
135 """Strip escape \\ characters"""
136 return ESCAPE_CHAR.sub(r"\1", string)
139def escapeHtml(raw: str) -> str:
140 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
141 # like html.escape, but without escaping single quotes
142 raw = raw.replace("&", "&") # Must be done first!
143 raw = raw.replace("<", "<")
144 raw = raw.replace(">", ">")
145 raw = raw.replace('"', """)
146 return raw
149# //////////////////////////////////////////////////////////////////////////////
151REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
154def escapeRE(string: str) -> str:
155 string = REGEXP_ESCAPE_RE.sub("\\$&", string)
156 return string
159# //////////////////////////////////////////////////////////////////////////////
162def isSpace(code: int | None) -> bool:
163 """Check if character code is a whitespace."""
164 return code in (0x09, 0x20)
167def isStrSpace(ch: str | None) -> bool:
168 """Check if character is a whitespace."""
169 return ch in ("\t", " ")
172MD_WHITESPACE = {
173 0x09, # \t
174 0x0A, # \n
175 0x0B, # \v
176 0x0C, # \f
177 0x0D, # \r
178 0x20, # space
179 0xA0,
180 0x1680,
181 0x202F,
182 0x205F,
183 0x3000,
184}
187def isWhiteSpace(code: int) -> bool:
188 r"""Zs (unicode class) || [\t\f\v\r\n]"""
189 if code >= 0x2000 and code <= 0x200A:
190 return True
191 return code in MD_WHITESPACE
194# //////////////////////////////////////////////////////////////////////////////
196UNICODE_PUNCT_RE = re.compile(
197 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501
198)
201# Currently without astral characters support.
202def isPunctChar(ch: str) -> bool:
203 """Check if character is a punctuation character."""
204 return UNICODE_PUNCT_RE.search(ch) is not None
207MD_ASCII_PUNCT = {
208 0x21, # /* ! */
209 0x22, # /* " */
210 0x23, # /* # */
211 0x24, # /* $ */
212 0x25, # /* % */
213 0x26, # /* & */
214 0x27, # /* ' */
215 0x28, # /* ( */
216 0x29, # /* ) */
217 0x2A, # /* * */
218 0x2B, # /* + */
219 0x2C, # /* , */
220 0x2D, # /* - */
221 0x2E, # /* . */
222 0x2F, # /* / */
223 0x3A, # /* : */
224 0x3B, # /* ; */
225 0x3C, # /* < */
226 0x3D, # /* = */
227 0x3E, # /* > */
228 0x3F, # /* ? */
229 0x40, # /* @ */
230 0x5B, # /* [ */
231 0x5C, # /* \ */
232 0x5D, # /* ] */
233 0x5E, # /* ^ */
234 0x5F, # /* _ */
235 0x60, # /* ` */
236 0x7B, # /* { */
237 0x7C, # /* | */
238 0x7D, # /* } */
239 0x7E, # /* ~ */
240}
243def isMdAsciiPunct(ch: int) -> bool:
244 """Markdown ASCII punctuation characters.
246 ::
248 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
250 See http://spec.commonmark.org/0.15/#ascii-punctuation-character
252 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
254 """ # noqa: E501
255 return ch in MD_ASCII_PUNCT
258def normalizeReference(string: str) -> str:
259 """Helper to unify [reference labels]."""
260 # Trim and collapse whitespace
261 #
262 string = re.sub(r"\s+", " ", string.strip())
264 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
265 # fixed in v12 (couldn't find any details).
266 #
267 # So treat this one as a special case
268 # (remove this when node v10 is no longer supported).
269 #
270 # if ('ẞ'.toLowerCase() === 'Ṿ') {
271 # str = str.replace(/ẞ/g, 'ß')
272 # }
274 # .toLowerCase().toUpperCase() should get rid of all differences
275 # between letter variants.
276 #
277 # Simple .toLowerCase() doesn't normalize 125 code points correctly,
278 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:
279 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
280 # uppercased versions).
281 #
282 # Here's an example showing how it happens. Lets take greek letter omega:
283 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
284 #
285 # Unicode entries:
286 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
287 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
288 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
289 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
290 #
291 # Case-insensitive comparison should treat all of them as equivalent.
292 #
293 # But .toLowerCase() doesn't change ϑ (it's already lowercase),
294 # and .toUpperCase() doesn't change ϴ (already uppercase).
295 #
296 # Applying first lower then upper case normalizes any character:
297 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
298 #
299 # Note: this is equivalent to unicode case folding; unicode normalization
300 # is a different step that is not required here.
301 #
302 # Final result should be uppercased, because it's later stored in an object
303 # (this avoid a conflict with Object.prototype members,
304 # most notably, `__proto__`)
305 #
306 return string.lower().upper()
309LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
310LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
313def isLinkOpen(string: str) -> bool:
314 return bool(LINK_OPEN_RE.search(string))
317def isLinkClose(string: str) -> bool:
318 return bool(LINK_CLOSE_RE.search(string))