1"""Utilities for parsing source text"""
2
3from __future__ import annotations
4
5import re
6from re import Match
7from typing import TypeVar
8
9from .entities import entities
10
11
12def charCodeAt(src: str, pos: int) -> int | None:
13 """
14 Returns the Unicode value of the character at the specified location.
15
16 @param - index The zero-based index of the desired character.
17 If there is no character at the specified index, NaN is returned.
18
19 This was added for compatibility with python
20 """
21 try:
22 return ord(src[pos])
23 except IndexError:
24 return None
25
26
27def charStrAt(src: str, pos: int) -> str | None:
28 """
29 Returns the Unicode value of the character at the specified location.
30
31 @param - index The zero-based index of the desired character.
32 If there is no character at the specified index, NaN is returned.
33
34 This was added for compatibility with python
35 """
36 try:
37 return src[pos]
38 except IndexError:
39 return None
40
41
42_ItemTV = TypeVar("_ItemTV")
43
44
45def arrayReplaceAt(
46 src: list[_ItemTV], pos: int, newElements: list[_ItemTV]
47) -> list[_ItemTV]:
48 """
49 Remove element from array and put another array at those position.
50 Useful for some operations with tokens
51 """
52 return src[:pos] + newElements + src[pos + 1 :]
53
54
55def isValidEntityCode(c: int) -> bool:
56 # broken sequence
57 if c >= 0xD800 and c <= 0xDFFF:
58 return False
59 # never used
60 if c >= 0xFDD0 and c <= 0xFDEF:
61 return False
62 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
63 return False
64 # control codes
65 if c >= 0x00 and c <= 0x08:
66 return False
67 if c == 0x0B:
68 return False
69 if c >= 0x0E and c <= 0x1F:
70 return False
71 if c >= 0x7F and c <= 0x9F:
72 return False
73 # out of range
74 return not (c > 0x10FFFF)
75
76
77def fromCodePoint(c: int) -> str:
78 """Convert ordinal to unicode.
79
80 Note, in the original Javascript two string characters were required,
81 for codepoints larger than `0xFFFF`.
82 But Python 3 can represent any unicode codepoint in one character.
83 """
84 return chr(c)
85
86
87# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
88# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
89UNESCAPE_ALL_RE = re.compile(
90 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
91 re.IGNORECASE,
92)
93DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
94DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
95
96
97def replaceEntityPattern(match: str, name: str) -> str:
98 """Convert HTML entity patterns,
99 see https://spec.commonmark.org/0.30/#entity-references
100 """
101 if name in entities:
102 return entities[name]
103
104 code: None | int = None
105 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
106 code = int(pat.group(1), 10)
107 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
108 code = int(pat.group(1), 16)
109
110 if code is not None and isValidEntityCode(code):
111 return fromCodePoint(code)
112
113 return match
114
115
116def unescapeAll(string: str) -> str:
117 def replacer_func(match: Match[str]) -> str:
118 escaped = match.group(1)
119 if escaped:
120 return escaped
121 entity = match.group(2)
122 return replaceEntityPattern(match.group(), entity)
123
124 if "\\" not in string and "&" not in string:
125 return string
126 return UNESCAPE_ALL_RE.sub(replacer_func, string)
127
128
129ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
130ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
131
132
133def stripEscape(string: str) -> str:
134 """Strip escape \\ characters"""
135 return ESCAPE_CHAR.sub(r"\1", string)
136
137
138def escapeHtml(raw: str) -> str:
139 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
140 # like html.escape, but without escaping single quotes
141 raw = raw.replace("&", "&") # Must be done first!
142 raw = raw.replace("<", "<")
143 raw = raw.replace(">", ">")
144 raw = raw.replace('"', """)
145 return raw
146
147
148# //////////////////////////////////////////////////////////////////////////////
149
150REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
151
152
153def escapeRE(string: str) -> str:
154 string = REGEXP_ESCAPE_RE.sub("\\$&", string)
155 return string
156
157
158# //////////////////////////////////////////////////////////////////////////////
159
160
161def isSpace(code: int | None) -> bool:
162 """Check if character code is a whitespace."""
163 return code in (0x09, 0x20)
164
165
166def isStrSpace(ch: str | None) -> bool:
167 """Check if character is a whitespace."""
168 return ch in ("\t", " ")
169
170
171MD_WHITESPACE = {
172 0x09, # \t
173 0x0A, # \n
174 0x0B, # \v
175 0x0C, # \f
176 0x0D, # \r
177 0x20, # space
178 0xA0,
179 0x1680,
180 0x202F,
181 0x205F,
182 0x3000,
183}
184
185
186def isWhiteSpace(code: int) -> bool:
187 r"""Zs (unicode class) || [\t\f\v\r\n]"""
188 if code >= 0x2000 and code <= 0x200A:
189 return True
190 return code in MD_WHITESPACE
191
192
193# //////////////////////////////////////////////////////////////////////////////
194
195UNICODE_PUNCT_RE = re.compile(
196 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]"
197)
198
199
200# Currently without astral characters support.
201def isPunctChar(ch: str) -> bool:
202 """Check if character is a punctuation character."""
203 return UNICODE_PUNCT_RE.search(ch) is not None
204
205
206MD_ASCII_PUNCT = {
207 0x21, # /* ! */
208 0x22, # /* " */
209 0x23, # /* # */
210 0x24, # /* $ */
211 0x25, # /* % */
212 0x26, # /* & */
213 0x27, # /* ' */
214 0x28, # /* ( */
215 0x29, # /* ) */
216 0x2A, # /* * */
217 0x2B, # /* + */
218 0x2C, # /* , */
219 0x2D, # /* - */
220 0x2E, # /* . */
221 0x2F, # /* / */
222 0x3A, # /* : */
223 0x3B, # /* ; */
224 0x3C, # /* < */
225 0x3D, # /* = */
226 0x3E, # /* > */
227 0x3F, # /* ? */
228 0x40, # /* @ */
229 0x5B, # /* [ */
230 0x5C, # /* \ */
231 0x5D, # /* ] */
232 0x5E, # /* ^ */
233 0x5F, # /* _ */
234 0x60, # /* ` */
235 0x7B, # /* { */
236 0x7C, # /* | */
237 0x7D, # /* } */
238 0x7E, # /* ~ */
239}
240
241
242def isMdAsciiPunct(ch: int) -> bool:
243 """Markdown ASCII punctuation characters.
244
245 ::
246
247 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
248
249 See http://spec.commonmark.org/0.15/#ascii-punctuation-character
250
251 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
252
253 """
254 return ch in MD_ASCII_PUNCT
255
256
257def normalizeReference(string: str) -> str:
258 """Helper to unify [reference labels]."""
259 # Trim and collapse whitespace
260 #
261 string = re.sub(r"\s+", " ", string.strip())
262
263 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
264 # fixed in v12 (couldn't find any details).
265 #
266 # So treat this one as a special case
267 # (remove this when node v10 is no longer supported).
268 #
269 # if ('ẞ'.toLowerCase() === 'Ṿ') {
270 # str = str.replace(/ẞ/g, 'ß')
271 # }
272
273 # .toLowerCase().toUpperCase() should get rid of all differences
274 # between letter variants.
275 #
276 # Simple .toLowerCase() doesn't normalize 125 code points correctly,
277 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:
278 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
279 # uppercased versions).
280 #
281 # Here's an example showing how it happens. Lets take greek letter omega:
282 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
283 #
284 # Unicode entries:
285 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
286 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
287 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
288 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
289 #
290 # Case-insensitive comparison should treat all of them as equivalent.
291 #
292 # But .toLowerCase() doesn't change ϑ (it's already lowercase),
293 # and .toUpperCase() doesn't change ϴ (already uppercase).
294 #
295 # Applying first lower then upper case normalizes any character:
296 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
297 #
298 # Note: this is equivalent to unicode case folding; unicode normalization
299 # is a different step that is not required here.
300 #
301 # Final result should be uppercased, because it's later stored in an object
302 # (this avoid a conflict with Object.prototype members,
303 # most notably, `__proto__`)
304 #
305 return string.lower().upper()
306
307
308LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
309LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
310
311
312def isLinkOpen(string: str) -> bool:
313 return bool(LINK_OPEN_RE.search(string))
314
315
316def isLinkClose(string: str) -> bool:
317 return bool(LINK_CLOSE_RE.search(string))