1"""Utilities for parsing source text"""
2
3from __future__ import annotations
4
5import re
6from re import Match
7from typing import TypeVar
8import unicodedata
9
10from .entities import entities
11
12
13def charCodeAt(src: str, pos: int) -> int | None:
14 """
15 Returns the Unicode value of the character at the specified location.
16
17 @param - index The zero-based index of the desired character.
18 If there is no character at the specified index, NaN is returned.
19
20 This was added for compatibility with python
21 """
22 try:
23 return ord(src[pos])
24 except IndexError:
25 return None
26
27
28def charStrAt(src: str, pos: int) -> str | None:
29 """
30 Returns the Unicode value of the character at the specified location.
31
32 @param - index The zero-based index of the desired character.
33 If there is no character at the specified index, NaN is returned.
34
35 This was added for compatibility with python
36 """
37 try:
38 return src[pos]
39 except IndexError:
40 return None
41
42
43_ItemTV = TypeVar("_ItemTV")
44
45
46def arrayReplaceAt(
47 src: list[_ItemTV], pos: int, newElements: list[_ItemTV]
48) -> list[_ItemTV]:
49 """
50 Remove element from array and put another array at those position.
51 Useful for some operations with tokens
52 """
53 return src[:pos] + newElements + src[pos + 1 :]
54
55
56def isValidEntityCode(c: int) -> bool:
57 # broken sequence
58 if c >= 0xD800 and c <= 0xDFFF:
59 return False
60 # never used
61 if c >= 0xFDD0 and c <= 0xFDEF:
62 return False
63 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
64 return False
65 # control codes
66 if c >= 0x00 and c <= 0x08:
67 return False
68 if c == 0x0B:
69 return False
70 if c >= 0x0E and c <= 0x1F:
71 return False
72 if c >= 0x7F and c <= 0x9F:
73 return False
74 # out of range
75 return not (c > 0x10FFFF)
76
77
78def fromCodePoint(c: int) -> str:
79 """Convert ordinal to unicode.
80
81 Note, in the original Javascript two string characters were required,
82 for codepoints larger than `0xFFFF`.
83 But Python 3 can represent any unicode codepoint in one character.
84 """
85 return chr(c)
86
87
88# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
89# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
90UNESCAPE_ALL_RE = re.compile(
91 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
92 re.IGNORECASE,
93)
94DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})")
95DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE)
96
97
98def replaceEntityPattern(match: str, name: str) -> str:
99 """Convert HTML entity patterns,
100 see https://spec.commonmark.org/0.30/#entity-references
101 """
102 if name in entities:
103 return entities[name]
104
105 code: None | int = None
106 if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name):
107 code = int(pat.group(1), 10)
108 elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name):
109 code = int(pat.group(1), 16)
110
111 if code is not None and isValidEntityCode(code):
112 return fromCodePoint(code)
113
114 return match
115
116
117def unescapeAll(string: str) -> str:
118 def replacer_func(match: Match[str]) -> str:
119 escaped = match.group(1)
120 if escaped:
121 return escaped
122 entity = match.group(2)
123 return replaceEntityPattern(match.group(), entity)
124
125 if "\\" not in string and "&" not in string:
126 return string
127 return UNESCAPE_ALL_RE.sub(replacer_func, string)
128
129
130ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
131ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
132
133
134def stripEscape(string: str) -> str:
135 """Strip escape \\ characters"""
136 return ESCAPE_CHAR.sub(r"\1", string)
137
138
139def escapeHtml(raw: str) -> str:
140 """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences."""
141 # like html.escape, but without escaping single quotes
142 raw = raw.replace("&", "&") # Must be done first!
143 raw = raw.replace("<", "<")
144 raw = raw.replace(">", ">")
145 raw = raw.replace('"', """)
146 return raw
147
148
149# //////////////////////////////////////////////////////////////////////////////
150
151REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
152
153
154def escapeRE(string: str) -> str:
155 string = REGEXP_ESCAPE_RE.sub("\\$&", string)
156 return string
157
158
159# //////////////////////////////////////////////////////////////////////////////
160
161
162def isSpace(code: int | None) -> bool:
163 """Check if character code is a whitespace."""
164 return code in (0x09, 0x20)
165
166
167def isStrSpace(ch: str | None) -> bool:
168 """Check if character is a whitespace."""
169 return ch in ("\t", " ")
170
171
172MD_WHITESPACE = {
173 0x09, # \t
174 0x0A, # \n
175 0x0B, # \v
176 0x0C, # \f
177 0x0D, # \r
178 0x20, # space
179 0xA0,
180 0x1680,
181 0x202F,
182 0x205F,
183 0x3000,
184}
185
186
187def isWhiteSpace(code: int) -> bool:
188 r"""Zs (unicode class) || [\t\f\v\r\n]"""
189 if code >= 0x2000 and code <= 0x200A:
190 return True
191 return code in MD_WHITESPACE
192
193
194# //////////////////////////////////////////////////////////////////////////////
195
196
197def isPunctChar(ch: str) -> bool:
198 """Check if character is a punctuation character."""
199 return unicodedata.category(ch).startswith(("P", "S"))
200
201
202MD_ASCII_PUNCT = {
203 0x21, # /* ! */
204 0x22, # /* " */
205 0x23, # /* # */
206 0x24, # /* $ */
207 0x25, # /* % */
208 0x26, # /* & */
209 0x27, # /* ' */
210 0x28, # /* ( */
211 0x29, # /* ) */
212 0x2A, # /* * */
213 0x2B, # /* + */
214 0x2C, # /* , */
215 0x2D, # /* - */
216 0x2E, # /* . */
217 0x2F, # /* / */
218 0x3A, # /* : */
219 0x3B, # /* ; */
220 0x3C, # /* < */
221 0x3D, # /* = */
222 0x3E, # /* > */
223 0x3F, # /* ? */
224 0x40, # /* @ */
225 0x5B, # /* [ */
226 0x5C, # /* \ */
227 0x5D, # /* ] */
228 0x5E, # /* ^ */
229 0x5F, # /* _ */
230 0x60, # /* ` */
231 0x7B, # /* { */
232 0x7C, # /* | */
233 0x7D, # /* } */
234 0x7E, # /* ~ */
235}
236
237
238def isMdAsciiPunct(ch: int) -> bool:
239 """Markdown ASCII punctuation characters.
240
241 ::
242
243 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
244
245 See http://spec.commonmark.org/0.15/#ascii-punctuation-character
246
247 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
248
249 """
250 return ch in MD_ASCII_PUNCT
251
252
253def normalizeReference(string: str) -> str:
254 """Helper to unify [reference labels]."""
255 # Trim and collapse whitespace
256 #
257 string = re.sub(r"\s+", " ", string.strip())
258
259 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
260 # fixed in v12 (couldn't find any details).
261 #
262 # So treat this one as a special case
263 # (remove this when node v10 is no longer supported).
264 #
265 # if ('ẞ'.toLowerCase() === 'Ṿ') {
266 # str = str.replace(/ẞ/g, 'ß')
267 # }
268
269 # .toLowerCase().toUpperCase() should get rid of all differences
270 # between letter variants.
271 #
272 # Simple .toLowerCase() doesn't normalize 125 code points correctly,
273 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:
274 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
275 # uppercased versions).
276 #
277 # Here's an example showing how it happens. Lets take greek letter omega:
278 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
279 #
280 # Unicode entries:
281 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
282 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
283 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
284 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
285 #
286 # Case-insensitive comparison should treat all of them as equivalent.
287 #
288 # But .toLowerCase() doesn't change ϑ (it's already lowercase),
289 # and .toUpperCase() doesn't change ϴ (already uppercase).
290 #
291 # Applying first lower then upper case normalizes any character:
292 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
293 #
294 # Note: this is equivalent to unicode case folding; unicode normalization
295 # is a different step that is not required here.
296 #
297 # Final result should be uppercased, because it's later stored in an object
298 # (this avoid a conflict with Object.prototype members,
299 # most notably, `__proto__`)
300 #
301 return string.lower().upper()
302
303
304LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE)
305LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE)
306
307
308def isLinkOpen(string: str) -> bool:
309 return bool(LINK_OPEN_RE.search(string))
310
311
312def isLinkClose(string: str) -> bool:
313 return bool(LINK_CLOSE_RE.search(string))