Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/markdown_it/common/utils.py: 92%
83 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:07 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:07 +0000
1"""Utilities for parsing source text
2"""
3import html
4import re
5from typing import Any
7from .entities import entities
10def charCodeAt(src: str, pos: int) -> Any:
11 """
12 Returns the Unicode value of the character at the specified location.
14 @param - index The zero-based index of the desired character.
15 If there is no character at the specified index, NaN is returned.
17 This was added for compatibility with python
18 """
19 try:
20 return ord(src[pos])
21 except IndexError:
22 return None
25# Merge objects
26#
27def assign(obj):
28 """Merge objects /*from1, from2, from3, ...*/)"""
29 raise NotImplementedError
30 # sources = Array.prototype.slice.call(arguments, 1)
32 # sources.forEach(function (source) {
33 # if (!source) { return; }
35 # if (typeof source !== 'object') {
36 # throw new TypeError(source + 'must be object')
37 # }
39 # Object.keys(source).forEach(function (key) {
40 # obj[key] = source[key]
41 # })
42 # })
44 # return obj
47def arrayReplaceAt(src: list, pos: int, newElements: list) -> list:
48 """
49 Remove element from array and put another array at those position.
50 Useful for some operations with tokens
51 """
52 return src[:pos] + newElements + src[pos + 1 :]
55######################################################################
58def isValidEntityCode(c: int) -> bool:
59 # broken sequence
60 if c >= 0xD800 and c <= 0xDFFF:
61 return False
62 # never used
63 if c >= 0xFDD0 and c <= 0xFDEF:
64 return False
65 if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE):
66 return False
67 # control codes
68 if c >= 0x00 and c <= 0x08:
69 return False
70 if c == 0x0B:
71 return False
72 if c >= 0x0E and c <= 0x1F:
73 return False
74 if c >= 0x7F and c <= 0x9F:
75 return False
76 # out of range
77 if c > 0x10FFFF:
78 return False
79 return True
82def fromCodePoint(c: int) -> str:
83 """Convert ordinal to unicode.
85 Note, in the original Javascript two string characters were required,
86 for codepoints larger than `0xFFFF`.
87 But Python 3 can represent any unicode codepoint in one character.
88 """
89 return chr(c)
92UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])')
93# ENTITY_RE_g = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE)
94UNESCAPE_ALL_RE = re.compile(
95 r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});",
96 re.IGNORECASE,
97)
98DIGITAL_ENTITY_TEST_RE = re.compile(r"^#((?:x[a-f0-9]{1,8}|[0-9]{1,8}))", re.IGNORECASE)
101def replaceEntityPattern(match: str, name: str) -> str:
102 """Convert HTML entity patterns
104 ::
106 https://www.google.com -> https%3A//www.google.com
108 """
109 code = 0
111 if name in entities:
112 return entities[name]
114 if ord(name[0]) == 0x23 and DIGITAL_ENTITY_TEST_RE.search(name):
115 code = int(name[2:], 16) if name[1].lower() == "x" else int(name[1:], 10)
116 if isValidEntityCode(code):
117 return fromCodePoint(code)
119 return match
122# def replaceEntities(string):
123# if (string.indexOf('&') < 0):
124# return string
125# return string.replace(ENTITY_RE, replaceEntityPattern)
128def unescapeMd(string: str) -> str:
129 raise NotImplementedError
130 # if "\\" in string:
131 # return string
132 # return string.replace(UNESCAPE_MD_RE, "$1")
135def unescapeAll(string: str) -> str:
136 def replacer_func(match):
137 escaped = match.group(1)
138 if escaped:
139 return escaped
140 entity = match.group(2)
141 return replaceEntityPattern(match.group(), entity)
143 if "\\" not in string and "&" not in string:
144 return string
145 return UNESCAPE_ALL_RE.sub(replacer_func, string)
148ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-"""
149ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])")
152def stripEscape(string: str) -> str:
153 """Strip escape \\ characters"""
154 return ESCAPE_CHAR.sub(r"\1", string)
157# //////////////////////////////////////////////////////////////////////////////
159# TODO This section changed quite a lot, should re-check
161# UNESCAPE_HTML_RE = re.compile(r"\\&(?=(amp\;|lt\;|gt\;|quot\;))")
162# ESCAPE_AND_HTML = re.compile(r"&(?!(amp\;|lt\;|gt\;|quot\;))")
163# HTML_ESCAPE_REPLACE_RE = re.compile(r'[&<>"]')
166# def escapeHtml(string: str):
168# if HTML_ESCAPE_REPLACE_RE.search(string):
170# string = UNESCAPE_HTML_RE.sub("&", string)
171# string = ESCAPE_AND_HTML.sub("&", string)
172# for k, v in {"<": "<", ">": ">", '"': """}.items():
173# string = string.replace(k, v)
175# return string
178def escapeHtml(raw: str) -> str:
179 # return html.escape(html.unescape(raw)).replace("'", "'")
180 return html.escape(raw).replace("'", "'")
183# //////////////////////////////////////////////////////////////////////////////
185REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]")
188def escapeRE(string: str) -> str:
189 string = REGEXP_ESCAPE_RE.sub("\\$&", string)
190 return string
193# //////////////////////////////////////////////////////////////////////////////
196def isSpace(code: object) -> bool:
197 return code in {0x09, 0x20}
200MD_WHITESPACE = {
201 0x09, # \t
202 0x0A, # \n
203 0x0B, # \v
204 0x0C, # \f
205 0x0D, # \r
206 0x20,
207 0xA0,
208 0x1680,
209 0x202F,
210 0x205F,
211 0x3000,
212}
215def isWhiteSpace(code: int) -> bool:
216 r"""Zs (unicode class) || [\t\f\v\r\n]"""
217 if code >= 0x2000 and code <= 0x200A:
218 return True
219 return code in MD_WHITESPACE
222# //////////////////////////////////////////////////////////////////////////////
224UNICODE_PUNCT_RE = re.compile(
225 r"[!-#%-\*,-\/:;\?@\[-\]_\{\}\xA1\xA7\xAB\xB6\xB7\xBB\xBF\u037E\u0387\u055A-\u055F\u0589\u058A\u05BE\u05C0\u05C3\u05C6\u05F3\u05F4\u0609\u060A\u060C\u060D\u061B\u061E\u061F\u066A-\u066D\u06D4\u0700-\u070D\u07F7-\u07F9\u0830-\u083E\u085E\u0964\u0965\u0970\u09FD\u0A76\u0AF0\u0C84\u0DF4\u0E4F\u0E5A\u0E5B\u0F04-\u0F12\u0F14\u0F3A-\u0F3D\u0F85\u0FD0-\u0FD4\u0FD9\u0FDA\u104A-\u104F\u10FB\u1360-\u1368\u1400\u166D\u166E\u169B\u169C\u16EB-\u16ED\u1735\u1736\u17D4-\u17D6\u17D8-\u17DA\u1800-\u180A\u1944\u1945\u1A1E\u1A1F\u1AA0-\u1AA6\u1AA8-\u1AAD\u1B5A-\u1B60\u1BFC-\u1BFF\u1C3B-\u1C3F\u1C7E\u1C7F\u1CC0-\u1CC7\u1CD3\u2010-\u2027\u2030-\u2043\u2045-\u2051\u2053-\u205E\u207D\u207E\u208D\u208E\u2308-\u230B\u2329\u232A\u2768-\u2775\u27C5\u27C6\u27E6-\u27EF\u2983-\u2998\u29D8-\u29DB\u29FC\u29FD\u2CF9-\u2CFC\u2CFE\u2CFF\u2D70\u2E00-\u2E2E\u2E30-\u2E4E\u3001-\u3003\u3008-\u3011\u3014-\u301F\u3030\u303D\u30A0\u30FB\uA4FE\uA4FF\uA60D-\uA60F\uA673\uA67E\uA6F2-\uA6F7\uA874-\uA877\uA8CE\uA8CF\uA8F8-\uA8FA\uA8FC\uA92E\uA92F\uA95F\uA9C1-\uA9CD\uA9DE\uA9DF\uAA5C-\uAA5F\uAADE\uAADF\uAAF0\uAAF1\uABEB\uFD3E\uFD3F\uFE10-\uFE19\uFE30-\uFE52\uFE54-\uFE61\uFE63\uFE68\uFE6A\uFE6B\uFF01-\uFF03\uFF05-\uFF0A\uFF0C-\uFF0F\uFF1A\uFF1B\uFF1F\uFF20\uFF3B-\uFF3D\uFF3F\uFF5B\uFF5D\uFF5F-\uFF65]|\uD800[\uDD00-\uDD02\uDF9F\uDFD0]|\uD801\uDD6F|\uD802[\uDC57\uDD1F\uDD3F\uDE50-\uDE58\uDE7F\uDEF0-\uDEF6\uDF39-\uDF3F\uDF99-\uDF9C]|\uD803[\uDF55-\uDF59]|\uD804[\uDC47-\uDC4D\uDCBB\uDCBC\uDCBE-\uDCC1\uDD40-\uDD43\uDD74\uDD75\uDDC5-\uDDC8\uDDCD\uDDDB\uDDDD-\uDDDF\uDE38-\uDE3D\uDEA9]|\uD805[\uDC4B-\uDC4F\uDC5B\uDC5D\uDCC6\uDDC1-\uDDD7\uDE41-\uDE43\uDE60-\uDE6C\uDF3C-\uDF3E]|\uD806[\uDC3B\uDE3F-\uDE46\uDE9A-\uDE9C\uDE9E-\uDEA2]|\uD807[\uDC41-\uDC45\uDC70\uDC71\uDEF7\uDEF8]|\uD809[\uDC70-\uDC74]|\uD81A[\uDE6E\uDE6F\uDEF5\uDF37-\uDF3B\uDF44]|\uD81B[\uDE97-\uDE9A]|\uD82F\uDC9F|\uD836[\uDE87-\uDE8B]|\uD83A[\uDD5E\uDD5F]" # noqa: E501
226)
229# Currently without astral characters support.
230def isPunctChar(ch: str) -> bool:
231 return UNICODE_PUNCT_RE.search(ch) is not None
234MD_ASCII_PUNCT = {
235 0x21, # /* ! */
236 0x22, # /* " */
237 0x23, # /* # */
238 0x24, # /* $ */
239 0x25, # /* % */
240 0x26, # /* & */
241 0x27, # /* ' */
242 0x28, # /* ( */
243 0x29, # /* ) */
244 0x2A, # /* * */
245 0x2B, # /* + */
246 0x2C, # /* , */
247 0x2D, # /* - */
248 0x2E, # /* . */
249 0x2F, # /* / */
250 0x3A, # /* : */
251 0x3B, # /* ; */
252 0x3C, # /* < */
253 0x3D, # /* = */
254 0x3E, # /* > */
255 0x3F, # /* ? */
256 0x40, # /* @ */
257 0x5B, # /* [ */
258 0x5C, # /* \ */
259 0x5D, # /* ] */
260 0x5E, # /* ^ */
261 0x5F, # /* _ */
262 0x60, # /* ` */
263 0x7B, # /* { */
264 0x7C, # /* | */
265 0x7D, # /* } */
266 0x7E, # /* ~ */
267}
270def isMdAsciiPunct(ch: int) -> bool:
271 """Markdown ASCII punctuation characters.
273 ::
275 !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~
277 See http://spec.commonmark.org/0.15/#ascii-punctuation-character
279 Don't confuse with unicode punctuation !!! It lacks some chars in ascii range.
281 """ # noqa: E501
282 return ch in MD_ASCII_PUNCT
285def normalizeReference(string: str) -> str:
286 """Helper to unify [reference labels]."""
287 # Trim and collapse whitespace
288 #
289 string = re.sub(r"\s+", " ", string.strip())
291 # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug
292 # fixed in v12 (couldn't find any details).
293 #
294 # So treat this one as a special case
295 # (remove this when node v10 is no longer supported).
296 #
297 # if ('ẞ'.toLowerCase() === 'Ṿ') {
298 # str = str.replace(/ẞ/g, 'ß')
299 # }
301 # .toLowerCase().toUpperCase() should get rid of all differences
302 # between letter variants.
303 #
304 # Simple .toLowerCase() doesn't normalize 125 code points correctly,
305 # and .toUpperCase doesn't normalize 6 of them (list of exceptions:
306 # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently
307 # uppercased versions).
308 #
309 # Here's an example showing how it happens. Lets take greek letter omega:
310 # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ)
311 #
312 # Unicode entries:
313 # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8
314 # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398
315 # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398
316 # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8
317 #
318 # Case-insensitive comparison should treat all of them as equivalent.
319 #
320 # But .toLowerCase() doesn't change ϑ (it's already lowercase),
321 # and .toUpperCase() doesn't change ϴ (already uppercase).
322 #
323 # Applying first lower then upper case normalizes any character:
324 # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398'
325 #
326 # Note: this is equivalent to unicode case folding; unicode normalization
327 # is a different step that is not required here.
328 #
329 # Final result should be uppercased, because it's later stored in an object
330 # (this avoid a conflict with Object.prototype members,
331 # most notably, `__proto__`)
332 #
333 return string.lower().upper()