1"""Utilities for parsing source text""" 
    2 
    3from __future__ import annotations 
    4 
    5import re 
    6from re import Match 
    7from typing import TypeVar 
    8import unicodedata 
    9 
    10from .entities import entities 
    11 
    12 
    13def charCodeAt(src: str, pos: int) -> int | None: 
    14    """ 
    15    Returns the Unicode value of the character at the specified location. 
    16 
    17    @param - index The zero-based index of the desired character. 
    18    If there is no character at the specified index, NaN is returned. 
    19 
    20    This was added for compatibility with python 
    21    """ 
    22    try: 
    23        return ord(src[pos]) 
    24    except IndexError: 
    25        return None 
    26 
    27 
    28def charStrAt(src: str, pos: int) -> str | None: 
    29    """ 
    30    Returns the Unicode value of the character at the specified location. 
    31 
    32    @param - index The zero-based index of the desired character. 
    33    If there is no character at the specified index, NaN is returned. 
    34 
    35    This was added for compatibility with python 
    36    """ 
    37    try: 
    38        return src[pos] 
    39    except IndexError: 
    40        return None 
    41 
    42 
    43_ItemTV = TypeVar("_ItemTV") 
    44 
    45 
    46def arrayReplaceAt( 
    47    src: list[_ItemTV], pos: int, newElements: list[_ItemTV] 
    48) -> list[_ItemTV]: 
    49    """ 
    50    Remove element from array and put another array at those position. 
    51    Useful for some operations with tokens 
    52    """ 
    53    return src[:pos] + newElements + src[pos + 1 :] 
    54 
    55 
    56def isValidEntityCode(c: int) -> bool: 
    57    # broken sequence 
    58    if c >= 0xD800 and c <= 0xDFFF: 
    59        return False 
    60    # never used 
    61    if c >= 0xFDD0 and c <= 0xFDEF: 
    62        return False 
    63    if ((c & 0xFFFF) == 0xFFFF) or ((c & 0xFFFF) == 0xFFFE): 
    64        return False 
    65    # control codes 
    66    if c >= 0x00 and c <= 0x08: 
    67        return False 
    68    if c == 0x0B: 
    69        return False 
    70    if c >= 0x0E and c <= 0x1F: 
    71        return False 
    72    if c >= 0x7F and c <= 0x9F: 
    73        return False 
    74    # out of range 
    75    return not (c > 0x10FFFF) 
    76 
    77 
    78def fromCodePoint(c: int) -> str: 
    79    """Convert ordinal to unicode. 
    80 
    81    Note, in the original Javascript two string characters were required, 
    82    for codepoints larger than `0xFFFF`. 
    83    But Python 3 can represent any unicode codepoint in one character. 
    84    """ 
    85    return chr(c) 
    86 
    87 
    88# UNESCAPE_MD_RE = re.compile(r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])') 
    89# ENTITY_RE_g       = re.compile(r'&([a-z#][a-z0-9]{1,31})', re.IGNORECASE) 
    90UNESCAPE_ALL_RE = re.compile( 
    91    r'\\([!"#$%&\'()*+,\-.\/:;<=>?@[\\\]^_`{|}~])' + "|" + r"&([a-z#][a-z0-9]{1,31});", 
    92    re.IGNORECASE, 
    93) 
    94DIGITAL_ENTITY_BASE10_RE = re.compile(r"#([0-9]{1,8})") 
    95DIGITAL_ENTITY_BASE16_RE = re.compile(r"#x([a-f0-9]{1,8})", re.IGNORECASE) 
    96 
    97 
    98def replaceEntityPattern(match: str, name: str) -> str: 
    99    """Convert HTML entity patterns, 
    100    see https://spec.commonmark.org/0.30/#entity-references 
    101    """ 
    102    if name in entities: 
    103        return entities[name] 
    104 
    105    code: None | int = None 
    106    if pat := DIGITAL_ENTITY_BASE10_RE.fullmatch(name): 
    107        code = int(pat.group(1), 10) 
    108    elif pat := DIGITAL_ENTITY_BASE16_RE.fullmatch(name): 
    109        code = int(pat.group(1), 16) 
    110 
    111    if code is not None and isValidEntityCode(code): 
    112        return fromCodePoint(code) 
    113 
    114    return match 
    115 
    116 
    117def unescapeAll(string: str) -> str: 
    118    def replacer_func(match: Match[str]) -> str: 
    119        escaped = match.group(1) 
    120        if escaped: 
    121            return escaped 
    122        entity = match.group(2) 
    123        return replaceEntityPattern(match.group(), entity) 
    124 
    125    if "\\" not in string and "&" not in string: 
    126        return string 
    127    return UNESCAPE_ALL_RE.sub(replacer_func, string) 
    128 
    129 
    130ESCAPABLE = r"""\\!"#$%&'()*+,./:;<=>?@\[\]^`{}|_~-""" 
    131ESCAPE_CHAR = re.compile(r"\\([" + ESCAPABLE + r"])") 
    132 
    133 
    134def stripEscape(string: str) -> str: 
    135    """Strip escape \\ characters""" 
    136    return ESCAPE_CHAR.sub(r"\1", string) 
    137 
    138 
    139def escapeHtml(raw: str) -> str: 
    140    """Replace special characters "&", "<", ">" and '"' to HTML-safe sequences.""" 
    141    # like html.escape, but without escaping single quotes 
    142    raw = raw.replace("&", "&")  # Must be done first! 
    143    raw = raw.replace("<", "<") 
    144    raw = raw.replace(">", ">") 
    145    raw = raw.replace('"', """) 
    146    return raw 
    147 
    148 
    149# ////////////////////////////////////////////////////////////////////////////// 
    150 
    151REGEXP_ESCAPE_RE = re.compile(r"[.?*+^$[\]\\(){}|-]") 
    152 
    153 
    154def escapeRE(string: str) -> str: 
    155    string = REGEXP_ESCAPE_RE.sub("\\$&", string) 
    156    return string 
    157 
    158 
    159# ////////////////////////////////////////////////////////////////////////////// 
    160 
    161 
    162def isSpace(code: int | None) -> bool: 
    163    """Check if character code is a whitespace.""" 
    164    return code in (0x09, 0x20) 
    165 
    166 
    167def isStrSpace(ch: str | None) -> bool: 
    168    """Check if character is a whitespace.""" 
    169    return ch in ("\t", " ") 
    170 
    171 
    172MD_WHITESPACE = { 
    173    0x09,  # \t 
    174    0x0A,  # \n 
    175    0x0B,  # \v 
    176    0x0C,  # \f 
    177    0x0D,  # \r 
    178    0x20,  # space 
    179    0xA0, 
    180    0x1680, 
    181    0x202F, 
    182    0x205F, 
    183    0x3000, 
    184} 
    185 
    186 
    187def isWhiteSpace(code: int) -> bool: 
    188    r"""Zs (unicode class) || [\t\f\v\r\n]""" 
    189    if code >= 0x2000 and code <= 0x200A: 
    190        return True 
    191    return code in MD_WHITESPACE 
    192 
    193 
    194# ////////////////////////////////////////////////////////////////////////////// 
    195 
    196 
    197def isPunctChar(ch: str) -> bool: 
    198    """Check if character is a punctuation character.""" 
    199    return unicodedata.category(ch).startswith(("P", "S")) 
    200 
    201 
    202MD_ASCII_PUNCT = { 
    203    0x21,  # /* ! */ 
    204    0x22,  # /* " */ 
    205    0x23,  # /* # */ 
    206    0x24,  # /* $ */ 
    207    0x25,  # /* % */ 
    208    0x26,  # /* & */ 
    209    0x27,  # /* ' */ 
    210    0x28,  # /* ( */ 
    211    0x29,  # /* ) */ 
    212    0x2A,  # /* * */ 
    213    0x2B,  # /* + */ 
    214    0x2C,  # /* , */ 
    215    0x2D,  # /* - */ 
    216    0x2E,  # /* . */ 
    217    0x2F,  # /* / */ 
    218    0x3A,  # /* : */ 
    219    0x3B,  # /* ; */ 
    220    0x3C,  # /* < */ 
    221    0x3D,  # /* = */ 
    222    0x3E,  # /* > */ 
    223    0x3F,  # /* ? */ 
    224    0x40,  # /* @ */ 
    225    0x5B,  # /* [ */ 
    226    0x5C,  # /* \ */ 
    227    0x5D,  # /* ] */ 
    228    0x5E,  # /* ^ */ 
    229    0x5F,  # /* _ */ 
    230    0x60,  # /* ` */ 
    231    0x7B,  # /* { */ 
    232    0x7C,  # /* | */ 
    233    0x7D,  # /* } */ 
    234    0x7E,  # /* ~ */ 
    235} 
    236 
    237 
    238def isMdAsciiPunct(ch: int) -> bool: 
    239    """Markdown ASCII punctuation characters. 
    240 
    241    :: 
    242 
    243        !, ", #, $, %, &, ', (, ), *, +, ,, -, ., /, :, ;, <, =, >, ?, @, [, \\, ], ^, _, `, {, |, }, or ~ 
    244 
    245    See http://spec.commonmark.org/0.15/#ascii-punctuation-character 
    246 
    247    Don't confuse with unicode punctuation !!! It lacks some chars in ascii range. 
    248 
    249    """ 
    250    return ch in MD_ASCII_PUNCT 
    251 
    252 
    253def normalizeReference(string: str) -> str: 
    254    """Helper to unify [reference labels].""" 
    255    # Trim and collapse whitespace 
    256    # 
    257    string = re.sub(r"\s+", " ", string.strip()) 
    258 
    259    # In node v10 'ẞ'.toLowerCase() === 'Ṿ', which is presumed to be a bug 
    260    # fixed in v12 (couldn't find any details). 
    261    # 
    262    # So treat this one as a special case 
    263    # (remove this when node v10 is no longer supported). 
    264    # 
    265    # if ('ẞ'.toLowerCase() === 'Ṿ') { 
    266    #   str = str.replace(/ẞ/g, 'ß') 
    267    # } 
    268 
    269    # .toLowerCase().toUpperCase() should get rid of all differences 
    270    # between letter variants. 
    271    # 
    272    # Simple .toLowerCase() doesn't normalize 125 code points correctly, 
    273    # and .toUpperCase doesn't normalize 6 of them (list of exceptions: 
    274    # İ, ϴ, ẞ, Ω, K, Å - those are already uppercased, but have differently 
    275    # uppercased versions). 
    276    # 
    277    # Here's an example showing how it happens. Lets take greek letter omega: 
    278    # uppercase U+0398 (Θ), U+03f4 (ϴ) and lowercase U+03b8 (θ), U+03d1 (ϑ) 
    279    # 
    280    # Unicode entries: 
    281    # 0398;GREEK CAPITAL LETTER THETA;Lu;0;L;;;;;N;;;;03B8 
    282    # 03B8;GREEK SMALL LETTER THETA;Ll;0;L;;;;;N;;;0398;;0398 
    283    # 03D1;GREEK THETA SYMBOL;Ll;0;L;<compat> 03B8;;;;N;GREEK SMALL LETTER SCRIPT THETA;;0398;;0398 
    284    # 03F4;GREEK CAPITAL THETA SYMBOL;Lu;0;L;<compat> 0398;;;;N;;;;03B8 
    285    # 
    286    # Case-insensitive comparison should treat all of them as equivalent. 
    287    # 
    288    # But .toLowerCase() doesn't change ϑ (it's already lowercase), 
    289    # and .toUpperCase() doesn't change ϴ (already uppercase). 
    290    # 
    291    # Applying first lower then upper case normalizes any character: 
    292    # '\u0398\u03f4\u03b8\u03d1'.toLowerCase().toUpperCase() === '\u0398\u0398\u0398\u0398' 
    293    # 
    294    # Note: this is equivalent to unicode case folding; unicode normalization 
    295    # is a different step that is not required here. 
    296    # 
    297    # Final result should be uppercased, because it's later stored in an object 
    298    # (this avoid a conflict with Object.prototype members, 
    299    # most notably, `__proto__`) 
    300    # 
    301    return string.lower().upper() 
    302 
    303 
    304LINK_OPEN_RE = re.compile(r"^<a[>\s]", flags=re.IGNORECASE) 
    305LINK_CLOSE_RE = re.compile(r"^</a\s*>", flags=re.IGNORECASE) 
    306 
    307 
    308def isLinkOpen(string: str) -> bool: 
    309    return bool(LINK_OPEN_RE.search(string)) 
    310 
    311 
    312def isLinkClose(string: str) -> bool: 
    313    return bool(LINK_CLOSE_RE.search(string))