1from .exceptions import EmailSyntaxError 
    2from .types import ValidatedEmail 
    3from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ 
    4    DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ 
    5    DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS 
    6 
    7import re 
    8import unicodedata 
    9import idna  # implements IDNA 2008; Python's codec is only IDNA 2003 
    10import ipaddress 
    11from typing import Optional, Tuple, TypedDict, Union 
    12 
    13 
    14def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: 
    15    # Return the display name, unescaped local part, and domain part 
    16    # of the address, and whether the local part was quoted. If no 
    17    # display name was present and angle brackets do not surround 
    18    # the address, display name will be None; otherwise, it will be 
    19    # set to the display name or the empty string if there were 
    20    # angle brackets but no display name. 
    21 
    22    # Typical email addresses have a single @-sign and no quote 
    23    # characters, but the awkward "quoted string" local part form 
    24    # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear 
    25    # in the local part if the local part is quoted. 
    26 
    27    # A `display name <addr>` format is also present in MIME messages 
    28    # (RFC 5322 3.4) and this format is also often recognized in 
    29    # mail UIs. It's not allowed in SMTP commands or in typical web 
    30    # login forms, but parsing it has been requested, so it's done 
    31    # here as a convenience. It's implemented in the spirit but not 
    32    # the letter of RFC 5322 3.4 because MIME messages allow newlines 
    33    # and comments as a part of the CFWS rule, but this is typically 
    34    # not allowed in mail UIs (although comment syntax was requested 
    35    # once too). 
    36    # 
    37    # Display names are either basic characters (the same basic characters 
    38    # permitted in email addresses, but periods are not allowed and spaces 
    39    # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with 
    40    # the same rules as a quoted local part. (Multiple quoted strings might 
    41    # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the 
    42    # email address follows in angle brackets. 
    43    # 
    44    # An initial quote is ambiguous between starting a display name or 
    45    # a quoted local part --- fun. 
    46    # 
    47    # We assume the input string is already stripped of leading and 
    48    # trailing CFWS. 
    49 
    50    def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: 
    51        # Split the string at the first character in specials (an @-sign 
    52        # or left angle bracket) that does not occur within quotes and 
    53        # is not followed by a Unicode combining character. 
    54        # If no special character is found, raise an error. 
    55        inside_quote = False 
    56        escaped = False 
    57        left_part = "" 
    58        for i, c in enumerate(text): 
    59            # < plus U+0338 (Combining Long Solidus Overlay) normalizes to 
    60            # ≮ U+226E (Not Less-Than), and  it would be confusing to treat 
    61            # the < as the start of "<email>" syntax in that case. Likewise, 
    62            # if anything combines with an @ or ", we should probably not 
    63            # treat it as a special character. 
    64            if unicodedata.normalize("NFC", text[i:])[0] != c: 
    65                left_part += c 
    66 
    67            elif inside_quote: 
    68                left_part += c 
    69                if c == '\\' and not escaped: 
    70                    escaped = True 
    71                elif c == '"' and not escaped: 
    72                    # The only way to exit the quote is an unescaped quote. 
    73                    inside_quote = False 
    74                    escaped = False 
    75                else: 
    76                    escaped = False 
    77            elif c == '"': 
    78                left_part += c 
    79                inside_quote = True 
    80            elif c in specials: 
    81                # When unquoted, stop before a special character. 
    82                break 
    83            else: 
    84                left_part += c 
    85 
    86        # No special symbol found. The special symbols always 
    87        # include an at-sign, so this always indicates a missing 
    88        # at-sign. The other symbol is optional. 
    89        if len(left_part) == len(text): 
    90            # The full-width at-sign might occur in CJK contexts. 
    91            # We can't accept it because we only accept addresess 
    92            # that are actually valid. But if this is common we 
    93            # may want to consider accepting and normalizing full- 
    94            # width characters for the other special symbols (and 
    95            # full-width dot is already accepted in internationalized 
    96            # domains) with a new option. 
    97            # See https://news.ycombinator.com/item?id=42235268. 
    98            if "@" in text: 
    99                raise EmailSyntaxError("The email address has the \"full-width\" at-sign (@) character instead of a regular at-sign.") 
    100 
    101            # Check another near-homoglyph for good measure because 
    102            # homoglyphs in place of required characters could be 
    103            # very confusing. We may want to consider checking for 
    104            # homoglyphs anywhere we look for a special symbol. 
    105            if "﹫" in text: 
    106                raise EmailSyntaxError('The email address has the "small commercial at" character instead of a regular at-sign.') 
    107 
    108            raise EmailSyntaxError("An email address must have an @-sign.") 
    109 
    110        # The right part is whatever is left. 
    111        right_part = text[len(left_part):] 
    112 
    113        return left_part, right_part 
    114 
    115    def unquote_quoted_string(text: str) -> Tuple[str, bool]: 
    116        # Remove surrounding quotes and unescape escaped backslashes 
    117        # and quotes. Escapes are parsed liberally. I think only 
    118        # backslashes and quotes can be escaped but we'll allow anything 
    119        # to be. 
    120        quoted = False 
    121        escaped = False 
    122        value = "" 
    123        for i, c in enumerate(text): 
    124            if quoted: 
    125                if escaped: 
    126                    value += c 
    127                    escaped = False 
    128                elif c == '\\': 
    129                    escaped = True 
    130                elif c == '"': 
    131                    if i != len(text) - 1: 
    132                        raise EmailSyntaxError("Extra character(s) found after close quote: " 
    133                                               + ", ".join(safe_character_display(c) for c in text[i + 1:])) 
    134                    break 
    135                else: 
    136                    value += c 
    137            elif i == 0 and c == '"': 
    138                quoted = True 
    139            else: 
    140                value += c 
    141 
    142        return value, quoted 
    143 
    144    # Split the string at the first unquoted @-sign or left angle bracket. 
    145    left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) 
    146 
    147    # If the right part starts with an angle bracket, 
    148    # then the left part is a display name and the rest 
    149    # of the right part up to the final right angle bracket 
    150    # is the email address, . 
    151    if right_part.startswith("<"): 
    152        # Remove space between the display name and angle bracket. 
    153        left_part = left_part.rstrip() 
    154 
    155        # Unquote and unescape the display name. 
    156        display_name, display_name_quoted = unquote_quoted_string(left_part) 
    157 
    158        # Check that only basic characters are present in a 
    159        # non-quoted display name. 
    160        if not display_name_quoted: 
    161            bad_chars = { 
    162                safe_character_display(c) 
    163                for c in display_name 
    164                if (not ATEXT_RE.match(c) and c != ' ') or c == '.' 
    165            } 
    166            if bad_chars: 
    167                raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") 
    168 
    169        # Check for other unsafe characters. 
    170        check_unsafe_chars(display_name, allow_space=True) 
    171 
    172        # Check that the right part ends with an angle bracket 
    173        # but allow spaces after it, I guess. 
    174        if ">" not in right_part: 
    175            raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") 
    176        right_part = right_part.rstrip(" ") 
    177        if right_part[-1] != ">": 
    178            raise EmailSyntaxError("There can't be anything after the email address.") 
    179 
    180        # Remove the initial and trailing angle brackets. 
    181        addr_spec = right_part[1:].rstrip(">") 
    182 
    183        # Split the email address at the first unquoted @-sign. 
    184        local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) 
    185 
    186    # Otherwise there is no display name. The left part is the local 
    187    # part and the right part is the domain. 
    188    else: 
    189        display_name = None 
    190        local_part, domain_part = left_part, right_part 
    191 
    192    if domain_part.startswith("@"): 
    193        domain_part = domain_part[1:] 
    194 
    195    # Unquote the local part if it is quoted. 
    196    local_part, is_quoted_local_part = unquote_quoted_string(local_part) 
    197 
    198    return display_name, local_part, domain_part, is_quoted_local_part 
    199 
    200 
    201def get_length_reason(addr: str, limit: int) -> str: 
    202    """Helper function to return an error message related to invalid length.""" 
    203    diff = len(addr) - limit 
    204    suffix = "s" if diff > 1 else "" 
    205    return f"({diff} character{suffix} too many)" 
    206 
    207 
    208def safe_character_display(c: str) -> str: 
    209    # Return safely displayable characters in quotes. 
    210    if c == '\\': 
    211        return f"\"{c}\""  # can't use repr because it escapes it 
    212    if unicodedata.category(c)[0] in ("L", "N", "P", "S"): 
    213        return repr(c) 
    214 
    215    # Construct a hex string in case the unicode name doesn't exist. 
    216    if ord(c) < 0xFFFF: 
    217        h = f"U+{ord(c):04x}".upper() 
    218    else: 
    219        h = f"U+{ord(c):08x}".upper() 
    220 
    221    # Return the character name or, if it has no name, the hex string. 
    222    return unicodedata.name(c, h) 
    223 
    224 
    225class LocalPartValidationResult(TypedDict): 
    226    local_part: str 
    227    ascii_local_part: Optional[str] 
    228    smtputf8: bool 
    229 
    230 
    231def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, 
    232                              quoted_local_part: bool = False, strict: bool = False) -> LocalPartValidationResult: 
    233    """Validates the syntax of the local part of an email address.""" 
    234 
    235    if len(local) == 0: 
    236        if not allow_empty_local: 
    237            raise EmailSyntaxError("There must be something before the @-sign.") 
    238 
    239        # The caller allows an empty local part. Useful for validating certain 
    240        # Postfix aliases. 
    241        return { 
    242            "local_part": local, 
    243            "ascii_local_part": local, 
    244            "smtputf8": False, 
    245        } 
    246 
    247    # Check the length of the local part by counting characters. 
    248    # (RFC 5321 4.5.3.1.1) 
    249    # We're checking the number of characters here. If the local part 
    250    # is ASCII-only, then that's the same as bytes (octets). If it's 
    251    # internationalized, then the UTF-8 encoding may be longer, but 
    252    # that may not be relevant. We will check the total address length 
    253    # instead. 
    254    if strict and len(local) > LOCAL_PART_MAX_LENGTH: 
    255        reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) 
    256        raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") 
    257 
    258    # Check the local part against the non-internationalized regular expression. 
    259    # Most email addresses match this regex so it's probably fastest to check this first. 
    260    # (RFC 5322 3.2.3) 
    261    # All local parts matching the dot-atom rule are also valid as a quoted string 
    262    # so if it was originally quoted (quoted_local_part is True) and this regex matches, 
    263    # it's ok. 
    264    # (RFC 5321 4.1.2 / RFC 5322 3.2.4). 
    265    if DOT_ATOM_TEXT.match(local): 
    266        # It's valid. And since it's just the permitted ASCII characters, 
    267        # it's normalized and safe. If the local part was originally quoted, 
    268        # the quoting was unnecessary and it'll be returned as normalized to 
    269        # non-quoted form. 
    270 
    271        # Return the local part and flag that SMTPUTF8 is not needed. 
    272        return { 
    273            "local_part": local, 
    274            "ascii_local_part": local, 
    275            "smtputf8": False, 
    276        } 
    277 
    278    # The local part failed the basic dot-atom check. Try the extended character set 
    279    # for internationalized addresses. It's the same pattern but with additional 
    280    # characters permitted. 
    281    # RFC 6531 section 3.3. 
    282    valid: Optional[str] = None 
    283    requires_smtputf8 = False 
    284    if DOT_ATOM_TEXT_INTL.match(local): 
    285        # But international characters in the local part may not be permitted. 
    286        if not allow_smtputf8: 
    287            # Check for invalid characters against the non-internationalized 
    288            # permitted character set. 
    289            # (RFC 5322 3.2.3) 
    290            bad_chars = { 
    291                safe_character_display(c) 
    292                for c in local 
    293                if not ATEXT_RE.match(c) 
    294            } 
    295            if bad_chars: 
    296                raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") 
    297 
    298            # Although the check above should always find something, fall back to this just in case. 
    299            raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") 
    300 
    301        # It's valid. 
    302        valid = "dot-atom" 
    303        requires_smtputf8 = True 
    304 
    305    # There are no dot-atom syntax restrictions on quoted local parts, so 
    306    # if it was originally quoted, it is probably valid. More characters 
    307    # are allowed, like @-signs, spaces, and quotes, and there are no 
    308    # restrictions on the placement of dots, as in dot-atom local parts. 
    309    elif quoted_local_part: 
    310        # Check for invalid characters in a quoted string local part. 
    311        # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* 
    312        # characters which are *not* allowed here. RFC 6531 section 3.3 
    313        # extends the range to UTF8 strings.) 
    314        bad_chars = { 
    315            safe_character_display(c) 
    316            for c in local 
    317            if not QTEXT_INTL.match(c) 
    318        } 
    319        if bad_chars: 
    320            raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") 
    321 
    322        # See if any characters are outside of the ASCII range. 
    323        bad_chars = { 
    324            safe_character_display(c) 
    325            for c in local 
    326            if not (32 <= ord(c) <= 126) 
    327        } 
    328        if bad_chars: 
    329            requires_smtputf8 = True 
    330 
    331            # International characters in the local part may not be permitted. 
    332            if not allow_smtputf8: 
    333                raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") 
    334 
    335        # It's valid. 
    336        valid = "quoted" 
    337 
    338    # If the local part matches the internationalized dot-atom form or was quoted, 
    339    # perform additional checks for Unicode strings. 
    340    if valid: 
    341        # Check that the local part is a valid, safe, and sensible Unicode string. 
    342        # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked 
    343        # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the 
    344        # email specs, but they may not be valid, safe, or sensible Unicode strings. 
    345        # See the function for rationale. 
    346        check_unsafe_chars(local, allow_space=(valid == "quoted")) 
    347 
    348        # Try encoding to UTF-8. Failure is possible with some characters like 
    349        # surrogate code points, but those are checked above. Still, we don't 
    350        # want to have an unhandled exception later. 
    351        try: 
    352            local.encode("utf8") 
    353        except ValueError as e: 
    354            raise EmailSyntaxError("The email address contains an invalid character.") from e 
    355 
    356        # If this address passes only by the quoted string form, re-quote it 
    357        # and backslash-escape quotes and backslashes (removing any unnecessary 
    358        # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, 
    359        # and the sending system SHOULD transmit the form that uses the minimum quoting possible." 
    360        if valid == "quoted": 
    361            local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' 
    362 
    363        return { 
    364            "local_part": local, 
    365            "ascii_local_part": local if not requires_smtputf8 else None, 
    366            "smtputf8": requires_smtputf8, 
    367        } 
    368 
    369    # It's not a valid local part. Let's find out why. 
    370    # (Since quoted local parts are all valid or handled above, these checks 
    371    # don't apply in those cases.) 
    372 
    373    # Check for invalid characters. 
    374    # (RFC 5322 3.2.3, plus RFC 6531 3.3) 
    375    bad_chars = { 
    376        safe_character_display(c) 
    377        for c in local 
    378        if not ATEXT_INTL_DOT_RE.match(c) 
    379    } 
    380    if bad_chars: 
    381        raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") 
    382 
    383    # Check for dot errors imposted by the dot-atom rule. 
    384    # (RFC 5322 3.2.3) 
    385    check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) 
    386 
    387    # All of the reasons should already have been checked, but just in case 
    388    # we have a fallback message. 
    389    raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") 
    390 
    391 
    392def check_unsafe_chars(s: str, allow_space: bool = False) -> None: 
    393    # Check for unsafe characters or characters that would make the string 
    394    # invalid or non-sensible Unicode. 
    395    bad_chars = set() 
    396    for i, c in enumerate(s): 
    397        category = unicodedata.category(c) 
    398        if category[0] in ("L", "N", "P", "S"): 
    399            # Letters, numbers, punctuation, and symbols are permitted. 
    400            pass 
    401        elif category[0] == "M": 
    402            # Combining character in first position would combine with something 
    403            # outside of the email address if concatenated, so they are not safe. 
    404            # We also check if this occurs after the @-sign, which would not be 
    405            # sensible because it would modify the @-sign. 
    406            if i == 0: 
    407                bad_chars.add(c) 
    408        elif category == "Zs": 
    409            # Spaces outside of the ASCII range are not specifically disallowed in 
    410            # internationalized addresses as far as I can tell, but they violate 
    411            # the spirit of the non-internationalized specification that email 
    412            # addresses do not contain ASCII spaces when not quoted. Excluding 
    413            # ASCII spaces when not quoted is handled directly by the atom regex. 
    414            # 
    415            # In quoted-string local parts, spaces are explicitly permitted, and 
    416            # the ASCII space has category Zs, so we must allow it here, and we'll 
    417            # allow all Unicode spaces to be consistent. 
    418            if not allow_space: 
    419                bad_chars.add(c) 
    420        elif category[0] == "Z": 
    421            # The two line and paragraph separator characters (in categories Zl and Zp) 
    422            # are not specifically disallowed in internationalized addresses 
    423            # as far as I can tell, but they violate the spirit of the non-internationalized 
    424            # specification that email addresses do not contain line breaks when not quoted. 
    425            bad_chars.add(c) 
    426        elif category[0] == "C": 
    427            # Control, format, surrogate, private use, and unassigned code points (C) 
    428            # are all unsafe in various ways. Control and format characters can affect 
    429            # text rendering if the email address is concatenated with other text. 
    430            # Bidirectional format characters are unsafe, even if used properly, because 
    431            # they cause an email address to render as a different email address. 
    432            # Private use characters do not make sense for publicly deliverable 
    433            # email addresses. 
    434            bad_chars.add(c) 
    435        else: 
    436            # All categories should be handled above, but in case there is something new 
    437            # to the Unicode specification in the future, reject all other categories. 
    438            bad_chars.add(c) 
    439    if bad_chars: 
    440        raise EmailSyntaxError("The email address contains unsafe characters: " 
    441                               + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") 
    442 
    443 
    444def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: 
    445    # RFC 5322 3.2.3 
    446    if label.endswith("."): 
    447        raise EmailSyntaxError(end_descr.format("period")) 
    448    if label.startswith("."): 
    449        raise EmailSyntaxError(start_descr.format("period")) 
    450    if ".." in label: 
    451        raise EmailSyntaxError("An email address cannot have two periods in a row.") 
    452 
    453    if is_hostname: 
    454        # RFC 952 
    455        if label.endswith("-"): 
    456            raise EmailSyntaxError(end_descr.format("hyphen")) 
    457        if label.startswith("-"): 
    458            raise EmailSyntaxError(start_descr.format("hyphen")) 
    459        if ".-" in label or "-." in label: 
    460            raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") 
    461 
    462 
    463def uts46_valid_char(char: str) -> bool: 
    464    # By exhaustively searching for characters rejected by 
    465    # for c in (chr(i) for i in range(0x110000)): 
    466    #   idna.uts46_remap(c, std3_rules=False, transitional=False) 
    467    # I found the following rules are pretty close. 
    468    c = ord(char) 
    469    if 0x80 <= c <= 0x9f: 
    470        # 8-bit ASCII range. 
    471        return False 
    472    elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E)) 
    473          or c in (0x00AD, 0x2064, 0xFF0E) 
    474          or 0x200B <= c <= 0x200D 
    475          or 0x1BCA0 <= c <= 0x1BCA3): 
    476        # Characters that are permitted but fall into one of the 
    477        # tests below. 
    478        return True 
    479    elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"): 
    480        # There are a bunch of Zs characters including regular space 
    481        # that are allowed by UTS46 but are not allowed in domain 
    482        # names anyway. 
    483        # 
    484        # There are some Cn (unassigned) characters that the idna 
    485        # package doesn't reject but we can, I think. 
    486        return False 
    487    elif "002E" in unicodedata.decomposition(chr(c)).split(" "): 
    488        # Characters that decompose into a sequence with a dot. 
    489        return False 
    490    return True 
    491 
    492 
    493class DomainNameValidationResult(TypedDict): 
    494    ascii_domain: str 
    495    domain: str 
    496 
    497 
    498def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: 
    499    """Validates the syntax of the domain part of an email address.""" 
    500 
    501    # Check for invalid characters. 
    502    # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) 
    503    bad_chars = { 
    504        safe_character_display(c) 
    505        for c in domain 
    506        if not ATEXT_HOSTNAME_INTL.match(c) 
    507    } 
    508    if bad_chars: 
    509        raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 
    510 
    511    # Check for unsafe characters. 
    512    # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked 
    513    # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but 
    514    # they may not be valid, safe, or sensible Unicode strings. 
    515    check_unsafe_chars(domain) 
    516 
    517    # Reject characters that would be rejected by UTS-46 normalization next but 
    518    # with an error message under our control. 
    519    bad_chars = { 
    520        safe_character_display(c) for c in domain 
    521        if not uts46_valid_char(c) 
    522    } 
    523    if bad_chars: 
    524        raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 
    525 
    526    # Perform UTS-46 normalization, which includes casefolding, NFC normalization, 
    527    # and converting all label separators (the period/full stop, fullwidth full stop, 
    528    # ideographic full stop, and halfwidth ideographic full stop) to regular dots. 
    529    # It will also raise an exception if there is an invalid character in the input, 
    530    # such as "⒈" which is invalid because it would expand to include a dot and 
    531    # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. 
    532    # Since several characters *are* normalized to a dot, this has to come before 
    533    # checks related to dots, like check_dot_atom which comes next. 
    534    original_domain = domain 
    535    try: 
    536        domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) 
    537    except idna.IDNAError as e: 
    538        raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e 
    539 
    540    # Check for invalid characters after Unicode normalization which are not caught 
    541    # by uts46_remap (see tests for examples). 
    542    bad_chars = { 
    543        safe_character_display(c) 
    544        for c in domain 
    545        if not ATEXT_HOSTNAME_INTL.match(c) 
    546    } 
    547    if bad_chars: 
    548        raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") 
    549 
    550    # The domain part is made up dot-separated "labels." Each label must 
    551    # have at least one character and cannot start or end with dashes, which 
    552    # means there are some surprising restrictions on periods and dashes. 
    553    # Check that before we do IDNA encoding because the IDNA library gives 
    554    # unfriendly errors for these cases, but after UTS-46 normalization because 
    555    # it can insert periods and hyphens (from fullwidth characters). 
    556    # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3) 
    557    check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) 
    558 
    559    # Check for RFC 5890's invalid R-LDH labels, which are labels that start 
    560    # with two characters other than "xn" and two dashes. 
    561    for label in domain.split("."): 
    562        if re.match(r"(?!xn)..--", label, re.I): 
    563            raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") 
    564 
    565    if DOT_ATOM_TEXT_HOSTNAME.match(domain): 
    566        # This is a valid non-internationalized domain. 
    567        ascii_domain = domain 
    568    else: 
    569        # If international characters are present in the domain name, convert 
    570        # the domain to IDNA ASCII. If internationalized characters are present, 
    571        # the MTA must either support SMTPUTF8 or the mail client must convert the 
    572        # domain name to IDNA before submission. 
    573        # 
    574        # For ASCII-only domains, the transformation does nothing and is safe to 
    575        # apply. However, to ensure we don't rely on the idna library for basic 
    576        # syntax checks, we don't use it if it's not needed. 
    577        # 
    578        # idna.encode also checks the domain name length after encoding but it 
    579        # doesn't give a nice error, so we call the underlying idna.alabel method 
    580        # directly. idna.alabel checks label length and doesn't give great messages, 
    581        # but we can't easily go to lower level methods. 
    582        try: 
    583            ascii_domain = ".".join( 
    584                idna.alabel(label).decode("ascii") 
    585                for label in domain.split(".") 
    586            ) 
    587        except idna.IDNAError as e: 
    588            # Some errors would have already been raised by idna.uts46_remap. 
    589            raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e 
    590 
    591        # Check the syntax of the string returned by idna.encode. 
    592        # It should never fail. 
    593        if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain): 
    594            raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") 
    595 
    596    # Check the length of the domain name in bytes. 
    597    # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) 
    598    # We're checking the number of bytes ("octets") here, which can be much 
    599    # higher than the number of characters in internationalized domains, 
    600    # on the assumption that the domain may be transmitted without SMTPUTF8 
    601    # as IDNA ASCII. (This is also checked by idna.encode, so this exception 
    602    # is never reached for internationalized domains.) 
    603    if len(ascii_domain) > DOMAIN_MAX_LENGTH: 
    604        if ascii_domain == original_domain: 
    605            reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) 
    606            raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") 
    607        else: 
    608            diff = len(ascii_domain) - DOMAIN_MAX_LENGTH 
    609            s = "" if diff == 1 else "s" 
    610            raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") 
    611 
    612    # Also check the label length limit. 
    613    # (RFC 1035 2.3.1) 
    614    for label in ascii_domain.split("."): 
    615        if len(label) > DNS_LABEL_LENGTH_LIMIT: 
    616            reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) 
    617            raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") 
    618 
    619    if globally_deliverable: 
    620        # All publicly deliverable addresses have domain names with at least 
    621        # one period, at least for gTLDs created since 2013 (per the ICANN Board 
    622        # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). 
    623        # We'll consider the lack of a period a syntax error 
    624        # since that will match people's sense of what an email address looks 
    625        # like. We'll skip this in test environments to allow '@test' email 
    626        # addresses. 
    627        if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): 
    628            raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") 
    629 
    630        # We also know that all TLDs currently end with a letter. 
    631        if not DOMAIN_NAME_REGEX.search(ascii_domain): 
    632            raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") 
    633 
    634    # Check special-use and reserved domain names. 
    635    # Some might fail DNS-based deliverability checks, but that 
    636    # can be turned off, so we should fail them all sooner. 
    637    # See the references in __init__.py. 
    638    from . import SPECIAL_USE_DOMAIN_NAMES 
    639    for d in SPECIAL_USE_DOMAIN_NAMES: 
    640        # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. 
    641        if d == "test" and test_environment: 
    642            continue 
    643 
    644        if ascii_domain == d or ascii_domain.endswith("." + d): 
    645            raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.") 
    646 
    647    # We may have been given an IDNA ASCII domain to begin with. Check 
    648    # that the domain actually conforms to IDNA. It could look like IDNA 
    649    # but not be actual IDNA. For ASCII-only domains, the conversion out 
    650    # of IDNA just gives the same thing back. 
    651    # 
    652    # This gives us the canonical internationalized form of the domain, 
    653    # which we return to the caller as a part of the normalized email 
    654    # address. 
    655    try: 
    656        domain_i18n = idna.decode(ascii_domain.encode('ascii')) 
    657    except idna.IDNAError as e: 
    658        raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e 
    659 
    660    # Check that this normalized domain name has not somehow become 
    661    # an invalid domain name. All of the checks before this point 
    662    # using the idna package probably guarantee that we now have 
    663    # a valid international domain name in most respects. But it 
    664    # doesn't hurt to re-apply some tests to be sure. See the similar 
    665    # tests above. 
    666 
    667    # Check for invalid and unsafe characters. We have no test 
    668    # case for this. 
    669    bad_chars = { 
    670        safe_character_display(c) 
    671        for c in domain_i18n 
    672        if not ATEXT_HOSTNAME_INTL.match(c) 
    673    } 
    674    if bad_chars: 
    675        raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 
    676    check_unsafe_chars(domain_i18n) 
    677 
    678    # Check that it can be encoded back to IDNA ASCII. We have no test 
    679    # case for this. 
    680    try: 
    681        idna.encode(domain_i18n) 
    682    except idna.IDNAError as e: 
    683        raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e 
    684 
    685    # Return the IDNA ASCII-encoded form of the domain, which is how it 
    686    # would be transmitted on the wire (except when used with SMTPUTF8 
    687    # possibly), as well as the canonical Unicode form of the domain, 
    688    # which is better for display purposes. This should also take care 
    689    # of RFC 6532 section 3.1's suggestion to apply Unicode NFC 
    690    # normalization to addresses. 
    691    return { 
    692        "ascii_domain": ascii_domain, 
    693        "domain": domain_i18n, 
    694    } 
    695 
    696 
    697def validate_email_length(addrinfo: ValidatedEmail) -> None: 
    698    # There are three forms of the email address whose length must be checked: 
    699    # 
    700    # 1) The original email address string. Since callers may continue to use 
    701    #    this string, even though we recommend using the normalized form, we 
    702    #    should not pass validation when the original input is not valid. This 
    703    #    form is checked first because it is the original input. 
    704    # 2) The normalized email address. We perform Unicode NFC normalization of 
    705    #    the local part, we normalize the domain to internationalized characters 
    706    #    (if originally IDNA ASCII) which also includes Unicode normalization, 
    707    #    and we may remove quotes in quoted local parts. We recommend that 
    708    #    callers use this string, so it must be valid. 
    709    # 3) The email address with the IDNA ASCII representation of the domain 
    710    #    name, since this string may be used with email stacks that don't 
    711    #    support UTF-8. Since this is the least likely to be used by callers, 
    712    #    it is checked last. Note that ascii_email will only be set if the 
    713    #    local part is ASCII, but conceivably the caller may combine a 
    714    #    internationalized local part with an ASCII domain, so we check this 
    715    #    on that combination also. Since we only return the normalized local 
    716    #    part, we use that (and not the unnormalized local part). 
    717    # 
    718    # In all cases, the length is checked in UTF-8 because the SMTPUTF8 
    719    # extension to SMTP validates the length in bytes. 
    720 
    721    addresses_to_check = [ 
    722        (addrinfo.original, None), 
    723        (addrinfo.normalized, "after normalization"), 
    724        ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), 
    725    ] 
    726 
    727    for addr, reason in addresses_to_check: 
    728        addr_len = len(addr) 
    729        addr_utf8_len = len(addr.encode("utf8")) 
    730        diff = addr_utf8_len - EMAIL_MAX_LENGTH 
    731        if diff > 0: 
    732            if reason is None and addr_len == addr_utf8_len: 
    733                # If there is no normalization or transcoding, 
    734                # we can give a simple count of the number of 
    735                # characters over the limit. 
    736                reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) 
    737            elif reason is None: 
    738                # If there is no normalization but there is 
    739                # some transcoding to UTF-8, we can compute 
    740                # the minimum number of characters over the 
    741                # limit by dividing the number of bytes over 
    742                # the limit by the maximum number of bytes 
    743                # per character. 
    744                mbpc = max(len(c.encode("utf8")) for c in addr) 
    745                mchars = max(1, diff // mbpc) 
    746                suffix = "s" if diff > 1 else "" 
    747                if mchars == diff: 
    748                    reason = f"({diff} character{suffix} too many)" 
    749                else: 
    750                    reason = f"({mchars}-{diff} character{suffix} too many)" 
    751            else: 
    752                # Since there is normalization, the number of 
    753                # characters in the input that need to change is 
    754                # impossible to know. 
    755                suffix = "s" if diff > 1 else "" 
    756                reason += f" ({diff} byte{suffix} too many)" 
    757            raise EmailSyntaxError(f"The email address is too long {reason}.") 
    758 
    759 
    760class DomainLiteralValidationResult(TypedDict): 
    761    domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] 
    762    domain: str 
    763 
    764 
    765def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: 
    766    # This is obscure domain-literal syntax. Parse it and return 
    767    # a compressed/normalized address. 
    768    # RFC 5321 4.1.3 and RFC 5322 3.4.1. 
    769 
    770    addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] 
    771 
    772    # Try to parse the domain literal as an IPv4 address. 
    773    # There is no tag for IPv4 addresses, so we can never 
    774    # be sure if the user intends an IPv4 address. 
    775    if re.match(r"^[0-9\.]+$", domain_literal): 
    776        try: 
    777            addr = ipaddress.IPv4Address(domain_literal) 
    778        except ValueError as e: 
    779            raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e 
    780 
    781        # Return the IPv4Address object and the domain back unchanged. 
    782        return { 
    783            "domain_address": addr, 
    784            "domain": f"[{addr}]", 
    785        } 
    786 
    787    # If it begins with "IPv6:" it's an IPv6 address. 
    788    if domain_literal.startswith("IPv6:"): 
    789        try: 
    790            addr = ipaddress.IPv6Address(domain_literal[5:]) 
    791        except ValueError as e: 
    792            raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e 
    793 
    794        # Return the IPv6Address object and construct a normalized 
    795        # domain literal. 
    796        return { 
    797            "domain_address": addr, 
    798            "domain": f"[IPv6:{addr.compressed}]", 
    799        } 
    800 
    801    # Nothing else is valid. 
    802 
    803    if ":" not in domain_literal: 
    804        raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") 
    805 
    806    # The tag (the part before the colon) has character restrictions, 
    807    # but since it must come from a registry of tags (in which only "IPv6" is defined), 
    808    # there's no need to check the syntax of the tag. See RFC 5321 4.1.2. 
    809 
    810    # Check for permitted ASCII characters. This actually doesn't matter 
    811    # since there will be an exception after anyway. 
    812    bad_chars = { 
    813        safe_character_display(c) 
    814        for c in domain_literal 
    815        if not DOMAIN_LITERAL_CHARS.match(c) 
    816    } 
    817    if bad_chars: 
    818        raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") 
    819 
    820    # There are no other domain literal tags. 
    821    # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml 
    822    raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")