1from .exceptions import EmailSyntaxError
2from .types import ValidatedEmail
3from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
4 DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \
5 DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS
6
7import re
8import unicodedata
9import idna # implements IDNA 2008; Python's codec is only IDNA 2003
10import ipaddress
11from typing import Optional, Tuple, TypedDict, Union
12
13
14def split_email(email: str) -> Tuple[Optional[str], str, str, bool]:
15 # Return the display name, unescaped local part, and domain part
16 # of the address, and whether the local part was quoted. If no
17 # display name was present and angle brackets do not surround
18 # the address, display name will be None; otherwise, it will be
19 # set to the display name or the empty string if there were
20 # angle brackets but no display name.
21
22 # Typical email addresses have a single @-sign and no quote
23 # characters, but the awkward "quoted string" local part form
24 # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear
25 # in the local part if the local part is quoted.
26
27 # A `display name <addr>` format is also present in MIME messages
28 # (RFC 5322 3.4) and this format is also often recognized in
29 # mail UIs. It's not allowed in SMTP commands or in typical web
30 # login forms, but parsing it has been requested, so it's done
31 # here as a convenience. It's implemented in the spirit but not
32 # the letter of RFC 5322 3.4 because MIME messages allow newlines
33 # and comments as a part of the CFWS rule, but this is typically
34 # not allowed in mail UIs (although comment syntax was requested
35 # once too).
36 #
37 # Display names are either basic characters (the same basic characters
38 # permitted in email addresses, but periods are not allowed and spaces
39 # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with
40 # the same rules as a quoted local part. (Multiple quoted strings might
41 # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the
42 # email address follows in angle brackets.
43 #
44 # An initial quote is ambiguous between starting a display name or
45 # a quoted local part --- fun.
46 #
47 # We assume the input string is already stripped of leading and
48 # trailing CFWS.
49
50 def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]:
51 # Split the string at the first character in specials (an @-sign
52 # or left angle bracket) that does not occur within quotes and
53 # is not followed by a Unicode combining character.
54 # If no special character is found, raise an error.
55 inside_quote = False
56 escaped = False
57 left_part = ""
58 for i, c in enumerate(text):
59 # < plus U+0338 (Combining Long Solidus Overlay) normalizes to
60 # ≮ U+226E (Not Less-Than), and it would be confusing to treat
61 # the < as the start of "<email>" syntax in that case. Likewise,
62 # if anything combines with an @ or ", we should probably not
63 # treat it as a special character.
64 if unicodedata.normalize("NFC", text[i:])[0] != c:
65 left_part += c
66
67 elif inside_quote:
68 left_part += c
69 if c == '\\' and not escaped:
70 escaped = True
71 elif c == '"' and not escaped:
72 # The only way to exit the quote is an unescaped quote.
73 inside_quote = False
74 escaped = False
75 else:
76 escaped = False
77 elif c == '"':
78 left_part += c
79 inside_quote = True
80 elif c in specials:
81 # When unquoted, stop before a special character.
82 break
83 else:
84 left_part += c
85
86 # No special symbol found. The special symbols always
87 # include an at-sign, so this always indicates a missing
88 # at-sign. The other symbol is optional.
89 if len(left_part) == len(text):
90 # The full-width at-sign might occur in CJK contexts.
91 # We can't accept it because we only accept addresess
92 # that are actually valid. But if this is common we
93 # may want to consider accepting and normalizing full-
94 # width characters for the other special symbols (and
95 # full-width dot is already accepted in internationalized
96 # domains) with a new option.
97 # See https://news.ycombinator.com/item?id=42235268.
98 if "@" in text:
99 raise EmailSyntaxError("The email address has the \"full-width\" at-sign (@) character instead of a regular at-sign.")
100
101 # Check another near-homoglyph for good measure because
102 # homoglyphs in place of required characters could be
103 # very confusing. We may want to consider checking for
104 # homoglyphs anywhere we look for a special symbol.
105 if "﹫" in text:
106 raise EmailSyntaxError('The email address has the "small commercial at" character instead of a regular at-sign.')
107
108 raise EmailSyntaxError("An email address must have an @-sign.")
109
110 # The right part is whatever is left.
111 right_part = text[len(left_part):]
112
113 return left_part, right_part
114
115 def unquote_quoted_string(text: str) -> Tuple[str, bool]:
116 # Remove surrounding quotes and unescape escaped backslashes
117 # and quotes. Escapes are parsed liberally. I think only
118 # backslashes and quotes can be escaped but we'll allow anything
119 # to be.
120 quoted = False
121 escaped = False
122 value = ""
123 for i, c in enumerate(text):
124 if quoted:
125 if escaped:
126 value += c
127 escaped = False
128 elif c == '\\':
129 escaped = True
130 elif c == '"':
131 if i != len(text) - 1:
132 raise EmailSyntaxError("Extra character(s) found after close quote: "
133 + ", ".join(safe_character_display(c) for c in text[i + 1:]))
134 break
135 else:
136 value += c
137 elif i == 0 and c == '"':
138 quoted = True
139 else:
140 value += c
141
142 return value, quoted
143
144 # Split the string at the first unquoted @-sign or left angle bracket.
145 left_part, right_part = split_string_at_unquoted_special(email, ("@", "<"))
146
147 # If the right part starts with an angle bracket,
148 # then the left part is a display name and the rest
149 # of the right part up to the final right angle bracket
150 # is the email address, .
151 if right_part.startswith("<"):
152 # Remove space between the display name and angle bracket.
153 left_part = left_part.rstrip()
154
155 # Unquote and unescape the display name.
156 display_name, display_name_quoted = unquote_quoted_string(left_part)
157
158 # Check that only basic characters are present in a
159 # non-quoted display name.
160 if not display_name_quoted:
161 bad_chars = {
162 safe_character_display(c)
163 for c in display_name
164 if (not ATEXT_RE.match(c) and c != ' ') or c == '.'
165 }
166 if bad_chars:
167 raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".")
168
169 # Check for other unsafe characters.
170 check_unsafe_chars(display_name, allow_space=True)
171
172 # Check that the right part ends with an angle bracket
173 # but allow spaces after it, I guess.
174 if ">" not in right_part:
175 raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.")
176 right_part = right_part.rstrip(" ")
177 if right_part[-1] != ">":
178 raise EmailSyntaxError("There can't be anything after the email address.")
179
180 # Remove the initial and trailing angle brackets.
181 addr_spec = right_part[1:].rstrip(">")
182
183 # Split the email address at the first unquoted @-sign.
184 local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",))
185
186 # Otherwise there is no display name. The left part is the local
187 # part and the right part is the domain.
188 else:
189 display_name = None
190 local_part, domain_part = left_part, right_part
191
192 if domain_part.startswith("@"):
193 domain_part = domain_part[1:]
194
195 # Unquote the local part if it is quoted.
196 local_part, is_quoted_local_part = unquote_quoted_string(local_part)
197
198 return display_name, local_part, domain_part, is_quoted_local_part
199
200
201def get_length_reason(addr: str, limit: int) -> str:
202 """Helper function to return an error message related to invalid length."""
203 diff = len(addr) - limit
204 suffix = "s" if diff > 1 else ""
205 return f"({diff} character{suffix} too many)"
206
207
208def safe_character_display(c: str) -> str:
209 # Return safely displayable characters in quotes.
210 if c == '\\':
211 return f"\"{c}\"" # can't use repr because it escapes it
212 if unicodedata.category(c)[0] in ("L", "N", "P", "S"):
213 return repr(c)
214
215 # Construct a hex string in case the unicode name doesn't exist.
216 if ord(c) < 0xFFFF:
217 h = f"U+{ord(c):04x}".upper()
218 else:
219 h = f"U+{ord(c):08x}".upper()
220
221 # Return the character name or, if it has no name, the hex string.
222 return unicodedata.name(c, h)
223
224
225class LocalPartValidationResult(TypedDict):
226 local_part: str
227 ascii_local_part: Optional[str]
228 smtputf8: bool
229
230
231def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False,
232 quoted_local_part: bool = False) -> LocalPartValidationResult:
233 """Validates the syntax of the local part of an email address."""
234
235 if len(local) == 0:
236 if not allow_empty_local:
237 raise EmailSyntaxError("There must be something before the @-sign.")
238
239 # The caller allows an empty local part. Useful for validating certain
240 # Postfix aliases.
241 return {
242 "local_part": local,
243 "ascii_local_part": local,
244 "smtputf8": False,
245 }
246
247 # Check the length of the local part by counting characters.
248 # (RFC 5321 4.5.3.1.1)
249 # We're checking the number of characters here. If the local part
250 # is ASCII-only, then that's the same as bytes (octets). If it's
251 # internationalized, then the UTF-8 encoding may be longer, but
252 # that may not be relevant. We will check the total address length
253 # instead.
254 if len(local) > LOCAL_PART_MAX_LENGTH:
255 reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH)
256 raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.")
257
258 # Check the local part against the non-internationalized regular expression.
259 # Most email addresses match this regex so it's probably fastest to check this first.
260 # (RFC 5322 3.2.3)
261 # All local parts matching the dot-atom rule are also valid as a quoted string
262 # so if it was originally quoted (quoted_local_part is True) and this regex matches,
263 # it's ok.
264 # (RFC 5321 4.1.2 / RFC 5322 3.2.4).
265 if DOT_ATOM_TEXT.match(local):
266 # It's valid. And since it's just the permitted ASCII characters,
267 # it's normalized and safe. If the local part was originally quoted,
268 # the quoting was unnecessary and it'll be returned as normalized to
269 # non-quoted form.
270
271 # Return the local part and flag that SMTPUTF8 is not needed.
272 return {
273 "local_part": local,
274 "ascii_local_part": local,
275 "smtputf8": False,
276 }
277
278 # The local part failed the basic dot-atom check. Try the extended character set
279 # for internationalized addresses. It's the same pattern but with additional
280 # characters permitted.
281 # RFC 6531 section 3.3.
282 valid: Optional[str] = None
283 requires_smtputf8 = False
284 if DOT_ATOM_TEXT_INTL.match(local):
285 # But international characters in the local part may not be permitted.
286 if not allow_smtputf8:
287 # Check for invalid characters against the non-internationalized
288 # permitted character set.
289 # (RFC 5322 3.2.3)
290 bad_chars = {
291 safe_character_display(c)
292 for c in local
293 if not ATEXT_RE.match(c)
294 }
295 if bad_chars:
296 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")
297
298 # Although the check above should always find something, fall back to this just in case.
299 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.")
300
301 # It's valid.
302 valid = "dot-atom"
303 requires_smtputf8 = True
304
305 # There are no dot-atom syntax restrictions on quoted local parts, so
306 # if it was originally quoted, it is probably valid. More characters
307 # are allowed, like @-signs, spaces, and quotes, and there are no
308 # restrictions on the placement of dots, as in dot-atom local parts.
309 elif quoted_local_part:
310 # Check for invalid characters in a quoted string local part.
311 # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete*
312 # characters which are *not* allowed here. RFC 6531 section 3.3
313 # extends the range to UTF8 strings.)
314 bad_chars = {
315 safe_character_display(c)
316 for c in local
317 if not QTEXT_INTL.match(c)
318 }
319 if bad_chars:
320 raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
321
322 # See if any characters are outside of the ASCII range.
323 bad_chars = {
324 safe_character_display(c)
325 for c in local
326 if not (32 <= ord(c) <= 126)
327 }
328 if bad_chars:
329 requires_smtputf8 = True
330
331 # International characters in the local part may not be permitted.
332 if not allow_smtputf8:
333 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")
334
335 # It's valid.
336 valid = "quoted"
337
338 # If the local part matches the internationalized dot-atom form or was quoted,
339 # perform additional checks for Unicode strings.
340 if valid:
341 # Check that the local part is a valid, safe, and sensible Unicode string.
342 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
343 # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the
344 # email specs, but they may not be valid, safe, or sensible Unicode strings.
345 # See the function for rationale.
346 check_unsafe_chars(local, allow_space=(valid == "quoted"))
347
348 # Try encoding to UTF-8. Failure is possible with some characters like
349 # surrogate code points, but those are checked above. Still, we don't
350 # want to have an unhandled exception later.
351 try:
352 local.encode("utf8")
353 except ValueError as e:
354 raise EmailSyntaxError("The email address contains an invalid character.") from e
355
356 # If this address passes only by the quoted string form, re-quote it
357 # and backslash-escape quotes and backslashes (removing any unnecessary
358 # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent,
359 # and the sending system SHOULD transmit the form that uses the minimum quoting possible."
360 if valid == "quoted":
361 local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"'
362
363 return {
364 "local_part": local,
365 "ascii_local_part": local if not requires_smtputf8 else None,
366 "smtputf8": requires_smtputf8,
367 }
368
369 # It's not a valid local part. Let's find out why.
370 # (Since quoted local parts are all valid or handled above, these checks
371 # don't apply in those cases.)
372
373 # Check for invalid characters.
374 # (RFC 5322 3.2.3, plus RFC 6531 3.3)
375 bad_chars = {
376 safe_character_display(c)
377 for c in local
378 if not ATEXT_INTL_DOT_RE.match(c)
379 }
380 if bad_chars:
381 raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
382
383 # Check for dot errors imposted by the dot-atom rule.
384 # (RFC 5322 3.2.3)
385 check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)
386
387 # All of the reasons should already have been checked, but just in case
388 # we have a fallback message.
389 raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
390
391
392def check_unsafe_chars(s: str, allow_space: bool = False) -> None:
393 # Check for unsafe characters or characters that would make the string
394 # invalid or non-sensible Unicode.
395 bad_chars = set()
396 for i, c in enumerate(s):
397 category = unicodedata.category(c)
398 if category[0] in ("L", "N", "P", "S"):
399 # Letters, numbers, punctuation, and symbols are permitted.
400 pass
401 elif category[0] == "M":
402 # Combining character in first position would combine with something
403 # outside of the email address if concatenated, so they are not safe.
404 # We also check if this occurs after the @-sign, which would not be
405 # sensible because it would modify the @-sign.
406 if i == 0:
407 bad_chars.add(c)
408 elif category == "Zs":
409 # Spaces outside of the ASCII range are not specifically disallowed in
410 # internationalized addresses as far as I can tell, but they violate
411 # the spirit of the non-internationalized specification that email
412 # addresses do not contain ASCII spaces when not quoted. Excluding
413 # ASCII spaces when not quoted is handled directly by the atom regex.
414 #
415 # In quoted-string local parts, spaces are explicitly permitted, and
416 # the ASCII space has category Zs, so we must allow it here, and we'll
417 # allow all Unicode spaces to be consistent.
418 if not allow_space:
419 bad_chars.add(c)
420 elif category[0] == "Z":
421 # The two line and paragraph separator characters (in categories Zl and Zp)
422 # are not specifically disallowed in internationalized addresses
423 # as far as I can tell, but they violate the spirit of the non-internationalized
424 # specification that email addresses do not contain line breaks when not quoted.
425 bad_chars.add(c)
426 elif category[0] == "C":
427 # Control, format, surrogate, private use, and unassigned code points (C)
428 # are all unsafe in various ways. Control and format characters can affect
429 # text rendering if the email address is concatenated with other text.
430 # Bidirectional format characters are unsafe, even if used properly, because
431 # they cause an email address to render as a different email address.
432 # Private use characters do not make sense for publicly deliverable
433 # email addresses.
434 bad_chars.add(c)
435 else:
436 # All categories should be handled above, but in case there is something new
437 # to the Unicode specification in the future, reject all other categories.
438 bad_chars.add(c)
439 if bad_chars:
440 raise EmailSyntaxError("The email address contains unsafe characters: "
441 + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")
442
443
444def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None:
445 # RFC 5322 3.2.3
446 if label.endswith("."):
447 raise EmailSyntaxError(end_descr.format("period"))
448 if label.startswith("."):
449 raise EmailSyntaxError(start_descr.format("period"))
450 if ".." in label:
451 raise EmailSyntaxError("An email address cannot have two periods in a row.")
452
453 if is_hostname:
454 # RFC 952
455 if label.endswith("-"):
456 raise EmailSyntaxError(end_descr.format("hyphen"))
457 if label.startswith("-"):
458 raise EmailSyntaxError(start_descr.format("hyphen"))
459 if ".-" in label or "-." in label:
460 raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")
461
462
463def uts46_valid_char(char: str) -> bool:
464 # By exhaustively searching for characters rejected by
465 # for c in (chr(i) for i in range(0x110000)):
466 # idna.uts46_remap(c, std3_rules=False, transitional=False)
467 # I found the following rules are pretty close.
468 c = ord(char)
469 if 0x80 <= c <= 0x9f:
470 # 8-bit ASCII range.
471 return False
472 elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E))
473 or c in (0x00AD, 0x2064, 0xFF0E)
474 or 0x200B <= c <= 0x200D
475 or 0x1BCA0 <= c <= 0x1BCA3):
476 # Characters that are permitted but fall into one of the
477 # tests below.
478 return True
479 elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"):
480 # There are a bunch of Zs characters including regular space
481 # that are allowed by UTS46 but are not allowed in domain
482 # names anyway.
483 #
484 # There are some Cn (unassigned) characters that the idna
485 # package doesn't reject but we can, I think.
486 return False
487 elif "002E" in unicodedata.decomposition(chr(c)).split(" "):
488 # Characters that decompose into a sequence with a dot.
489 return False
490 return True
491
492
493class DomainNameValidationResult(TypedDict):
494 ascii_domain: str
495 domain: str
496
497
498def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult:
499 """Validates the syntax of the domain part of an email address."""
500
501 # Check for invalid characters.
502 # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
503 bad_chars = {
504 safe_character_display(c)
505 for c in domain
506 if not ATEXT_HOSTNAME_INTL.match(c)
507 }
508 if bad_chars:
509 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
510
511 # Check for unsafe characters.
512 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
513 # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
514 # they may not be valid, safe, or sensible Unicode strings.
515 check_unsafe_chars(domain)
516
517 # Reject characters that would be rejected by UTS-46 normalization next but
518 # with an error message under our control.
519 bad_chars = {
520 safe_character_display(c) for c in domain
521 if not uts46_valid_char(c)
522 }
523 if bad_chars:
524 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
525
526 # Perform UTS-46 normalization, which includes casefolding, NFC normalization,
527 # and converting all label separators (the period/full stop, fullwidth full stop,
528 # ideographic full stop, and halfwidth ideographic full stop) to regular dots.
529 # It will also raise an exception if there is an invalid character in the input,
530 # such as "⒈" which is invalid because it would expand to include a dot and
531 # U+1FEF which normalizes to a backtick, which is not an allowed hostname character.
532 # Since several characters *are* normalized to a dot, this has to come before
533 # checks related to dots, like check_dot_atom which comes next.
534 original_domain = domain
535 try:
536 domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)
537 except idna.IDNAError as e:
538 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e
539
540 # Check for invalid characters after Unicode normalization which are not caught
541 # by uts46_remap (see tests for examples).
542 bad_chars = {
543 safe_character_display(c)
544 for c in domain
545 if not ATEXT_HOSTNAME_INTL.match(c)
546 }
547 if bad_chars:
548 raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".")
549
550 # The domain part is made up dot-separated "labels." Each label must
551 # have at least one character and cannot start or end with dashes, which
552 # means there are some surprising restrictions on periods and dashes.
553 # Check that before we do IDNA encoding because the IDNA library gives
554 # unfriendly errors for these cases, but after UTS-46 normalization because
555 # it can insert periods and hyphens (from fullwidth characters).
556 # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3)
557 check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)
558
559 # Check for RFC 5890's invalid R-LDH labels, which are labels that start
560 # with two characters other than "xn" and two dashes.
561 for label in domain.split("."):
562 if re.match(r"(?!xn)..--", label, re.I):
563 raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")
564
565 if DOT_ATOM_TEXT_HOSTNAME.match(domain):
566 # This is a valid non-internationalized domain.
567 ascii_domain = domain
568 else:
569 # If international characters are present in the domain name, convert
570 # the domain to IDNA ASCII. If internationalized characters are present,
571 # the MTA must either support SMTPUTF8 or the mail client must convert the
572 # domain name to IDNA before submission.
573 #
574 # For ASCII-only domains, the transformation does nothing and is safe to
575 # apply. However, to ensure we don't rely on the idna library for basic
576 # syntax checks, we don't use it if it's not needed.
577 #
578 # idna.encode also checks the domain name length after encoding but it
579 # doesn't give a nice error, so we call the underlying idna.alabel method
580 # directly. idna.alabel checks label length and doesn't give great messages,
581 # but we can't easily go to lower level methods.
582 try:
583 ascii_domain = ".".join(
584 idna.alabel(label).decode("ascii")
585 for label in domain.split(".")
586 )
587 except idna.IDNAError as e:
588 # Some errors would have already been raised by idna.uts46_remap.
589 raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e
590
591 # Check the syntax of the string returned by idna.encode.
592 # It should never fail.
593 if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain):
594 raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")
595
596 # Check the length of the domain name in bytes.
597 # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)
598 # We're checking the number of bytes ("octets") here, which can be much
599 # higher than the number of characters in internationalized domains,
600 # on the assumption that the domain may be transmitted without SMTPUTF8
601 # as IDNA ASCII. (This is also checked by idna.encode, so this exception
602 # is never reached for internationalized domains.)
603 if len(ascii_domain) > DOMAIN_MAX_LENGTH:
604 if ascii_domain == original_domain:
605 reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)
606 raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.")
607 else:
608 diff = len(ascii_domain) - DOMAIN_MAX_LENGTH
609 s = "" if diff == 1 else "s"
610 raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).")
611
612 # Also check the label length limit.
613 # (RFC 1035 2.3.1)
614 for label in ascii_domain.split("."):
615 if len(label) > DNS_LABEL_LENGTH_LIMIT:
616 reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)
617 raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")
618
619 if globally_deliverable:
620 # All publicly deliverable addresses have domain names with at least
621 # one period, at least for gTLDs created since 2013 (per the ICANN Board
622 # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
623 # We'll consider the lack of a period a syntax error
624 # since that will match people's sense of what an email address looks
625 # like. We'll skip this in test environments to allow '@test' email
626 # addresses.
627 if "." not in ascii_domain and not (ascii_domain == "test" and test_environment):
628 raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.")
629
630 # We also know that all TLDs currently end with a letter.
631 if not DOMAIN_NAME_REGEX.search(ascii_domain):
632 raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.")
633
634 # Check special-use and reserved domain names.
635 # Some might fail DNS-based deliverability checks, but that
636 # can be turned off, so we should fail them all sooner.
637 # See the references in __init__.py.
638 from . import SPECIAL_USE_DOMAIN_NAMES
639 for d in SPECIAL_USE_DOMAIN_NAMES:
640 # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.
641 if d == "test" and test_environment:
642 continue
643
644 if ascii_domain == d or ascii_domain.endswith("." + d):
645 raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.")
646
647 # We may have been given an IDNA ASCII domain to begin with. Check
648 # that the domain actually conforms to IDNA. It could look like IDNA
649 # but not be actual IDNA. For ASCII-only domains, the conversion out
650 # of IDNA just gives the same thing back.
651 #
652 # This gives us the canonical internationalized form of the domain,
653 # which we return to the caller as a part of the normalized email
654 # address.
655 try:
656 domain_i18n = idna.decode(ascii_domain.encode('ascii'))
657 except idna.IDNAError as e:
658 raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e
659
660 # Check that this normalized domain name has not somehow become
661 # an invalid domain name. All of the checks before this point
662 # using the idna package probably guarantee that we now have
663 # a valid international domain name in most respects. But it
664 # doesn't hurt to re-apply some tests to be sure. See the similar
665 # tests above.
666
667 # Check for invalid and unsafe characters. We have no test
668 # case for this.
669 bad_chars = {
670 safe_character_display(c)
671 for c in domain_i18n
672 if not ATEXT_HOSTNAME_INTL.match(c)
673 }
674 if bad_chars:
675 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
676 check_unsafe_chars(domain_i18n)
677
678 # Check that it can be encoded back to IDNA ASCII. We have no test
679 # case for this.
680 try:
681 idna.encode(domain_i18n)
682 except idna.IDNAError as e:
683 raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e
684
685 # Return the IDNA ASCII-encoded form of the domain, which is how it
686 # would be transmitted on the wire (except when used with SMTPUTF8
687 # possibly), as well as the canonical Unicode form of the domain,
688 # which is better for display purposes. This should also take care
689 # of RFC 6532 section 3.1's suggestion to apply Unicode NFC
690 # normalization to addresses.
691 return {
692 "ascii_domain": ascii_domain,
693 "domain": domain_i18n,
694 }
695
696
697def validate_email_length(addrinfo: ValidatedEmail) -> None:
698 # There are three forms of the email address whose length must be checked:
699 #
700 # 1) The original email address string. Since callers may continue to use
701 # this string, even though we recommend using the normalized form, we
702 # should not pass validation when the original input is not valid. This
703 # form is checked first because it is the original input.
704 # 2) The normalized email address. We perform Unicode NFC normalization of
705 # the local part, we normalize the domain to internationalized characters
706 # (if originally IDNA ASCII) which also includes Unicode normalization,
707 # and we may remove quotes in quoted local parts. We recommend that
708 # callers use this string, so it must be valid.
709 # 3) The email address with the IDNA ASCII representation of the domain
710 # name, since this string may be used with email stacks that don't
711 # support UTF-8. Since this is the least likely to be used by callers,
712 # it is checked last. Note that ascii_email will only be set if the
713 # local part is ASCII, but conceivably the caller may combine a
714 # internationalized local part with an ASCII domain, so we check this
715 # on that combination also. Since we only return the normalized local
716 # part, we use that (and not the unnormalized local part).
717 #
718 # In all cases, the length is checked in UTF-8 because the SMTPUTF8
719 # extension to SMTP validates the length in bytes.
720
721 addresses_to_check = [
722 (addrinfo.original, None),
723 (addrinfo.normalized, "after normalization"),
724 ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"),
725 ]
726
727 for addr, reason in addresses_to_check:
728 addr_len = len(addr)
729 addr_utf8_len = len(addr.encode("utf8"))
730 diff = addr_utf8_len - EMAIL_MAX_LENGTH
731 if diff > 0:
732 if reason is None and addr_len == addr_utf8_len:
733 # If there is no normalization or transcoding,
734 # we can give a simple count of the number of
735 # characters over the limit.
736 reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH)
737 elif reason is None:
738 # If there is no normalization but there is
739 # some transcoding to UTF-8, we can compute
740 # the minimum number of characters over the
741 # limit by dividing the number of bytes over
742 # the limit by the maximum number of bytes
743 # per character.
744 mbpc = max(len(c.encode("utf8")) for c in addr)
745 mchars = max(1, diff // mbpc)
746 suffix = "s" if diff > 1 else ""
747 if mchars == diff:
748 reason = f"({diff} character{suffix} too many)"
749 else:
750 reason = f"({mchars}-{diff} character{suffix} too many)"
751 else:
752 # Since there is normalization, the number of
753 # characters in the input that need to change is
754 # impossible to know.
755 suffix = "s" if diff > 1 else ""
756 reason += f" ({diff} byte{suffix} too many)"
757 raise EmailSyntaxError(f"The email address is too long {reason}.")
758
759
760class DomainLiteralValidationResult(TypedDict):
761 domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address]
762 domain: str
763
764
765def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult:
766 # This is obscure domain-literal syntax. Parse it and return
767 # a compressed/normalized address.
768 # RFC 5321 4.1.3 and RFC 5322 3.4.1.
769
770 addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address]
771
772 # Try to parse the domain literal as an IPv4 address.
773 # There is no tag for IPv4 addresses, so we can never
774 # be sure if the user intends an IPv4 address.
775 if re.match(r"^[0-9\.]+$", domain_literal):
776 try:
777 addr = ipaddress.IPv4Address(domain_literal)
778 except ValueError as e:
779 raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e
780
781 # Return the IPv4Address object and the domain back unchanged.
782 return {
783 "domain_address": addr,
784 "domain": f"[{addr}]",
785 }
786
787 # If it begins with "IPv6:" it's an IPv6 address.
788 if domain_literal.startswith("IPv6:"):
789 try:
790 addr = ipaddress.IPv6Address(domain_literal[5:])
791 except ValueError as e:
792 raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e
793
794 # Return the IPv6Address object and construct a normalized
795 # domain literal.
796 return {
797 "domain_address": addr,
798 "domain": f"[IPv6:{addr.compressed}]",
799 }
800
801 # Nothing else is valid.
802
803 if ":" not in domain_literal:
804 raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")
805
806 # The tag (the part before the colon) has character restrictions,
807 # but since it must come from a registry of tags (in which only "IPv6" is defined),
808 # there's no need to check the syntax of the tag. See RFC 5321 4.1.2.
809
810 # Check for permitted ASCII characters. This actually doesn't matter
811 # since there will be an exception after anyway.
812 bad_chars = {
813 safe_character_display(c)
814 for c in domain_literal
815 if not DOMAIN_LITERAL_CHARS.match(c)
816 }
817 if bad_chars:
818 raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".")
819
820 # There are no other domain literal tags.
821 # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml
822 raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")