Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/email_validator/syntax.py: 55%
175 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:32 +0000
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-07 06:32 +0000
1from .exceptions_types import EmailSyntaxError
2from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \
3 DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \
4 DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS
6import re
7import unicodedata
8import idna # implements IDNA 2008; Python's codec is only IDNA 2003
9import ipaddress
10from typing import Optional
13def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH):
14 """Helper function to return an error message related to invalid length."""
15 diff = len(addr) - limit
16 prefix = "at least " if utf8 else ""
17 suffix = "s" if diff > 1 else ""
18 return f"({prefix}{diff} character{suffix} too many)"
21def safe_character_display(c):
22 # Return safely displayable characters in quotes.
23 if c == '\\':
24 return f"\"{c}\"" # can't use repr because it escapes it
25 if unicodedata.category(c)[0] in ("L", "N", "P", "S"):
26 return repr(c)
28 # Construct a hex string in case the unicode name doesn't exist.
29 if ord(c) < 0xFFFF:
30 h = f"U+{ord(c):04x}".upper()
31 else:
32 h = f"U+{ord(c):08x}".upper()
34 # Return the character name or, if it has no name, the hex string.
35 return unicodedata.name(c, h)
38def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False,
39 quoted_local_part: bool = False):
40 """Validates the syntax of the local part of an email address."""
42 if len(local) == 0:
43 if not allow_empty_local:
44 raise EmailSyntaxError("There must be something before the @-sign.")
45 else:
46 # The caller allows an empty local part. Useful for validating certain
47 # Postfix aliases.
48 return {
49 "local_part": local,
50 "ascii_local_part": local,
51 "smtputf8": False,
52 }
54 # Check the length of the local part by counting characters.
55 # (RFC 5321 4.5.3.1.1)
56 # We're checking the number of characters here. If the local part
57 # is ASCII-only, then that's the same as bytes (octets). If it's
58 # internationalized, then the UTF-8 encoding may be longer, but
59 # that may not be relevant. We will check the total address length
60 # instead.
61 if len(local) > LOCAL_PART_MAX_LENGTH:
62 reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH)
63 raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.")
65 # Check the local part against the non-internationalized regular expression.
66 # Most email addresses match this regex so it's probably fastest to check this first.
67 # (RFC 5322 3.2.3)
68 # All local parts matching the dot-atom rule are also valid as a quoted string
69 # so if it was originally quoted (quoted_local_part is True) and this regex matches,
70 # it's ok.
71 # (RFC 5321 4.1.2 / RFC 5322 3.2.4).
72 m = DOT_ATOM_TEXT.match(local)
73 if m:
74 # It's valid. And since it's just the permitted ASCII characters,
75 # it's normalized and safe. If the local part was originally quoted,
76 # the quoting was unnecessary and it'll be returned as normalized to
77 # non-quoted form.
79 # Return the local part and flag that SMTPUTF8 is not needed.
80 return {
81 "local_part": local,
82 "ascii_local_part": local,
83 "smtputf8": False,
84 }
86 # The local part failed the basic dot-atom check. Try the extended character set
87 # for internationalized addresses. It's the same pattern but with additional
88 # characters permitted.
89 # RFC 6531 section 3.3.
90 valid: Optional[str] = None
91 requires_smtputf8 = False
92 m = DOT_ATOM_TEXT_INTL.match(local)
93 if m:
94 # But international characters in the local part may not be permitted.
95 if not allow_smtputf8:
96 # Check for invalid characters against the non-internationalized
97 # permitted character set.
98 # (RFC 5322 3.2.3)
99 bad_chars = set(
100 safe_character_display(c)
101 for c in local
102 if not ATEXT_RE.match(c)
103 )
104 if bad_chars:
105 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")
107 # Although the check above should always find something, fall back to this just in case.
108 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.")
110 # It's valid.
111 valid = "dot-atom"
112 requires_smtputf8 = True
114 # There are no syntactic restrictions on quoted local parts, so if
115 # it was originally quoted, it is probably valid. More characters
116 # are allowed, like @-signs, spaces, and quotes, and there are no
117 # restrictions on the placement of dots, as in dot-atom local parts.
118 elif quoted_local_part:
119 # Check for invalid characters in a quoted string local part.
120 # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete*
121 # characters which are *not* allowed here. RFC 6531 section 3.3
122 # extends the range to UTF8 strings.)
123 bad_chars = set(
124 safe_character_display(c)
125 for c in local
126 if not QTEXT_INTL.match(c)
127 )
128 if bad_chars:
129 raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
131 # See if any characters are outside of the ASCII range.
132 bad_chars = set(
133 safe_character_display(c)
134 for c in local
135 if not (32 <= ord(c) <= 126)
136 )
137 if bad_chars:
138 requires_smtputf8 = True
140 # International characters in the local part may not be permitted.
141 if not allow_smtputf8:
142 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")
144 # It's valid.
145 valid = "quoted"
147 # If the local part matches the internationalized dot-atom form or was quoted,
148 # perform normalization and additional checks for Unicode strings.
149 if valid:
150 # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied,
151 # so we'll return the normalized local part in the return value.
152 local = unicodedata.normalize("NFC", local)
154 # Check that the local part is a valid, safe, and sensible Unicode string.
155 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
156 # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the
157 # email specs, but they may not be valid, safe, or sensible Unicode strings.
158 # See the function for rationale.
159 check_unsafe_chars(local, allow_space=(valid == "quoted"))
161 # Try encoding to UTF-8. Failure is possible with some characters like
162 # surrogate code points, but those are checked above. Still, we don't
163 # want to have an unhandled exception later.
164 try:
165 local.encode("utf8")
166 except ValueError:
167 raise EmailSyntaxError("The email address contains an invalid character.")
169 # If this address passes only by the quoted string form, re-quote it
170 # and backslash-escape quotes and backslashes (removing any unnecessary
171 # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent,
172 # and the sending system SHOULD transmit the form that uses the minimum quoting possible."
173 if valid == "quoted":
174 local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"'
176 return {
177 "local_part": local,
178 "ascii_local_part": local if not requires_smtputf8 else None,
179 "smtputf8": requires_smtputf8,
180 }
182 # It's not a valid local part. Let's find out why.
183 # (Since quoted local parts are all valid or handled above, these checks
184 # don't apply in those cases.)
186 # Check for invalid characters.
187 # (RFC 5322 3.2.3, plus RFC 6531 3.3)
188 bad_chars = set(
189 safe_character_display(c)
190 for c in local
191 if not ATEXT_INTL_RE.match(c)
192 )
193 if bad_chars:
194 raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")
196 # Check for dot errors imposted by the dot-atom rule.
197 # (RFC 5322 3.2.3)
198 check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)
200 # All of the reasons should already have been checked, but just in case
201 # we have a fallback message.
202 raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")
205def check_unsafe_chars(s, allow_space=False):
206 # Check for unsafe characters or characters that would make the string
207 # invalid or non-sensible Unicode.
208 bad_chars = set()
209 for i, c in enumerate(s):
210 category = unicodedata.category(c)
211 if category[0] in ("L", "N", "P", "S"):
212 # Letters, numbers, punctuation, and symbols are permitted.
213 pass
214 elif category[0] == "M":
215 # Combining character in first position would combine with something
216 # outside of the email address if concatenated, so they are not safe.
217 # We also check if this occurs after the @-sign, which would not be
218 # sensible.
219 if i == 0:
220 bad_chars.add(c)
221 elif category == "Zs":
222 # Spaces outside of the ASCII range are not specifically disallowed in
223 # internationalized addresses as far as I can tell, but they violate
224 # the spirit of the non-internationalized specification that email
225 # addresses do not contain ASCII spaces when not quoted. Excluding
226 # ASCII spaces when not quoted is handled directly by the atom regex.
227 #
228 # In quoted-string local parts, spaces are explicitly permitted, and
229 # the ASCII space has category Zs, so we must allow it here, and we'll
230 # allow all Unicode spaces to be consistent.
231 if not allow_space:
232 bad_chars.add(c)
233 elif category[0] == "Z":
234 # The two line and paragraph separator characters (in categories Zl and Zp)
235 # are not specifically disallowed in internationalized addresses
236 # as far as I can tell, but they violate the spirit of the non-internationalized
237 # specification that email addresses do not contain line breaks when not quoted.
238 bad_chars.add(c)
239 elif category[0] in ("C", "Z"):
240 # Control, format, surrogate, private use, and unassigned code points (C)
241 # are all unsafe in various ways. Control and format characters can affect
242 # text rendering if the email address is concatenated with other text.
243 # Bidirectional format characters are unsafe, even if used properly, because
244 # they cause an email address to render as a different email address.
245 # Private use characters do not make sense for publicly deliverable
246 # email addresses.
247 bad_chars.add(c)
248 else:
249 # All categories should be handled above, but in case there is something new
250 # to the Unicode specification in the future, reject all other categories.
251 bad_chars.add(c)
252 if bad_chars:
253 raise EmailSyntaxError("The email address contains unsafe characters: "
254 + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")
257def check_dot_atom(label, start_descr, end_descr, is_hostname):
258 # RFC 5322 3.2.3
259 if label.endswith("."):
260 raise EmailSyntaxError(end_descr.format("period"))
261 if label.startswith("."):
262 raise EmailSyntaxError(start_descr.format("period"))
263 if ".." in label:
264 raise EmailSyntaxError("An email address cannot have two periods in a row.")
266 if is_hostname:
267 # RFC 952
268 if label.endswith("-"):
269 raise EmailSyntaxError(end_descr.format("hyphen"))
270 if label.startswith("-"):
271 raise EmailSyntaxError(start_descr.format("hyphen"))
272 if ".-" in label or "-." in label:
273 raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")
276def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True):
277 """Validates the syntax of the domain part of an email address."""
279 # Check for invalid characters before normalization.
280 # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)
281 bad_chars = set(
282 safe_character_display(c)
283 for c in domain
284 if not ATEXT_HOSTNAME_INTL.match(c)
285 )
286 if bad_chars:
287 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
289 # Check for unsafe characters.
290 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked
291 # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but
292 # they may not be valid, safe, or sensible Unicode strings.
293 check_unsafe_chars(domain)
295 # Perform UTS-46 normalization, which includes casefolding, NFC normalization,
296 # and converting all label separators (the period/full stop, fullwidth full stop,
297 # ideographic full stop, and halfwidth ideographic full stop) to basic periods.
298 # It will also raise an exception if there is an invalid character in the input,
299 # such as "⒈" which is invalid because it would expand to include a period.
300 try:
301 domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)
302 except idna.IDNAError as e:
303 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).")
305 # The domain part is made up period-separated "labels." Each label must
306 # have at least one character and cannot start or end with dashes, which
307 # means there are some surprising restrictions on periods and dashes.
308 # Check that before we do IDNA encoding because the IDNA library gives
309 # unfriendly errors for these cases, but after UTS-46 normalization because
310 # it can insert periods and hyphens (from fullwidth characters).
311 # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3)
312 check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)
314 # Check for RFC 5890's invalid R-LDH labels, which are labels that start
315 # with two characters other than "xn" and two dashes.
316 for label in domain.split("."):
317 if re.match(r"(?!xn)..--", label, re.I):
318 raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")
320 if DOT_ATOM_TEXT_HOSTNAME.match(domain):
321 # This is a valid non-internationalized domain.
322 ascii_domain = domain
323 else:
324 # If international characters are present in the domain name, convert
325 # the domain to IDNA ASCII. If internationalized characters are present,
326 # the MTA must either support SMTPUTF8 or the mail client must convert the
327 # domain name to IDNA before submission.
328 #
329 # Unfortunately this step incorrectly 'fixes' domain names with leading
330 # periods by removing them, so we have to check for this above. It also gives
331 # a funky error message ("No input") when there are two periods in a
332 # row, also checked separately above.
333 #
334 # For ASCII-only domains, the transformation does nothing and is safe to
335 # apply. However, to ensure we don't rely on the idna library for basic
336 # syntax checks, we don't use it if it's not needed.
337 try:
338 ascii_domain = idna.encode(domain, uts46=False).decode("ascii")
339 except idna.IDNAError as e:
340 if "Domain too long" in str(e):
341 # We can't really be more specific because UTS-46 normalization means
342 # the length check is applied to a string that is different from the
343 # one the user supplied. Also I'm not sure if the length check applies
344 # to the internationalized form, the IDNA ASCII form, or even both!
345 raise EmailSyntaxError("The email address is too long after the @-sign.")
346 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).")
348 # Check the syntax of the string returned by idna.encode.
349 # It should never fail.
350 m = DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain)
351 if not m:
352 raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")
354 # Check the length of the domain name in bytes.
355 # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)
356 # We're checking the number of bytes ("octets") here, which can be much
357 # higher than the number of characters in internationalized domains,
358 # on the assumption that the domain may be transmitted without SMTPUTF8
359 # as IDNA ASCII. (This is also checked by idna.encode, so this exception
360 # is never reached for internationalized domains.)
361 if len(ascii_domain) > DOMAIN_MAX_LENGTH:
362 reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)
363 raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.")
365 # Also check the label length limit.
366 # (RFC 1035 2.3.1)
367 for label in ascii_domain.split("."):
368 if len(label) > DNS_LABEL_LENGTH_LIMIT:
369 reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)
370 raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")
372 if globally_deliverable:
373 # All publicly deliverable addresses have domain named with at least
374 # one period, at least for gTLDs created since 2013 (per the ICANN Board
375 # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).
376 # We'll consider the lack of a period a syntax error
377 # since that will match people's sense of what an email address looks
378 # like. We'll skip this in test environments to allow '@test' email
379 # addresses.
380 if "." not in ascii_domain and not (ascii_domain == "test" and test_environment):
381 raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.")
383 # We also know that all TLDs currently end with a letter.
384 if not DOMAIN_NAME_REGEX.search(ascii_domain):
385 raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.")
387 # Check special-use and reserved domain names.
388 # Some might fail DNS-based deliverability checks, but that
389 # can be turned off, so we should fail them all sooner.
390 # See the references in __init__.py.
391 from . import SPECIAL_USE_DOMAIN_NAMES
392 for d in SPECIAL_USE_DOMAIN_NAMES:
393 # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.
394 if d == "test" and test_environment:
395 continue
397 if ascii_domain == d or ascii_domain.endswith("." + d):
398 raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.")
400 # We may have been given an IDNA ASCII domain to begin with. Check
401 # that the domain actually conforms to IDNA. It could look like IDNA
402 # but not be actual IDNA. For ASCII-only domains, the conversion out
403 # of IDNA just gives the same thing back.
404 #
405 # This gives us the canonical internationalized form of the domain.
406 try:
407 domain_i18n = idna.decode(ascii_domain.encode('ascii'))
408 except idna.IDNAError as e:
409 raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).")
411 # Check for invalid characters after normalization. These
412 # should never arise. See the similar checks above.
413 bad_chars = set(
414 safe_character_display(c)
415 for c in domain
416 if not ATEXT_HOSTNAME_INTL.match(c)
417 )
418 if bad_chars:
419 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")
420 check_unsafe_chars(domain)
422 # Return the IDNA ASCII-encoded form of the domain, which is how it
423 # would be transmitted on the wire (except when used with SMTPUTF8
424 # possibly), as well as the canonical Unicode form of the domain,
425 # which is better for display purposes. This should also take care
426 # of RFC 6532 section 3.1's suggestion to apply Unicode NFC
427 # normalization to addresses.
428 return {
429 "ascii_domain": ascii_domain,
430 "domain": domain_i18n,
431 }
434def validate_email_domain_literal(domain_literal, allow_domain_literal=False):
435 # This is obscure domain-literal syntax. Parse it and return
436 # a compressed/normalized address.
437 # RFC 5321 4.1.3 and RFC 5322 3.4.1.
439 # Try to parse the domain literal as an IPv4 address.
440 # There is no tag for IPv4 addresses, so we can never
441 # be sure if the user intends an IPv4 address.
442 if re.match(r"^[0-9\.]+$", domain_literal):
443 try:
444 addr = ipaddress.IPv4Address(domain_literal)
445 except ValueError as e:
446 raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.")
447 if not allow_domain_literal:
448 raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.")
450 # Return the IPv4Address object and the domain back unchanged.
451 return {
452 "domain_address": addr,
453 "domain": f"[{addr}]",
454 }
456 # If it begins with "IPv6:" it's an IPv6 address.
457 if domain_literal.startswith("IPv6:"):
458 try:
459 addr = ipaddress.IPv6Address(domain_literal[5:])
460 except ValueError as e:
461 raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).")
462 if not allow_domain_literal:
463 raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.")
465 # Return the IPv6Address object and construct a normalized
466 # domain literal.
467 return {
468 "domain_address": addr,
469 "domain": f"[IPv6:{addr.compressed}]",
470 }
472 if ":" not in domain_literal:
473 raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")
475 # The tag (the part before the colon) has character restrictions,
476 # but since it must come from a registry of tags (in which only "IPv6" is defined),
477 # there's no need to check the syntax of the tag. See RFC 5321 4.1.2.
479 # Check for permitted ASCII characters. This actually doesn't matter
480 # since there will be an exception after anyway.
481 bad_chars = set(
482 safe_character_display(c)
483 for c in domain_literal
484 if not DOMAIN_LITERAL_CHARS.match(c)
485 )
486 if bad_chars:
487 raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".")
489 # There are no other domain literal tags.
490 # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml
491 raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")