Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/email

1from .exceptions_types import EmailSyntaxError, ValidatedEmail

2from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \

3 DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \

4 DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS

6import re

7import unicodedata

8import idna # implements IDNA 2008; Python's codec is only IDNA 2003

9import ipaddress

10from typing import Optional, Tuple, TypedDict, Union

13def split_email(email: str) -> Tuple[Optional[str], str, str, bool]:

14 # Return the display name, unescaped local part, and domain part

15 # of the address, and whether the local part was quoted. If no

16 # display name was present and angle brackets do not surround

17 # the address, display name will be None; otherwise, it will be

18 # set to the display name or the empty string if there were

19 # angle brackets but no display name.

21 # Typical email addresses have a single @-sign and no quote

22 # characters, but the awkward "quoted string" local part form

23 # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear

24 # in the local part if the local part is quoted.

26 # A `display name <addr>` format is also present in MIME messages

27 # (RFC 5322 3.4) and this format is also often recognized in

28 # mail UIs. It's not allowed in SMTP commands or in typical web

29 # login forms, but parsing it has been requested, so it's done

30 # here as a convenience. It's implemented in the spirit but not

31 # the letter of RFC 5322 3.4 because MIME messages allow newlines

32 # and comments as a part of the CFWS rule, but this is typically

33 # not allowed in mail UIs (although comment syntax was requested

34 # once too).

35 #

36 # Display names are either basic characters (the same basic characters

37 # permitted in email addresses, but periods are not allowed and spaces

38 # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with

39 # the same rules as a quoted local part. (Multiple quoted strings might

40 # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the

41 # email address follows in angle brackets.

42 #

43 # An initial quote is ambiguous between starting a display name or

44 # a quoted local part --- fun.

45 #

46 # We assume the input string is already stripped of leading and

47 # trailing CFWS.

49 def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]:

50 # Split the string at the first character in specials (an @-sign

51 # or left angle bracket) that does not occur within quotes and

52 # is not followed by a Unicode combining character.

53 # If no special character is found, raise an error.

54 inside_quote = False

55 escaped = False

56 left_part = ""

57 for i, c in enumerate(text):

58 # < plus U+0338 (Combining Long Solidus Overlay) normalizes to

59 # ≮ U+226E (Not Less-Than), and it would be confusing to treat

60 # the < as the start of "<email>" syntax in that case. Liekwise,

61 # if anything combines with an @ or ", we should probably not

62 # treat it as a special character.

63 if unicodedata.normalize("NFC", text[i:])[0] != c:

64 left_part += c

66 elif inside_quote:

67 left_part += c

68 if c == '\\' and not escaped:

69 escaped = True

70 elif c == '"' and not escaped:

71 # The only way to exit the quote is an unescaped quote.

72 inside_quote = False

73 escaped = False

74 else:

75 escaped = False

76 elif c == '"':

77 left_part += c

78 inside_quote = True

79 elif c in specials:

80 # When unquoted, stop before a special character.

81 break

82 else:

83 left_part += c

85 if len(left_part) == len(text):

86 raise EmailSyntaxError("An email address must have an @-sign.")

88 # The right part is whatever is left.

89 right_part = text[len(left_part):]

91 return left_part, right_part

93 def unquote_quoted_string(text: str) -> Tuple[str, bool]:

94 # Remove surrounding quotes and unescape escaped backslashes

95 # and quotes. Escapes are parsed liberally. I think only

96 # backslashes and quotes can be escaped but we'll allow anything

97 # to be.

98 quoted = False

99 escaped = False

100 value = ""

101 for i, c in enumerate(text):

102 if quoted:

103 if escaped:

104 value += c

105 escaped = False

106 elif c == '\\':

107 escaped = True

108 elif c == '"':

109 if i != len(text) - 1:

110 raise EmailSyntaxError("Extra character(s) found after close quote: "

111 + ", ".join(safe_character_display(c) for c in text[i + 1:]))

112 break

113 else:

114 value += c

115 elif i == 0 and c == '"':

116 quoted = True

117 else:

118 value += c

119

120 return value, quoted

121

122 # Split the string at the first unquoted @-sign or left angle bracket.

123 left_part, right_part = split_string_at_unquoted_special(email, ("@", "<"))

124

125 # If the right part starts with an angle bracket,

126 # then the left part is a display name and the rest

127 # of the right part up to the final right angle bracket

128 # is the email address, .

129 if right_part.startswith("<"):

130 # Remove space between the display name and angle bracket.

131 left_part = left_part.rstrip()

132

133 # Unquote and unescape the display name.

134 display_name, display_name_quoted = unquote_quoted_string(left_part)

135

136 # Check that only basic characters are present in a

137 # non-quoted display name.

138 if not display_name_quoted:

139 bad_chars = {

140 safe_character_display(c)

141 for c in display_name

142 if (not ATEXT_RE.match(c) and c != ' ') or c == '.'

143 }

144 if bad_chars:

145 raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".")

146

147 # Check for other unsafe characters.

148 check_unsafe_chars(display_name, allow_space=True)

149

150 # Check that the right part ends with an angle bracket

151 # but allow spaces after it, I guess.

152 if ">" not in right_part:

153 raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.")

154 right_part = right_part.rstrip(" ")

155 if right_part[-1] != ">":

156 raise EmailSyntaxError("There can't be anything after the email address.")

157

158 # Remove the initial and trailing angle brackets.

159 addr_spec = right_part[1:].rstrip(">")

160

161 # Split the email address at the first unquoted @-sign.

162 local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",))

163

164 # Otherwise there is no display name. The left part is the local

165 # part and the right part is the domain.

166 else:

167 display_name = None

168 local_part, domain_part = left_part, right_part

169

170 if domain_part.startswith("@"):

171 domain_part = domain_part[1:]

172

173 # Unquote the local part if it is quoted.

174 local_part, is_quoted_local_part = unquote_quoted_string(local_part)

175

176 return display_name, local_part, domain_part, is_quoted_local_part

177

178

179def get_length_reason(addr: str, limit: int) -> str:

180 """Helper function to return an error message related to invalid length."""

181 diff = len(addr) - limit

182 suffix = "s" if diff > 1 else ""

183 return f"({diff} character{suffix} too many)"

184

185

186def safe_character_display(c: str) -> str:

187 # Return safely displayable characters in quotes.

188 if c == '\\':

189 return f"\"{c}\"" # can't use repr because it escapes it

190 if unicodedata.category(c)[0] in ("L", "N", "P", "S"):

191 return repr(c)

192

193 # Construct a hex string in case the unicode name doesn't exist.

194 if ord(c) < 0xFFFF:

195 h = f"U+{ord(c):04x}".upper()

196 else:

197 h = f"U+{ord(c):08x}".upper()

198

199 # Return the character name or, if it has no name, the hex string.

200 return unicodedata.name(c, h)

201

202

203class LocalPartValidationResult(TypedDict):

204 local_part: str

205 ascii_local_part: Optional[str]

206 smtputf8: bool

207

208

209def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False,

210 quoted_local_part: bool = False) -> LocalPartValidationResult:

211 """Validates the syntax of the local part of an email address."""

212

213 if len(local) == 0:

214 if not allow_empty_local:

215 raise EmailSyntaxError("There must be something before the @-sign.")

216

217 # The caller allows an empty local part. Useful for validating certain

218 # Postfix aliases.

219 return {

220 "local_part": local,

221 "ascii_local_part": local,

222 "smtputf8": False,

223 }

224

225 # Check the length of the local part by counting characters.

226 # (RFC 5321 4.5.3.1.1)

227 # We're checking the number of characters here. If the local part

228 # is ASCII-only, then that's the same as bytes (octets). If it's

229 # internationalized, then the UTF-8 encoding may be longer, but

230 # that may not be relevant. We will check the total address length

231 # instead.

232 if len(local) > LOCAL_PART_MAX_LENGTH:

233 reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH)

234 raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.")

235

236 # Check the local part against the non-internationalized regular expression.

237 # Most email addresses match this regex so it's probably fastest to check this first.

238 # (RFC 5322 3.2.3)

239 # All local parts matching the dot-atom rule are also valid as a quoted string

240 # so if it was originally quoted (quoted_local_part is True) and this regex matches,

241 # it's ok.

242 # (RFC 5321 4.1.2 / RFC 5322 3.2.4).

243 if DOT_ATOM_TEXT.match(local):

244 # It's valid. And since it's just the permitted ASCII characters,

245 # it's normalized and safe. If the local part was originally quoted,

246 # the quoting was unnecessary and it'll be returned as normalized to

247 # non-quoted form.

248

249 # Return the local part and flag that SMTPUTF8 is not needed.

250 return {

251 "local_part": local,

252 "ascii_local_part": local,

253 "smtputf8": False,

254 }

255

256 # The local part failed the basic dot-atom check. Try the extended character set

257 # for internationalized addresses. It's the same pattern but with additional

258 # characters permitted.

259 # RFC 6531 section 3.3.

260 valid: Optional[str] = None

261 requires_smtputf8 = False

262 if DOT_ATOM_TEXT_INTL.match(local):

263 # But international characters in the local part may not be permitted.

264 if not allow_smtputf8:

265 # Check for invalid characters against the non-internationalized

266 # permitted character set.

267 # (RFC 5322 3.2.3)

268 bad_chars = {

269 safe_character_display(c)

270 for c in local

271 if not ATEXT_RE.match(c)

272 }

273 if bad_chars:

274 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")

275

276 # Although the check above should always find something, fall back to this just in case.

277 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.")

278

279 # It's valid.

280 valid = "dot-atom"

281 requires_smtputf8 = True

282

283 # There are no syntactic restrictions on quoted local parts, so if

284 # it was originally quoted, it is probably valid. More characters

285 # are allowed, like @-signs, spaces, and quotes, and there are no

286 # restrictions on the placement of dots, as in dot-atom local parts.

287 elif quoted_local_part:

288 # Check for invalid characters in a quoted string local part.

289 # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete*

290 # characters which are *not* allowed here. RFC 6531 section 3.3

291 # extends the range to UTF8 strings.)

292 bad_chars = {

293 safe_character_display(c)

294 for c in local

295 if not QTEXT_INTL.match(c)

296 }

297 if bad_chars:

298 raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")

299

300 # See if any characters are outside of the ASCII range.

301 bad_chars = {

302 safe_character_display(c)

303 for c in local

304 if not (32 <= ord(c) <= 126)

305 }

306 if bad_chars:

307 requires_smtputf8 = True

308

309 # International characters in the local part may not be permitted.

310 if not allow_smtputf8:

311 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")

312

313 # It's valid.

314 valid = "quoted"

315

316 # If the local part matches the internationalized dot-atom form or was quoted,

317 # perform additional checks for Unicode strings.

318 if valid:

319 # Check that the local part is a valid, safe, and sensible Unicode string.

320 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked

321 # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the

322 # email specs, but they may not be valid, safe, or sensible Unicode strings.

323 # See the function for rationale.

324 check_unsafe_chars(local, allow_space=(valid == "quoted"))

325

326 # Try encoding to UTF-8. Failure is possible with some characters like

327 # surrogate code points, but those are checked above. Still, we don't

328 # want to have an unhandled exception later.

329 try:

330 local.encode("utf8")

331 except ValueError as e:

332 raise EmailSyntaxError("The email address contains an invalid character.") from e

333

334 # If this address passes only by the quoted string form, re-quote it

335 # and backslash-escape quotes and backslashes (removing any unnecessary

336 # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent,

337 # and the sending system SHOULD transmit the form that uses the minimum quoting possible."

338 if valid == "quoted":

339 local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"'

340

341 return {

342 "local_part": local,

343 "ascii_local_part": local if not requires_smtputf8 else None,

344 "smtputf8": requires_smtputf8,

345 }

346

347 # It's not a valid local part. Let's find out why.

348 # (Since quoted local parts are all valid or handled above, these checks

349 # don't apply in those cases.)

350

351 # Check for invalid characters.

352 # (RFC 5322 3.2.3, plus RFC 6531 3.3)

353 bad_chars = {

354 safe_character_display(c)

355 for c in local

356 if not ATEXT_INTL_DOT_RE.match(c)

357 }

358 if bad_chars:

359 raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")

360

361 # Check for dot errors imposted by the dot-atom rule.

362 # (RFC 5322 3.2.3)

363 check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)

364

365 # All of the reasons should already have been checked, but just in case

366 # we have a fallback message.

367 raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")

368

369

370def check_unsafe_chars(s: str, allow_space: bool = False) -> None:

371 # Check for unsafe characters or characters that would make the string

372 # invalid or non-sensible Unicode.

373 bad_chars = set()

374 for i, c in enumerate(s):

375 category = unicodedata.category(c)

376 if category[0] in ("L", "N", "P", "S"):

377 # Letters, numbers, punctuation, and symbols are permitted.

378 pass

379 elif category[0] == "M":

380 # Combining character in first position would combine with something

381 # outside of the email address if concatenated, so they are not safe.

382 # We also check if this occurs after the @-sign, which would not be

383 # sensible because it would modify the @-sign.

384 if i == 0:

385 bad_chars.add(c)

386 elif category == "Zs":

387 # Spaces outside of the ASCII range are not specifically disallowed in

388 # internationalized addresses as far as I can tell, but they violate

389 # the spirit of the non-internationalized specification that email

390 # addresses do not contain ASCII spaces when not quoted. Excluding

391 # ASCII spaces when not quoted is handled directly by the atom regex.

392 #

393 # In quoted-string local parts, spaces are explicitly permitted, and

394 # the ASCII space has category Zs, so we must allow it here, and we'll

395 # allow all Unicode spaces to be consistent.

396 if not allow_space:

397 bad_chars.add(c)

398 elif category[0] == "Z":

399 # The two line and paragraph separator characters (in categories Zl and Zp)

400 # are not specifically disallowed in internationalized addresses

401 # as far as I can tell, but they violate the spirit of the non-internationalized

402 # specification that email addresses do not contain line breaks when not quoted.

403 bad_chars.add(c)

404 elif category[0] == "C":

405 # Control, format, surrogate, private use, and unassigned code points (C)

406 # are all unsafe in various ways. Control and format characters can affect

407 # text rendering if the email address is concatenated with other text.

408 # Bidirectional format characters are unsafe, even if used properly, because

409 # they cause an email address to render as a different email address.

410 # Private use characters do not make sense for publicly deliverable

411 # email addresses.

412 bad_chars.add(c)

413 else:

414 # All categories should be handled above, but in case there is something new

415 # to the Unicode specification in the future, reject all other categories.

416 bad_chars.add(c)

417 if bad_chars:

418 raise EmailSyntaxError("The email address contains unsafe characters: "

419 + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")

420

421

422def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None:

423 # RFC 5322 3.2.3

424 if label.endswith("."):

425 raise EmailSyntaxError(end_descr.format("period"))

426 if label.startswith("."):

427 raise EmailSyntaxError(start_descr.format("period"))

428 if ".." in label:

429 raise EmailSyntaxError("An email address cannot have two periods in a row.")

430

431 if is_hostname:

432 # RFC 952

433 if label.endswith("-"):

434 raise EmailSyntaxError(end_descr.format("hyphen"))

435 if label.startswith("-"):

436 raise EmailSyntaxError(start_descr.format("hyphen"))

437 if ".-" in label or "-." in label:

438 raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")

439

440

441class DomainNameValidationResult(TypedDict):

442 ascii_domain: str

443 domain: str

444

445

446def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult:

447 """Validates the syntax of the domain part of an email address."""

448

449 # Check for invalid characters.

450 # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)

451 bad_chars = {

452 safe_character_display(c)

453 for c in domain

454 if not ATEXT_HOSTNAME_INTL.match(c)

455 }

456 if bad_chars:

457 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")

458

459 # Check for unsafe characters.

460 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked

461 # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but

462 # they may not be valid, safe, or sensible Unicode strings.

463 check_unsafe_chars(domain)

464

465 # Perform UTS-46 normalization, which includes casefolding, NFC normalization,

466 # and converting all label separators (the period/full stop, fullwidth full stop,

467 # ideographic full stop, and halfwidth ideographic full stop) to regular dots.

468 # It will also raise an exception if there is an invalid character in the input,

469 # such as "⒈" which is invalid because it would expand to include a dot and

470 # U+1FEF which normalizes to a backtick, which is not an allowed hostname character.

471 # Since several characters *are* normalized to a dot, this has to come before

472 # checks related to dots, like check_dot_atom which comes next.

473 original_domain = domain

474 try:

475 domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)

476 except idna.IDNAError as e:

477 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e

478

479 # Check for invalid characters after Unicode normalization which are not caught

480 # by uts46_remap (see tests for examples).

481 bad_chars = {

482 safe_character_display(c)

483 for c in domain

484 if not ATEXT_HOSTNAME_INTL.match(c)

485 }

486 if bad_chars:

487 raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".")

488

489 # The domain part is made up dot-separated "labels." Each label must

490 # have at least one character and cannot start or end with dashes, which

491 # means there are some surprising restrictions on periods and dashes.

492 # Check that before we do IDNA encoding because the IDNA library gives

493 # unfriendly errors for these cases, but after UTS-46 normalization because

494 # it can insert periods and hyphens (from fullwidth characters).

495 # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3)

496 check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)

497

498 # Check for RFC 5890's invalid R-LDH labels, which are labels that start

499 # with two characters other than "xn" and two dashes.

500 for label in domain.split("."):

501 if re.match(r"(?!xn)..--", label, re.I):

502 raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")

503

504 if DOT_ATOM_TEXT_HOSTNAME.match(domain):

505 # This is a valid non-internationalized domain.

506 ascii_domain = domain

507 else:

508 # If international characters are present in the domain name, convert

509 # the domain to IDNA ASCII. If internationalized characters are present,

510 # the MTA must either support SMTPUTF8 or the mail client must convert the

511 # domain name to IDNA before submission.

512 #

513 # For ASCII-only domains, the transformation does nothing and is safe to

514 # apply. However, to ensure we don't rely on the idna library for basic

515 # syntax checks, we don't use it if it's not needed.

516 #

517 # idna.encode also checks the domain name length after encoding but it

518 # doesn't give a nice error, so we call the underlying idna.alabel method

519 # directly. idna.alabel checks label length and doesn't give great messages,

520 # but we can't easily go to lower level methods.

521 try:

522 ascii_domain = ".".join(

523 idna.alabel(label).decode("ascii")

524 for label in domain.split(".")

525 )

526 except idna.IDNAError as e:

527 # Some errors would have already been raised by idna.uts46_remap.

528 raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e

529

530 # Check the syntax of the string returned by idna.encode.

531 # It should never fail.

532 if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain):

533 raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")

534

535 # Check the length of the domain name in bytes.

536 # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)

537 # We're checking the number of bytes ("octets") here, which can be much

538 # higher than the number of characters in internationalized domains,

539 # on the assumption that the domain may be transmitted without SMTPUTF8

540 # as IDNA ASCII. (This is also checked by idna.encode, so this exception

541 # is never reached for internationalized domains.)

542 if len(ascii_domain) > DOMAIN_MAX_LENGTH:

543 if ascii_domain == original_domain:

544 reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)

545 raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.")

546 else:

547 diff = len(ascii_domain) - DOMAIN_MAX_LENGTH

548 s = "" if diff == 1 else "s"

549 raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).")

550

551 # Also check the label length limit.

552 # (RFC 1035 2.3.1)

553 for label in ascii_domain.split("."):

554 if len(label) > DNS_LABEL_LENGTH_LIMIT:

555 reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)

556 raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")

557

558 if globally_deliverable:

559 # All publicly deliverable addresses have domain names with at least

560 # one period, at least for gTLDs created since 2013 (per the ICANN Board

561 # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).

562 # We'll consider the lack of a period a syntax error

563 # since that will match people's sense of what an email address looks

564 # like. We'll skip this in test environments to allow '@test' email

565 # addresses.

566 if "." not in ascii_domain and not (ascii_domain == "test" and test_environment):

567 raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.")

568

569 # We also know that all TLDs currently end with a letter.

570 if not DOMAIN_NAME_REGEX.search(ascii_domain):

571 raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.")

572

573 # Check special-use and reserved domain names.

574 # Some might fail DNS-based deliverability checks, but that

575 # can be turned off, so we should fail them all sooner.

576 # See the references in __init__.py.

577 from . import SPECIAL_USE_DOMAIN_NAMES

578 for d in SPECIAL_USE_DOMAIN_NAMES:

579 # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.

580 if d == "test" and test_environment:

581 continue

582

583 if ascii_domain == d or ascii_domain.endswith("." + d):

584 raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.")

585

586 # We may have been given an IDNA ASCII domain to begin with. Check

587 # that the domain actually conforms to IDNA. It could look like IDNA

588 # but not be actual IDNA. For ASCII-only domains, the conversion out

589 # of IDNA just gives the same thing back.

590 #

591 # This gives us the canonical internationalized form of the domain,

592 # which we return to the caller as a part of the normalized email

593 # address.

594 try:

595 domain_i18n = idna.decode(ascii_domain.encode('ascii'))

596 except idna.IDNAError as e:

597 raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e

598

599 # Check that this normalized domain name has not somehow become

600 # an invalid domain name. All of the checks before this point

601 # using the idna package probably guarantee that we now have

602 # a valid international domain name in most respects. But it

603 # doesn't hurt to re-apply some tests to be sure. See the similar

604 # tests above.

605

606 # Check for invalid and unsafe characters. We have no test

607 # case for this.

608 bad_chars = {

609 safe_character_display(c)

610 for c in domain

611 if not ATEXT_HOSTNAME_INTL.match(c)

612 }

613 if bad_chars:

614 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")

615 check_unsafe_chars(domain)

616

617 # Check that it can be encoded back to IDNA ASCII. We have no test

618 # case for this.

619 try:

620 idna.encode(domain_i18n)

621 except idna.IDNAError as e:

622 raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e

623

624 # Return the IDNA ASCII-encoded form of the domain, which is how it

625 # would be transmitted on the wire (except when used with SMTPUTF8

626 # possibly), as well as the canonical Unicode form of the domain,

627 # which is better for display purposes. This should also take care

628 # of RFC 6532 section 3.1's suggestion to apply Unicode NFC

629 # normalization to addresses.

630 return {

631 "ascii_domain": ascii_domain,

632 "domain": domain_i18n,

633 }

634

635

636def validate_email_length(addrinfo: ValidatedEmail) -> None:

637 # There are three forms of the email address whose length must be checked:

638 #

639 # 1) The original email address string. Since callers may continue to use

640 # this string, even though we recommend using the normalized form, we

641 # should not pass validation when the original input is not valid. This

642 # form is checked first because it is the original input.

643 # 2) The normalized email address. We perform Unicode NFC normalization of

644 # the local part, we normalize the domain to internationalized characters

645 # (if originaly IDNA ASCII) which also includes Unicode normalization,

646 # and we may remove quotes in quoted local parts. We recommend that

647 # callers use this string, so it must be valid.

648 # 3) The email address with the IDNA ASCII representation of the domain

649 # name, since this string may be used with email stacks that don't

650 # support UTF-8. Since this is the least likely to be used by callers,

651 # it is checked last. Note that ascii_email will only be set if the

652 # local part is ASCII, but conceivably the caller may combine a

653 # internationalized local part with an ASCII domain, so we check this

654 # on that combination also. Since we only return the normalized local

655 # part, we use that (and not the unnormalized local part).

656 #

657 # In all cases, the length is checked in UTF-8 because the SMTPUTF8

658 # extension to SMTP validates the length in bytes.

659

660 addresses_to_check = [

661 (addrinfo.original, None),

662 (addrinfo.normalized, "after normalization"),

663 ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"),

664 ]

665

666 for addr, reason in addresses_to_check:

667 addr_len = len(addr)

668 addr_utf8_len = len(addr.encode("utf8"))

669 diff = addr_utf8_len - EMAIL_MAX_LENGTH

670 if diff > 0:

671 if reason is None and addr_len == addr_utf8_len:

672 # If there is no normalization or transcoding,

673 # we can give a simple count of the number of

674 # characters over the limit.

675 reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH)

676 elif reason is None:

677 # If there is no normalization but there is

678 # some transcoding to UTF-8, we can compute

679 # the minimum number of characters over the

680 # limit by dividing the number of bytes over

681 # the limit by the maximum number of bytes

682 # per character.

683 mbpc = max(len(c.encode("utf8")) for c in addr)

684 mchars = max(1, diff // mbpc)

685 suffix = "s" if diff > 1 else ""

686 if mchars == diff:

687 reason = f"({diff} character{suffix} too many)"

688 else:

689 reason = f"({mchars}-{diff} character{suffix} too many)"

690 else:

691 # Since there is normalization, the number of

692 # characters in the input that need to change is

693 # impossible to know.

694 suffix = "s" if diff > 1 else ""

695 reason += f" ({diff} byte{suffix} too many)"

696 raise EmailSyntaxError(f"The email address is too long {reason}.")

697

698

699class DomainLiteralValidationResult(TypedDict):

700 domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address]

701 domain: str

702

703

704def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult:

705 # This is obscure domain-literal syntax. Parse it and return

706 # a compressed/normalized address.

707 # RFC 5321 4.1.3 and RFC 5322 3.4.1.

708

709 addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address]

710

711 # Try to parse the domain literal as an IPv4 address.

712 # There is no tag for IPv4 addresses, so we can never

713 # be sure if the user intends an IPv4 address.

714 if re.match(r"^[0-9\.]+$", domain_literal):

715 try:

716 addr = ipaddress.IPv4Address(domain_literal)

717 except ValueError as e:

718 raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e

719

720 # Return the IPv4Address object and the domain back unchanged.

721 return {

722 "domain_address": addr,

723 "domain": f"[{addr}]",

724 }

725

726 # If it begins with "IPv6:" it's an IPv6 address.

727 if domain_literal.startswith("IPv6:"):

728 try:

729 addr = ipaddress.IPv6Address(domain_literal[5:])

730 except ValueError as e:

731 raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e

732

733 # Return the IPv6Address object and construct a normalized

734 # domain literal.

735 return {

736 "domain_address": addr,

737 "domain": f"[IPv6:{addr.compressed}]",

738 }

739

740 # Nothing else is valid.

741

742 if ":" not in domain_literal:

743 raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")

744

745 # The tag (the part before the colon) has character restrictions,

746 # but since it must come from a registry of tags (in which only "IPv6" is defined),

747 # there's no need to check the syntax of the tag. See RFC 5321 4.1.2.

748

749 # Check for permitted ASCII characters. This actually doesn't matter

750 # since there will be an exception after anyway.

751 bad_chars = {

752 safe_character_display(c)

753 for c in domain_literal

754 if not DOMAIN_LITERAL_CHARS.match(c)

755 }

756 if bad_chars:

757 raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".")

758

759 # There are no other domain literal tags.

760 # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml

761 raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/email_validator/syntax.py: 70%

273 statements