Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/email_validator/syntax.py: 84%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

292 statements  

1from .exceptions import EmailSyntaxError 

2from .types import ValidatedEmail 

3from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ 

4 DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_DOT_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ 

5 DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS 

6 

7import re 

8import unicodedata 

9import idna # implements IDNA 2008; Python's codec is only IDNA 2003 

10import ipaddress 

11from typing import Optional, Tuple, TypedDict, Union 

12 

13 

14def split_email(email: str) -> Tuple[Optional[str], str, str, bool]: 

15 # Return the display name, unescaped local part, and domain part 

16 # of the address, and whether the local part was quoted. If no 

17 # display name was present and angle brackets do not surround 

18 # the address, display name will be None; otherwise, it will be 

19 # set to the display name or the empty string if there were 

20 # angle brackets but no display name. 

21 

22 # Typical email addresses have a single @-sign and no quote 

23 # characters, but the awkward "quoted string" local part form 

24 # (RFC 5321 4.1.2) allows @-signs and escaped quotes to appear 

25 # in the local part if the local part is quoted. 

26 

27 # A `display name <addr>` format is also present in MIME messages 

28 # (RFC 5322 3.4) and this format is also often recognized in 

29 # mail UIs. It's not allowed in SMTP commands or in typical web 

30 # login forms, but parsing it has been requested, so it's done 

31 # here as a convenience. It's implemented in the spirit but not 

32 # the letter of RFC 5322 3.4 because MIME messages allow newlines 

33 # and comments as a part of the CFWS rule, but this is typically 

34 # not allowed in mail UIs (although comment syntax was requested 

35 # once too). 

36 # 

37 # Display names are either basic characters (the same basic characters 

38 # permitted in email addresses, but periods are not allowed and spaces 

39 # are allowed; see RFC 5322 Appendix A.1.2), or or a quoted string with 

40 # the same rules as a quoted local part. (Multiple quoted strings might 

41 # be allowed? Unclear.) Optional space (RFC 5322 3.4 CFWS) and then the 

42 # email address follows in angle brackets. 

43 # 

44 # An initial quote is ambiguous between starting a display name or 

45 # a quoted local part --- fun. 

46 # 

47 # We assume the input string is already stripped of leading and 

48 # trailing CFWS. 

49 

50 def split_string_at_unquoted_special(text: str, specials: Tuple[str, ...]) -> Tuple[str, str]: 

51 # Split the string at the first character in specials (an @-sign 

52 # or left angle bracket) that does not occur within quotes and 

53 # is not followed by a Unicode combining character. 

54 # If no special character is found, raise an error. 

55 inside_quote = False 

56 escaped = False 

57 left_part = "" 

58 for i, c in enumerate(text): 

59 # < plus U+0338 (Combining Long Solidus Overlay) normalizes to 

60 # ≮ U+226E (Not Less-Than), and it would be confusing to treat 

61 # the < as the start of "<email>" syntax in that case. Likewise, 

62 # if anything combines with an @ or ", we should probably not 

63 # treat it as a special character. 

64 if unicodedata.normalize("NFC", text[i:])[0] != c: 

65 left_part += c 

66 

67 elif inside_quote: 

68 left_part += c 

69 if c == '\\' and not escaped: 

70 escaped = True 

71 elif c == '"' and not escaped: 

72 # The only way to exit the quote is an unescaped quote. 

73 inside_quote = False 

74 escaped = False 

75 else: 

76 escaped = False 

77 elif c == '"': 

78 left_part += c 

79 inside_quote = True 

80 elif c in specials: 

81 # When unquoted, stop before a special character. 

82 break 

83 else: 

84 left_part += c 

85 

86 # No special symbol found. The special symbols always 

87 # include an at-sign, so this always indicates a missing 

88 # at-sign. The other symbol is optional. 

89 if len(left_part) == len(text): 

90 # The full-width at-sign might occur in CJK contexts. 

91 # We can't accept it because we only accept addresess 

92 # that are actually valid. But if this is common we 

93 # may want to consider accepting and normalizing full- 

94 # width characters for the other special symbols (and 

95 # full-width dot is already accepted in internationalized 

96 # domains) with a new option. 

97 # See https://news.ycombinator.com/item?id=42235268. 

98 if "@" in text: 

99 raise EmailSyntaxError("The email address has the \"full-width\" at-sign (@) character instead of a regular at-sign.") 

100 

101 # Check another near-homoglyph for good measure because 

102 # homoglyphs in place of required characters could be 

103 # very confusing. We may want to consider checking for 

104 # homoglyphs anywhere we look for a special symbol. 

105 if "﹫" in text: 

106 raise EmailSyntaxError('The email address has the "small commercial at" character instead of a regular at-sign.') 

107 

108 raise EmailSyntaxError("An email address must have an @-sign.") 

109 

110 # The right part is whatever is left. 

111 right_part = text[len(left_part):] 

112 

113 return left_part, right_part 

114 

115 def unquote_quoted_string(text: str) -> Tuple[str, bool]: 

116 # Remove surrounding quotes and unescape escaped backslashes 

117 # and quotes. Escapes are parsed liberally. I think only 

118 # backslashes and quotes can be escaped but we'll allow anything 

119 # to be. 

120 quoted = False 

121 escaped = False 

122 value = "" 

123 for i, c in enumerate(text): 

124 if quoted: 

125 if escaped: 

126 value += c 

127 escaped = False 

128 elif c == '\\': 

129 escaped = True 

130 elif c == '"': 

131 if i != len(text) - 1: 

132 raise EmailSyntaxError("Extra character(s) found after close quote: " 

133 + ", ".join(safe_character_display(c) for c in text[i + 1:])) 

134 break 

135 else: 

136 value += c 

137 elif i == 0 and c == '"': 

138 quoted = True 

139 else: 

140 value += c 

141 

142 return value, quoted 

143 

144 # Split the string at the first unquoted @-sign or left angle bracket. 

145 left_part, right_part = split_string_at_unquoted_special(email, ("@", "<")) 

146 

147 # If the right part starts with an angle bracket, 

148 # then the left part is a display name and the rest 

149 # of the right part up to the final right angle bracket 

150 # is the email address, . 

151 if right_part.startswith("<"): 

152 # Remove space between the display name and angle bracket. 

153 left_part = left_part.rstrip() 

154 

155 # Unquote and unescape the display name. 

156 display_name, display_name_quoted = unquote_quoted_string(left_part) 

157 

158 # Check that only basic characters are present in a 

159 # non-quoted display name. 

160 if not display_name_quoted: 

161 bad_chars = { 

162 safe_character_display(c) 

163 for c in display_name 

164 if (not ATEXT_RE.match(c) and c != ' ') or c == '.' 

165 } 

166 if bad_chars: 

167 raise EmailSyntaxError("The display name contains invalid characters when not quoted: " + ", ".join(sorted(bad_chars)) + ".") 

168 

169 # Check for other unsafe characters. 

170 check_unsafe_chars(display_name, allow_space=True) 

171 

172 # Check that the right part ends with an angle bracket 

173 # but allow spaces after it, I guess. 

174 if ">" not in right_part: 

175 raise EmailSyntaxError("An open angle bracket at the start of the email address has to be followed by a close angle bracket at the end.") 

176 right_part = right_part.rstrip(" ") 

177 if right_part[-1] != ">": 

178 raise EmailSyntaxError("There can't be anything after the email address.") 

179 

180 # Remove the initial and trailing angle brackets. 

181 addr_spec = right_part[1:].rstrip(">") 

182 

183 # Split the email address at the first unquoted @-sign. 

184 local_part, domain_part = split_string_at_unquoted_special(addr_spec, ("@",)) 

185 

186 # Otherwise there is no display name. The left part is the local 

187 # part and the right part is the domain. 

188 else: 

189 display_name = None 

190 local_part, domain_part = left_part, right_part 

191 

192 if domain_part.startswith("@"): 

193 domain_part = domain_part[1:] 

194 

195 # Unquote the local part if it is quoted. 

196 local_part, is_quoted_local_part = unquote_quoted_string(local_part) 

197 

198 return display_name, local_part, domain_part, is_quoted_local_part 

199 

200 

201def get_length_reason(addr: str, limit: int) -> str: 

202 """Helper function to return an error message related to invalid length.""" 

203 diff = len(addr) - limit 

204 suffix = "s" if diff > 1 else "" 

205 return f"({diff} character{suffix} too many)" 

206 

207 

208def safe_character_display(c: str) -> str: 

209 # Return safely displayable characters in quotes. 

210 if c == '\\': 

211 return f"\"{c}\"" # can't use repr because it escapes it 

212 if unicodedata.category(c)[0] in ("L", "N", "P", "S"): 

213 return repr(c) 

214 

215 # Construct a hex string in case the unicode name doesn't exist. 

216 if ord(c) < 0xFFFF: 

217 h = f"U+{ord(c):04x}".upper() 

218 else: 

219 h = f"U+{ord(c):08x}".upper() 

220 

221 # Return the character name or, if it has no name, the hex string. 

222 return unicodedata.name(c, h) 

223 

224 

225class LocalPartValidationResult(TypedDict): 

226 local_part: str 

227 ascii_local_part: Optional[str] 

228 smtputf8: bool 

229 

230 

231def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, 

232 quoted_local_part: bool = False) -> LocalPartValidationResult: 

233 """Validates the syntax of the local part of an email address.""" 

234 

235 if len(local) == 0: 

236 if not allow_empty_local: 

237 raise EmailSyntaxError("There must be something before the @-sign.") 

238 

239 # The caller allows an empty local part. Useful for validating certain 

240 # Postfix aliases. 

241 return { 

242 "local_part": local, 

243 "ascii_local_part": local, 

244 "smtputf8": False, 

245 } 

246 

247 # Check the length of the local part by counting characters. 

248 # (RFC 5321 4.5.3.1.1) 

249 # We're checking the number of characters here. If the local part 

250 # is ASCII-only, then that's the same as bytes (octets). If it's 

251 # internationalized, then the UTF-8 encoding may be longer, but 

252 # that may not be relevant. We will check the total address length 

253 # instead. 

254 if len(local) > LOCAL_PART_MAX_LENGTH: 

255 reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) 

256 raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") 

257 

258 # Check the local part against the non-internationalized regular expression. 

259 # Most email addresses match this regex so it's probably fastest to check this first. 

260 # (RFC 5322 3.2.3) 

261 # All local parts matching the dot-atom rule are also valid as a quoted string 

262 # so if it was originally quoted (quoted_local_part is True) and this regex matches, 

263 # it's ok. 

264 # (RFC 5321 4.1.2 / RFC 5322 3.2.4). 

265 if DOT_ATOM_TEXT.match(local): 

266 # It's valid. And since it's just the permitted ASCII characters, 

267 # it's normalized and safe. If the local part was originally quoted, 

268 # the quoting was unnecessary and it'll be returned as normalized to 

269 # non-quoted form. 

270 

271 # Return the local part and flag that SMTPUTF8 is not needed. 

272 return { 

273 "local_part": local, 

274 "ascii_local_part": local, 

275 "smtputf8": False, 

276 } 

277 

278 # The local part failed the basic dot-atom check. Try the extended character set 

279 # for internationalized addresses. It's the same pattern but with additional 

280 # characters permitted. 

281 # RFC 6531 section 3.3. 

282 valid: Optional[str] = None 

283 requires_smtputf8 = False 

284 if DOT_ATOM_TEXT_INTL.match(local): 

285 # But international characters in the local part may not be permitted. 

286 if not allow_smtputf8: 

287 # Check for invalid characters against the non-internationalized 

288 # permitted character set. 

289 # (RFC 5322 3.2.3) 

290 bad_chars = { 

291 safe_character_display(c) 

292 for c in local 

293 if not ATEXT_RE.match(c) 

294 } 

295 if bad_chars: 

296 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") 

297 

298 # Although the check above should always find something, fall back to this just in case. 

299 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") 

300 

301 # It's valid. 

302 valid = "dot-atom" 

303 requires_smtputf8 = True 

304 

305 # There are no dot-atom syntax restrictions on quoted local parts, so 

306 # if it was originally quoted, it is probably valid. More characters 

307 # are allowed, like @-signs, spaces, and quotes, and there are no 

308 # restrictions on the placement of dots, as in dot-atom local parts. 

309 elif quoted_local_part: 

310 # Check for invalid characters in a quoted string local part. 

311 # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* 

312 # characters which are *not* allowed here. RFC 6531 section 3.3 

313 # extends the range to UTF8 strings.) 

314 bad_chars = { 

315 safe_character_display(c) 

316 for c in local 

317 if not QTEXT_INTL.match(c) 

318 } 

319 if bad_chars: 

320 raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") 

321 

322 # See if any characters are outside of the ASCII range. 

323 bad_chars = { 

324 safe_character_display(c) 

325 for c in local 

326 if not (32 <= ord(c) <= 126) 

327 } 

328 if bad_chars: 

329 requires_smtputf8 = True 

330 

331 # International characters in the local part may not be permitted. 

332 if not allow_smtputf8: 

333 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") 

334 

335 # It's valid. 

336 valid = "quoted" 

337 

338 # If the local part matches the internationalized dot-atom form or was quoted, 

339 # perform additional checks for Unicode strings. 

340 if valid: 

341 # Check that the local part is a valid, safe, and sensible Unicode string. 

342 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked 

343 # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the 

344 # email specs, but they may not be valid, safe, or sensible Unicode strings. 

345 # See the function for rationale. 

346 check_unsafe_chars(local, allow_space=(valid == "quoted")) 

347 

348 # Try encoding to UTF-8. Failure is possible with some characters like 

349 # surrogate code points, but those are checked above. Still, we don't 

350 # want to have an unhandled exception later. 

351 try: 

352 local.encode("utf8") 

353 except ValueError as e: 

354 raise EmailSyntaxError("The email address contains an invalid character.") from e 

355 

356 # If this address passes only by the quoted string form, re-quote it 

357 # and backslash-escape quotes and backslashes (removing any unnecessary 

358 # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, 

359 # and the sending system SHOULD transmit the form that uses the minimum quoting possible." 

360 if valid == "quoted": 

361 local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' 

362 

363 return { 

364 "local_part": local, 

365 "ascii_local_part": local if not requires_smtputf8 else None, 

366 "smtputf8": requires_smtputf8, 

367 } 

368 

369 # It's not a valid local part. Let's find out why. 

370 # (Since quoted local parts are all valid or handled above, these checks 

371 # don't apply in those cases.) 

372 

373 # Check for invalid characters. 

374 # (RFC 5322 3.2.3, plus RFC 6531 3.3) 

375 bad_chars = { 

376 safe_character_display(c) 

377 for c in local 

378 if not ATEXT_INTL_DOT_RE.match(c) 

379 } 

380 if bad_chars: 

381 raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") 

382 

383 # Check for dot errors imposted by the dot-atom rule. 

384 # (RFC 5322 3.2.3) 

385 check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) 

386 

387 # All of the reasons should already have been checked, but just in case 

388 # we have a fallback message. 

389 raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") 

390 

391 

392def check_unsafe_chars(s: str, allow_space: bool = False) -> None: 

393 # Check for unsafe characters or characters that would make the string 

394 # invalid or non-sensible Unicode. 

395 bad_chars = set() 

396 for i, c in enumerate(s): 

397 category = unicodedata.category(c) 

398 if category[0] in ("L", "N", "P", "S"): 

399 # Letters, numbers, punctuation, and symbols are permitted. 

400 pass 

401 elif category[0] == "M": 

402 # Combining character in first position would combine with something 

403 # outside of the email address if concatenated, so they are not safe. 

404 # We also check if this occurs after the @-sign, which would not be 

405 # sensible because it would modify the @-sign. 

406 if i == 0: 

407 bad_chars.add(c) 

408 elif category == "Zs": 

409 # Spaces outside of the ASCII range are not specifically disallowed in 

410 # internationalized addresses as far as I can tell, but they violate 

411 # the spirit of the non-internationalized specification that email 

412 # addresses do not contain ASCII spaces when not quoted. Excluding 

413 # ASCII spaces when not quoted is handled directly by the atom regex. 

414 # 

415 # In quoted-string local parts, spaces are explicitly permitted, and 

416 # the ASCII space has category Zs, so we must allow it here, and we'll 

417 # allow all Unicode spaces to be consistent. 

418 if not allow_space: 

419 bad_chars.add(c) 

420 elif category[0] == "Z": 

421 # The two line and paragraph separator characters (in categories Zl and Zp) 

422 # are not specifically disallowed in internationalized addresses 

423 # as far as I can tell, but they violate the spirit of the non-internationalized 

424 # specification that email addresses do not contain line breaks when not quoted. 

425 bad_chars.add(c) 

426 elif category[0] == "C": 

427 # Control, format, surrogate, private use, and unassigned code points (C) 

428 # are all unsafe in various ways. Control and format characters can affect 

429 # text rendering if the email address is concatenated with other text. 

430 # Bidirectional format characters are unsafe, even if used properly, because 

431 # they cause an email address to render as a different email address. 

432 # Private use characters do not make sense for publicly deliverable 

433 # email addresses. 

434 bad_chars.add(c) 

435 else: 

436 # All categories should be handled above, but in case there is something new 

437 # to the Unicode specification in the future, reject all other categories. 

438 bad_chars.add(c) 

439 if bad_chars: 

440 raise EmailSyntaxError("The email address contains unsafe characters: " 

441 + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") 

442 

443 

444def check_dot_atom(label: str, start_descr: str, end_descr: str, is_hostname: bool) -> None: 

445 # RFC 5322 3.2.3 

446 if label.endswith("."): 

447 raise EmailSyntaxError(end_descr.format("period")) 

448 if label.startswith("."): 

449 raise EmailSyntaxError(start_descr.format("period")) 

450 if ".." in label: 

451 raise EmailSyntaxError("An email address cannot have two periods in a row.") 

452 

453 if is_hostname: 

454 # RFC 952 

455 if label.endswith("-"): 

456 raise EmailSyntaxError(end_descr.format("hyphen")) 

457 if label.startswith("-"): 

458 raise EmailSyntaxError(start_descr.format("hyphen")) 

459 if ".-" in label or "-." in label: 

460 raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") 

461 

462 

463def uts46_valid_char(char: str) -> bool: 

464 # By exhaustively searching for characters rejected by 

465 # for c in (chr(i) for i in range(0x110000)): 

466 # idna.uts46_remap(c, std3_rules=False, transitional=False) 

467 # I found the following rules are pretty close. 

468 c = ord(char) 

469 if 0x80 <= c <= 0x9f: 

470 # 8-bit ASCII range. 

471 return False 

472 elif ((0x2010 <= c <= 0x2060 and not (0x2024 <= c <= 0x2026) and not (0x2028 <= c <= 0x202E)) 

473 or c in (0x00AD, 0x2064, 0xFF0E) 

474 or 0x200B <= c <= 0x200D 

475 or 0x1BCA0 <= c <= 0x1BCA3): 

476 # Characters that are permitted but fall into one of the 

477 # tests below. 

478 return True 

479 elif unicodedata.category(chr(c)) in ("Cf", "Cn", "Co", "Cs", "Zs", "Zl", "Zp"): 

480 # There are a bunch of Zs characters including regular space 

481 # that are allowed by UTS46 but are not allowed in domain 

482 # names anyway. 

483 # 

484 # There are some Cn (unassigned) characters that the idna 

485 # package doesn't reject but we can, I think. 

486 return False 

487 elif "002E" in unicodedata.decomposition(chr(c)).split(" "): 

488 # Characters that decompose into a sequence with a dot. 

489 return False 

490 return True 

491 

492 

493class DomainNameValidationResult(TypedDict): 

494 ascii_domain: str 

495 domain: str 

496 

497 

498def validate_email_domain_name(domain: str, test_environment: bool = False, globally_deliverable: bool = True) -> DomainNameValidationResult: 

499 """Validates the syntax of the domain part of an email address.""" 

500 

501 # Check for invalid characters. 

502 # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) 

503 bad_chars = { 

504 safe_character_display(c) 

505 for c in domain 

506 if not ATEXT_HOSTNAME_INTL.match(c) 

507 } 

508 if bad_chars: 

509 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 

510 

511 # Check for unsafe characters. 

512 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked 

513 # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but 

514 # they may not be valid, safe, or sensible Unicode strings. 

515 check_unsafe_chars(domain) 

516 

517 # Reject characters that would be rejected by UTS-46 normalization next but 

518 # with an error message under our control. 

519 bad_chars = { 

520 safe_character_display(c) for c in domain 

521 if not uts46_valid_char(c) 

522 } 

523 if bad_chars: 

524 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 

525 

526 # Perform UTS-46 normalization, which includes casefolding, NFC normalization, 

527 # and converting all label separators (the period/full stop, fullwidth full stop, 

528 # ideographic full stop, and halfwidth ideographic full stop) to regular dots. 

529 # It will also raise an exception if there is an invalid character in the input, 

530 # such as "⒈" which is invalid because it would expand to include a dot and 

531 # U+1FEF which normalizes to a backtick, which is not an allowed hostname character. 

532 # Since several characters *are* normalized to a dot, this has to come before 

533 # checks related to dots, like check_dot_atom which comes next. 

534 original_domain = domain 

535 try: 

536 domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) 

537 except idna.IDNAError as e: 

538 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") from e 

539 

540 # Check for invalid characters after Unicode normalization which are not caught 

541 # by uts46_remap (see tests for examples). 

542 bad_chars = { 

543 safe_character_display(c) 

544 for c in domain 

545 if not ATEXT_HOSTNAME_INTL.match(c) 

546 } 

547 if bad_chars: 

548 raise EmailSyntaxError("The part after the @-sign contains invalid characters after Unicode normalization: " + ", ".join(sorted(bad_chars)) + ".") 

549 

550 # The domain part is made up dot-separated "labels." Each label must 

551 # have at least one character and cannot start or end with dashes, which 

552 # means there are some surprising restrictions on periods and dashes. 

553 # Check that before we do IDNA encoding because the IDNA library gives 

554 # unfriendly errors for these cases, but after UTS-46 normalization because 

555 # it can insert periods and hyphens (from fullwidth characters). 

556 # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3) 

557 check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) 

558 

559 # Check for RFC 5890's invalid R-LDH labels, which are labels that start 

560 # with two characters other than "xn" and two dashes. 

561 for label in domain.split("."): 

562 if re.match(r"(?!xn)..--", label, re.I): 

563 raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") 

564 

565 if DOT_ATOM_TEXT_HOSTNAME.match(domain): 

566 # This is a valid non-internationalized domain. 

567 ascii_domain = domain 

568 else: 

569 # If international characters are present in the domain name, convert 

570 # the domain to IDNA ASCII. If internationalized characters are present, 

571 # the MTA must either support SMTPUTF8 or the mail client must convert the 

572 # domain name to IDNA before submission. 

573 # 

574 # For ASCII-only domains, the transformation does nothing and is safe to 

575 # apply. However, to ensure we don't rely on the idna library for basic 

576 # syntax checks, we don't use it if it's not needed. 

577 # 

578 # idna.encode also checks the domain name length after encoding but it 

579 # doesn't give a nice error, so we call the underlying idna.alabel method 

580 # directly. idna.alabel checks label length and doesn't give great messages, 

581 # but we can't easily go to lower level methods. 

582 try: 

583 ascii_domain = ".".join( 

584 idna.alabel(label).decode("ascii") 

585 for label in domain.split(".") 

586 ) 

587 except idna.IDNAError as e: 

588 # Some errors would have already been raised by idna.uts46_remap. 

589 raise EmailSyntaxError(f"The part after the @-sign is invalid ({e}).") from e 

590 

591 # Check the syntax of the string returned by idna.encode. 

592 # It should never fail. 

593 if not DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain): 

594 raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") 

595 

596 # Check the length of the domain name in bytes. 

597 # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) 

598 # We're checking the number of bytes ("octets") here, which can be much 

599 # higher than the number of characters in internationalized domains, 

600 # on the assumption that the domain may be transmitted without SMTPUTF8 

601 # as IDNA ASCII. (This is also checked by idna.encode, so this exception 

602 # is never reached for internationalized domains.) 

603 if len(ascii_domain) > DOMAIN_MAX_LENGTH: 

604 if ascii_domain == original_domain: 

605 reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) 

606 raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") 

607 else: 

608 diff = len(ascii_domain) - DOMAIN_MAX_LENGTH 

609 s = "" if diff == 1 else "s" 

610 raise EmailSyntaxError(f"The email address is too long after the @-sign ({diff} byte{s} too many after IDNA encoding).") 

611 

612 # Also check the label length limit. 

613 # (RFC 1035 2.3.1) 

614 for label in ascii_domain.split("."): 

615 if len(label) > DNS_LABEL_LENGTH_LIMIT: 

616 reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) 

617 raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") 

618 

619 if globally_deliverable: 

620 # All publicly deliverable addresses have domain names with at least 

621 # one period, at least for gTLDs created since 2013 (per the ICANN Board 

622 # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). 

623 # We'll consider the lack of a period a syntax error 

624 # since that will match people's sense of what an email address looks 

625 # like. We'll skip this in test environments to allow '@test' email 

626 # addresses. 

627 if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): 

628 raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") 

629 

630 # We also know that all TLDs currently end with a letter. 

631 if not DOMAIN_NAME_REGEX.search(ascii_domain): 

632 raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") 

633 

634 # Check special-use and reserved domain names. 

635 # Some might fail DNS-based deliverability checks, but that 

636 # can be turned off, so we should fail them all sooner. 

637 # See the references in __init__.py. 

638 from . import SPECIAL_USE_DOMAIN_NAMES 

639 for d in SPECIAL_USE_DOMAIN_NAMES: 

640 # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. 

641 if d == "test" and test_environment: 

642 continue 

643 

644 if ascii_domain == d or ascii_domain.endswith("." + d): 

645 raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.") 

646 

647 # We may have been given an IDNA ASCII domain to begin with. Check 

648 # that the domain actually conforms to IDNA. It could look like IDNA 

649 # but not be actual IDNA. For ASCII-only domains, the conversion out 

650 # of IDNA just gives the same thing back. 

651 # 

652 # This gives us the canonical internationalized form of the domain, 

653 # which we return to the caller as a part of the normalized email 

654 # address. 

655 try: 

656 domain_i18n = idna.decode(ascii_domain.encode('ascii')) 

657 except idna.IDNAError as e: 

658 raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") from e 

659 

660 # Check that this normalized domain name has not somehow become 

661 # an invalid domain name. All of the checks before this point 

662 # using the idna package probably guarantee that we now have 

663 # a valid international domain name in most respects. But it 

664 # doesn't hurt to re-apply some tests to be sure. See the similar 

665 # tests above. 

666 

667 # Check for invalid and unsafe characters. We have no test 

668 # case for this. 

669 bad_chars = { 

670 safe_character_display(c) 

671 for c in domain_i18n 

672 if not ATEXT_HOSTNAME_INTL.match(c) 

673 } 

674 if bad_chars: 

675 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 

676 check_unsafe_chars(domain_i18n) 

677 

678 # Check that it can be encoded back to IDNA ASCII. We have no test 

679 # case for this. 

680 try: 

681 idna.encode(domain_i18n) 

682 except idna.IDNAError as e: 

683 raise EmailSyntaxError(f"The part after the @-sign became invalid after normalizing to international characters ({e}).") from e 

684 

685 # Return the IDNA ASCII-encoded form of the domain, which is how it 

686 # would be transmitted on the wire (except when used with SMTPUTF8 

687 # possibly), as well as the canonical Unicode form of the domain, 

688 # which is better for display purposes. This should also take care 

689 # of RFC 6532 section 3.1's suggestion to apply Unicode NFC 

690 # normalization to addresses. 

691 return { 

692 "ascii_domain": ascii_domain, 

693 "domain": domain_i18n, 

694 } 

695 

696 

697def validate_email_length(addrinfo: ValidatedEmail) -> None: 

698 # There are three forms of the email address whose length must be checked: 

699 # 

700 # 1) The original email address string. Since callers may continue to use 

701 # this string, even though we recommend using the normalized form, we 

702 # should not pass validation when the original input is not valid. This 

703 # form is checked first because it is the original input. 

704 # 2) The normalized email address. We perform Unicode NFC normalization of 

705 # the local part, we normalize the domain to internationalized characters 

706 # (if originally IDNA ASCII) which also includes Unicode normalization, 

707 # and we may remove quotes in quoted local parts. We recommend that 

708 # callers use this string, so it must be valid. 

709 # 3) The email address with the IDNA ASCII representation of the domain 

710 # name, since this string may be used with email stacks that don't 

711 # support UTF-8. Since this is the least likely to be used by callers, 

712 # it is checked last. Note that ascii_email will only be set if the 

713 # local part is ASCII, but conceivably the caller may combine a 

714 # internationalized local part with an ASCII domain, so we check this 

715 # on that combination also. Since we only return the normalized local 

716 # part, we use that (and not the unnormalized local part). 

717 # 

718 # In all cases, the length is checked in UTF-8 because the SMTPUTF8 

719 # extension to SMTP validates the length in bytes. 

720 

721 addresses_to_check = [ 

722 (addrinfo.original, None), 

723 (addrinfo.normalized, "after normalization"), 

724 ((addrinfo.ascii_local_part or addrinfo.local_part or "") + "@" + addrinfo.ascii_domain, "when the part after the @-sign is converted to IDNA ASCII"), 

725 ] 

726 

727 for addr, reason in addresses_to_check: 

728 addr_len = len(addr) 

729 addr_utf8_len = len(addr.encode("utf8")) 

730 diff = addr_utf8_len - EMAIL_MAX_LENGTH 

731 if diff > 0: 

732 if reason is None and addr_len == addr_utf8_len: 

733 # If there is no normalization or transcoding, 

734 # we can give a simple count of the number of 

735 # characters over the limit. 

736 reason = get_length_reason(addr, limit=EMAIL_MAX_LENGTH) 

737 elif reason is None: 

738 # If there is no normalization but there is 

739 # some transcoding to UTF-8, we can compute 

740 # the minimum number of characters over the 

741 # limit by dividing the number of bytes over 

742 # the limit by the maximum number of bytes 

743 # per character. 

744 mbpc = max(len(c.encode("utf8")) for c in addr) 

745 mchars = max(1, diff // mbpc) 

746 suffix = "s" if diff > 1 else "" 

747 if mchars == diff: 

748 reason = f"({diff} character{suffix} too many)" 

749 else: 

750 reason = f"({mchars}-{diff} character{suffix} too many)" 

751 else: 

752 # Since there is normalization, the number of 

753 # characters in the input that need to change is 

754 # impossible to know. 

755 suffix = "s" if diff > 1 else "" 

756 reason += f" ({diff} byte{suffix} too many)" 

757 raise EmailSyntaxError(f"The email address is too long {reason}.") 

758 

759 

760class DomainLiteralValidationResult(TypedDict): 

761 domain_address: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] 

762 domain: str 

763 

764 

765def validate_email_domain_literal(domain_literal: str) -> DomainLiteralValidationResult: 

766 # This is obscure domain-literal syntax. Parse it and return 

767 # a compressed/normalized address. 

768 # RFC 5321 4.1.3 and RFC 5322 3.4.1. 

769 

770 addr: Union[ipaddress.IPv4Address, ipaddress.IPv6Address] 

771 

772 # Try to parse the domain literal as an IPv4 address. 

773 # There is no tag for IPv4 addresses, so we can never 

774 # be sure if the user intends an IPv4 address. 

775 if re.match(r"^[0-9\.]+$", domain_literal): 

776 try: 

777 addr = ipaddress.IPv4Address(domain_literal) 

778 except ValueError as e: 

779 raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") from e 

780 

781 # Return the IPv4Address object and the domain back unchanged. 

782 return { 

783 "domain_address": addr, 

784 "domain": f"[{addr}]", 

785 } 

786 

787 # If it begins with "IPv6:" it's an IPv6 address. 

788 if domain_literal.startswith("IPv6:"): 

789 try: 

790 addr = ipaddress.IPv6Address(domain_literal[5:]) 

791 except ValueError as e: 

792 raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") from e 

793 

794 # Return the IPv6Address object and construct a normalized 

795 # domain literal. 

796 return { 

797 "domain_address": addr, 

798 "domain": f"[IPv6:{addr.compressed}]", 

799 } 

800 

801 # Nothing else is valid. 

802 

803 if ":" not in domain_literal: 

804 raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") 

805 

806 # The tag (the part before the colon) has character restrictions, 

807 # but since it must come from a registry of tags (in which only "IPv6" is defined), 

808 # there's no need to check the syntax of the tag. See RFC 5321 4.1.2. 

809 

810 # Check for permitted ASCII characters. This actually doesn't matter 

811 # since there will be an exception after anyway. 

812 bad_chars = { 

813 safe_character_display(c) 

814 for c in domain_literal 

815 if not DOMAIN_LITERAL_CHARS.match(c) 

816 } 

817 if bad_chars: 

818 raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") 

819 

820 # There are no other domain literal tags. 

821 # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml 

822 raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")