Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/email_validator/syntax.py: 55%

175 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-07 06:32 +0000

1from .exceptions_types import EmailSyntaxError 

2from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \ 

3 DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \ 

4 DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS 

5 

6import re 

7import unicodedata 

8import idna # implements IDNA 2008; Python's codec is only IDNA 2003 

9import ipaddress 

10from typing import Optional 

11 

12 

13def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH): 

14 """Helper function to return an error message related to invalid length.""" 

15 diff = len(addr) - limit 

16 prefix = "at least " if utf8 else "" 

17 suffix = "s" if diff > 1 else "" 

18 return f"({prefix}{diff} character{suffix} too many)" 

19 

20 

21def safe_character_display(c): 

22 # Return safely displayable characters in quotes. 

23 if c == '\\': 

24 return f"\"{c}\"" # can't use repr because it escapes it 

25 if unicodedata.category(c)[0] in ("L", "N", "P", "S"): 

26 return repr(c) 

27 

28 # Construct a hex string in case the unicode name doesn't exist. 

29 if ord(c) < 0xFFFF: 

30 h = f"U+{ord(c):04x}".upper() 

31 else: 

32 h = f"U+{ord(c):08x}".upper() 

33 

34 # Return the character name or, if it has no name, the hex string. 

35 return unicodedata.name(c, h) 

36 

37 

38def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False, 

39 quoted_local_part: bool = False): 

40 """Validates the syntax of the local part of an email address.""" 

41 

42 if len(local) == 0: 

43 if not allow_empty_local: 

44 raise EmailSyntaxError("There must be something before the @-sign.") 

45 else: 

46 # The caller allows an empty local part. Useful for validating certain 

47 # Postfix aliases. 

48 return { 

49 "local_part": local, 

50 "ascii_local_part": local, 

51 "smtputf8": False, 

52 } 

53 

54 # Check the length of the local part by counting characters. 

55 # (RFC 5321 4.5.3.1.1) 

56 # We're checking the number of characters here. If the local part 

57 # is ASCII-only, then that's the same as bytes (octets). If it's 

58 # internationalized, then the UTF-8 encoding may be longer, but 

59 # that may not be relevant. We will check the total address length 

60 # instead. 

61 if len(local) > LOCAL_PART_MAX_LENGTH: 

62 reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH) 

63 raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.") 

64 

65 # Check the local part against the non-internationalized regular expression. 

66 # Most email addresses match this regex so it's probably fastest to check this first. 

67 # (RFC 5322 3.2.3) 

68 # All local parts matching the dot-atom rule are also valid as a quoted string 

69 # so if it was originally quoted (quoted_local_part is True) and this regex matches, 

70 # it's ok. 

71 # (RFC 5321 4.1.2 / RFC 5322 3.2.4). 

72 m = DOT_ATOM_TEXT.match(local) 

73 if m: 

74 # It's valid. And since it's just the permitted ASCII characters, 

75 # it's normalized and safe. If the local part was originally quoted, 

76 # the quoting was unnecessary and it'll be returned as normalized to 

77 # non-quoted form. 

78 

79 # Return the local part and flag that SMTPUTF8 is not needed. 

80 return { 

81 "local_part": local, 

82 "ascii_local_part": local, 

83 "smtputf8": False, 

84 } 

85 

86 # The local part failed the basic dot-atom check. Try the extended character set 

87 # for internationalized addresses. It's the same pattern but with additional 

88 # characters permitted. 

89 # RFC 6531 section 3.3. 

90 valid: Optional[str] = None 

91 requires_smtputf8 = False 

92 m = DOT_ATOM_TEXT_INTL.match(local) 

93 if m: 

94 # But international characters in the local part may not be permitted. 

95 if not allow_smtputf8: 

96 # Check for invalid characters against the non-internationalized 

97 # permitted character set. 

98 # (RFC 5322 3.2.3) 

99 bad_chars = set( 

100 safe_character_display(c) 

101 for c in local 

102 if not ATEXT_RE.match(c) 

103 ) 

104 if bad_chars: 

105 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") 

106 

107 # Although the check above should always find something, fall back to this just in case. 

108 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.") 

109 

110 # It's valid. 

111 valid = "dot-atom" 

112 requires_smtputf8 = True 

113 

114 # There are no syntactic restrictions on quoted local parts, so if 

115 # it was originally quoted, it is probably valid. More characters 

116 # are allowed, like @-signs, spaces, and quotes, and there are no 

117 # restrictions on the placement of dots, as in dot-atom local parts. 

118 elif quoted_local_part: 

119 # Check for invalid characters in a quoted string local part. 

120 # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete* 

121 # characters which are *not* allowed here. RFC 6531 section 3.3 

122 # extends the range to UTF8 strings.) 

123 bad_chars = set( 

124 safe_character_display(c) 

125 for c in local 

126 if not QTEXT_INTL.match(c) 

127 ) 

128 if bad_chars: 

129 raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") 

130 

131 # See if any characters are outside of the ASCII range. 

132 bad_chars = set( 

133 safe_character_display(c) 

134 for c in local 

135 if not (32 <= ord(c) <= 126) 

136 ) 

137 if bad_chars: 

138 requires_smtputf8 = True 

139 

140 # International characters in the local part may not be permitted. 

141 if not allow_smtputf8: 

142 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".") 

143 

144 # It's valid. 

145 valid = "quoted" 

146 

147 # If the local part matches the internationalized dot-atom form or was quoted, 

148 # perform normalization and additional checks for Unicode strings. 

149 if valid: 

150 # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied, 

151 # so we'll return the normalized local part in the return value. 

152 local = unicodedata.normalize("NFC", local) 

153 

154 # Check that the local part is a valid, safe, and sensible Unicode string. 

155 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked 

156 # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the 

157 # email specs, but they may not be valid, safe, or sensible Unicode strings. 

158 # See the function for rationale. 

159 check_unsafe_chars(local, allow_space=(valid == "quoted")) 

160 

161 # Try encoding to UTF-8. Failure is possible with some characters like 

162 # surrogate code points, but those are checked above. Still, we don't 

163 # want to have an unhandled exception later. 

164 try: 

165 local.encode("utf8") 

166 except ValueError: 

167 raise EmailSyntaxError("The email address contains an invalid character.") 

168 

169 # If this address passes only by the quoted string form, re-quote it 

170 # and backslash-escape quotes and backslashes (removing any unnecessary 

171 # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent, 

172 # and the sending system SHOULD transmit the form that uses the minimum quoting possible." 

173 if valid == "quoted": 

174 local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"' 

175 

176 return { 

177 "local_part": local, 

178 "ascii_local_part": local if not requires_smtputf8 else None, 

179 "smtputf8": requires_smtputf8, 

180 } 

181 

182 # It's not a valid local part. Let's find out why. 

183 # (Since quoted local parts are all valid or handled above, these checks 

184 # don't apply in those cases.) 

185 

186 # Check for invalid characters. 

187 # (RFC 5322 3.2.3, plus RFC 6531 3.3) 

188 bad_chars = set( 

189 safe_character_display(c) 

190 for c in local 

191 if not ATEXT_INTL_RE.match(c) 

192 ) 

193 if bad_chars: 

194 raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".") 

195 

196 # Check for dot errors imposted by the dot-atom rule. 

197 # (RFC 5322 3.2.3) 

198 check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False) 

199 

200 # All of the reasons should already have been checked, but just in case 

201 # we have a fallback message. 

202 raise EmailSyntaxError("The email address contains invalid characters before the @-sign.") 

203 

204 

205def check_unsafe_chars(s, allow_space=False): 

206 # Check for unsafe characters or characters that would make the string 

207 # invalid or non-sensible Unicode. 

208 bad_chars = set() 

209 for i, c in enumerate(s): 

210 category = unicodedata.category(c) 

211 if category[0] in ("L", "N", "P", "S"): 

212 # Letters, numbers, punctuation, and symbols are permitted. 

213 pass 

214 elif category[0] == "M": 

215 # Combining character in first position would combine with something 

216 # outside of the email address if concatenated, so they are not safe. 

217 # We also check if this occurs after the @-sign, which would not be 

218 # sensible. 

219 if i == 0: 

220 bad_chars.add(c) 

221 elif category == "Zs": 

222 # Spaces outside of the ASCII range are not specifically disallowed in 

223 # internationalized addresses as far as I can tell, but they violate 

224 # the spirit of the non-internationalized specification that email 

225 # addresses do not contain ASCII spaces when not quoted. Excluding 

226 # ASCII spaces when not quoted is handled directly by the atom regex. 

227 # 

228 # In quoted-string local parts, spaces are explicitly permitted, and 

229 # the ASCII space has category Zs, so we must allow it here, and we'll 

230 # allow all Unicode spaces to be consistent. 

231 if not allow_space: 

232 bad_chars.add(c) 

233 elif category[0] == "Z": 

234 # The two line and paragraph separator characters (in categories Zl and Zp) 

235 # are not specifically disallowed in internationalized addresses 

236 # as far as I can tell, but they violate the spirit of the non-internationalized 

237 # specification that email addresses do not contain line breaks when not quoted. 

238 bad_chars.add(c) 

239 elif category[0] in ("C", "Z"): 

240 # Control, format, surrogate, private use, and unassigned code points (C) 

241 # are all unsafe in various ways. Control and format characters can affect 

242 # text rendering if the email address is concatenated with other text. 

243 # Bidirectional format characters are unsafe, even if used properly, because 

244 # they cause an email address to render as a different email address. 

245 # Private use characters do not make sense for publicly deliverable 

246 # email addresses. 

247 bad_chars.add(c) 

248 else: 

249 # All categories should be handled above, but in case there is something new 

250 # to the Unicode specification in the future, reject all other categories. 

251 bad_chars.add(c) 

252 if bad_chars: 

253 raise EmailSyntaxError("The email address contains unsafe characters: " 

254 + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".") 

255 

256 

257def check_dot_atom(label, start_descr, end_descr, is_hostname): 

258 # RFC 5322 3.2.3 

259 if label.endswith("."): 

260 raise EmailSyntaxError(end_descr.format("period")) 

261 if label.startswith("."): 

262 raise EmailSyntaxError(start_descr.format("period")) 

263 if ".." in label: 

264 raise EmailSyntaxError("An email address cannot have two periods in a row.") 

265 

266 if is_hostname: 

267 # RFC 952 

268 if label.endswith("-"): 

269 raise EmailSyntaxError(end_descr.format("hyphen")) 

270 if label.startswith("-"): 

271 raise EmailSyntaxError(start_descr.format("hyphen")) 

272 if ".-" in label or "-." in label: 

273 raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.") 

274 

275 

276def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True): 

277 """Validates the syntax of the domain part of an email address.""" 

278 

279 # Check for invalid characters before normalization. 

280 # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses) 

281 bad_chars = set( 

282 safe_character_display(c) 

283 for c in domain 

284 if not ATEXT_HOSTNAME_INTL.match(c) 

285 ) 

286 if bad_chars: 

287 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 

288 

289 # Check for unsafe characters. 

290 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked 

291 # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but 

292 # they may not be valid, safe, or sensible Unicode strings. 

293 check_unsafe_chars(domain) 

294 

295 # Perform UTS-46 normalization, which includes casefolding, NFC normalization, 

296 # and converting all label separators (the period/full stop, fullwidth full stop, 

297 # ideographic full stop, and halfwidth ideographic full stop) to basic periods. 

298 # It will also raise an exception if there is an invalid character in the input, 

299 # such as "⒈" which is invalid because it would expand to include a period. 

300 try: 

301 domain = idna.uts46_remap(domain, std3_rules=False, transitional=False) 

302 except idna.IDNAError as e: 

303 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") 

304 

305 # The domain part is made up period-separated "labels." Each label must 

306 # have at least one character and cannot start or end with dashes, which 

307 # means there are some surprising restrictions on periods and dashes. 

308 # Check that before we do IDNA encoding because the IDNA library gives 

309 # unfriendly errors for these cases, but after UTS-46 normalization because 

310 # it can insert periods and hyphens (from fullwidth characters). 

311 # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3) 

312 check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True) 

313 

314 # Check for RFC 5890's invalid R-LDH labels, which are labels that start 

315 # with two characters other than "xn" and two dashes. 

316 for label in domain.split("."): 

317 if re.match(r"(?!xn)..--", label, re.I): 

318 raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.") 

319 

320 if DOT_ATOM_TEXT_HOSTNAME.match(domain): 

321 # This is a valid non-internationalized domain. 

322 ascii_domain = domain 

323 else: 

324 # If international characters are present in the domain name, convert 

325 # the domain to IDNA ASCII. If internationalized characters are present, 

326 # the MTA must either support SMTPUTF8 or the mail client must convert the 

327 # domain name to IDNA before submission. 

328 # 

329 # Unfortunately this step incorrectly 'fixes' domain names with leading 

330 # periods by removing them, so we have to check for this above. It also gives 

331 # a funky error message ("No input") when there are two periods in a 

332 # row, also checked separately above. 

333 # 

334 # For ASCII-only domains, the transformation does nothing and is safe to 

335 # apply. However, to ensure we don't rely on the idna library for basic 

336 # syntax checks, we don't use it if it's not needed. 

337 try: 

338 ascii_domain = idna.encode(domain, uts46=False).decode("ascii") 

339 except idna.IDNAError as e: 

340 if "Domain too long" in str(e): 

341 # We can't really be more specific because UTS-46 normalization means 

342 # the length check is applied to a string that is different from the 

343 # one the user supplied. Also I'm not sure if the length check applies 

344 # to the internationalized form, the IDNA ASCII form, or even both! 

345 raise EmailSyntaxError("The email address is too long after the @-sign.") 

346 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).") 

347 

348 # Check the syntax of the string returned by idna.encode. 

349 # It should never fail. 

350 m = DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain) 

351 if not m: 

352 raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.") 

353 

354 # Check the length of the domain name in bytes. 

355 # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2) 

356 # We're checking the number of bytes ("octets") here, which can be much 

357 # higher than the number of characters in internationalized domains, 

358 # on the assumption that the domain may be transmitted without SMTPUTF8 

359 # as IDNA ASCII. (This is also checked by idna.encode, so this exception 

360 # is never reached for internationalized domains.) 

361 if len(ascii_domain) > DOMAIN_MAX_LENGTH: 

362 reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH) 

363 raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.") 

364 

365 # Also check the label length limit. 

366 # (RFC 1035 2.3.1) 

367 for label in ascii_domain.split("."): 

368 if len(label) > DNS_LABEL_LENGTH_LIMIT: 

369 reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT) 

370 raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.") 

371 

372 if globally_deliverable: 

373 # All publicly deliverable addresses have domain named with at least 

374 # one period, at least for gTLDs created since 2013 (per the ICANN Board 

375 # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en). 

376 # We'll consider the lack of a period a syntax error 

377 # since that will match people's sense of what an email address looks 

378 # like. We'll skip this in test environments to allow '@test' email 

379 # addresses. 

380 if "." not in ascii_domain and not (ascii_domain == "test" and test_environment): 

381 raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.") 

382 

383 # We also know that all TLDs currently end with a letter. 

384 if not DOMAIN_NAME_REGEX.search(ascii_domain): 

385 raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.") 

386 

387 # Check special-use and reserved domain names. 

388 # Some might fail DNS-based deliverability checks, but that 

389 # can be turned off, so we should fail them all sooner. 

390 # See the references in __init__.py. 

391 from . import SPECIAL_USE_DOMAIN_NAMES 

392 for d in SPECIAL_USE_DOMAIN_NAMES: 

393 # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES. 

394 if d == "test" and test_environment: 

395 continue 

396 

397 if ascii_domain == d or ascii_domain.endswith("." + d): 

398 raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.") 

399 

400 # We may have been given an IDNA ASCII domain to begin with. Check 

401 # that the domain actually conforms to IDNA. It could look like IDNA 

402 # but not be actual IDNA. For ASCII-only domains, the conversion out 

403 # of IDNA just gives the same thing back. 

404 # 

405 # This gives us the canonical internationalized form of the domain. 

406 try: 

407 domain_i18n = idna.decode(ascii_domain.encode('ascii')) 

408 except idna.IDNAError as e: 

409 raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).") 

410 

411 # Check for invalid characters after normalization. These 

412 # should never arise. See the similar checks above. 

413 bad_chars = set( 

414 safe_character_display(c) 

415 for c in domain 

416 if not ATEXT_HOSTNAME_INTL.match(c) 

417 ) 

418 if bad_chars: 

419 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".") 

420 check_unsafe_chars(domain) 

421 

422 # Return the IDNA ASCII-encoded form of the domain, which is how it 

423 # would be transmitted on the wire (except when used with SMTPUTF8 

424 # possibly), as well as the canonical Unicode form of the domain, 

425 # which is better for display purposes. This should also take care 

426 # of RFC 6532 section 3.1's suggestion to apply Unicode NFC 

427 # normalization to addresses. 

428 return { 

429 "ascii_domain": ascii_domain, 

430 "domain": domain_i18n, 

431 } 

432 

433 

434def validate_email_domain_literal(domain_literal, allow_domain_literal=False): 

435 # This is obscure domain-literal syntax. Parse it and return 

436 # a compressed/normalized address. 

437 # RFC 5321 4.1.3 and RFC 5322 3.4.1. 

438 

439 # Try to parse the domain literal as an IPv4 address. 

440 # There is no tag for IPv4 addresses, so we can never 

441 # be sure if the user intends an IPv4 address. 

442 if re.match(r"^[0-9\.]+$", domain_literal): 

443 try: 

444 addr = ipaddress.IPv4Address(domain_literal) 

445 except ValueError as e: 

446 raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.") 

447 if not allow_domain_literal: 

448 raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.") 

449 

450 # Return the IPv4Address object and the domain back unchanged. 

451 return { 

452 "domain_address": addr, 

453 "domain": f"[{addr}]", 

454 } 

455 

456 # If it begins with "IPv6:" it's an IPv6 address. 

457 if domain_literal.startswith("IPv6:"): 

458 try: 

459 addr = ipaddress.IPv6Address(domain_literal[5:]) 

460 except ValueError as e: 

461 raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).") 

462 if not allow_domain_literal: 

463 raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.") 

464 

465 # Return the IPv6Address object and construct a normalized 

466 # domain literal. 

467 return { 

468 "domain_address": addr, 

469 "domain": f"[IPv6:{addr.compressed}]", 

470 } 

471 

472 if ":" not in domain_literal: 

473 raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.") 

474 

475 # The tag (the part before the colon) has character restrictions, 

476 # but since it must come from a registry of tags (in which only "IPv6" is defined), 

477 # there's no need to check the syntax of the tag. See RFC 5321 4.1.2. 

478 

479 # Check for permitted ASCII characters. This actually doesn't matter 

480 # since there will be an exception after anyway. 

481 bad_chars = set( 

482 safe_character_display(c) 

483 for c in domain_literal 

484 if not DOMAIN_LITERAL_CHARS.match(c) 

485 ) 

486 if bad_chars: 

487 raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".") 

488 

489 # There are no other domain literal tags. 

490 # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml 

491 raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")