Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/email

1from .exceptions_types import EmailSyntaxError

2from .rfc_constants import EMAIL_MAX_LENGTH, LOCAL_PART_MAX_LENGTH, DOMAIN_MAX_LENGTH, \

3 DOT_ATOM_TEXT, DOT_ATOM_TEXT_INTL, ATEXT_RE, ATEXT_INTL_RE, ATEXT_HOSTNAME_INTL, QTEXT_INTL, \

4 DNS_LABEL_LENGTH_LIMIT, DOT_ATOM_TEXT_HOSTNAME, DOMAIN_NAME_REGEX, DOMAIN_LITERAL_CHARS

6import re

7import unicodedata

8import idna # implements IDNA 2008; Python's codec is only IDNA 2003

9import ipaddress

10from typing import Optional

13def get_length_reason(addr, utf8=False, limit=EMAIL_MAX_LENGTH):

14 """Helper function to return an error message related to invalid length."""

15 diff = len(addr) - limit

16 prefix = "at least " if utf8 else ""

17 suffix = "s" if diff > 1 else ""

18 return f"({prefix}{diff} character{suffix} too many)"

21def safe_character_display(c):

22 # Return safely displayable characters in quotes.

23 if c == '\\':

24 return f"\"{c}\"" # can't use repr because it escapes it

25 if unicodedata.category(c)[0] in ("L", "N", "P", "S"):

26 return repr(c)

28 # Construct a hex string in case the unicode name doesn't exist.

29 if ord(c) < 0xFFFF:

30 h = f"U+{ord(c):04x}".upper()

31 else:

32 h = f"U+{ord(c):08x}".upper()

34 # Return the character name or, if it has no name, the hex string.

35 return unicodedata.name(c, h)

38def validate_email_local_part(local: str, allow_smtputf8: bool = True, allow_empty_local: bool = False,

39 quoted_local_part: bool = False):

40 """Validates the syntax of the local part of an email address."""

42 if len(local) == 0:

43 if not allow_empty_local:

44 raise EmailSyntaxError("There must be something before the @-sign.")

45 else:

46 # The caller allows an empty local part. Useful for validating certain

47 # Postfix aliases.

48 return {

49 "local_part": local,

50 "ascii_local_part": local,

51 "smtputf8": False,

52 }

54 # Check the length of the local part by counting characters.

55 # (RFC 5321 4.5.3.1.1)

56 # We're checking the number of characters here. If the local part

57 # is ASCII-only, then that's the same as bytes (octets). If it's

58 # internationalized, then the UTF-8 encoding may be longer, but

59 # that may not be relevant. We will check the total address length

60 # instead.

61 if len(local) > LOCAL_PART_MAX_LENGTH:

62 reason = get_length_reason(local, limit=LOCAL_PART_MAX_LENGTH)

63 raise EmailSyntaxError(f"The email address is too long before the @-sign {reason}.")

65 # Check the local part against the non-internationalized regular expression.

66 # Most email addresses match this regex so it's probably fastest to check this first.

67 # (RFC 5322 3.2.3)

68 # All local parts matching the dot-atom rule are also valid as a quoted string

69 # so if it was originally quoted (quoted_local_part is True) and this regex matches,

70 # it's ok.

71 # (RFC 5321 4.1.2 / RFC 5322 3.2.4).

72 m = DOT_ATOM_TEXT.match(local)

73 if m:

74 # It's valid. And since it's just the permitted ASCII characters,

75 # it's normalized and safe. If the local part was originally quoted,

76 # the quoting was unnecessary and it'll be returned as normalized to

77 # non-quoted form.

79 # Return the local part and flag that SMTPUTF8 is not needed.

80 return {

81 "local_part": local,

82 "ascii_local_part": local,

83 "smtputf8": False,

84 }

86 # The local part failed the basic dot-atom check. Try the extended character set

87 # for internationalized addresses. It's the same pattern but with additional

88 # characters permitted.

89 # RFC 6531 section 3.3.

90 valid: Optional[str] = None

91 requires_smtputf8 = False

92 m = DOT_ATOM_TEXT_INTL.match(local)

93 if m:

94 # But international characters in the local part may not be permitted.

95 if not allow_smtputf8:

96 # Check for invalid characters against the non-internationalized

97 # permitted character set.

98 # (RFC 5322 3.2.3)

99 bad_chars = set(

100 safe_character_display(c)

101 for c in local

102 if not ATEXT_RE.match(c)

103 )

104 if bad_chars:

105 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")

106

107 # Although the check above should always find something, fall back to this just in case.

108 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported.")

109

110 # It's valid.

111 valid = "dot-atom"

112 requires_smtputf8 = True

113

114 # There are no syntactic restrictions on quoted local parts, so if

115 # it was originally quoted, it is probably valid. More characters

116 # are allowed, like @-signs, spaces, and quotes, and there are no

117 # restrictions on the placement of dots, as in dot-atom local parts.

118 elif quoted_local_part:

119 # Check for invalid characters in a quoted string local part.

120 # (RFC 5321 4.1.2. RFC 5322 lists additional permitted *obsolete*

121 # characters which are *not* allowed here. RFC 6531 section 3.3

122 # extends the range to UTF8 strings.)

123 bad_chars = set(

124 safe_character_display(c)

125 for c in local

126 if not QTEXT_INTL.match(c)

127 )

128 if bad_chars:

129 raise EmailSyntaxError("The email address contains invalid characters in quotes before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")

130

131 # See if any characters are outside of the ASCII range.

132 bad_chars = set(

133 safe_character_display(c)

134 for c in local

135 if not (32 <= ord(c) <= 126)

136 )

137 if bad_chars:

138 requires_smtputf8 = True

139

140 # International characters in the local part may not be permitted.

141 if not allow_smtputf8:

142 raise EmailSyntaxError("Internationalized characters before the @-sign are not supported: " + ", ".join(sorted(bad_chars)) + ".")

143

144 # It's valid.

145 valid = "quoted"

146

147 # If the local part matches the internationalized dot-atom form or was quoted,

148 # perform normalization and additional checks for Unicode strings.

149 if valid:

150 # RFC 6532 section 3.1 says that Unicode NFC normalization should be applied,

151 # so we'll return the normalized local part in the return value.

152 local = unicodedata.normalize("NFC", local)

153

154 # Check that the local part is a valid, safe, and sensible Unicode string.

155 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked

156 # by DOT_ATOM_TEXT_INTL and QTEXT_INTL. Other characters may be permitted by the

157 # email specs, but they may not be valid, safe, or sensible Unicode strings.

158 # See the function for rationale.

159 check_unsafe_chars(local, allow_space=(valid == "quoted"))

160

161 # Try encoding to UTF-8. Failure is possible with some characters like

162 # surrogate code points, but those are checked above. Still, we don't

163 # want to have an unhandled exception later.

164 try:

165 local.encode("utf8")

166 except ValueError:

167 raise EmailSyntaxError("The email address contains an invalid character.")

168

169 # If this address passes only by the quoted string form, re-quote it

170 # and backslash-escape quotes and backslashes (removing any unnecessary

171 # escapes). Per RFC 5321 4.1.2, "all quoted forms MUST be treated as equivalent,

172 # and the sending system SHOULD transmit the form that uses the minimum quoting possible."

173 if valid == "quoted":

174 local = '"' + re.sub(r'(["\\])', r'\\\1', local) + '"'

175

176 return {

177 "local_part": local,

178 "ascii_local_part": local if not requires_smtputf8 else None,

179 "smtputf8": requires_smtputf8,

180 }

181

182 # It's not a valid local part. Let's find out why.

183 # (Since quoted local parts are all valid or handled above, these checks

184 # don't apply in those cases.)

185

186 # Check for invalid characters.

187 # (RFC 5322 3.2.3, plus RFC 6531 3.3)

188 bad_chars = set(

189 safe_character_display(c)

190 for c in local

191 if not ATEXT_INTL_RE.match(c)

192 )

193 if bad_chars:

194 raise EmailSyntaxError("The email address contains invalid characters before the @-sign: " + ", ".join(sorted(bad_chars)) + ".")

195

196 # Check for dot errors imposted by the dot-atom rule.

197 # (RFC 5322 3.2.3)

198 check_dot_atom(local, 'An email address cannot start with a {}.', 'An email address cannot have a {} immediately before the @-sign.', is_hostname=False)

199

200 # All of the reasons should already have been checked, but just in case

201 # we have a fallback message.

202 raise EmailSyntaxError("The email address contains invalid characters before the @-sign.")

203

204

205def check_unsafe_chars(s, allow_space=False):

206 # Check for unsafe characters or characters that would make the string

207 # invalid or non-sensible Unicode.

208 bad_chars = set()

209 for i, c in enumerate(s):

210 category = unicodedata.category(c)

211 if category[0] in ("L", "N", "P", "S"):

212 # Letters, numbers, punctuation, and symbols are permitted.

213 pass

214 elif category[0] == "M":

215 # Combining character in first position would combine with something

216 # outside of the email address if concatenated, so they are not safe.

217 # We also check if this occurs after the @-sign, which would not be

218 # sensible.

219 if i == 0:

220 bad_chars.add(c)

221 elif category == "Zs":

222 # Spaces outside of the ASCII range are not specifically disallowed in

223 # internationalized addresses as far as I can tell, but they violate

224 # the spirit of the non-internationalized specification that email

225 # addresses do not contain ASCII spaces when not quoted. Excluding

226 # ASCII spaces when not quoted is handled directly by the atom regex.

227 #

228 # In quoted-string local parts, spaces are explicitly permitted, and

229 # the ASCII space has category Zs, so we must allow it here, and we'll

230 # allow all Unicode spaces to be consistent.

231 if not allow_space:

232 bad_chars.add(c)

233 elif category[0] == "Z":

234 # The two line and paragraph separator characters (in categories Zl and Zp)

235 # are not specifically disallowed in internationalized addresses

236 # as far as I can tell, but they violate the spirit of the non-internationalized

237 # specification that email addresses do not contain line breaks when not quoted.

238 bad_chars.add(c)

239 elif category[0] in ("C", "Z"):

240 # Control, format, surrogate, private use, and unassigned code points (C)

241 # are all unsafe in various ways. Control and format characters can affect

242 # text rendering if the email address is concatenated with other text.

243 # Bidirectional format characters are unsafe, even if used properly, because

244 # they cause an email address to render as a different email address.

245 # Private use characters do not make sense for publicly deliverable

246 # email addresses.

247 bad_chars.add(c)

248 else:

249 # All categories should be handled above, but in case there is something new

250 # to the Unicode specification in the future, reject all other categories.

251 bad_chars.add(c)

252 if bad_chars:

253 raise EmailSyntaxError("The email address contains unsafe characters: "

254 + ", ".join(safe_character_display(c) for c in sorted(bad_chars)) + ".")

255

256

257def check_dot_atom(label, start_descr, end_descr, is_hostname):

258 # RFC 5322 3.2.3

259 if label.endswith("."):

260 raise EmailSyntaxError(end_descr.format("period"))

261 if label.startswith("."):

262 raise EmailSyntaxError(start_descr.format("period"))

263 if ".." in label:

264 raise EmailSyntaxError("An email address cannot have two periods in a row.")

265

266 if is_hostname:

267 # RFC 952

268 if label.endswith("-"):

269 raise EmailSyntaxError(end_descr.format("hyphen"))

270 if label.startswith("-"):

271 raise EmailSyntaxError(start_descr.format("hyphen"))

272 if ".-" in label or "-." in label:

273 raise EmailSyntaxError("An email address cannot have a period and a hyphen next to each other.")

274

275

276def validate_email_domain_name(domain, test_environment=False, globally_deliverable=True):

277 """Validates the syntax of the domain part of an email address."""

278

279 # Check for invalid characters before normalization.

280 # (RFC 952 plus RFC 6531 section 3.3 for internationalized addresses)

281 bad_chars = set(

282 safe_character_display(c)

283 for c in domain

284 if not ATEXT_HOSTNAME_INTL.match(c)

285 )

286 if bad_chars:

287 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")

288

289 # Check for unsafe characters.

290 # Some of this may be redundant with the range U+0080 to U+10FFFF that is checked

291 # by DOT_ATOM_TEXT_INTL. Other characters may be permitted by the email specs, but

292 # they may not be valid, safe, or sensible Unicode strings.

293 check_unsafe_chars(domain)

294

295 # Perform UTS-46 normalization, which includes casefolding, NFC normalization,

296 # and converting all label separators (the period/full stop, fullwidth full stop,

297 # ideographic full stop, and halfwidth ideographic full stop) to basic periods.

298 # It will also raise an exception if there is an invalid character in the input,

299 # such as "⒈" which is invalid because it would expand to include a period.

300 try:

301 domain = idna.uts46_remap(domain, std3_rules=False, transitional=False)

302 except idna.IDNAError as e:

303 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).")

304

305 # The domain part is made up period-separated "labels." Each label must

306 # have at least one character and cannot start or end with dashes, which

307 # means there are some surprising restrictions on periods and dashes.

308 # Check that before we do IDNA encoding because the IDNA library gives

309 # unfriendly errors for these cases, but after UTS-46 normalization because

310 # it can insert periods and hyphens (from fullwidth characters).

311 # (RFC 952, RFC 1123 2.1, RFC 5322 3.2.3)

312 check_dot_atom(domain, 'An email address cannot have a {} immediately after the @-sign.', 'An email address cannot end with a {}.', is_hostname=True)

313

314 # Check for RFC 5890's invalid R-LDH labels, which are labels that start

315 # with two characters other than "xn" and two dashes.

316 for label in domain.split("."):

317 if re.match(r"(?!xn)..--", label, re.I):

318 raise EmailSyntaxError("An email address cannot have two letters followed by two dashes immediately after the @-sign or after a period, except Punycode.")

319

320 if DOT_ATOM_TEXT_HOSTNAME.match(domain):

321 # This is a valid non-internationalized domain.

322 ascii_domain = domain

323 else:

324 # If international characters are present in the domain name, convert

325 # the domain to IDNA ASCII. If internationalized characters are present,

326 # the MTA must either support SMTPUTF8 or the mail client must convert the

327 # domain name to IDNA before submission.

328 #

329 # Unfortunately this step incorrectly 'fixes' domain names with leading

330 # periods by removing them, so we have to check for this above. It also gives

331 # a funky error message ("No input") when there are two periods in a

332 # row, also checked separately above.

333 #

334 # For ASCII-only domains, the transformation does nothing and is safe to

335 # apply. However, to ensure we don't rely on the idna library for basic

336 # syntax checks, we don't use it if it's not needed.

337 try:

338 ascii_domain = idna.encode(domain, uts46=False).decode("ascii")

339 except idna.IDNAError as e:

340 if "Domain too long" in str(e):

341 # We can't really be more specific because UTS-46 normalization means

342 # the length check is applied to a string that is different from the

343 # one the user supplied. Also I'm not sure if the length check applies

344 # to the internationalized form, the IDNA ASCII form, or even both!

345 raise EmailSyntaxError("The email address is too long after the @-sign.")

346 raise EmailSyntaxError(f"The part after the @-sign contains invalid characters ({e}).")

347

348 # Check the syntax of the string returned by idna.encode.

349 # It should never fail.

350 m = DOT_ATOM_TEXT_HOSTNAME.match(ascii_domain)

351 if not m:

352 raise EmailSyntaxError("The email address contains invalid characters after the @-sign after IDNA encoding.")

353

354 # Check the length of the domain name in bytes.

355 # (RFC 1035 2.3.4 and RFC 5321 4.5.3.1.2)

356 # We're checking the number of bytes ("octets") here, which can be much

357 # higher than the number of characters in internationalized domains,

358 # on the assumption that the domain may be transmitted without SMTPUTF8

359 # as IDNA ASCII. (This is also checked by idna.encode, so this exception

360 # is never reached for internationalized domains.)

361 if len(ascii_domain) > DOMAIN_MAX_LENGTH:

362 reason = get_length_reason(ascii_domain, limit=DOMAIN_MAX_LENGTH)

363 raise EmailSyntaxError(f"The email address is too long after the @-sign {reason}.")

364

365 # Also check the label length limit.

366 # (RFC 1035 2.3.1)

367 for label in ascii_domain.split("."):

368 if len(label) > DNS_LABEL_LENGTH_LIMIT:

369 reason = get_length_reason(label, limit=DNS_LABEL_LENGTH_LIMIT)

370 raise EmailSyntaxError(f"After the @-sign, periods cannot be separated by so many characters {reason}.")

371

372 if globally_deliverable:

373 # All publicly deliverable addresses have domain named with at least

374 # one period, at least for gTLDs created since 2013 (per the ICANN Board

375 # New gTLD Program Committee, https://www.icann.org/en/announcements/details/new-gtld-dotless-domain-names-prohibited-30-8-2013-en).

376 # We'll consider the lack of a period a syntax error

377 # since that will match people's sense of what an email address looks

378 # like. We'll skip this in test environments to allow '@test' email

379 # addresses.

380 if "." not in ascii_domain and not (ascii_domain == "test" and test_environment):

381 raise EmailSyntaxError("The part after the @-sign is not valid. It should have a period.")

382

383 # We also know that all TLDs currently end with a letter.

384 if not DOMAIN_NAME_REGEX.search(ascii_domain):

385 raise EmailSyntaxError("The part after the @-sign is not valid. It is not within a valid top-level domain.")

386

387 # Check special-use and reserved domain names.

388 # Some might fail DNS-based deliverability checks, but that

389 # can be turned off, so we should fail them all sooner.

390 # See the references in __init__.py.

391 from . import SPECIAL_USE_DOMAIN_NAMES

392 for d in SPECIAL_USE_DOMAIN_NAMES:

393 # See the note near the definition of SPECIAL_USE_DOMAIN_NAMES.

394 if d == "test" and test_environment:

395 continue

396

397 if ascii_domain == d or ascii_domain.endswith("." + d):

398 raise EmailSyntaxError("The part after the @-sign is a special-use or reserved name that cannot be used with email.")

399

400 # We may have been given an IDNA ASCII domain to begin with. Check

401 # that the domain actually conforms to IDNA. It could look like IDNA

402 # but not be actual IDNA. For ASCII-only domains, the conversion out

403 # of IDNA just gives the same thing back.

404 #

405 # This gives us the canonical internationalized form of the domain.

406 try:

407 domain_i18n = idna.decode(ascii_domain.encode('ascii'))

408 except idna.IDNAError as e:

409 raise EmailSyntaxError(f"The part after the @-sign is not valid IDNA ({e}).")

410

411 # Check for invalid characters after normalization. These

412 # should never arise. See the similar checks above.

413 bad_chars = set(

414 safe_character_display(c)

415 for c in domain

416 if not ATEXT_HOSTNAME_INTL.match(c)

417 )

418 if bad_chars:

419 raise EmailSyntaxError("The part after the @-sign contains invalid characters: " + ", ".join(sorted(bad_chars)) + ".")

420 check_unsafe_chars(domain)

421

422 # Return the IDNA ASCII-encoded form of the domain, which is how it

423 # would be transmitted on the wire (except when used with SMTPUTF8

424 # possibly), as well as the canonical Unicode form of the domain,

425 # which is better for display purposes. This should also take care

426 # of RFC 6532 section 3.1's suggestion to apply Unicode NFC

427 # normalization to addresses.

428 return {

429 "ascii_domain": ascii_domain,

430 "domain": domain_i18n,

431 }

432

433

434def validate_email_domain_literal(domain_literal, allow_domain_literal=False):

435 # This is obscure domain-literal syntax. Parse it and return

436 # a compressed/normalized address.

437 # RFC 5321 4.1.3 and RFC 5322 3.4.1.

438

439 # Try to parse the domain literal as an IPv4 address.

440 # There is no tag for IPv4 addresses, so we can never

441 # be sure if the user intends an IPv4 address.

442 if re.match(r"^[0-9\.]+$", domain_literal):

443 try:

444 addr = ipaddress.IPv4Address(domain_literal)

445 except ValueError as e:

446 raise EmailSyntaxError(f"The address in brackets after the @-sign is not valid: It is not an IPv4 address ({e}) or is missing an address literal tag.")

447 if not allow_domain_literal:

448 raise EmailSyntaxError("A bracketed IPv4 address after the @-sign is not allowed here.")

449

450 # Return the IPv4Address object and the domain back unchanged.

451 return {

452 "domain_address": addr,

453 "domain": f"[{addr}]",

454 }

455

456 # If it begins with "IPv6:" it's an IPv6 address.

457 if domain_literal.startswith("IPv6:"):

458 try:

459 addr = ipaddress.IPv6Address(domain_literal[5:])

460 except ValueError as e:

461 raise EmailSyntaxError(f"The IPv6 address in brackets after the @-sign is not valid ({e}).")

462 if not allow_domain_literal:

463 raise EmailSyntaxError("A bracketed IPv6 address after the @-sign is not allowed here.")

464

465 # Return the IPv6Address object and construct a normalized

466 # domain literal.

467 return {

468 "domain_address": addr,

469 "domain": f"[IPv6:{addr.compressed}]",

470 }

471

472 if ":" not in domain_literal:

473 raise EmailSyntaxError("The part after the @-sign in brackets is not an IPv4 address and has no address literal tag.")

474

475 # The tag (the part before the colon) has character restrictions,

476 # but since it must come from a registry of tags (in which only "IPv6" is defined),

477 # there's no need to check the syntax of the tag. See RFC 5321 4.1.2.

478

479 # Check for permitted ASCII characters. This actually doesn't matter

480 # since there will be an exception after anyway.

481 bad_chars = set(

482 safe_character_display(c)

483 for c in domain_literal

484 if not DOMAIN_LITERAL_CHARS.match(c)

485 )

486 if bad_chars:

487 raise EmailSyntaxError("The part after the @-sign contains invalid characters in brackets: " + ", ".join(sorted(bad_chars)) + ".")

488

489 # There are no other domain literal tags.

490 # https://www.iana.org/assignments/address-literal-tags/address-literal-tags.xhtml

491 raise EmailSyntaxError("The part after the @-sign contains an invalid address literal tag in brackets.")

Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/email_validator/syntax.py: 55%

175 statements