1"""Domain."""
2
3# standard
4from os import environ
5from pathlib import Path
6import re
7from typing import Optional, Set
8
9# local
10from .utils import validator
11
12
13class _IanaTLD:
14 """Read IANA TLDs, and optionally cache them."""
15
16 _full_cache: Optional[Set[str]] = None
17 # source: https://www.statista.com/statistics/265677
18 _popular_cache = {"COM", "ORG", "RU", "DE", "NET", "BR", "UK", "JP", "FR", "IT"}
19 _popular_cache.add("ONION")
20
21 @classmethod
22 def _retrieve(cls):
23 with Path(__file__).parent.joinpath("_tld.txt").open() as tld_f:
24 _ = next(tld_f) # ignore the first line
25 for line in tld_f:
26 yield line.strip()
27
28 @classmethod
29 def check(cls, tld: str):
30 if tld in cls._popular_cache:
31 return True
32 if cls._full_cache is None:
33 if environ.get("PYVLD_CACHE_TLD") == "True":
34 cls._full_cache = set(cls._retrieve())
35 else:
36 return tld in cls._retrieve()
37 return tld in cls._full_cache
38
39
40@validator
41def domain(
42 value: str, /, *, consider_tld: bool = False, rfc_1034: bool = False, rfc_2782: bool = False
43):
44 """Return whether or not given value is a valid domain.
45
46 Examples:
47 >>> domain('example.com')
48 True
49 >>> domain('example.com/')
50 ValidationError(func=domain, args={'value': 'example.com/'})
51 >>> # Supports IDN domains as well::
52 >>> domain('xn----gtbspbbmkef.xn--p1ai')
53 True
54
55 Args:
56 value:
57 Domain string to validate.
58 consider_tld:
59 Restrict domain to TLDs allowed by IANA.
60 rfc_1034:
61 Allows optional trailing dot in the domain name.
62 Ref: [RFC 1034](https://www.rfc-editor.org/rfc/rfc1034).
63 rfc_2782:
64 Domain name is of type service record.
65 Allows optional underscores in the domain name.
66 Ref: [RFC 2782](https://www.rfc-editor.org/rfc/rfc2782).
67
68
69 Returns:
70 (Literal[True]): If `value` is a valid domain name.
71 (ValidationError): If `value` is an invalid domain name.
72
73 Raises:
74 (UnicodeError): If `value` cannot be encoded into `idna` or decoded into `utf-8`.
75 """
76 if not value:
77 return False
78
79 if consider_tld and not _IanaTLD.check(value.rstrip(".").rsplit(".", 1)[-1].upper()):
80 return False
81
82 try:
83 service_record = r"_" if rfc_2782 else ""
84 trailing_dot = r"\.?$" if rfc_1034 else r"$"
85
86 return not re.search(r"\s|__+", value) and re.match(
87 # First character of the domain
88 rf"^(?:[a-z0-9{service_record}]"
89 # Sub-domain
90 + rf"(?:[a-z0-9-{service_record}]{{0,61}}"
91 # Hostname
92 + rf"[a-z0-9{service_record}])?\.)"
93 # First 61 characters of the gTLD
94 + r"+[a-z0-9][a-z0-9-_]{0,61}"
95 # Last character of the gTLD
96 + rf"[a-z]{trailing_dot}",
97 value.encode("idna").decode("utf-8"),
98 re.IGNORECASE,
99 )
100 except UnicodeError as err:
101 raise UnicodeError(f"Unable to encode/decode {value}") from err