Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 45%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import bisect
2import re
3import unicodedata
4import warnings
5from typing import Optional, Union
7from . import idnadata
8from .intranges import intranges_contain
10_virama_combining_class = 9
11_alabel_prefix = b"xn--"
12_max_input_length = 1024
13_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
16# Bidi category sets from RFC 5893, hoisted out of the per-codepoint loop
17_bidi_rtl_first = frozenset({"R", "AL"})
18_bidi_rtl_categories = frozenset({"R", "AL", "AN"})
19_bidi_rtl_allowed = frozenset({"R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})
20_bidi_rtl_valid_ending = frozenset({"R", "AL", "EN", "AN"})
21_bidi_rtl_numeric = frozenset({"AN", "EN"})
22_bidi_ltr_allowed = frozenset({"L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})
23_bidi_ltr_valid_ending = frozenset({"L", "EN"})
24_bidi_joiner_l_or_d = frozenset({"L", "D"})
25_bidi_joiner_r_or_d = frozenset({"R", "D"})
28def _joining_type(cp: int) -> Optional[str]:
29 for jt, ranges in idnadata.joining_types.items():
30 if intranges_contain(cp, ranges):
31 return jt
32 return None
35class IDNAError(UnicodeError):
36 """Base exception for all IDNA-encoding related problems"""
39class IDNABidiError(IDNAError):
40 """Exception when bidirectional requirements are not satisfied"""
43class InvalidCodepoint(IDNAError):
44 """Exception when a disallowed or unallocated codepoint is used"""
47class InvalidCodepointContext(IDNAError):
48 """Exception when the codepoint is not valid in the context it is used"""
51def _combining_class(cp: int) -> int:
52 v = unicodedata.combining(chr(cp))
53 if v == 0 and not unicodedata.name(chr(cp)):
54 raise ValueError("Unknown character in unicodedata")
55 return v
58def _is_script(cp: str, script: str) -> bool:
59 return intranges_contain(ord(cp), idnadata.scripts[script])
62def _punycode(s: str) -> bytes:
63 return s.encode("punycode")
66def _unot(s: int) -> str:
67 return f"U+{s:04X}"
70def valid_label_length(label: Union[bytes, str]) -> bool:
71 """Check that a label does not exceed the maximum permitted length.
73 Per :rfc:`1035` (and :rfc:`5891` §4.2.4) a DNS label must not exceed
74 63 octets. The argument may be either a :class:`str` (a U-label, where
75 length is measured in characters) or :class:`bytes` (an A-label, where
76 length is measured in octets).
78 :param label: The label to check.
79 :returns: ``True`` if the label is within the length limit, otherwise
80 ``False``.
81 """
82 return len(label) <= 63
85def valid_string_length(domain: Union[bytes, str], trailing_dot: bool) -> bool:
86 """Check that a full domain name does not exceed the maximum length.
88 Per :rfc:`1035`, a domain name is limited to 253 octets when no trailing
89 dot is present, or 254 octets when one is included.
91 :param domain: The full (possibly multi-label) domain name.
92 :param trailing_dot: ``True`` if ``domain`` includes a trailing ``.``.
93 :returns: ``True`` if the domain is within the length limit, otherwise
94 ``False``.
95 """
96 return len(domain) <= (254 if trailing_dot else 253)
99def check_bidi(label: str, check_ltr: bool = False) -> bool:
100 """Validate the Bidi Rule from :rfc:`5893` for a single label.
102 The Bidi Rule constrains how bidirectional characters (Hebrew, Arabic,
103 etc.) may appear within a label. By default the check is only applied
104 when the label contains at least one right-to-left character (Unicode
105 bidirectional categories ``R``, ``AL``, or ``AN``); set ``check_ltr``
106 to ``True`` to apply it to LTR-only labels as well.
108 :param label: The label to validate, as a Unicode string.
109 :param check_ltr: If ``True``, apply the rules even when the label
110 contains no RTL characters.
111 :returns: ``True`` if the label satisfies the Bidi Rule.
112 :raises IDNABidiError: If any of Bidi Rule conditions 1-6 are violated,
113 or if the directional category of a codepoint cannot be determined.
114 """
115 if len(label) > _max_input_length:
116 raise IDNAError("Label too long")
117 # Bidi rules should only be applied if string contains RTL characters
118 bidi_label = False
119 for idx, cp in enumerate(label, 1):
120 direction = unicodedata.bidirectional(cp)
121 if direction == "":
122 # String likely comes from a newer version of Unicode
123 raise IDNABidiError(f"Unknown directionality in label {label!r} at position {idx}")
124 if direction in _bidi_rtl_categories:
125 bidi_label = True
126 if not bidi_label and not check_ltr:
127 return True
129 # Bidi rule 1
130 direction = unicodedata.bidirectional(label[0])
131 if direction in _bidi_rtl_first:
132 rtl = True
133 elif direction == "L":
134 rtl = False
135 else:
136 raise IDNABidiError(f"First codepoint in label {label!r} must be directionality L, R or AL")
138 valid_ending = False
139 number_type: Optional[str] = None
140 for idx, cp in enumerate(label, 1):
141 direction = unicodedata.bidirectional(cp)
143 if rtl:
144 # Bidi rule 2
145 if direction not in _bidi_rtl_allowed:
146 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a right-to-left label")
147 # Bidi rule 3
148 if direction in _bidi_rtl_valid_ending:
149 valid_ending = True
150 elif direction != "NSM":
151 valid_ending = False
152 # Bidi rule 4
153 if direction in _bidi_rtl_numeric:
154 if not number_type:
155 number_type = direction
156 elif number_type != direction:
157 raise IDNABidiError("Can not mix numeral types in a right-to-left label")
158 else:
159 # Bidi rule 5
160 if direction not in _bidi_ltr_allowed:
161 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a left-to-right label")
162 # Bidi rule 6
163 if direction in _bidi_ltr_valid_ending:
164 valid_ending = True
165 elif direction != "NSM":
166 valid_ending = False
168 if not valid_ending:
169 raise IDNABidiError("Label ends with illegal codepoint directionality")
171 return True
174def check_initial_combiner(label: str) -> bool:
175 """Reject labels that begin with a combining mark.
177 Per :rfc:`5891` §4.2.3.2 a label must not start with a character of
178 Unicode general category ``M`` (Mark).
180 :param label: The label to check.
181 :returns: ``True`` if the first character is not a combining mark.
182 :raises IDNAError: If the label begins with a combining character.
183 """
184 if unicodedata.category(label[0])[0] == "M":
185 raise IDNAError("Label begins with an illegal combining character")
186 return True
189def check_hyphen_ok(label: str) -> bool:
190 """Validate the hyphen restrictions for a label.
192 Per :rfc:`5891` §4.2.3.1 a label must not start or end with a hyphen
193 (``U+002D``), and must not have hyphens in both the third and fourth
194 positions (the prefix reserved for A-labels).
196 :param label: The label to check.
197 :returns: ``True`` if the hyphen restrictions are satisfied.
198 :raises IDNAError: If any of the hyphen restrictions are violated.
199 """
200 if label[2:4] == "--":
201 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")
202 if label[0] == "-" or label[-1] == "-":
203 raise IDNAError("Label must not start or end with a hyphen")
204 return True
207def check_nfc(label: str) -> None:
208 """Require that a label is in Unicode Normalization Form C.
210 :param label: The label to check.
211 :raises IDNAError: If ``label`` differs from its NFC normalisation.
212 """
213 if len(label) > _max_input_length:
214 raise IDNAError("Label too long")
215 if unicodedata.normalize("NFC", label) != label:
216 raise IDNAError("Label must be in Normalization Form C")
219def valid_contextj(label: str, pos: int) -> bool:
220 """Validate the CONTEXTJ rules from :rfc:`5892` Appendix A.
222 These rules govern the contextual use of the joiner codepoints
223 ``U+200C`` (ZERO WIDTH NON-JOINER, Appendix A.1) and ``U+200D``
224 (ZERO WIDTH JOINER, Appendix A.2) within a label.
226 :param label: The label containing the codepoint.
227 :param pos: Index of the joiner codepoint within ``label``.
228 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTJ
229 rule, ``False`` otherwise (including when the codepoint at
230 ``pos`` is not a recognised joiner).
231 :raises ValueError: If an adjacent codepoint has no Unicode name when
232 determining its combining class.
233 :raises IDNAError: If ``label`` exceeds the defensive input length limit.
234 """
235 if len(label) > _max_input_length:
236 raise IDNAError("Label too long")
237 cp_value = ord(label[pos])
239 if cp_value == 0x200C:
240 if pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class:
241 return True
243 ok = False
244 for i in range(pos - 1, -1, -1):
245 joining_type = _joining_type(ord(label[i]))
246 if joining_type == "T":
247 continue
248 if joining_type in _bidi_joiner_l_or_d:
249 ok = True
250 break
251 break
253 if not ok:
254 return False
256 ok = False
257 for i in range(pos + 1, len(label)):
258 joining_type = _joining_type(ord(label[i]))
259 if joining_type == "T":
260 continue
261 if joining_type in _bidi_joiner_r_or_d:
262 ok = True
263 break
264 break
265 return ok
267 if cp_value == 0x200D:
268 return pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class
270 return False
273def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
274 """Validate the CONTEXTO rules from :rfc:`5892` Appendix A.
276 Covers the contextual rules for codepoints such as MIDDLE DOT
277 (``U+00B7``), Greek lower numeral sign, Hebrew punctuation, Katakana
278 middle dot, and the Arabic-Indic / Extended Arabic-Indic digit ranges.
280 :param label: The label containing the codepoint.
281 :param pos: Index of the codepoint within ``label``.
282 :param exception: Reserved for forward compatibility; currently unused.
283 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTO
284 rule, ``False`` otherwise (including when the codepoint is not a
285 recognised CONTEXTO codepoint).
286 :raises IDNAError: If ``label`` exceeds the defensive input length limit.
287 """
288 if len(label) > _max_input_length:
289 raise IDNAError("Label too long")
290 cp_value = ord(label[pos])
292 if cp_value == 0x00B7:
293 return 0 < pos < len(label) - 1 and ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C
295 if cp_value == 0x0375:
296 if pos < len(label) - 1 and len(label) > 1:
297 return _is_script(label[pos + 1], "Greek")
298 return False
300 if cp_value in {0x05F3, 0x05F4}:
301 if pos > 0:
302 return _is_script(label[pos - 1], "Hebrew")
303 return False
305 if cp_value == 0x30FB:
306 for cp in label:
307 if cp == "\u30fb":
308 continue
309 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):
310 return True
311 return False
313 if 0x660 <= cp_value <= 0x669:
314 return not any(0x6F0 <= ord(cp) <= 0x06F9 for cp in label)
316 if 0x6F0 <= cp_value <= 0x6F9:
317 return not any(0x660 <= ord(cp) <= 0x0669 for cp in label)
319 return False
322def check_label(label: Union[str, bytes, bytearray]) -> None:
323 """Run the full set of IDNA 2008 validity checks on a single label.
325 Applies, in order: NFC normalisation (:func:`check_nfc`), hyphen
326 restrictions (:func:`check_hyphen_ok`), the no-leading-combiner rule
327 (:func:`check_initial_combiner`), per-codepoint validity (PVALID,
328 CONTEXTJ, CONTEXTO classes from :rfc:`5892`), and the Bidi Rule
329 (:func:`check_bidi`).
331 :param label: The label to validate. ``bytes`` or ``bytearray`` input
332 is decoded as UTF-8 first.
333 :raises IDNAError: If the label is empty or fails a structural rule.
334 :raises InvalidCodepoint: If the label contains a DISALLOWED or
335 UNASSIGNED codepoint.
336 :raises InvalidCodepointContext: If a CONTEXTJ or CONTEXTO codepoint
337 is not valid in its context.
338 :raises IDNABidiError: If the Bidi Rule is violated.
339 """
340 if len(label) > _max_input_length:
341 raise IDNAError("Label too long")
342 if isinstance(label, (bytes, bytearray)):
343 label = label.decode("utf-8")
344 if len(label) == 0:
345 raise IDNAError("Empty Label")
347 # Reject on domain length rather than label length so support some UTS 46
348 # use cases, still reducing processing of label contextual rules
349 if not valid_string_length(label, trailing_dot=True):
350 raise IDNAError("Label too long")
352 check_nfc(label)
353 check_hyphen_ok(label)
354 check_initial_combiner(label)
356 for pos, cp in enumerate(label):
357 cp_value = ord(cp)
358 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):
359 continue
360 if intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):
361 try:
362 if not valid_contextj(label, pos):
363 raise InvalidCodepointContext(f"Joiner {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")
364 except ValueError as err:
365 raise IDNAError(
366 f"Unknown codepoint adjacent to joiner {_unot(cp_value)} at position {pos + 1} in {label!r}"
367 ) from err
368 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):
369 if not valid_contexto(label, pos):
370 raise InvalidCodepointContext(f"Codepoint {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")
371 else:
372 raise InvalidCodepoint(f"Codepoint {_unot(cp_value)} at position {pos + 1} of {label!r} not allowed")
374 check_bidi(label)
377def alabel(label: str) -> bytes:
378 """Convert a single U-label into its A-label form.
380 The result is the ASCII-Compatible Encoding (ACE) form per :rfc:`5891`
381 §4: the label is validated, Punycode-encoded, and prefixed with
382 ``xn--``. Pure ASCII labels that are already valid IDNA labels are
383 returned unchanged (as :class:`bytes`).
385 :param label: The label to convert, as a Unicode string.
386 :returns: The A-label as ASCII-encoded :class:`bytes`.
387 :raises IDNAError: If the label is invalid or the resulting A-label
388 exceeds 63 octets.
389 """
390 if len(label) > _max_input_length:
391 raise IDNAError("Label too long")
392 try:
393 label_bytes = label.encode("ascii")
394 except UnicodeEncodeError:
395 pass
396 else:
397 ulabel(label_bytes)
398 if not valid_label_length(label_bytes):
399 raise IDNAError("Label too long")
400 return label_bytes
402 check_label(label)
403 label_bytes = _alabel_prefix + _punycode(label)
405 if not valid_label_length(label_bytes):
406 raise IDNAError("Label too long")
408 return label_bytes
411def ulabel(label: Union[str, bytes, bytearray]) -> str:
412 """Convert a single A-label into its U-label form.
414 Performs the inverse of :func:`alabel`: an ``xn--``-prefixed label is
415 Punycode-decoded and validated. Labels that are already Unicode (or
416 plain ASCII without the ACE prefix) are validated and returned as a
417 Unicode string.
419 :param label: The label to convert. ``bytes`` or ``bytearray`` input
420 is treated as ASCII.
421 :returns: The U-label as a Unicode string.
422 :raises IDNAError: If the label is malformed or fails validation.
423 """
424 if len(label) > _max_input_length:
425 raise IDNAError("Label too long")
426 if not isinstance(label, (bytes, bytearray)):
427 try:
428 label_bytes = label.encode("ascii")
429 except UnicodeEncodeError:
430 check_label(label)
431 return label
432 else:
433 label_bytes = bytes(label)
435 label_bytes = label_bytes.lower()
436 if label_bytes.startswith(_alabel_prefix):
437 label_bytes = label_bytes[len(_alabel_prefix) :]
438 if not label_bytes:
439 raise IDNAError("Malformed A-label, no Punycode eligible content found")
440 if label_bytes.endswith(b"-"):
441 raise IDNAError("A-label must not end with a hyphen")
442 else:
443 check_label(label_bytes)
444 return label_bytes.decode("ascii")
446 try:
447 label = label_bytes.decode("punycode")
448 except UnicodeError as err:
449 raise IDNAError("Invalid A-label") from err
450 check_label(label)
451 return label
454def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
455 """Apply the UTS #46 character mapping to a domain string.
457 Implements the mapping table from `UTS #46 §4
458 <https://www.unicode.org/reports/tr46/>`_: each character is kept,
459 replaced, or rejected based on its status (``V``, ``M``, ``D``, ``3``,
460 ``I``). The result is returned in Normalisation Form C.
462 :param domain: The full domain name to remap.
463 :param std3_rules: If ``True``, apply the stricter STD3 ASCII rules
464 (status ``3`` codepoints raise instead of being kept or mapped).
465 :param transitional: If ``True``, use transitional processing (status
466 ``D`` codepoints are mapped instead of kept). Transitional
467 processing has been removed from UTS #46 and this option is
468 retained only for backwards compatibility.
469 :returns: The remapped domain, in Normalisation Form C.
470 :raises InvalidCodepoint: If the domain contains a disallowed
471 codepoint under the chosen rules.
472 :raises IDNAError: If ``domain`` exceeds the defensive input length limit.
473 """
474 if len(domain) > _max_input_length:
475 raise IDNAError("Domain too long")
476 from .uts46data import uts46_replacements, uts46_starts, uts46_statuses
478 output = ""
480 for pos, char in enumerate(domain):
481 code_point = ord(char)
482 i = code_point if code_point < 256 else bisect.bisect_right(uts46_starts, code_point) - 1
483 status = chr(uts46_statuses[i])
484 replacement: Optional[str] = uts46_replacements[i]
486 # UTS #46 §4: V is always valid, D is deviation (kept unless transitional),
487 # 3 is disallowed-STD3 (kept unmapped if std3_rules is off and no mapping).
488 keep_as_is = (
489 status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None)
490 )
491 # M is mapped, 3-with-replacement and transitional D fall through to the
492 # same replacement output path.
493 use_replacement = replacement is not None and (
494 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)
495 )
497 if keep_as_is:
498 output += char
499 elif use_replacement:
500 assert replacement is not None # narrowed by use_replacement
501 output += replacement
502 elif status == "I":
503 continue
504 else:
505 raise InvalidCodepoint(f"Codepoint {_unot(code_point)} not allowed at position {pos + 1} in {domain!r}")
507 return unicodedata.normalize("NFC", output)
510def encode(
511 s: Union[str, bytes, bytearray],
512 strict: bool = False,
513 uts46: bool = False,
514 std3_rules: bool = False,
515 transitional: bool = False,
516) -> bytes:
517 """Encode a Unicode domain name into its ASCII (A-label) form.
519 Splits the input on label separators (only ``U+002E`` if ``strict`` is
520 set; otherwise also IDEOGRAPHIC FULL STOP ``U+3002``, FULLWIDTH FULL
521 STOP ``U+FF0E``, and HALFWIDTH IDEOGRAPHIC FULL STOP ``U+FF61``),
522 encodes each label with :func:`alabel`, and rejoins them with ``.``.
523 Optionally pre-processes the input through :func:`uts46_remap`.
525 :param s: The domain name to encode.
526 :param strict: If ``True``, only ``U+002E`` is recognised as a label
527 separator.
528 :param uts46: If ``True``, apply UTS #46 mapping before encoding.
529 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is
530 ``True``.
531 :param transitional: Forwarded to :func:`uts46_remap` when ``uts46``
532 is ``True``. Deprecated: emits a :class:`DeprecationWarning` and
533 will be removed in a future version.
534 :returns: The encoded domain as ASCII :class:`bytes`.
535 :raises IDNAError: If the domain is empty, contains an invalid label,
536 or exceeds the maximum domain length.
537 """
538 if transitional:
539 warnings.warn(
540 "Transitional processing has been removed from UTS #46. "
541 "The transitional argument will be removed in a future version.",
542 DeprecationWarning,
543 stacklevel=2,
544 )
545 if not isinstance(s, str):
546 try:
547 s = str(s, "ascii")
548 except (UnicodeDecodeError, TypeError) as err:
549 raise IDNAError("should pass a unicode string to the function rather than a byte string.") from err
550 if len(s) > _max_input_length:
551 raise IDNAError("Domain too long")
552 if uts46:
553 s = uts46_remap(s, std3_rules, transitional)
555 # Reject inputs that exceed the maximum DNS domain length up-front
556 # to avoid expensive computation on long inputs.
557 if not valid_string_length(s, trailing_dot=True):
558 raise IDNAError("Domain too long")
560 trailing_dot = False
561 result = []
562 labels = s.split(".") if strict else _unicode_dots_re.split(s)
563 if not labels or labels == [""]:
564 raise IDNAError("Empty domain")
565 if labels[-1] == "":
566 del labels[-1]
567 trailing_dot = True
568 for label in labels:
569 s = alabel(label)
570 if s:
571 result.append(s)
572 else:
573 raise IDNAError("Empty label")
574 if trailing_dot:
575 result.append(b"")
576 s = b".".join(result)
577 if not valid_string_length(s, trailing_dot):
578 raise IDNAError("Domain too long")
579 return s
582def decode(
583 s: Union[str, bytes, bytearray],
584 strict: bool = False,
585 uts46: bool = False,
586 std3_rules: bool = False,
587) -> str:
588 """Decode an A-label-encoded domain name back to Unicode.
590 Splits the input on label separators (see :func:`encode` for the
591 rules), decodes each label with :func:`ulabel`, and rejoins them
592 with ``.``. Optionally pre-processes the input through
593 :func:`uts46_remap`.
595 :param s: The domain name to decode.
596 :param strict: If ``True``, only ``U+002E`` is recognised as a label
597 separator.
598 :param uts46: If ``True``, apply UTS #46 mapping before decoding.
599 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is
600 ``True``.
601 :returns: The decoded domain as a Unicode string.
602 :raises IDNAError: If the input is not valid ASCII, contains an
603 invalid label, or is empty.
604 """
605 if not isinstance(s, str):
606 try:
607 s = str(s, "ascii")
608 except (UnicodeDecodeError, TypeError) as err:
609 raise IDNAError("Invalid ASCII in A-label") from err
610 if len(s) > _max_input_length:
611 raise IDNAError("Domain too long")
612 if uts46:
613 s = uts46_remap(s, std3_rules, False)
614 # Reject inputs that exceed the maximum DNS domain length up-front
615 # to avoid expensive computation on long inputs.
616 if not valid_string_length(s, trailing_dot=True):
617 raise IDNAError("Domain too long")
618 trailing_dot = False
619 result = []
620 labels = s.split(".") if strict else _unicode_dots_re.split(s)
621 if not labels or labels == [""]:
622 raise IDNAError("Empty domain")
623 if not labels[-1]:
624 del labels[-1]
625 trailing_dot = True
626 for label in labels:
627 s = ulabel(label)
628 if s:
629 result.append(s)
630 else:
631 raise IDNAError("Empty label")
632 if trailing_dot:
633 result.append("")
634 return ".".join(result)