Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 14%
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
Shortcuts on this page
r m x toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1import bisect
2import re
3import unicodedata
4import warnings
5from typing import Optional, Union
7from . import idnadata
8from .intranges import intranges_contain
10_virama_combining_class = 9
11_alabel_prefix = b"xn--"
12_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")
15# Bidi category sets from RFC 5893, hoisted out of the per-codepoint loop
16_bidi_rtl_first = frozenset({"R", "AL"})
17_bidi_rtl_categories = frozenset({"R", "AL", "AN"})
18_bidi_rtl_allowed = frozenset({"R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})
19_bidi_rtl_valid_ending = frozenset({"R", "AL", "EN", "AN"})
20_bidi_rtl_numeric = frozenset({"AN", "EN"})
21_bidi_ltr_allowed = frozenset({"L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})
22_bidi_ltr_valid_ending = frozenset({"L", "EN"})
23_bidi_joiner_l_or_d = frozenset({ord("L"), ord("D")})
24_bidi_joiner_r_or_d = frozenset({ord("R"), ord("D")})
27class IDNAError(UnicodeError):
28 """Base exception for all IDNA-encoding related problems"""
31class IDNABidiError(IDNAError):
32 """Exception when bidirectional requirements are not satisfied"""
35class InvalidCodepoint(IDNAError):
36 """Exception when a disallowed or unallocated codepoint is used"""
39class InvalidCodepointContext(IDNAError):
40 """Exception when the codepoint is not valid in the context it is used"""
43def _combining_class(cp: int) -> int:
44 v = unicodedata.combining(chr(cp))
45 if v == 0 and not unicodedata.name(chr(cp)):
46 raise ValueError("Unknown character in unicodedata")
47 return v
50def _is_script(cp: str, script: str) -> bool:
51 return intranges_contain(ord(cp), idnadata.scripts[script])
54def _punycode(s: str) -> bytes:
55 return s.encode("punycode")
58def _unot(s: int) -> str:
59 return f"U+{s:04X}"
62def valid_label_length(label: Union[bytes, str]) -> bool:
63 """Check that a label does not exceed the maximum permitted length.
65 Per :rfc:`1035` (and :rfc:`5891` §4.2.4) a DNS label must not exceed
66 63 octets. The argument may be either a :class:`str` (a U-label, where
67 length is measured in characters) or :class:`bytes` (an A-label, where
68 length is measured in octets).
70 :param label: The label to check.
71 :returns: ``True`` if the label is within the length limit, otherwise
72 ``False``.
73 """
74 return len(label) <= 63
77def valid_string_length(domain: Union[bytes, str], trailing_dot: bool) -> bool:
78 """Check that a full domain name does not exceed the maximum length.
80 Per :rfc:`1035`, a domain name is limited to 253 octets when no trailing
81 dot is present, or 254 octets when one is included.
83 :param domain: The full (possibly multi-label) domain name.
84 :param trailing_dot: ``True`` if ``domain`` includes a trailing ``.``.
85 :returns: ``True`` if the domain is within the length limit, otherwise
86 ``False``.
87 """
88 return len(domain) <= (254 if trailing_dot else 253)
91def check_bidi(label: str, check_ltr: bool = False) -> bool:
92 """Validate the Bidi Rule from :rfc:`5893` for a single label.
94 The Bidi Rule constrains how bidirectional characters (Hebrew, Arabic,
95 etc.) may appear within a label. By default the check is only applied
96 when the label contains at least one right-to-left character (Unicode
97 bidirectional categories ``R``, ``AL``, or ``AN``); set ``check_ltr``
98 to ``True`` to apply it to LTR-only labels as well.
100 :param label: The label to validate, as a Unicode string.
101 :param check_ltr: If ``True``, apply the rules even when the label
102 contains no RTL characters.
103 :returns: ``True`` if the label satisfies the Bidi Rule.
104 :raises IDNABidiError: If any of Bidi Rule conditions 1-6 are violated,
105 or if the directional category of a codepoint cannot be determined.
106 """
107 # Bidi rules should only be applied if string contains RTL characters
108 bidi_label = False
109 for idx, cp in enumerate(label, 1):
110 direction = unicodedata.bidirectional(cp)
111 if direction == "":
112 # String likely comes from a newer version of Unicode
113 raise IDNABidiError(f"Unknown directionality in label {label!r} at position {idx}")
114 if direction in _bidi_rtl_categories:
115 bidi_label = True
116 if not bidi_label and not check_ltr:
117 return True
119 # Bidi rule 1
120 direction = unicodedata.bidirectional(label[0])
121 if direction in _bidi_rtl_first:
122 rtl = True
123 elif direction == "L":
124 rtl = False
125 else:
126 raise IDNABidiError(f"First codepoint in label {label!r} must be directionality L, R or AL")
128 valid_ending = False
129 number_type: Optional[str] = None
130 for idx, cp in enumerate(label, 1):
131 direction = unicodedata.bidirectional(cp)
133 if rtl:
134 # Bidi rule 2
135 if direction not in _bidi_rtl_allowed:
136 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a right-to-left label")
137 # Bidi rule 3
138 if direction in _bidi_rtl_valid_ending:
139 valid_ending = True
140 elif direction != "NSM":
141 valid_ending = False
142 # Bidi rule 4
143 if direction in _bidi_rtl_numeric:
144 if not number_type:
145 number_type = direction
146 elif number_type != direction:
147 raise IDNABidiError("Can not mix numeral types in a right-to-left label")
148 else:
149 # Bidi rule 5
150 if direction not in _bidi_ltr_allowed:
151 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a left-to-right label")
152 # Bidi rule 6
153 if direction in _bidi_ltr_valid_ending:
154 valid_ending = True
155 elif direction != "NSM":
156 valid_ending = False
158 if not valid_ending:
159 raise IDNABidiError("Label ends with illegal codepoint directionality")
161 return True
164def check_initial_combiner(label: str) -> bool:
165 """Reject labels that begin with a combining mark.
167 Per :rfc:`5891` §4.2.3.2 a label must not start with a character of
168 Unicode general category ``M`` (Mark).
170 :param label: The label to check.
171 :returns: ``True`` if the first character is not a combining mark.
172 :raises IDNAError: If the label begins with a combining character.
173 """
174 if unicodedata.category(label[0])[0] == "M":
175 raise IDNAError("Label begins with an illegal combining character")
176 return True
179def check_hyphen_ok(label: str) -> bool:
180 """Validate the hyphen restrictions for a label.
182 Per :rfc:`5891` §4.2.3.1 a label must not start or end with a hyphen
183 (``U+002D``), and must not have hyphens in both the third and fourth
184 positions (the prefix reserved for A-labels).
186 :param label: The label to check.
187 :returns: ``True`` if the hyphen restrictions are satisfied.
188 :raises IDNAError: If any of the hyphen restrictions are violated.
189 """
190 if label[2:4] == "--":
191 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")
192 if label[0] == "-" or label[-1] == "-":
193 raise IDNAError("Label must not start or end with a hyphen")
194 return True
197def check_nfc(label: str) -> None:
198 """Require that a label is in Unicode Normalization Form C.
200 :param label: The label to check.
201 :raises IDNAError: If ``label`` differs from its NFC normalisation.
202 """
203 if unicodedata.normalize("NFC", label) != label:
204 raise IDNAError("Label must be in Normalization Form C")
207def valid_contextj(label: str, pos: int) -> bool:
208 """Validate the CONTEXTJ rules from :rfc:`5892` Appendix A.
210 These rules govern the contextual use of the joiner codepoints
211 ``U+200C`` (ZERO WIDTH NON-JOINER, Appendix A.1) and ``U+200D``
212 (ZERO WIDTH JOINER, Appendix A.2) within a label.
214 :param label: The label containing the codepoint.
215 :param pos: Index of the joiner codepoint within ``label``.
216 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTJ
217 rule, ``False`` otherwise (including when the codepoint at
218 ``pos`` is not a recognised joiner).
219 :raises ValueError: If an adjacent codepoint has no Unicode name when
220 determining its combining class.
221 """
222 cp_value = ord(label[pos])
224 if cp_value == 0x200C:
225 if pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class:
226 return True
228 ok = False
229 for i in range(pos - 1, -1, -1):
230 joining_type = idnadata.joining_types().get(ord(label[i]))
231 if joining_type == ord("T"):
232 continue
233 if joining_type in _bidi_joiner_l_or_d:
234 ok = True
235 break
236 break
238 if not ok:
239 return False
241 ok = False
242 for i in range(pos + 1, len(label)):
243 joining_type = idnadata.joining_types().get(ord(label[i]))
244 if joining_type == ord("T"):
245 continue
246 if joining_type in _bidi_joiner_r_or_d:
247 ok = True
248 break
249 break
250 return ok
252 if cp_value == 0x200D:
253 return pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class
255 return False
258def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:
259 """Validate the CONTEXTO rules from :rfc:`5892` Appendix A.
261 Covers the contextual rules for codepoints such as MIDDLE DOT
262 (``U+00B7``), Greek lower numeral sign, Hebrew punctuation, Katakana
263 middle dot, and the Arabic-Indic / Extended Arabic-Indic digit ranges.
265 :param label: The label containing the codepoint.
266 :param pos: Index of the codepoint within ``label``.
267 :param exception: Reserved for forward compatibility; currently unused.
268 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTO
269 rule, ``False`` otherwise (including when the codepoint is not a
270 recognised CONTEXTO codepoint).
271 """
272 cp_value = ord(label[pos])
274 if cp_value == 0x00B7:
275 return 0 < pos < len(label) - 1 and ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C
277 if cp_value == 0x0375:
278 if pos < len(label) - 1 and len(label) > 1:
279 return _is_script(label[pos + 1], "Greek")
280 return False
282 if cp_value in {0x05F3, 0x05F4}:
283 if pos > 0:
284 return _is_script(label[pos - 1], "Hebrew")
285 return False
287 if cp_value == 0x30FB:
288 for cp in label:
289 if cp == "\u30fb":
290 continue
291 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):
292 return True
293 return False
295 if 0x660 <= cp_value <= 0x669:
296 return not any(0x6F0 <= ord(cp) <= 0x06F9 for cp in label)
298 if 0x6F0 <= cp_value <= 0x6F9:
299 return not any(0x660 <= ord(cp) <= 0x0669 for cp in label)
301 return False
304def check_label(label: Union[str, bytes, bytearray]) -> None:
305 """Run the full set of IDNA 2008 validity checks on a single label.
307 Applies, in order: NFC normalisation (:func:`check_nfc`), hyphen
308 restrictions (:func:`check_hyphen_ok`), the no-leading-combiner rule
309 (:func:`check_initial_combiner`), per-codepoint validity (PVALID,
310 CONTEXTJ, CONTEXTO classes from :rfc:`5892`), and the Bidi Rule
311 (:func:`check_bidi`).
313 :param label: The label to validate. ``bytes`` or ``bytearray`` input
314 is decoded as UTF-8 first.
315 :raises IDNAError: If the label is empty or fails a structural rule.
316 :raises InvalidCodepoint: If the label contains a DISALLOWED or
317 UNASSIGNED codepoint.
318 :raises InvalidCodepointContext: If a CONTEXTJ or CONTEXTO codepoint
319 is not valid in its context.
320 :raises IDNABidiError: If the Bidi Rule is violated.
321 """
322 if isinstance(label, (bytes, bytearray)):
323 label = label.decode("utf-8")
324 if len(label) == 0:
325 raise IDNAError("Empty Label")
327 # Reject on domain length rather than label length so support some UTS 46
328 # use cases, still reducing processing of label contextual rules
329 if not valid_string_length(label, trailing_dot=True):
330 raise IDNAError("Label too long")
332 check_nfc(label)
333 check_hyphen_ok(label)
334 check_initial_combiner(label)
336 for pos, cp in enumerate(label):
337 cp_value = ord(cp)
338 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):
339 continue
340 if intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):
341 try:
342 if not valid_contextj(label, pos):
343 raise InvalidCodepointContext(f"Joiner {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")
344 except ValueError as err:
345 raise IDNAError(
346 f"Unknown codepoint adjacent to joiner {_unot(cp_value)} at position {pos + 1} in {label!r}"
347 ) from err
348 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):
349 if not valid_contexto(label, pos):
350 raise InvalidCodepointContext(f"Codepoint {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")
351 else:
352 raise InvalidCodepoint(f"Codepoint {_unot(cp_value)} at position {pos + 1} of {label!r} not allowed")
354 check_bidi(label)
357def alabel(label: str) -> bytes:
358 """Convert a single U-label into its A-label form.
360 The result is the ASCII-Compatible Encoding (ACE) form per :rfc:`5891`
361 §4: the label is validated, Punycode-encoded, and prefixed with
362 ``xn--``. Pure ASCII labels that are already valid IDNA labels are
363 returned unchanged (as :class:`bytes`).
365 :param label: The label to convert, as a Unicode string.
366 :returns: The A-label as ASCII-encoded :class:`bytes`.
367 :raises IDNAError: If the label is invalid or the resulting A-label
368 exceeds 63 octets.
369 """
370 try:
371 label_bytes = label.encode("ascii")
372 except UnicodeEncodeError:
373 pass
374 else:
375 ulabel(label_bytes)
376 if not valid_label_length(label_bytes):
377 raise IDNAError("Label too long")
378 return label_bytes
380 check_label(label)
381 label_bytes = _alabel_prefix + _punycode(label)
383 if not valid_label_length(label_bytes):
384 raise IDNAError("Label too long")
386 return label_bytes
389def ulabel(label: Union[str, bytes, bytearray]) -> str:
390 """Convert a single A-label into its U-label form.
392 Performs the inverse of :func:`alabel`: an ``xn--``-prefixed label is
393 Punycode-decoded and validated. Labels that are already Unicode (or
394 plain ASCII without the ACE prefix) are validated and returned as a
395 Unicode string.
397 :param label: The label to convert. ``bytes`` or ``bytearray`` input
398 is treated as ASCII.
399 :returns: The U-label as a Unicode string.
400 :raises IDNAError: If the label is malformed or fails validation.
401 """
402 if not isinstance(label, (bytes, bytearray)):
403 try:
404 label_bytes = label.encode("ascii")
405 except UnicodeEncodeError:
406 check_label(label)
407 return label
408 else:
409 label_bytes = bytes(label)
411 label_bytes = label_bytes.lower()
412 if label_bytes.startswith(_alabel_prefix):
413 label_bytes = label_bytes[len(_alabel_prefix) :]
414 if not label_bytes:
415 raise IDNAError("Malformed A-label, no Punycode eligible content found")
416 if label_bytes.endswith(b"-"):
417 raise IDNAError("A-label must not end with a hyphen")
418 else:
419 check_label(label_bytes)
420 return label_bytes.decode("ascii")
422 try:
423 label = label_bytes.decode("punycode")
424 except UnicodeError as err:
425 raise IDNAError("Invalid A-label") from err
426 check_label(label)
427 return label
430def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:
431 """Apply the UTS #46 character mapping to a domain string.
433 Implements the mapping table from `UTS #46 §4
434 <https://www.unicode.org/reports/tr46/>`_: each character is kept,
435 replaced, or rejected based on its status (``V``, ``M``, ``D``, ``3``,
436 ``I``). The result is returned in Normalisation Form C.
438 :param domain: The full domain name to remap.
439 :param std3_rules: If ``True``, apply the stricter STD3 ASCII rules
440 (status ``3`` codepoints raise instead of being kept or mapped).
441 :param transitional: If ``True``, use transitional processing (status
442 ``D`` codepoints are mapped instead of kept). Transitional
443 processing has been removed from UTS #46 and this option is
444 retained only for backwards compatibility.
445 :returns: The remapped domain, in Normalisation Form C.
446 :raises InvalidCodepoint: If the domain contains a disallowed
447 codepoint under the chosen rules.
448 """
449 from .uts46data import uts46data
451 output = ""
453 for pos, char in enumerate(domain):
454 code_point = ord(char)
455 uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]
456 status = uts46row[1]
457 replacement: Optional[str] = None
458 if len(uts46row) == 3:
459 replacement = uts46row[2] # ty: ignore[index-out-of-bounds]
461 # UTS #46 §4: V is always valid, D is deviation (kept unless transitional),
462 # 3 is disallowed-STD3 (kept unmapped if std3_rules is off and no mapping).
463 keep_as_is = (
464 status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None)
465 )
466 # M is mapped, 3-with-replacement and transitional D fall through to the
467 # same replacement output path.
468 use_replacement = replacement is not None and (
469 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)
470 )
472 if keep_as_is:
473 output += char
474 elif use_replacement:
475 assert replacement is not None # narrowed by use_replacement
476 output += replacement
477 elif status == "I":
478 continue
479 else:
480 raise InvalidCodepoint(f"Codepoint {_unot(code_point)} not allowed at position {pos + 1} in {domain!r}")
482 return unicodedata.normalize("NFC", output)
485def encode(
486 s: Union[str, bytes, bytearray],
487 strict: bool = False,
488 uts46: bool = False,
489 std3_rules: bool = False,
490 transitional: bool = False,
491) -> bytes:
492 """Encode a Unicode domain name into its ASCII (A-label) form.
494 Splits the input on label separators (only ``U+002E`` if ``strict`` is
495 set; otherwise also IDEOGRAPHIC FULL STOP ``U+3002``, FULLWIDTH FULL
496 STOP ``U+FF0E``, and HALFWIDTH IDEOGRAPHIC FULL STOP ``U+FF61``),
497 encodes each label with :func:`alabel`, and rejoins them with ``.``.
498 Optionally pre-processes the input through :func:`uts46_remap`.
500 :param s: The domain name to encode.
501 :param strict: If ``True``, only ``U+002E`` is recognised as a label
502 separator.
503 :param uts46: If ``True``, apply UTS #46 mapping before encoding.
504 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is
505 ``True``.
506 :param transitional: Forwarded to :func:`uts46_remap` when ``uts46``
507 is ``True``. Deprecated: emits a :class:`DeprecationWarning` and
508 will be removed in a future version.
509 :returns: The encoded domain as ASCII :class:`bytes`.
510 :raises IDNAError: If the domain is empty, contains an invalid label,
511 or exceeds the maximum domain length.
512 """
513 if transitional:
514 warnings.warn(
515 "Transitional processing has been removed from UTS #46. "
516 "The transitional argument will be removed in a future version.",
517 DeprecationWarning,
518 stacklevel=2,
519 )
520 if not isinstance(s, str):
521 try:
522 s = str(s, "ascii")
523 except (UnicodeDecodeError, TypeError) as err:
524 raise IDNAError("should pass a unicode string to the function rather than a byte string.") from err
525 if uts46:
526 s = uts46_remap(s, std3_rules, transitional)
528 # Reject inputs that exceed the maximum DNS domain length up-front
529 # to avoid expensive computation on long inputs.
530 if not valid_string_length(s, trailing_dot=True):
531 raise IDNAError("Domain too long")
533 trailing_dot = False
534 result = []
535 labels = s.split(".") if strict else _unicode_dots_re.split(s)
536 if not labels or labels == [""]:
537 raise IDNAError("Empty domain")
538 if labels[-1] == "":
539 del labels[-1]
540 trailing_dot = True
541 for label in labels:
542 s = alabel(label)
543 if s:
544 result.append(s)
545 else:
546 raise IDNAError("Empty label")
547 if trailing_dot:
548 result.append(b"")
549 s = b".".join(result)
550 if not valid_string_length(s, trailing_dot):
551 raise IDNAError("Domain too long")
552 return s
555def decode(
556 s: Union[str, bytes, bytearray],
557 strict: bool = False,
558 uts46: bool = False,
559 std3_rules: bool = False,
560) -> str:
561 """Decode an A-label-encoded domain name back to Unicode.
563 Splits the input on label separators (see :func:`encode` for the
564 rules), decodes each label with :func:`ulabel`, and rejoins them
565 with ``.``. Optionally pre-processes the input through
566 :func:`uts46_remap`.
568 :param s: The domain name to decode.
569 :param strict: If ``True``, only ``U+002E`` is recognised as a label
570 separator.
571 :param uts46: If ``True``, apply UTS #46 mapping before decoding.
572 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is
573 ``True``.
574 :returns: The decoded domain as a Unicode string.
575 :raises IDNAError: If the input is not valid ASCII, contains an
576 invalid label, or is empty.
577 """
578 if not isinstance(s, str):
579 try:
580 s = str(s, "ascii")
581 except (UnicodeDecodeError, TypeError) as err:
582 raise IDNAError("Invalid ASCII in A-label") from err
583 if uts46:
584 s = uts46_remap(s, std3_rules, False)
585 # Reject inputs that exceed the maximum DNS domain length up-front
586 # to avoid expensive computation on long inputs.
587 if not valid_string_length(s, trailing_dot=True):
588 raise IDNAError("Domain too long")
589 trailing_dot = False
590 result = []
591 labels = s.split(".") if strict else _unicode_dots_re.split(s)
592 if not labels or labels == [""]:
593 raise IDNAError("Empty domain")
594 if not labels[-1]:
595 del labels[-1]
596 trailing_dot = True
597 for label in labels:
598 s = ulabel(label)
599 if s:
600 result.append(s)
601 else:
602 raise IDNAError("Empty label")
603 if trailing_dot:
604 result.append("")
605 return ".".join(result)