Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 45%

1import bisect

2import re

3import unicodedata

4import warnings

5from typing import Optional, Union

7from . import idnadata

8from .intranges import intranges_contain

10_virama_combining_class = 9

11_alabel_prefix = b"xn--"

12_max_input_length = 1024

13_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")

16# Bidi category sets from RFC 5893, hoisted out of the per-codepoint loop

17_bidi_rtl_first = frozenset({"R", "AL"})

18_bidi_rtl_categories = frozenset({"R", "AL", "AN"})

19_bidi_rtl_allowed = frozenset({"R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})

20_bidi_rtl_valid_ending = frozenset({"R", "AL", "EN", "AN"})

21_bidi_rtl_numeric = frozenset({"AN", "EN"})

22_bidi_ltr_allowed = frozenset({"L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})

23_bidi_ltr_valid_ending = frozenset({"L", "EN"})

24_bidi_joiner_l_or_d = frozenset({"L", "D"})

25_bidi_joiner_r_or_d = frozenset({"R", "D"})

28def _joining_type(cp: int) -> Optional[str]:

29 for jt, ranges in idnadata.joining_types.items():

30 if intranges_contain(cp, ranges):

31 return jt

32 return None

35class IDNAError(UnicodeError):

36 """Base exception for all IDNA-encoding related problems"""

39class IDNABidiError(IDNAError):

40 """Exception when bidirectional requirements are not satisfied"""

43class InvalidCodepoint(IDNAError):

44 """Exception when a disallowed or unallocated codepoint is used"""

47class InvalidCodepointContext(IDNAError):

48 """Exception when the codepoint is not valid in the context it is used"""

51def _combining_class(cp: int) -> int:

52 v = unicodedata.combining(chr(cp))

53 if v == 0 and not unicodedata.name(chr(cp)):

54 raise ValueError("Unknown character in unicodedata")

55 return v

58def _is_script(cp: str, script: str) -> bool:

59 return intranges_contain(ord(cp), idnadata.scripts[script])

62def _punycode(s: str) -> bytes:

63 return s.encode("punycode")

66def _unot(s: int) -> str:

67 return f"U+{s:04X}"

70def valid_label_length(label: Union[bytes, str]) -> bool:

71 """Check that a label does not exceed the maximum permitted length.

73 Per :rfc:`1035` (and :rfc:`5891` §4.2.4) a DNS label must not exceed

74 63 octets. The argument may be either a :class:`str` (a U-label, where

75 length is measured in characters) or :class:`bytes` (an A-label, where

76 length is measured in octets).

78 :param label: The label to check.

79 :returns: ``True`` if the label is within the length limit, otherwise

80 ``False``.

81 """

82 return len(label) <= 63

85def valid_string_length(domain: Union[bytes, str], trailing_dot: bool) -> bool:

86 """Check that a full domain name does not exceed the maximum length.

88 Per :rfc:`1035`, a domain name is limited to 253 octets when no trailing

89 dot is present, or 254 octets when one is included.

91 :param domain: The full (possibly multi-label) domain name.

92 :param trailing_dot: ``True`` if ``domain`` includes a trailing ``.``.

93 :returns: ``True`` if the domain is within the length limit, otherwise

94 ``False``.

95 """

96 return len(domain) <= (254 if trailing_dot else 253)

99def check_bidi(label: str, check_ltr: bool = False) -> bool:

100 """Validate the Bidi Rule from :rfc:`5893` for a single label.

101

102 The Bidi Rule constrains how bidirectional characters (Hebrew, Arabic,

103 etc.) may appear within a label. By default the check is only applied

104 when the label contains at least one right-to-left character (Unicode

105 bidirectional categories ``R``, ``AL``, or ``AN``); set ``check_ltr``

106 to ``True`` to apply it to LTR-only labels as well.

107

108 :param label: The label to validate, as a Unicode string.

109 :param check_ltr: If ``True``, apply the rules even when the label

110 contains no RTL characters.

111 :returns: ``True`` if the label satisfies the Bidi Rule.

112 :raises IDNABidiError: If any of Bidi Rule conditions 1-6 are violated,

113 or if the directional category of a codepoint cannot be determined.

114 """

115 if len(label) > _max_input_length:

116 raise IDNAError("Label too long")

117 # Bidi rules should only be applied if string contains RTL characters

118 bidi_label = False

119 for idx, cp in enumerate(label, 1):

120 direction = unicodedata.bidirectional(cp)

121 if direction == "":

122 # String likely comes from a newer version of Unicode

123 raise IDNABidiError(f"Unknown directionality in label {label!r} at position {idx}")

124 if direction in _bidi_rtl_categories:

125 bidi_label = True

126 if not bidi_label and not check_ltr:

127 return True

128

129 # Bidi rule 1

130 direction = unicodedata.bidirectional(label[0])

131 if direction in _bidi_rtl_first:

132 rtl = True

133 elif direction == "L":

134 rtl = False

135 else:

136 raise IDNABidiError(f"First codepoint in label {label!r} must be directionality L, R or AL")

137

138 valid_ending = False

139 number_type: Optional[str] = None

140 for idx, cp in enumerate(label, 1):

141 direction = unicodedata.bidirectional(cp)

142

143 if rtl:

144 # Bidi rule 2

145 if direction not in _bidi_rtl_allowed:

146 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a right-to-left label")

147 # Bidi rule 3

148 if direction in _bidi_rtl_valid_ending:

149 valid_ending = True

150 elif direction != "NSM":

151 valid_ending = False

152 # Bidi rule 4

153 if direction in _bidi_rtl_numeric:

154 if not number_type:

155 number_type = direction

156 elif number_type != direction:

157 raise IDNABidiError("Can not mix numeral types in a right-to-left label")

158 else:

159 # Bidi rule 5

160 if direction not in _bidi_ltr_allowed:

161 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a left-to-right label")

162 # Bidi rule 6

163 if direction in _bidi_ltr_valid_ending:

164 valid_ending = True

165 elif direction != "NSM":

166 valid_ending = False

167

168 if not valid_ending:

169 raise IDNABidiError("Label ends with illegal codepoint directionality")

170

171 return True

172

173

174def check_initial_combiner(label: str) -> bool:

175 """Reject labels that begin with a combining mark.

176

177 Per :rfc:`5891` §4.2.3.2 a label must not start with a character of

178 Unicode general category ``M`` (Mark).

179

180 :param label: The label to check.

181 :returns: ``True`` if the first character is not a combining mark.

182 :raises IDNAError: If the label begins with a combining character.

183 """

184 if unicodedata.category(label[0])[0] == "M":

185 raise IDNAError("Label begins with an illegal combining character")

186 return True

187

188

189def check_hyphen_ok(label: str) -> bool:

190 """Validate the hyphen restrictions for a label.

191

192 Per :rfc:`5891` §4.2.3.1 a label must not start or end with a hyphen

193 (``U+002D``), and must not have hyphens in both the third and fourth

194 positions (the prefix reserved for A-labels).

195

196 :param label: The label to check.

197 :returns: ``True`` if the hyphen restrictions are satisfied.

198 :raises IDNAError: If any of the hyphen restrictions are violated.

199 """

200 if label[2:4] == "--":

201 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")

202 if label[0] == "-" or label[-1] == "-":

203 raise IDNAError("Label must not start or end with a hyphen")

204 return True

205

206

207def check_nfc(label: str) -> None:

208 """Require that a label is in Unicode Normalization Form C.

209

210 :param label: The label to check.

211 :raises IDNAError: If ``label`` differs from its NFC normalisation.

212 """

213 if len(label) > _max_input_length:

214 raise IDNAError("Label too long")

215 if unicodedata.normalize("NFC", label) != label:

216 raise IDNAError("Label must be in Normalization Form C")

217

218

219def valid_contextj(label: str, pos: int) -> bool:

220 """Validate the CONTEXTJ rules from :rfc:`5892` Appendix A.

221

222 These rules govern the contextual use of the joiner codepoints

223 ``U+200C`` (ZERO WIDTH NON-JOINER, Appendix A.1) and ``U+200D``

224 (ZERO WIDTH JOINER, Appendix A.2) within a label.

225

226 :param label: The label containing the codepoint.

227 :param pos: Index of the joiner codepoint within ``label``.

228 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTJ

229 rule, ``False`` otherwise (including when the codepoint at

230 ``pos`` is not a recognised joiner).

231 :raises ValueError: If an adjacent codepoint has no Unicode name when

232 determining its combining class.

233 :raises IDNAError: If ``label`` exceeds the defensive input length limit.

234 """

235 if len(label) > _max_input_length:

236 raise IDNAError("Label too long")

237 cp_value = ord(label[pos])

238

239 if cp_value == 0x200C:

240 if pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class:

241 return True

242

243 ok = False

244 for i in range(pos - 1, -1, -1):

245 joining_type = _joining_type(ord(label[i]))

246 if joining_type == "T":

247 continue

248 if joining_type in _bidi_joiner_l_or_d:

249 ok = True

250 break

251 break

252

253 if not ok:

254 return False

255

256 ok = False

257 for i in range(pos + 1, len(label)):

258 joining_type = _joining_type(ord(label[i]))

259 if joining_type == "T":

260 continue

261 if joining_type in _bidi_joiner_r_or_d:

262 ok = True

263 break

264 break

265 return ok

266

267 if cp_value == 0x200D:

268 return pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class

269

270 return False

271

272

273def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:

274 """Validate the CONTEXTO rules from :rfc:`5892` Appendix A.

275

276 Covers the contextual rules for codepoints such as MIDDLE DOT

277 (``U+00B7``), Greek lower numeral sign, Hebrew punctuation, Katakana

278 middle dot, and the Arabic-Indic / Extended Arabic-Indic digit ranges.

279

280 :param label: The label containing the codepoint.

281 :param pos: Index of the codepoint within ``label``.

282 :param exception: Reserved for forward compatibility; currently unused.

283 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTO

284 rule, ``False`` otherwise (including when the codepoint is not a

285 recognised CONTEXTO codepoint).

286 :raises IDNAError: If ``label`` exceeds the defensive input length limit.

287 """

288 if len(label) > _max_input_length:

289 raise IDNAError("Label too long")

290 cp_value = ord(label[pos])

291

292 if cp_value == 0x00B7:

293 return 0 < pos < len(label) - 1 and ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C

294

295 if cp_value == 0x0375:

296 if pos < len(label) - 1 and len(label) > 1:

297 return _is_script(label[pos + 1], "Greek")

298 return False

299

300 if cp_value in {0x05F3, 0x05F4}:

301 if pos > 0:

302 return _is_script(label[pos - 1], "Hebrew")

303 return False

304

305 if cp_value == 0x30FB:

306 for cp in label:

307 if cp == "\u30fb":

308 continue

309 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):

310 return True

311 return False

312

313 if 0x660 <= cp_value <= 0x669:

314 return not any(0x6F0 <= ord(cp) <= 0x06F9 for cp in label)

315

316 if 0x6F0 <= cp_value <= 0x6F9:

317 return not any(0x660 <= ord(cp) <= 0x0669 for cp in label)

318

319 return False

320

321

322def check_label(label: Union[str, bytes, bytearray]) -> None:

323 """Run the full set of IDNA 2008 validity checks on a single label.

324

325 Applies, in order: NFC normalisation (:func:`check_nfc`), hyphen

326 restrictions (:func:`check_hyphen_ok`), the no-leading-combiner rule

327 (:func:`check_initial_combiner`), per-codepoint validity (PVALID,

328 CONTEXTJ, CONTEXTO classes from :rfc:`5892`), and the Bidi Rule

329 (:func:`check_bidi`).

330

331 :param label: The label to validate. ``bytes`` or ``bytearray`` input

332 is decoded as UTF-8 first.

333 :raises IDNAError: If the label is empty or fails a structural rule.

334 :raises InvalidCodepoint: If the label contains a DISALLOWED or

335 UNASSIGNED codepoint.

336 :raises InvalidCodepointContext: If a CONTEXTJ or CONTEXTO codepoint

337 is not valid in its context.

338 :raises IDNABidiError: If the Bidi Rule is violated.

339 """

340 if len(label) > _max_input_length:

341 raise IDNAError("Label too long")

342 if isinstance(label, (bytes, bytearray)):

343 label = label.decode("utf-8")

344 if len(label) == 0:

345 raise IDNAError("Empty Label")

346

347 # Reject on domain length rather than label length so support some UTS 46

348 # use cases, still reducing processing of label contextual rules

349 if not valid_string_length(label, trailing_dot=True):

350 raise IDNAError("Label too long")

351

352 check_nfc(label)

353 check_hyphen_ok(label)

354 check_initial_combiner(label)

355

356 for pos, cp in enumerate(label):

357 cp_value = ord(cp)

358 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):

359 continue

360 if intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):

361 try:

362 if not valid_contextj(label, pos):

363 raise InvalidCodepointContext(f"Joiner {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")

364 except ValueError as err:

365 raise IDNAError(

366 f"Unknown codepoint adjacent to joiner {_unot(cp_value)} at position {pos + 1} in {label!r}"

367 ) from err

368 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):

369 if not valid_contexto(label, pos):

370 raise InvalidCodepointContext(f"Codepoint {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")

371 else:

372 raise InvalidCodepoint(f"Codepoint {_unot(cp_value)} at position {pos + 1} of {label!r} not allowed")

373

374 check_bidi(label)

375

376

377def alabel(label: str) -> bytes:

378 """Convert a single U-label into its A-label form.

379

380 The result is the ASCII-Compatible Encoding (ACE) form per :rfc:`5891`

381 §4: the label is validated, Punycode-encoded, and prefixed with

382 ``xn--``. Pure ASCII labels that are already valid IDNA labels are

383 returned unchanged (as :class:`bytes`).

384

385 :param label: The label to convert, as a Unicode string.

386 :returns: The A-label as ASCII-encoded :class:`bytes`.

387 :raises IDNAError: If the label is invalid or the resulting A-label

388 exceeds 63 octets.

389 """

390 if len(label) > _max_input_length:

391 raise IDNAError("Label too long")

392 try:

393 label_bytes = label.encode("ascii")

394 except UnicodeEncodeError:

395 pass

396 else:

397 ulabel(label_bytes)

398 if not valid_label_length(label_bytes):

399 raise IDNAError("Label too long")

400 return label_bytes

401

402 check_label(label)

403 label_bytes = _alabel_prefix + _punycode(label)

404

405 if not valid_label_length(label_bytes):

406 raise IDNAError("Label too long")

407

408 return label_bytes

409

410

411def ulabel(label: Union[str, bytes, bytearray]) -> str:

412 """Convert a single A-label into its U-label form.

413

414 Performs the inverse of :func:`alabel`: an ``xn--``-prefixed label is

415 Punycode-decoded and validated. Labels that are already Unicode (or

416 plain ASCII without the ACE prefix) are validated and returned as a

417 Unicode string.

418

419 :param label: The label to convert. ``bytes`` or ``bytearray`` input

420 is treated as ASCII.

421 :returns: The U-label as a Unicode string.

422 :raises IDNAError: If the label is malformed or fails validation.

423 """

424 if len(label) > _max_input_length:

425 raise IDNAError("Label too long")

426 if not isinstance(label, (bytes, bytearray)):

427 try:

428 label_bytes = label.encode("ascii")

429 except UnicodeEncodeError:

430 check_label(label)

431 return label

432 else:

433 label_bytes = bytes(label)

434

435 label_bytes = label_bytes.lower()

436 if label_bytes.startswith(_alabel_prefix):

437 label_bytes = label_bytes[len(_alabel_prefix) :]

438 if not label_bytes:

439 raise IDNAError("Malformed A-label, no Punycode eligible content found")

440 if label_bytes.endswith(b"-"):

441 raise IDNAError("A-label must not end with a hyphen")

442 else:

443 check_label(label_bytes)

444 return label_bytes.decode("ascii")

445

446 try:

447 label = label_bytes.decode("punycode")

448 except UnicodeError as err:

449 raise IDNAError("Invalid A-label") from err

450 check_label(label)

451 return label

452

453

454def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:

455 """Apply the UTS #46 character mapping to a domain string.

456

457 Implements the mapping table from `UTS #46 §4

458 <https://www.unicode.org/reports/tr46/>`_: each character is kept,

459 replaced, or rejected based on its status (``V``, ``M``, ``D``, ``3``,

460 ``I``). The result is returned in Normalisation Form C.

461

462 :param domain: The full domain name to remap.

463 :param std3_rules: If ``True``, apply the stricter STD3 ASCII rules

464 (status ``3`` codepoints raise instead of being kept or mapped).

465 :param transitional: If ``True``, use transitional processing (status

466 ``D`` codepoints are mapped instead of kept). Transitional

467 processing has been removed from UTS #46 and this option is

468 retained only for backwards compatibility.

469 :returns: The remapped domain, in Normalisation Form C.

470 :raises InvalidCodepoint: If the domain contains a disallowed

471 codepoint under the chosen rules.

472 :raises IDNAError: If ``domain`` exceeds the defensive input length limit.

473 """

474 if len(domain) > _max_input_length:

475 raise IDNAError("Domain too long")

476 from .uts46data import uts46_replacements, uts46_starts, uts46_statuses

477

478 output = ""

479

480 for pos, char in enumerate(domain):

481 code_point = ord(char)

482 i = code_point if code_point < 256 else bisect.bisect_right(uts46_starts, code_point) - 1

483 status = chr(uts46_statuses[i])

484 replacement: Optional[str] = uts46_replacements[i]

485

486 # UTS #46 §4: V is always valid, D is deviation (kept unless transitional),

487 # 3 is disallowed-STD3 (kept unmapped if std3_rules is off and no mapping).

488 keep_as_is = (

489 status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None)

490 )

491 # M is mapped, 3-with-replacement and transitional D fall through to the

492 # same replacement output path.

493 use_replacement = replacement is not None and (

494 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)

495 )

496

497 if keep_as_is:

498 output += char

499 elif use_replacement:

500 assert replacement is not None # narrowed by use_replacement

501 output += replacement

502 elif status == "I":

503 continue

504 else:

505 raise InvalidCodepoint(f"Codepoint {_unot(code_point)} not allowed at position {pos + 1} in {domain!r}")

506

507 return unicodedata.normalize("NFC", output)

508

509

510def encode(

511 s: Union[str, bytes, bytearray],

512 strict: bool = False,

513 uts46: bool = False,

514 std3_rules: bool = False,

515 transitional: bool = False,

516) -> bytes:

517 """Encode a Unicode domain name into its ASCII (A-label) form.

518

519 Splits the input on label separators (only ``U+002E`` if ``strict`` is

520 set; otherwise also IDEOGRAPHIC FULL STOP ``U+3002``, FULLWIDTH FULL

521 STOP ``U+FF0E``, and HALFWIDTH IDEOGRAPHIC FULL STOP ``U+FF61``),

522 encodes each label with :func:`alabel`, and rejoins them with ``.``.

523 Optionally pre-processes the input through :func:`uts46_remap`.

524

525 :param s: The domain name to encode.

526 :param strict: If ``True``, only ``U+002E`` is recognised as a label

527 separator.

528 :param uts46: If ``True``, apply UTS #46 mapping before encoding.

529 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is

530 ``True``.

531 :param transitional: Forwarded to :func:`uts46_remap` when ``uts46``

532 is ``True``. Deprecated: emits a :class:`DeprecationWarning` and

533 will be removed in a future version.

534 :returns: The encoded domain as ASCII :class:`bytes`.

535 :raises IDNAError: If the domain is empty, contains an invalid label,

536 or exceeds the maximum domain length.

537 """

538 if transitional:

539 warnings.warn(

540 "Transitional processing has been removed from UTS #46. "

541 "The transitional argument will be removed in a future version.",

542 DeprecationWarning,

543 stacklevel=2,

544 )

545 if not isinstance(s, str):

546 try:

547 s = str(s, "ascii")

548 except (UnicodeDecodeError, TypeError) as err:

549 raise IDNAError("should pass a unicode string to the function rather than a byte string.") from err

550 if len(s) > _max_input_length:

551 raise IDNAError("Domain too long")

552 if uts46:

553 s = uts46_remap(s, std3_rules, transitional)

554

555 # Reject inputs that exceed the maximum DNS domain length up-front

556 # to avoid expensive computation on long inputs.

557 if not valid_string_length(s, trailing_dot=True):

558 raise IDNAError("Domain too long")

559

560 trailing_dot = False

561 result = []

562 labels = s.split(".") if strict else _unicode_dots_re.split(s)

563 if not labels or labels == [""]:

564 raise IDNAError("Empty domain")

565 if labels[-1] == "":

566 del labels[-1]

567 trailing_dot = True

568 for label in labels:

569 s = alabel(label)

570 if s:

571 result.append(s)

572 else:

573 raise IDNAError("Empty label")

574 if trailing_dot:

575 result.append(b"")

576 s = b".".join(result)

577 if not valid_string_length(s, trailing_dot):

578 raise IDNAError("Domain too long")

579 return s

580

581

582def decode(

583 s: Union[str, bytes, bytearray],

584 strict: bool = False,

585 uts46: bool = False,

586 std3_rules: bool = False,

587) -> str:

588 """Decode an A-label-encoded domain name back to Unicode.

589

590 Splits the input on label separators (see :func:`encode` for the

591 rules), decodes each label with :func:`ulabel`, and rejoins them

592 with ``.``. Optionally pre-processes the input through

593 :func:`uts46_remap`.

594

595 :param s: The domain name to decode.

596 :param strict: If ``True``, only ``U+002E`` is recognised as a label

597 separator.

598 :param uts46: If ``True``, apply UTS #46 mapping before decoding.

599 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is

600 ``True``.

601 :returns: The decoded domain as a Unicode string.

602 :raises IDNAError: If the input is not valid ASCII, contains an

603 invalid label, or is empty.

604 """

605 if not isinstance(s, str):

606 try:

607 s = str(s, "ascii")

608 except (UnicodeDecodeError, TypeError) as err:

609 raise IDNAError("Invalid ASCII in A-label") from err

610 if len(s) > _max_input_length:

611 raise IDNAError("Domain too long")

612 if uts46:

613 s = uts46_remap(s, std3_rules, False)

614 # Reject inputs that exceed the maximum DNS domain length up-front

615 # to avoid expensive computation on long inputs.

616 if not valid_string_length(s, trailing_dot=True):

617 raise IDNAError("Domain too long")

618 trailing_dot = False

619 result = []

620 labels = s.split(".") if strict else _unicode_dots_re.split(s)

621 if not labels or labels == [""]:

622 raise IDNAError("Empty domain")

623 if not labels[-1]:

624 del labels[-1]

625 trailing_dot = True

626 for label in labels:

627 s = ulabel(label)

628 if s:

629 result.append(s)

630 else:

631 raise IDNAError("Empty label")

632 if trailing_dot:

633 result.append("")

634 return ".".join(result)