Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 16%

1import bisect

2import re

3import unicodedata

4import warnings

5from typing import Optional, Union

7from . import idnadata

8from .intranges import intranges_contain

10_virama_combining_class = 9

11_alabel_prefix = b"xn--"

12_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")

15# Bidi category sets from RFC 5893, hoisted out of the per-codepoint loop

16_bidi_rtl_first = frozenset({"R", "AL"})

17_bidi_rtl_categories = frozenset({"R", "AL", "AN"})

18_bidi_rtl_allowed = frozenset({"R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})

19_bidi_rtl_valid_ending = frozenset({"R", "AL", "EN", "AN"})

20_bidi_rtl_numeric = frozenset({"AN", "EN"})

21_bidi_ltr_allowed = frozenset({"L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})

22_bidi_ltr_valid_ending = frozenset({"L", "EN"})

23_bidi_joiner_l_or_d = frozenset({ord("L"), ord("D")})

24_bidi_joiner_r_or_d = frozenset({ord("R"), ord("D")})

27class IDNAError(UnicodeError):

28 """Base exception for all IDNA-encoding related problems"""

30 pass

33class IDNABidiError(IDNAError):

34 """Exception when bidirectional requirements are not satisfied"""

36 pass

39class InvalidCodepoint(IDNAError):

40 """Exception when a disallowed or unallocated codepoint is used"""

42 pass

45class InvalidCodepointContext(IDNAError):

46 """Exception when the codepoint is not valid in the context it is used"""

48 pass

51def _combining_class(cp: int) -> int:

52 v = unicodedata.combining(chr(cp))

53 if v == 0 and not unicodedata.name(chr(cp)):

54 raise ValueError("Unknown character in unicodedata")

55 return v

58def _is_script(cp: str, script: str) -> bool:

59 return intranges_contain(ord(cp), idnadata.scripts[script])

62def _punycode(s: str) -> bytes:

63 return s.encode("punycode")

66def _unot(s: int) -> str:

67 return f"U+{s:04X}"

70def valid_label_length(label: Union[bytes, str]) -> bool:

71 """Check that a label does not exceed the maximum permitted length.

73 Per :rfc:`1035` (and :rfc:`5891` §4.2.4) a DNS label must not exceed

74 63 octets. The argument may be either a :class:`str` (a U-label, where

75 length is measured in characters) or :class:`bytes` (an A-label, where

76 length is measured in octets).

78 :param label: The label to check.

79 :returns: ``True`` if the label is within the length limit, otherwise

80 ``False``.

81 """

82 return len(label) <= 63

85def valid_string_length(domain: Union[bytes, str], trailing_dot: bool) -> bool:

86 """Check that a full domain name does not exceed the maximum length.

88 Per :rfc:`1035`, a domain name is limited to 253 octets when no trailing

89 dot is present, or 254 octets when one is included.

91 :param domain: The full (possibly multi-label) domain name.

92 :param trailing_dot: ``True`` if ``domain`` includes a trailing ``.``.

93 :returns: ``True`` if the domain is within the length limit, otherwise

94 ``False``.

95 """

96 return len(domain) <= (254 if trailing_dot else 253)

99def check_bidi(label: str, check_ltr: bool = False) -> bool:

100 """Validate the Bidi Rule from :rfc:`5893` for a single label.

101

102 The Bidi Rule constrains how bidirectional characters (Hebrew, Arabic,

103 etc.) may appear within a label. By default the check is only applied

104 when the label contains at least one right-to-left character (Unicode

105 bidirectional categories ``R``, ``AL``, or ``AN``); set ``check_ltr``

106 to ``True`` to apply it to LTR-only labels as well.

107

108 :param label: The label to validate, as a Unicode string.

109 :param check_ltr: If ``True``, apply the rules even when the label

110 contains no RTL characters.

111 :returns: ``True`` if the label satisfies the Bidi Rule.

112 :raises IDNABidiError: If any of Bidi Rule conditions 1-6 are violated,

113 or if the directional category of a codepoint cannot be determined.

114 """

115 # Bidi rules should only be applied if string contains RTL characters

116 bidi_label = False

117 for idx, cp in enumerate(label, 1):

118 direction = unicodedata.bidirectional(cp)

119 if direction == "":

120 # String likely comes from a newer version of Unicode

121 raise IDNABidiError(f"Unknown directionality in label {repr(label)} at position {idx}")

122 if direction in _bidi_rtl_categories:

123 bidi_label = True

124 if not bidi_label and not check_ltr:

125 return True

126

127 # Bidi rule 1

128 direction = unicodedata.bidirectional(label[0])

129 if direction in _bidi_rtl_first:

130 rtl = True

131 elif direction == "L":

132 rtl = False

133 else:

134 raise IDNABidiError(f"First codepoint in label {repr(label)} must be directionality L, R or AL")

135

136 valid_ending = False

137 number_type: Optional[str] = None

138 for idx, cp in enumerate(label, 1):

139 direction = unicodedata.bidirectional(cp)

140

141 if rtl:

142 # Bidi rule 2

143 if direction not in _bidi_rtl_allowed:

144 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a right-to-left label")

145 # Bidi rule 3

146 if direction in _bidi_rtl_valid_ending:

147 valid_ending = True

148 elif direction != "NSM":

149 valid_ending = False

150 # Bidi rule 4

151 if direction in _bidi_rtl_numeric:

152 if not number_type:

153 number_type = direction

154 else:

155 if number_type != direction:

156 raise IDNABidiError("Can not mix numeral types in a right-to-left label")

157 else:

158 # Bidi rule 5

159 if direction not in _bidi_ltr_allowed:

160 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a left-to-right label")

161 # Bidi rule 6

162 if direction in _bidi_ltr_valid_ending:

163 valid_ending = True

164 elif direction != "NSM":

165 valid_ending = False

166

167 if not valid_ending:

168 raise IDNABidiError("Label ends with illegal codepoint directionality")

169

170 return True

171

172

173def check_initial_combiner(label: str) -> bool:

174 """Reject labels that begin with a combining mark.

175

176 Per :rfc:`5891` §4.2.3.2 a label must not start with a character of

177 Unicode general category ``M`` (Mark).

178

179 :param label: The label to check.

180 :returns: ``True`` if the first character is not a combining mark.

181 :raises IDNAError: If the label begins with a combining character.

182 """

183 if unicodedata.category(label[0])[0] == "M":

184 raise IDNAError("Label begins with an illegal combining character")

185 return True

186

187

188def check_hyphen_ok(label: str) -> bool:

189 """Validate the hyphen restrictions for a label.

190

191 Per :rfc:`5891` §4.2.3.1 a label must not start or end with a hyphen

192 (``U+002D``), and must not have hyphens in both the third and fourth

193 positions (the prefix reserved for A-labels).

194

195 :param label: The label to check.

196 :returns: ``True`` if the hyphen restrictions are satisfied.

197 :raises IDNAError: If any of the hyphen restrictions are violated.

198 """

199 if label[2:4] == "--":

200 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")

201 if label[0] == "-" or label[-1] == "-":

202 raise IDNAError("Label must not start or end with a hyphen")

203 return True

204

205

206def check_nfc(label: str) -> None:

207 """Require that a label is in Unicode Normalization Form C.

208

209 :param label: The label to check.

210 :raises IDNAError: If ``label`` differs from its NFC normalisation.

211 """

212 if unicodedata.normalize("NFC", label) != label:

213 raise IDNAError("Label must be in Normalization Form C")

214

215

216def valid_contextj(label: str, pos: int) -> bool:

217 """Validate the CONTEXTJ rules from :rfc:`5892` Appendix A.

218

219 These rules govern the contextual use of the joiner codepoints

220 ``U+200C`` (ZERO WIDTH NON-JOINER, Appendix A.1) and ``U+200D``

221 (ZERO WIDTH JOINER, Appendix A.2) within a label.

222

223 :param label: The label containing the codepoint.

224 :param pos: Index of the joiner codepoint within ``label``.

225 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTJ

226 rule, ``False`` otherwise (including when the codepoint at

227 ``pos`` is not a recognised joiner).

228 :raises ValueError: If an adjacent codepoint has no Unicode name when

229 determining its combining class.

230 """

231 cp_value = ord(label[pos])

232

233 if cp_value == 0x200C:

234 if pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class:

235 return True

236

237 ok = False

238 for i in range(pos - 1, -1, -1):

239 joining_type = idnadata.joining_types().get(ord(label[i]))

240 if joining_type == ord("T"):

241 continue

242 elif joining_type in _bidi_joiner_l_or_d:

243 ok = True

244 break

245 else:

246 break

247

248 if not ok:

249 return False

250

251 ok = False

252 for i in range(pos + 1, len(label)):

253 joining_type = idnadata.joining_types().get(ord(label[i]))

254 if joining_type == ord("T"):

255 continue

256 elif joining_type in _bidi_joiner_r_or_d:

257 ok = True

258 break

259 else:

260 break

261 return ok

262

263 if cp_value == 0x200D:

264 return pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class

265

266 else:

267 return False

268

269

270def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:

271 """Validate the CONTEXTO rules from :rfc:`5892` Appendix A.

272

273 Covers the contextual rules for codepoints such as MIDDLE DOT

274 (``U+00B7``), Greek lower numeral sign, Hebrew punctuation, Katakana

275 middle dot, and the Arabic-Indic / Extended Arabic-Indic digit ranges.

276

277 :param label: The label containing the codepoint.

278 :param pos: Index of the codepoint within ``label``.

279 :param exception: Reserved for forward compatibility; currently unused.

280 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTO

281 rule, ``False`` otherwise (including when the codepoint is not a

282 recognised CONTEXTO codepoint).

283 """

284 cp_value = ord(label[pos])

285

286 if cp_value == 0x00B7:

287 return 0 < pos < len(label) - 1 and ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C

288

289 elif cp_value == 0x0375:

290 if pos < len(label) - 1 and len(label) > 1:

291 return _is_script(label[pos + 1], "Greek")

292 return False

293

294 elif cp_value == 0x05F3 or cp_value == 0x05F4:

295 if pos > 0:

296 return _is_script(label[pos - 1], "Hebrew")

297 return False

298

299 elif cp_value == 0x30FB:

300 for cp in label:

301 if cp == "\u30fb":

302 continue

303 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):

304 return True

305 return False

306

307 elif 0x660 <= cp_value <= 0x669:

308 return not any(0x6F0 <= ord(cp) <= 0x06F9 for cp in label)

309

310 elif 0x6F0 <= cp_value <= 0x6F9:

311 return not any(0x660 <= ord(cp) <= 0x0669 for cp in label)

312

313 return False

314

315

316def check_label(label: Union[str, bytes, bytearray]) -> None:

317 """Run the full set of IDNA 2008 validity checks on a single label.

318

319 Applies, in order: NFC normalisation (:func:`check_nfc`), hyphen

320 restrictions (:func:`check_hyphen_ok`), the no-leading-combiner rule

321 (:func:`check_initial_combiner`), per-codepoint validity (PVALID,

322 CONTEXTJ, CONTEXTO classes from :rfc:`5892`), and the Bidi Rule

323 (:func:`check_bidi`).

324

325 :param label: The label to validate. ``bytes`` or ``bytearray`` input

326 is decoded as UTF-8 first.

327 :raises IDNAError: If the label is empty or fails a structural rule.

328 :raises InvalidCodepoint: If the label contains a DISALLOWED or

329 UNASSIGNED codepoint.

330 :raises InvalidCodepointContext: If a CONTEXTJ or CONTEXTO codepoint

331 is not valid in its context.

332 :raises IDNABidiError: If the Bidi Rule is violated.

333 """

334 if isinstance(label, (bytes, bytearray)):

335 label = label.decode("utf-8")

336 if len(label) == 0:

337 raise IDNAError("Empty Label")

338

339 # Reject on domain length rather than label length so support some UTS 46

340 # use cases, still reducing processing of label contextual rules

341 if not valid_string_length(label, trailing_dot=True):

342 raise IDNAError("Label too long")

343

344 check_nfc(label)

345 check_hyphen_ok(label)

346 check_initial_combiner(label)

347

348 for pos, cp in enumerate(label):

349 cp_value = ord(cp)

350 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):

351 continue

352 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):

353 try:

354 if not valid_contextj(label, pos):

355 raise InvalidCodepointContext(

356 f"Joiner {_unot(cp_value)} not allowed at position {pos + 1} in {repr(label)}"

357 )

358 except ValueError as err:

359 raise IDNAError(

360 f"Unknown codepoint adjacent to joiner {_unot(cp_value)} at position {pos + 1} in {repr(label)}"

361 ) from err

362 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):

363 if not valid_contexto(label, pos):

364 raise InvalidCodepointContext(

365 f"Codepoint {_unot(cp_value)} not allowed at position {pos + 1} in {repr(label)}"

366 )

367 else:

368 raise InvalidCodepoint(f"Codepoint {_unot(cp_value)} at position {pos + 1} of {repr(label)} not allowed")

369

370 check_bidi(label)

371

372

373def alabel(label: str) -> bytes:

374 """Convert a single U-label into its A-label form.

375

376 The result is the ASCII-Compatible Encoding (ACE) form per :rfc:`5891`

377 §4: the label is validated, Punycode-encoded, and prefixed with

378 ``xn--``. Pure ASCII labels that are already valid IDNA labels are

379 returned unchanged (as :class:`bytes`).

380

381 :param label: The label to convert, as a Unicode string.

382 :returns: The A-label as ASCII-encoded :class:`bytes`.

383 :raises IDNAError: If the label is invalid or the resulting A-label

384 exceeds 63 octets.

385 """

386 try:

387 label_bytes = label.encode("ascii")

388 ulabel(label_bytes)

389 if not valid_label_length(label_bytes):

390 raise IDNAError("Label too long")

391 return label_bytes

392 except UnicodeEncodeError:

393 pass

394

395 check_label(label)

396 label_bytes = _alabel_prefix + _punycode(label)

397

398 if not valid_label_length(label_bytes):

399 raise IDNAError("Label too long")

400

401 return label_bytes

402

403

404def ulabel(label: Union[str, bytes, bytearray]) -> str:

405 """Convert a single A-label into its U-label form.

406

407 Performs the inverse of :func:`alabel`: an ``xn--``-prefixed label is

408 Punycode-decoded and validated. Labels that are already Unicode (or

409 plain ASCII without the ACE prefix) are validated and returned as a

410 Unicode string.

411

412 :param label: The label to convert. ``bytes`` or ``bytearray`` input

413 is treated as ASCII.

414 :returns: The U-label as a Unicode string.

415 :raises IDNAError: If the label is malformed or fails validation.

416 """

417 if not isinstance(label, (bytes, bytearray)):

418 try:

419 label_bytes = label.encode("ascii")

420 except UnicodeEncodeError:

421 check_label(label)

422 return label

423 else:

424 label_bytes = bytes(label)

425

426 label_bytes = label_bytes.lower()

427 if label_bytes.startswith(_alabel_prefix):

428 label_bytes = label_bytes[len(_alabel_prefix) :]

429 if not label_bytes:

430 raise IDNAError("Malformed A-label, no Punycode eligible content found")

431 if label_bytes.endswith(b"-"):

432 raise IDNAError("A-label must not end with a hyphen")

433 else:

434 check_label(label_bytes)

435 return label_bytes.decode("ascii")

436

437 try:

438 label = label_bytes.decode("punycode")

439 except UnicodeError as err:

440 raise IDNAError("Invalid A-label") from err

441 check_label(label)

442 return label

443

444

445def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:

446 """Apply the UTS #46 character mapping to a domain string.

447

448 Implements the mapping table from `UTS #46 §4

449 <https://www.unicode.org/reports/tr46/>`_: each character is kept,

450 replaced, or rejected based on its status (``V``, ``M``, ``D``, ``3``,

451 ``I``). The result is returned in Normalisation Form C.

452

453 :param domain: The full domain name to remap.

454 :param std3_rules: If ``True``, apply the stricter STD3 ASCII rules

455 (status ``3`` codepoints raise instead of being kept or mapped).

456 :param transitional: If ``True``, use transitional processing (status

457 ``D`` codepoints are mapped instead of kept). Transitional

458 processing has been removed from UTS #46 and this option is

459 retained only for backwards compatibility.

460 :returns: The remapped domain, in Normalisation Form C.

461 :raises InvalidCodepoint: If the domain contains a disallowed

462 codepoint under the chosen rules.

463 """

464 from .uts46data import uts46data

465

466 output = ""

467

468 for pos, char in enumerate(domain):

469 code_point = ord(char)

470 uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]

471 status = uts46row[1]

472 replacement: Optional[str] = None

473 if len(uts46row) == 3:

474 replacement = uts46row[2] # ty: ignore[index-out-of-bounds]

475

476 # UTS #46 §4: V is always valid, D is deviation (kept unless transitional),

477 # 3 is disallowed-STD3 (kept unmapped if std3_rules is off and no mapping).

478 keep_as_is = (

479 status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None)

480 )

481 # M is mapped, 3-with-replacement and transitional D fall through to the

482 # same replacement output path.

483 use_replacement = replacement is not None and (

484 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)

485 )

486

487 if keep_as_is:

488 output += char

489 elif use_replacement:

490 assert replacement is not None # narrowed by use_replacement

491 output += replacement

492 elif status == "I":

493 continue

494 else:

495 raise InvalidCodepoint(f"Codepoint {_unot(code_point)} not allowed at position {pos + 1} in {repr(domain)}")

496

497 return unicodedata.normalize("NFC", output)

498

499

500def encode(

501 s: Union[str, bytes, bytearray],

502 strict: bool = False,

503 uts46: bool = False,

504 std3_rules: bool = False,

505 transitional: bool = False,

506) -> bytes:

507 """Encode a Unicode domain name into its ASCII (A-label) form.

508

509 Splits the input on label separators (only ``U+002E`` if ``strict`` is

510 set; otherwise also IDEOGRAPHIC FULL STOP ``U+3002``, FULLWIDTH FULL

511 STOP ``U+FF0E``, and HALFWIDTH IDEOGRAPHIC FULL STOP ``U+FF61``),

512 encodes each label with :func:`alabel`, and rejoins them with ``.``.

513 Optionally pre-processes the input through :func:`uts46_remap`.

514

515 :param s: The domain name to encode.

516 :param strict: If ``True``, only ``U+002E`` is recognised as a label

517 separator.

518 :param uts46: If ``True``, apply UTS #46 mapping before encoding.

519 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is

520 ``True``.

521 :param transitional: Forwarded to :func:`uts46_remap` when ``uts46``

522 is ``True``. Deprecated: emits a :class:`DeprecationWarning` and

523 will be removed in a future version.

524 :returns: The encoded domain as ASCII :class:`bytes`.

525 :raises IDNAError: If the domain is empty, contains an invalid label,

526 or exceeds the maximum domain length.

527 """

528 if transitional:

529 warnings.warn(

530 "Transitional processing has been removed from UTS #46. "

531 "The transitional argument will be removed in a future version.",

532 DeprecationWarning,

533 stacklevel=2,

534 )

535 if not isinstance(s, str):

536 try:

537 s = str(s, "ascii")

538 except (UnicodeDecodeError, TypeError) as err:

539 raise IDNAError("should pass a unicode string to the function rather than a byte string.") from err

540 if uts46:

541 s = uts46_remap(s, std3_rules, transitional)

542

543 # Reject inputs that exceed the maximum DNS domain length up-front

544 # to avoid expensive computation on long inputs.

545 if not valid_string_length(s, trailing_dot=True):

546 raise IDNAError("Domain too long")

547

548 trailing_dot = False

549 result = []

550 labels = s.split(".") if strict else _unicode_dots_re.split(s)

551 if not labels or labels == [""]:

552 raise IDNAError("Empty domain")

553 if labels[-1] == "":

554 del labels[-1]

555 trailing_dot = True

556 for label in labels:

557 s = alabel(label)

558 if s:

559 result.append(s)

560 else:

561 raise IDNAError("Empty label")

562 if trailing_dot:

563 result.append(b"")

564 s = b".".join(result)

565 if not valid_string_length(s, trailing_dot):

566 raise IDNAError("Domain too long")

567 return s

568

569

570def decode(

571 s: Union[str, bytes, bytearray],

572 strict: bool = False,

573 uts46: bool = False,

574 std3_rules: bool = False,

575) -> str:

576 """Decode an A-label-encoded domain name back to Unicode.

577

578 Splits the input on label separators (see :func:`encode` for the

579 rules), decodes each label with :func:`ulabel`, and rejoins them

580 with ``.``. Optionally pre-processes the input through

581 :func:`uts46_remap`.

582

583 :param s: The domain name to decode.

584 :param strict: If ``True``, only ``U+002E`` is recognised as a label

585 separator.

586 :param uts46: If ``True``, apply UTS #46 mapping before decoding.

587 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is

588 ``True``.

589 :returns: The decoded domain as a Unicode string.

590 :raises IDNAError: If the input is not valid ASCII, contains an

591 invalid label, or is empty.

592 """

593 if not isinstance(s, str):

594 try:

595 s = str(s, "ascii")

596 except (UnicodeDecodeError, TypeError) as err:

597 raise IDNAError("Invalid ASCII in A-label") from err

598 if uts46:

599 s = uts46_remap(s, std3_rules, False)

600 # Reject inputs that exceed the maximum DNS domain length up-front

601 # to avoid expensive computation on long inputs.

602 if not valid_string_length(s, trailing_dot=True):

603 raise IDNAError("Domain too long")

604 trailing_dot = False

605 result = []

606 labels = s.split(".") if strict else _unicode_dots_re.split(s)

607 if not labels or labels == [""]:

608 raise IDNAError("Empty domain")

609 if not labels[-1]:

610 del labels[-1]

611 trailing_dot = True

612 for label in labels:

613 s = ulabel(label)

614 if s:

615 result.append(s)

616 else:

617 raise IDNAError("Empty label")

618 if trailing_dot:

619 result.append("")

620 return ".".join(result)