Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 14%

1import bisect

2import re

3import unicodedata

4import warnings

5from typing import Optional, Union

7from . import idnadata

8from .intranges import intranges_contain

10_virama_combining_class = 9

11_alabel_prefix = b"xn--"

12_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")

15# Bidi category sets from RFC 5893, hoisted out of the per-codepoint loop

16_bidi_rtl_first = frozenset({"R", "AL"})

17_bidi_rtl_categories = frozenset({"R", "AL", "AN"})

18_bidi_rtl_allowed = frozenset({"R", "AL", "AN", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})

19_bidi_rtl_valid_ending = frozenset({"R", "AL", "EN", "AN"})

20_bidi_rtl_numeric = frozenset({"AN", "EN"})

21_bidi_ltr_allowed = frozenset({"L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"})

22_bidi_ltr_valid_ending = frozenset({"L", "EN"})

23_bidi_joiner_l_or_d = frozenset({ord("L"), ord("D")})

24_bidi_joiner_r_or_d = frozenset({ord("R"), ord("D")})

27class IDNAError(UnicodeError):

28 """Base exception for all IDNA-encoding related problems"""

31class IDNABidiError(IDNAError):

32 """Exception when bidirectional requirements are not satisfied"""

35class InvalidCodepoint(IDNAError):

36 """Exception when a disallowed or unallocated codepoint is used"""

39class InvalidCodepointContext(IDNAError):

40 """Exception when the codepoint is not valid in the context it is used"""

43def _combining_class(cp: int) -> int:

44 v = unicodedata.combining(chr(cp))

45 if v == 0 and not unicodedata.name(chr(cp)):

46 raise ValueError("Unknown character in unicodedata")

47 return v

50def _is_script(cp: str, script: str) -> bool:

51 return intranges_contain(ord(cp), idnadata.scripts[script])

54def _punycode(s: str) -> bytes:

55 return s.encode("punycode")

58def _unot(s: int) -> str:

59 return f"U+{s:04X}"

62def valid_label_length(label: Union[bytes, str]) -> bool:

63 """Check that a label does not exceed the maximum permitted length.

65 Per :rfc:`1035` (and :rfc:`5891` §4.2.4) a DNS label must not exceed

66 63 octets. The argument may be either a :class:`str` (a U-label, where

67 length is measured in characters) or :class:`bytes` (an A-label, where

68 length is measured in octets).

70 :param label: The label to check.

71 :returns: ``True`` if the label is within the length limit, otherwise

72 ``False``.

73 """

74 return len(label) <= 63

77def valid_string_length(domain: Union[bytes, str], trailing_dot: bool) -> bool:

78 """Check that a full domain name does not exceed the maximum length.

80 Per :rfc:`1035`, a domain name is limited to 253 octets when no trailing

81 dot is present, or 254 octets when one is included.

83 :param domain: The full (possibly multi-label) domain name.

84 :param trailing_dot: ``True`` if ``domain`` includes a trailing ``.``.

85 :returns: ``True`` if the domain is within the length limit, otherwise

86 ``False``.

87 """

88 return len(domain) <= (254 if trailing_dot else 253)

91def check_bidi(label: str, check_ltr: bool = False) -> bool:

92 """Validate the Bidi Rule from :rfc:`5893` for a single label.

94 The Bidi Rule constrains how bidirectional characters (Hebrew, Arabic,

95 etc.) may appear within a label. By default the check is only applied

96 when the label contains at least one right-to-left character (Unicode

97 bidirectional categories ``R``, ``AL``, or ``AN``); set ``check_ltr``

98 to ``True`` to apply it to LTR-only labels as well.

100 :param label: The label to validate, as a Unicode string.

101 :param check_ltr: If ``True``, apply the rules even when the label

102 contains no RTL characters.

103 :returns: ``True`` if the label satisfies the Bidi Rule.

104 :raises IDNABidiError: If any of Bidi Rule conditions 1-6 are violated,

105 or if the directional category of a codepoint cannot be determined.

106 """

107 # Bidi rules should only be applied if string contains RTL characters

108 bidi_label = False

109 for idx, cp in enumerate(label, 1):

110 direction = unicodedata.bidirectional(cp)

111 if direction == "":

112 # String likely comes from a newer version of Unicode

113 raise IDNABidiError(f"Unknown directionality in label {label!r} at position {idx}")

114 if direction in _bidi_rtl_categories:

115 bidi_label = True

116 if not bidi_label and not check_ltr:

117 return True

118

119 # Bidi rule 1

120 direction = unicodedata.bidirectional(label[0])

121 if direction in _bidi_rtl_first:

122 rtl = True

123 elif direction == "L":

124 rtl = False

125 else:

126 raise IDNABidiError(f"First codepoint in label {label!r} must be directionality L, R or AL")

127

128 valid_ending = False

129 number_type: Optional[str] = None

130 for idx, cp in enumerate(label, 1):

131 direction = unicodedata.bidirectional(cp)

132

133 if rtl:

134 # Bidi rule 2

135 if direction not in _bidi_rtl_allowed:

136 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a right-to-left label")

137 # Bidi rule 3

138 if direction in _bidi_rtl_valid_ending:

139 valid_ending = True

140 elif direction != "NSM":

141 valid_ending = False

142 # Bidi rule 4

143 if direction in _bidi_rtl_numeric:

144 if not number_type:

145 number_type = direction

146 elif number_type != direction:

147 raise IDNABidiError("Can not mix numeral types in a right-to-left label")

148 else:

149 # Bidi rule 5

150 if direction not in _bidi_ltr_allowed:

151 raise IDNABidiError(f"Invalid direction for codepoint at position {idx} in a left-to-right label")

152 # Bidi rule 6

153 if direction in _bidi_ltr_valid_ending:

154 valid_ending = True

155 elif direction != "NSM":

156 valid_ending = False

157

158 if not valid_ending:

159 raise IDNABidiError("Label ends with illegal codepoint directionality")

160

161 return True

162

163

164def check_initial_combiner(label: str) -> bool:

165 """Reject labels that begin with a combining mark.

166

167 Per :rfc:`5891` §4.2.3.2 a label must not start with a character of

168 Unicode general category ``M`` (Mark).

169

170 :param label: The label to check.

171 :returns: ``True`` if the first character is not a combining mark.

172 :raises IDNAError: If the label begins with a combining character.

173 """

174 if unicodedata.category(label[0])[0] == "M":

175 raise IDNAError("Label begins with an illegal combining character")

176 return True

177

178

179def check_hyphen_ok(label: str) -> bool:

180 """Validate the hyphen restrictions for a label.

181

182 Per :rfc:`5891` §4.2.3.1 a label must not start or end with a hyphen

183 (``U+002D``), and must not have hyphens in both the third and fourth

184 positions (the prefix reserved for A-labels).

185

186 :param label: The label to check.

187 :returns: ``True`` if the hyphen restrictions are satisfied.

188 :raises IDNAError: If any of the hyphen restrictions are violated.

189 """

190 if label[2:4] == "--":

191 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")

192 if label[0] == "-" or label[-1] == "-":

193 raise IDNAError("Label must not start or end with a hyphen")

194 return True

195

196

197def check_nfc(label: str) -> None:

198 """Require that a label is in Unicode Normalization Form C.

199

200 :param label: The label to check.

201 :raises IDNAError: If ``label`` differs from its NFC normalisation.

202 """

203 if unicodedata.normalize("NFC", label) != label:

204 raise IDNAError("Label must be in Normalization Form C")

205

206

207def valid_contextj(label: str, pos: int) -> bool:

208 """Validate the CONTEXTJ rules from :rfc:`5892` Appendix A.

209

210 These rules govern the contextual use of the joiner codepoints

211 ``U+200C`` (ZERO WIDTH NON-JOINER, Appendix A.1) and ``U+200D``

212 (ZERO WIDTH JOINER, Appendix A.2) within a label.

213

214 :param label: The label containing the codepoint.

215 :param pos: Index of the joiner codepoint within ``label``.

216 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTJ

217 rule, ``False`` otherwise (including when the codepoint at

218 ``pos`` is not a recognised joiner).

219 :raises ValueError: If an adjacent codepoint has no Unicode name when

220 determining its combining class.

221 """

222 cp_value = ord(label[pos])

223

224 if cp_value == 0x200C:

225 if pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class:

226 return True

227

228 ok = False

229 for i in range(pos - 1, -1, -1):

230 joining_type = idnadata.joining_types().get(ord(label[i]))

231 if joining_type == ord("T"):

232 continue

233 if joining_type in _bidi_joiner_l_or_d:

234 ok = True

235 break

236 break

237

238 if not ok:

239 return False

240

241 ok = False

242 for i in range(pos + 1, len(label)):

243 joining_type = idnadata.joining_types().get(ord(label[i]))

244 if joining_type == ord("T"):

245 continue

246 if joining_type in _bidi_joiner_r_or_d:

247 ok = True

248 break

249 break

250 return ok

251

252 if cp_value == 0x200D:

253 return pos > 0 and _combining_class(ord(label[pos - 1])) == _virama_combining_class

254

255 return False

256

257

258def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:

259 """Validate the CONTEXTO rules from :rfc:`5892` Appendix A.

260

261 Covers the contextual rules for codepoints such as MIDDLE DOT

262 (``U+00B7``), Greek lower numeral sign, Hebrew punctuation, Katakana

263 middle dot, and the Arabic-Indic / Extended Arabic-Indic digit ranges.

264

265 :param label: The label containing the codepoint.

266 :param pos: Index of the codepoint within ``label``.

267 :param exception: Reserved for forward compatibility; currently unused.

268 :returns: ``True`` if the codepoint at ``pos`` satisfies its CONTEXTO

269 rule, ``False`` otherwise (including when the codepoint is not a

270 recognised CONTEXTO codepoint).

271 """

272 cp_value = ord(label[pos])

273

274 if cp_value == 0x00B7:

275 return 0 < pos < len(label) - 1 and ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C

276

277 if cp_value == 0x0375:

278 if pos < len(label) - 1 and len(label) > 1:

279 return _is_script(label[pos + 1], "Greek")

280 return False

281

282 if cp_value in {0x05F3, 0x05F4}:

283 if pos > 0:

284 return _is_script(label[pos - 1], "Hebrew")

285 return False

286

287 if cp_value == 0x30FB:

288 for cp in label:

289 if cp == "\u30fb":

290 continue

291 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):

292 return True

293 return False

294

295 if 0x660 <= cp_value <= 0x669:

296 return not any(0x6F0 <= ord(cp) <= 0x06F9 for cp in label)

297

298 if 0x6F0 <= cp_value <= 0x6F9:

299 return not any(0x660 <= ord(cp) <= 0x0669 for cp in label)

300

301 return False

302

303

304def check_label(label: Union[str, bytes, bytearray]) -> None:

305 """Run the full set of IDNA 2008 validity checks on a single label.

306

307 Applies, in order: NFC normalisation (:func:`check_nfc`), hyphen

308 restrictions (:func:`check_hyphen_ok`), the no-leading-combiner rule

309 (:func:`check_initial_combiner`), per-codepoint validity (PVALID,

310 CONTEXTJ, CONTEXTO classes from :rfc:`5892`), and the Bidi Rule

311 (:func:`check_bidi`).

312

313 :param label: The label to validate. ``bytes`` or ``bytearray`` input

314 is decoded as UTF-8 first.

315 :raises IDNAError: If the label is empty or fails a structural rule.

316 :raises InvalidCodepoint: If the label contains a DISALLOWED or

317 UNASSIGNED codepoint.

318 :raises InvalidCodepointContext: If a CONTEXTJ or CONTEXTO codepoint

319 is not valid in its context.

320 :raises IDNABidiError: If the Bidi Rule is violated.

321 """

322 if isinstance(label, (bytes, bytearray)):

323 label = label.decode("utf-8")

324 if len(label) == 0:

325 raise IDNAError("Empty Label")

326

327 # Reject on domain length rather than label length so support some UTS 46

328 # use cases, still reducing processing of label contextual rules

329 if not valid_string_length(label, trailing_dot=True):

330 raise IDNAError("Label too long")

331

332 check_nfc(label)

333 check_hyphen_ok(label)

334 check_initial_combiner(label)

335

336 for pos, cp in enumerate(label):

337 cp_value = ord(cp)

338 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):

339 continue

340 if intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):

341 try:

342 if not valid_contextj(label, pos):

343 raise InvalidCodepointContext(f"Joiner {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")

344 except ValueError as err:

345 raise IDNAError(

346 f"Unknown codepoint adjacent to joiner {_unot(cp_value)} at position {pos + 1} in {label!r}"

347 ) from err

348 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):

349 if not valid_contexto(label, pos):

350 raise InvalidCodepointContext(f"Codepoint {_unot(cp_value)} not allowed at position {pos + 1} in {label!r}")

351 else:

352 raise InvalidCodepoint(f"Codepoint {_unot(cp_value)} at position {pos + 1} of {label!r} not allowed")

353

354 check_bidi(label)

355

356

357def alabel(label: str) -> bytes:

358 """Convert a single U-label into its A-label form.

359

360 The result is the ASCII-Compatible Encoding (ACE) form per :rfc:`5891`

361 §4: the label is validated, Punycode-encoded, and prefixed with

362 ``xn--``. Pure ASCII labels that are already valid IDNA labels are

363 returned unchanged (as :class:`bytes`).

364

365 :param label: The label to convert, as a Unicode string.

366 :returns: The A-label as ASCII-encoded :class:`bytes`.

367 :raises IDNAError: If the label is invalid or the resulting A-label

368 exceeds 63 octets.

369 """

370 try:

371 label_bytes = label.encode("ascii")

372 except UnicodeEncodeError:

373 pass

374 else:

375 ulabel(label_bytes)

376 if not valid_label_length(label_bytes):

377 raise IDNAError("Label too long")

378 return label_bytes

379

380 check_label(label)

381 label_bytes = _alabel_prefix + _punycode(label)

382

383 if not valid_label_length(label_bytes):

384 raise IDNAError("Label too long")

385

386 return label_bytes

387

388

389def ulabel(label: Union[str, bytes, bytearray]) -> str:

390 """Convert a single A-label into its U-label form.

391

392 Performs the inverse of :func:`alabel`: an ``xn--``-prefixed label is

393 Punycode-decoded and validated. Labels that are already Unicode (or

394 plain ASCII without the ACE prefix) are validated and returned as a

395 Unicode string.

396

397 :param label: The label to convert. ``bytes`` or ``bytearray`` input

398 is treated as ASCII.

399 :returns: The U-label as a Unicode string.

400 :raises IDNAError: If the label is malformed or fails validation.

401 """

402 if not isinstance(label, (bytes, bytearray)):

403 try:

404 label_bytes = label.encode("ascii")

405 except UnicodeEncodeError:

406 check_label(label)

407 return label

408 else:

409 label_bytes = bytes(label)

410

411 label_bytes = label_bytes.lower()

412 if label_bytes.startswith(_alabel_prefix):

413 label_bytes = label_bytes[len(_alabel_prefix) :]

414 if not label_bytes:

415 raise IDNAError("Malformed A-label, no Punycode eligible content found")

416 if label_bytes.endswith(b"-"):

417 raise IDNAError("A-label must not end with a hyphen")

418 else:

419 check_label(label_bytes)

420 return label_bytes.decode("ascii")

421

422 try:

423 label = label_bytes.decode("punycode")

424 except UnicodeError as err:

425 raise IDNAError("Invalid A-label") from err

426 check_label(label)

427 return label

428

429

430def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:

431 """Apply the UTS #46 character mapping to a domain string.

432

433 Implements the mapping table from `UTS #46 §4

434 <https://www.unicode.org/reports/tr46/>`_: each character is kept,

435 replaced, or rejected based on its status (``V``, ``M``, ``D``, ``3``,

436 ``I``). The result is returned in Normalisation Form C.

437

438 :param domain: The full domain name to remap.

439 :param std3_rules: If ``True``, apply the stricter STD3 ASCII rules

440 (status ``3`` codepoints raise instead of being kept or mapped).

441 :param transitional: If ``True``, use transitional processing (status

442 ``D`` codepoints are mapped instead of kept). Transitional

443 processing has been removed from UTS #46 and this option is

444 retained only for backwards compatibility.

445 :returns: The remapped domain, in Normalisation Form C.

446 :raises InvalidCodepoint: If the domain contains a disallowed

447 codepoint under the chosen rules.

448 """

449 from .uts46data import uts46data

450

451 output = ""

452

453 for pos, char in enumerate(domain):

454 code_point = ord(char)

455 uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]

456 status = uts46row[1]

457 replacement: Optional[str] = None

458 if len(uts46row) == 3:

459 replacement = uts46row[2] # ty: ignore[index-out-of-bounds]

460

461 # UTS #46 §4: V is always valid, D is deviation (kept unless transitional),

462 # 3 is disallowed-STD3 (kept unmapped if std3_rules is off and no mapping).

463 keep_as_is = (

464 status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None)

465 )

466 # M is mapped, 3-with-replacement and transitional D fall through to the

467 # same replacement output path.

468 use_replacement = replacement is not None and (

469 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)

470 )

471

472 if keep_as_is:

473 output += char

474 elif use_replacement:

475 assert replacement is not None # narrowed by use_replacement

476 output += replacement

477 elif status == "I":

478 continue

479 else:

480 raise InvalidCodepoint(f"Codepoint {_unot(code_point)} not allowed at position {pos + 1} in {domain!r}")

481

482 return unicodedata.normalize("NFC", output)

483

484

485def encode(

486 s: Union[str, bytes, bytearray],

487 strict: bool = False,

488 uts46: bool = False,

489 std3_rules: bool = False,

490 transitional: bool = False,

491) -> bytes:

492 """Encode a Unicode domain name into its ASCII (A-label) form.

493

494 Splits the input on label separators (only ``U+002E`` if ``strict`` is

495 set; otherwise also IDEOGRAPHIC FULL STOP ``U+3002``, FULLWIDTH FULL

496 STOP ``U+FF0E``, and HALFWIDTH IDEOGRAPHIC FULL STOP ``U+FF61``),

497 encodes each label with :func:`alabel`, and rejoins them with ``.``.

498 Optionally pre-processes the input through :func:`uts46_remap`.

499

500 :param s: The domain name to encode.

501 :param strict: If ``True``, only ``U+002E`` is recognised as a label

502 separator.

503 :param uts46: If ``True``, apply UTS #46 mapping before encoding.

504 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is

505 ``True``.

506 :param transitional: Forwarded to :func:`uts46_remap` when ``uts46``

507 is ``True``. Deprecated: emits a :class:`DeprecationWarning` and

508 will be removed in a future version.

509 :returns: The encoded domain as ASCII :class:`bytes`.

510 :raises IDNAError: If the domain is empty, contains an invalid label,

511 or exceeds the maximum domain length.

512 """

513 if transitional:

514 warnings.warn(

515 "Transitional processing has been removed from UTS #46. "

516 "The transitional argument will be removed in a future version.",

517 DeprecationWarning,

518 stacklevel=2,

519 )

520 if not isinstance(s, str):

521 try:

522 s = str(s, "ascii")

523 except (UnicodeDecodeError, TypeError) as err:

524 raise IDNAError("should pass a unicode string to the function rather than a byte string.") from err

525 if uts46:

526 s = uts46_remap(s, std3_rules, transitional)

527

528 # Reject inputs that exceed the maximum DNS domain length up-front

529 # to avoid expensive computation on long inputs.

530 if not valid_string_length(s, trailing_dot=True):

531 raise IDNAError("Domain too long")

532

533 trailing_dot = False

534 result = []

535 labels = s.split(".") if strict else _unicode_dots_re.split(s)

536 if not labels or labels == [""]:

537 raise IDNAError("Empty domain")

538 if labels[-1] == "":

539 del labels[-1]

540 trailing_dot = True

541 for label in labels:

542 s = alabel(label)

543 if s:

544 result.append(s)

545 else:

546 raise IDNAError("Empty label")

547 if trailing_dot:

548 result.append(b"")

549 s = b".".join(result)

550 if not valid_string_length(s, trailing_dot):

551 raise IDNAError("Domain too long")

552 return s

553

554

555def decode(

556 s: Union[str, bytes, bytearray],

557 strict: bool = False,

558 uts46: bool = False,

559 std3_rules: bool = False,

560) -> str:

561 """Decode an A-label-encoded domain name back to Unicode.

562

563 Splits the input on label separators (see :func:`encode` for the

564 rules), decodes each label with :func:`ulabel`, and rejoins them

565 with ``.``. Optionally pre-processes the input through

566 :func:`uts46_remap`.

567

568 :param s: The domain name to decode.

569 :param strict: If ``True``, only ``U+002E`` is recognised as a label

570 separator.

571 :param uts46: If ``True``, apply UTS #46 mapping before decoding.

572 :param std3_rules: Forwarded to :func:`uts46_remap` when ``uts46`` is

573 ``True``.

574 :returns: The decoded domain as a Unicode string.

575 :raises IDNAError: If the input is not valid ASCII, contains an

576 invalid label, or is empty.

577 """

578 if not isinstance(s, str):

579 try:

580 s = str(s, "ascii")

581 except (UnicodeDecodeError, TypeError) as err:

582 raise IDNAError("Invalid ASCII in A-label") from err

583 if uts46:

584 s = uts46_remap(s, std3_rules, False)

585 # Reject inputs that exceed the maximum DNS domain length up-front

586 # to avoid expensive computation on long inputs.

587 if not valid_string_length(s, trailing_dot=True):

588 raise IDNAError("Domain too long")

589 trailing_dot = False

590 result = []

591 labels = s.split(".") if strict else _unicode_dots_re.split(s)

592 if not labels or labels == [""]:

593 raise IDNAError("Empty domain")

594 if not labels[-1]:

595 del labels[-1]

596 trailing_dot = True

597 for label in labels:

598 s = ulabel(label)

599 if s:

600 result.append(s)

601 else:

602 raise IDNAError("Empty label")

603 if trailing_dot:

604 result.append("")

605 return ".".join(result)