Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 12%

1import bisect

2import re

3import unicodedata

4from typing import Optional, Union

6from . import idnadata

7from .intranges import intranges_contain

9_virama_combining_class = 9

10_alabel_prefix = b"xn--"

11_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")

14class IDNAError(UnicodeError):

15 """Base exception for all IDNA-encoding related problems"""

17 pass

20class IDNABidiError(IDNAError):

21 """Exception when bidirectional requirements are not satisfied"""

23 pass

26class InvalidCodepoint(IDNAError):

27 """Exception when a disallowed or unallocated codepoint is used"""

29 pass

32class InvalidCodepointContext(IDNAError):

33 """Exception when the codepoint is not valid in the context it is used"""

35 pass

38def _combining_class(cp: int) -> int:

39 v = unicodedata.combining(chr(cp))

40 if v == 0:

41 if not unicodedata.name(chr(cp)):

42 raise ValueError("Unknown character in unicodedata")

43 return v

46def _is_script(cp: str, script: str) -> bool:

47 return intranges_contain(ord(cp), idnadata.scripts[script])

50def _punycode(s: str) -> bytes:

51 return s.encode("punycode")

54def _unot(s: int) -> str:

55 return "U+{:04X}".format(s)

58def valid_label_length(label: Union[bytes, str]) -> bool:

59 if len(label) > 63:

60 return False

61 return True

64def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:

65 if len(label) > (254 if trailing_dot else 253):

66 return False

67 return True

70def check_bidi(label: str, check_ltr: bool = False) -> bool:

71 # Bidi rules should only be applied if string contains RTL characters

72 bidi_label = False

73 for idx, cp in enumerate(label, 1):

74 direction = unicodedata.bidirectional(cp)

75 if direction == "":

76 # String likely comes from a newer version of Unicode

77 raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx))

78 if direction in ["R", "AL", "AN"]:

79 bidi_label = True

80 if not bidi_label and not check_ltr:

81 return True

83 # Bidi rule 1

84 direction = unicodedata.bidirectional(label[0])

85 if direction in ["R", "AL"]:

86 rtl = True

87 elif direction == "L":

88 rtl = False

89 else:

90 raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label)))

92 valid_ending = False

93 number_type: Optional[str] = None

94 for idx, cp in enumerate(label, 1):

95 direction = unicodedata.bidirectional(cp)

97 if rtl:

98 # Bidi rule 2

99 if direction not in [

100 "R",

101 "AL",

102 "AN",

103 "EN",

104 "ES",

105 "CS",

106 "ET",

107 "ON",

108 "BN",

109 "NSM",

110 ]:

111 raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx))

112 # Bidi rule 3

113 if direction in ["R", "AL", "EN", "AN"]:

114 valid_ending = True

115 elif direction != "NSM":

116 valid_ending = False

117 # Bidi rule 4

118 if direction in ["AN", "EN"]:

119 if not number_type:

120 number_type = direction

121 else:

122 if number_type != direction:

123 raise IDNABidiError("Can not mix numeral types in a right-to-left label")

124 else:

125 # Bidi rule 5

126 if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]:

127 raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx))

128 # Bidi rule 6

129 if direction in ["L", "EN"]:

130 valid_ending = True

131 elif direction != "NSM":

132 valid_ending = False

133

134 if not valid_ending:

135 raise IDNABidiError("Label ends with illegal codepoint directionality")

136

137 return True

138

139

140def check_initial_combiner(label: str) -> bool:

141 if unicodedata.category(label[0])[0] == "M":

142 raise IDNAError("Label begins with an illegal combining character")

143 return True

144

145

146def check_hyphen_ok(label: str) -> bool:

147 if label[2:4] == "--":

148 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")

149 if label[0] == "-" or label[-1] == "-":

150 raise IDNAError("Label must not start or end with a hyphen")

151 return True

152

153

154def check_nfc(label: str) -> None:

155 if unicodedata.normalize("NFC", label) != label:

156 raise IDNAError("Label must be in Normalization Form C")

157

158

159def valid_contextj(label: str, pos: int) -> bool:

160 cp_value = ord(label[pos])

161

162 if cp_value == 0x200C:

163 if pos > 0:

164 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

165 return True

166

167 ok = False

168 for i in range(pos - 1, -1, -1):

169 joining_type = idnadata.joining_types.get(ord(label[i]))

170 if joining_type == ord("T"):

171 continue

172 elif joining_type in [ord("L"), ord("D")]:

173 ok = True

174 break

175 else:

176 break

177

178 if not ok:

179 return False

180

181 ok = False

182 for i in range(pos + 1, len(label)):

183 joining_type = idnadata.joining_types.get(ord(label[i]))

184 if joining_type == ord("T"):

185 continue

186 elif joining_type in [ord("R"), ord("D")]:

187 ok = True

188 break

189 else:

190 break

191 return ok

192

193 if cp_value == 0x200D:

194 if pos > 0:

195 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

196 return True

197 return False

198

199 else:

200 return False

201

202

203def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:

204 cp_value = ord(label[pos])

205

206 if cp_value == 0x00B7:

207 if 0 < pos < len(label) - 1:

208 if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C:

209 return True

210 return False

211

212 elif cp_value == 0x0375:

213 if pos < len(label) - 1 and len(label) > 1:

214 return _is_script(label[pos + 1], "Greek")

215 return False

216

217 elif cp_value == 0x05F3 or cp_value == 0x05F4:

218 if pos > 0:

219 return _is_script(label[pos - 1], "Hebrew")

220 return False

221

222 elif cp_value == 0x30FB:

223 for cp in label:

224 if cp == "\u30fb":

225 continue

226 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):

227 return True

228 return False

229

230 elif 0x660 <= cp_value <= 0x669:

231 for cp in label:

232 if 0x6F0 <= ord(cp) <= 0x06F9:

233 return False

234 return True

235

236 elif 0x6F0 <= cp_value <= 0x6F9:

237 for cp in label:

238 if 0x660 <= ord(cp) <= 0x0669:

239 return False

240 return True

241

242 return False

243

244

245def check_label(label: Union[str, bytes, bytearray]) -> None:

246 if isinstance(label, (bytes, bytearray)):

247 label = label.decode("utf-8")

248 if len(label) == 0:

249 raise IDNAError("Empty Label")

250

251 check_nfc(label)

252 check_hyphen_ok(label)

253 check_initial_combiner(label)

254

255 for pos, cp in enumerate(label):

256 cp_value = ord(cp)

257 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):

258 continue

259 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):

260 try:

261 if not valid_contextj(label, pos):

262 raise InvalidCodepointContext(

263 "Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))

264 )

265 except ValueError:

266 raise IDNAError(

267 "Unknown codepoint adjacent to joiner {} at position {} in {}".format(

268 _unot(cp_value), pos + 1, repr(label)

269 )

270 )

271 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):

272 if not valid_contexto(label, pos):

273 raise InvalidCodepointContext(

274 "Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))

275 )

276 else:

277 raise InvalidCodepoint(

278 "Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label))

279 )

280

281 check_bidi(label)

282

283

284def alabel(label: str) -> bytes:

285 try:

286 label_bytes = label.encode("ascii")

287 ulabel(label_bytes)

288 if not valid_label_length(label_bytes):

289 raise IDNAError("Label too long")

290 return label_bytes

291 except UnicodeEncodeError:

292 pass

293

294 check_label(label)

295 label_bytes = _alabel_prefix + _punycode(label)

296

297 if not valid_label_length(label_bytes):

298 raise IDNAError("Label too long")

299

300 return label_bytes

301

302

303def ulabel(label: Union[str, bytes, bytearray]) -> str:

304 if not isinstance(label, (bytes, bytearray)):

305 try:

306 label_bytes = label.encode("ascii")

307 except UnicodeEncodeError:

308 check_label(label)

309 return label

310 else:

311 label_bytes = label

312

313 label_bytes = label_bytes.lower()

314 if label_bytes.startswith(_alabel_prefix):

315 label_bytes = label_bytes[len(_alabel_prefix) :]

316 if not label_bytes:

317 raise IDNAError("Malformed A-label, no Punycode eligible content found")

318 if label_bytes.decode("ascii")[-1] == "-":

319 raise IDNAError("A-label must not end with a hyphen")

320 else:

321 check_label(label_bytes)

322 return label_bytes.decode("ascii")

323

324 try:

325 label = label_bytes.decode("punycode")

326 except UnicodeError:

327 raise IDNAError("Invalid A-label")

328 check_label(label)

329 return label

330

331

332def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:

333 """Re-map the characters in the string according to UTS46 processing."""

334 from .uts46data import uts46data

335

336 output = ""

337

338 for pos, char in enumerate(domain):

339 code_point = ord(char)

340 try:

341 uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]

342 status = uts46row[1]

343 replacement: Optional[str] = None

344 if len(uts46row) == 3:

345 replacement = uts46row[2]

346 if (

347 status == "V"

348 or (status == "D" and not transitional)

349 or (status == "3" and not std3_rules and replacement is None)

350 ):

351 output += char

352 elif replacement is not None and (

353 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)

354 ):

355 output += replacement

356 elif status != "I":

357 raise IndexError()

358 except IndexError:

359 raise InvalidCodepoint(

360 "Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain))

361 )

362

363 return unicodedata.normalize("NFC", output)

364

365

366def encode(

367 s: Union[str, bytes, bytearray],

368 strict: bool = False,

369 uts46: bool = False,

370 std3_rules: bool = False,

371 transitional: bool = False,

372) -> bytes:

373 if not isinstance(s, str):

374 try:

375 s = str(s, "ascii")

376 except UnicodeDecodeError:

377 raise IDNAError("should pass a unicode string to the function rather than a byte string.")

378 if uts46:

379 s = uts46_remap(s, std3_rules, transitional)

380 trailing_dot = False

381 result = []

382 if strict:

383 labels = s.split(".")

384 else:

385 labels = _unicode_dots_re.split(s)

386 if not labels or labels == [""]:

387 raise IDNAError("Empty domain")

388 if labels[-1] == "":

389 del labels[-1]

390 trailing_dot = True

391 for label in labels:

392 s = alabel(label)

393 if s:

394 result.append(s)

395 else:

396 raise IDNAError("Empty label")

397 if trailing_dot:

398 result.append(b"")

399 s = b".".join(result)

400 if not valid_string_length(s, trailing_dot):

401 raise IDNAError("Domain too long")

402 return s

403

404

405def decode(

406 s: Union[str, bytes, bytearray],

407 strict: bool = False,

408 uts46: bool = False,

409 std3_rules: bool = False,

410) -> str:

411 try:

412 if not isinstance(s, str):

413 s = str(s, "ascii")

414 except UnicodeDecodeError:

415 raise IDNAError("Invalid ASCII in A-label")

416 if uts46:

417 s = uts46_remap(s, std3_rules, False)

418 trailing_dot = False

419 result = []

420 if not strict:

421 labels = _unicode_dots_re.split(s)

422 else:

423 labels = s.split(".")

424 if not labels or labels == [""]:

425 raise IDNAError("Empty domain")

426 if not labels[-1]:

427 del labels[-1]

428 trailing_dot = True

429 for label in labels:

430 s = ulabel(label)

431 if s:

432 result.append(s)

433 else:

434 raise IDNAError("Empty label")

435 if trailing_dot:

436 result.append("")

437 return ".".join(result)