Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 12%

1import bisect

2import re

3import unicodedata

4import warnings

5from typing import Optional, Union

7from . import idnadata

8from .intranges import intranges_contain

10_virama_combining_class = 9

11_alabel_prefix = b"xn--"

12_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]")

15class IDNAError(UnicodeError):

16 """Base exception for all IDNA-encoding related problems"""

18 pass

21class IDNABidiError(IDNAError):

22 """Exception when bidirectional requirements are not satisfied"""

24 pass

27class InvalidCodepoint(IDNAError):

28 """Exception when a disallowed or unallocated codepoint is used"""

30 pass

33class InvalidCodepointContext(IDNAError):

34 """Exception when the codepoint is not valid in the context it is used"""

36 pass

39def _combining_class(cp: int) -> int:

40 v = unicodedata.combining(chr(cp))

41 if v == 0:

42 if not unicodedata.name(chr(cp)):

43 raise ValueError("Unknown character in unicodedata")

44 return v

47def _is_script(cp: str, script: str) -> bool:

48 return intranges_contain(ord(cp), idnadata.scripts[script])

51def _punycode(s: str) -> bytes:

52 return s.encode("punycode")

55def _unot(s: int) -> str:

56 return "U+{:04X}".format(s)

59def valid_label_length(label: Union[bytes, str]) -> bool:

60 if len(label) > 63:

61 return False

62 return True

65def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:

66 if len(label) > (254 if trailing_dot else 253):

67 return False

68 return True

71def check_bidi(label: str, check_ltr: bool = False) -> bool:

72 # Bidi rules should only be applied if string contains RTL characters

73 bidi_label = False

74 for idx, cp in enumerate(label, 1):

75 direction = unicodedata.bidirectional(cp)

76 if direction == "":

77 # String likely comes from a newer version of Unicode

78 raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx))

79 if direction in ["R", "AL", "AN"]:

80 bidi_label = True

81 if not bidi_label and not check_ltr:

82 return True

84 # Bidi rule 1

85 direction = unicodedata.bidirectional(label[0])

86 if direction in ["R", "AL"]:

87 rtl = True

88 elif direction == "L":

89 rtl = False

90 else:

91 raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label)))

93 valid_ending = False

94 number_type: Optional[str] = None

95 for idx, cp in enumerate(label, 1):

96 direction = unicodedata.bidirectional(cp)

98 if rtl:

99 # Bidi rule 2

100 if direction not in [

101 "R",

102 "AL",

103 "AN",

104 "EN",

105 "ES",

106 "CS",

107 "ET",

108 "ON",

109 "BN",

110 "NSM",

111 ]:

112 raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx))

113 # Bidi rule 3

114 if direction in ["R", "AL", "EN", "AN"]:

115 valid_ending = True

116 elif direction != "NSM":

117 valid_ending = False

118 # Bidi rule 4

119 if direction in ["AN", "EN"]:

120 if not number_type:

121 number_type = direction

122 else:

123 if number_type != direction:

124 raise IDNABidiError("Can not mix numeral types in a right-to-left label")

125 else:

126 # Bidi rule 5

127 if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]:

128 raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx))

129 # Bidi rule 6

130 if direction in ["L", "EN"]:

131 valid_ending = True

132 elif direction != "NSM":

133 valid_ending = False

134

135 if not valid_ending:

136 raise IDNABidiError("Label ends with illegal codepoint directionality")

137

138 return True

139

140

141def check_initial_combiner(label: str) -> bool:

142 if unicodedata.category(label[0])[0] == "M":

143 raise IDNAError("Label begins with an illegal combining character")

144 return True

145

146

147def check_hyphen_ok(label: str) -> bool:

148 if label[2:4] == "--":

149 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position")

150 if label[0] == "-" or label[-1] == "-":

151 raise IDNAError("Label must not start or end with a hyphen")

152 return True

153

154

155def check_nfc(label: str) -> None:

156 if unicodedata.normalize("NFC", label) != label:

157 raise IDNAError("Label must be in Normalization Form C")

158

159

160def valid_contextj(label: str, pos: int) -> bool:

161 cp_value = ord(label[pos])

162

163 if cp_value == 0x200C:

164 if pos > 0:

165 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

166 return True

167

168 ok = False

169 for i in range(pos - 1, -1, -1):

170 joining_type = idnadata.joining_types().get(ord(label[i]))

171 if joining_type == ord("T"):

172 continue

173 elif joining_type in [ord("L"), ord("D")]:

174 ok = True

175 break

176 else:

177 break

178

179 if not ok:

180 return False

181

182 ok = False

183 for i in range(pos + 1, len(label)):

184 joining_type = idnadata.joining_types().get(ord(label[i]))

185 if joining_type == ord("T"):

186 continue

187 elif joining_type in [ord("R"), ord("D")]:

188 ok = True

189 break

190 else:

191 break

192 return ok

193

194 if cp_value == 0x200D:

195 if pos > 0:

196 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

197 return True

198 return False

199

200 else:

201 return False

202

203

204def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:

205 cp_value = ord(label[pos])

206

207 if cp_value == 0x00B7:

208 if 0 < pos < len(label) - 1:

209 if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C:

210 return True

211 return False

212

213 elif cp_value == 0x0375:

214 if pos < len(label) - 1 and len(label) > 1:

215 return _is_script(label[pos + 1], "Greek")

216 return False

217

218 elif cp_value == 0x05F3 or cp_value == 0x05F4:

219 if pos > 0:

220 return _is_script(label[pos - 1], "Hebrew")

221 return False

222

223 elif cp_value == 0x30FB:

224 for cp in label:

225 if cp == "\u30fb":

226 continue

227 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"):

228 return True

229 return False

230

231 elif 0x660 <= cp_value <= 0x669:

232 for cp in label:

233 if 0x6F0 <= ord(cp) <= 0x06F9:

234 return False

235 return True

236

237 elif 0x6F0 <= cp_value <= 0x6F9:

238 for cp in label:

239 if 0x660 <= ord(cp) <= 0x0669:

240 return False

241 return True

242

243 return False

244

245

246def check_label(label: Union[str, bytes, bytearray]) -> None:

247 if isinstance(label, (bytes, bytearray)):

248 label = label.decode("utf-8")

249 if len(label) == 0:

250 raise IDNAError("Empty Label")

251

252 check_nfc(label)

253 check_hyphen_ok(label)

254 check_initial_combiner(label)

255

256 for pos, cp in enumerate(label):

257 cp_value = ord(cp)

258 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]):

259 continue

260 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]):

261 try:

262 if not valid_contextj(label, pos):

263 raise InvalidCodepointContext(

264 "Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))

265 )

266 except ValueError:

267 raise IDNAError(

268 "Unknown codepoint adjacent to joiner {} at position {} in {}".format(

269 _unot(cp_value), pos + 1, repr(label)

270 )

271 )

272 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]):

273 if not valid_contexto(label, pos):

274 raise InvalidCodepointContext(

275 "Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label))

276 )

277 else:

278 raise InvalidCodepoint(

279 "Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label))

280 )

281

282 check_bidi(label)

283

284

285def alabel(label: str) -> bytes:

286 try:

287 label_bytes = label.encode("ascii")

288 ulabel(label_bytes)

289 if not valid_label_length(label_bytes):

290 raise IDNAError("Label too long")

291 return label_bytes

292 except UnicodeEncodeError:

293 pass

294

295 check_label(label)

296 label_bytes = _alabel_prefix + _punycode(label)

297

298 if not valid_label_length(label_bytes):

299 raise IDNAError("Label too long")

300

301 return label_bytes

302

303

304def ulabel(label: Union[str, bytes, bytearray]) -> str:

305 if not isinstance(label, (bytes, bytearray)):

306 try:

307 label_bytes = label.encode("ascii")

308 except UnicodeEncodeError:

309 check_label(label)

310 return label

311 else:

312 label_bytes = bytes(label)

313

314 label_bytes = label_bytes.lower()

315 if label_bytes.startswith(_alabel_prefix):

316 label_bytes = label_bytes[len(_alabel_prefix) :]

317 if not label_bytes:

318 raise IDNAError("Malformed A-label, no Punycode eligible content found")

319 if label_bytes.decode("ascii")[-1] == "-":

320 raise IDNAError("A-label must not end with a hyphen")

321 else:

322 check_label(label_bytes)

323 return label_bytes.decode("ascii")

324

325 try:

326 label = label_bytes.decode("punycode")

327 except UnicodeError:

328 raise IDNAError("Invalid A-label")

329 check_label(label)

330 return label

331

332

333def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:

334 """Re-map the characters in the string according to UTS46 processing."""

335 from .uts46data import uts46data

336

337 output = ""

338

339 for pos, char in enumerate(domain):

340 code_point = ord(char)

341 uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1]

342 status = uts46row[1]

343 replacement: Optional[str] = None

344 if len(uts46row) == 3:

345 replacement = uts46row[2]

346 if status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None):

347 output += char

348 elif replacement is not None and (

349 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional)

350 ):

351 output += replacement

352 elif status == "I":

353 continue

354 else:

355 raise InvalidCodepoint(

356 "Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain))

357 )

358

359 return unicodedata.normalize("NFC", output)

360

361

362def encode(

363 s: Union[str, bytes, bytearray],

364 strict: bool = False,

365 uts46: bool = False,

366 std3_rules: bool = False,

367 transitional: bool = False,

368) -> bytes:

369 if transitional:

370 warnings.warn(

371 "Transitional processing has been removed from UTS #46. "

372 "The transitional argument will be removed in a future version.",

373 DeprecationWarning,

374 stacklevel=2,

375 )

376 if not isinstance(s, str):

377 try:

378 s = str(s, "ascii")

379 except UnicodeDecodeError:

380 raise IDNAError("should pass a unicode string to the function rather than a byte string.")

381 if uts46:

382 s = uts46_remap(s, std3_rules, transitional)

383 trailing_dot = False

384 result = []

385 if strict:

386 labels = s.split(".")

387 else:

388 labels = _unicode_dots_re.split(s)

389 if not labels or labels == [""]:

390 raise IDNAError("Empty domain")

391 if labels[-1] == "":

392 del labels[-1]

393 trailing_dot = True

394 for label in labels:

395 s = alabel(label)

396 if s:

397 result.append(s)

398 else:

399 raise IDNAError("Empty label")

400 if trailing_dot:

401 result.append(b"")

402 s = b".".join(result)

403 if not valid_string_length(s, trailing_dot):

404 raise IDNAError("Domain too long")

405 return s

406

407

408def decode(

409 s: Union[str, bytes, bytearray],

410 strict: bool = False,

411 uts46: bool = False,

412 std3_rules: bool = False,

413) -> str:

414 try:

415 if not isinstance(s, str):

416 s = str(s, "ascii")

417 except UnicodeDecodeError:

418 raise IDNAError("Invalid ASCII in A-label")

419 if uts46:

420 s = uts46_remap(s, std3_rules, False)

421 trailing_dot = False

422 result = []

423 if not strict:

424 labels = _unicode_dots_re.split(s)

425 else:

426 labels = s.split(".")

427 if not labels or labels == [""]:

428 raise IDNAError("Empty domain")

429 if not labels[-1]:

430 del labels[-1]

431 trailing_dot = True

432 for label in labels:

433 s = ulabel(label)

434 if s:

435 result.append(s)

436 else:

437 raise IDNAError("Empty label")

438 if trailing_dot:

439 result.append("")

440 return ".".join(result)