Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

293 statements  

1import bisect 

2import re 

3import unicodedata 

4import warnings 

5from typing import Optional, Union 

6 

7from . import idnadata 

8from .intranges import intranges_contain 

9 

10_virama_combining_class = 9 

11_alabel_prefix = b"xn--" 

12_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]") 

13 

14 

15class IDNAError(UnicodeError): 

16 """Base exception for all IDNA-encoding related problems""" 

17 

18 pass 

19 

20 

21class IDNABidiError(IDNAError): 

22 """Exception when bidirectional requirements are not satisfied""" 

23 

24 pass 

25 

26 

27class InvalidCodepoint(IDNAError): 

28 """Exception when a disallowed or unallocated codepoint is used""" 

29 

30 pass 

31 

32 

33class InvalidCodepointContext(IDNAError): 

34 """Exception when the codepoint is not valid in the context it is used""" 

35 

36 pass 

37 

38 

39def _combining_class(cp: int) -> int: 

40 v = unicodedata.combining(chr(cp)) 

41 if v == 0: 

42 if not unicodedata.name(chr(cp)): 

43 raise ValueError("Unknown character in unicodedata") 

44 return v 

45 

46 

47def _is_script(cp: str, script: str) -> bool: 

48 return intranges_contain(ord(cp), idnadata.scripts[script]) 

49 

50 

51def _punycode(s: str) -> bytes: 

52 return s.encode("punycode") 

53 

54 

55def _unot(s: int) -> str: 

56 return "U+{:04X}".format(s) 

57 

58 

59def valid_label_length(label: Union[bytes, str]) -> bool: 

60 if len(label) > 63: 

61 return False 

62 return True 

63 

64 

65def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool: 

66 if len(label) > (254 if trailing_dot else 253): 

67 return False 

68 return True 

69 

70 

71def check_bidi(label: str, check_ltr: bool = False) -> bool: 

72 # Bidi rules should only be applied if string contains RTL characters 

73 bidi_label = False 

74 for idx, cp in enumerate(label, 1): 

75 direction = unicodedata.bidirectional(cp) 

76 if direction == "": 

77 # String likely comes from a newer version of Unicode 

78 raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx)) 

79 if direction in ["R", "AL", "AN"]: 

80 bidi_label = True 

81 if not bidi_label and not check_ltr: 

82 return True 

83 

84 # Bidi rule 1 

85 direction = unicodedata.bidirectional(label[0]) 

86 if direction in ["R", "AL"]: 

87 rtl = True 

88 elif direction == "L": 

89 rtl = False 

90 else: 

91 raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label))) 

92 

93 valid_ending = False 

94 number_type: Optional[str] = None 

95 for idx, cp in enumerate(label, 1): 

96 direction = unicodedata.bidirectional(cp) 

97 

98 if rtl: 

99 # Bidi rule 2 

100 if direction not in [ 

101 "R", 

102 "AL", 

103 "AN", 

104 "EN", 

105 "ES", 

106 "CS", 

107 "ET", 

108 "ON", 

109 "BN", 

110 "NSM", 

111 ]: 

112 raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx)) 

113 # Bidi rule 3 

114 if direction in ["R", "AL", "EN", "AN"]: 

115 valid_ending = True 

116 elif direction != "NSM": 

117 valid_ending = False 

118 # Bidi rule 4 

119 if direction in ["AN", "EN"]: 

120 if not number_type: 

121 number_type = direction 

122 else: 

123 if number_type != direction: 

124 raise IDNABidiError("Can not mix numeral types in a right-to-left label") 

125 else: 

126 # Bidi rule 5 

127 if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]: 

128 raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx)) 

129 # Bidi rule 6 

130 if direction in ["L", "EN"]: 

131 valid_ending = True 

132 elif direction != "NSM": 

133 valid_ending = False 

134 

135 if not valid_ending: 

136 raise IDNABidiError("Label ends with illegal codepoint directionality") 

137 

138 return True 

139 

140 

141def check_initial_combiner(label: str) -> bool: 

142 if unicodedata.category(label[0])[0] == "M": 

143 raise IDNAError("Label begins with an illegal combining character") 

144 return True 

145 

146 

147def check_hyphen_ok(label: str) -> bool: 

148 if label[2:4] == "--": 

149 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position") 

150 if label[0] == "-" or label[-1] == "-": 

151 raise IDNAError("Label must not start or end with a hyphen") 

152 return True 

153 

154 

155def check_nfc(label: str) -> None: 

156 if unicodedata.normalize("NFC", label) != label: 

157 raise IDNAError("Label must be in Normalization Form C") 

158 

159 

160def valid_contextj(label: str, pos: int) -> bool: 

161 cp_value = ord(label[pos]) 

162 

163 if cp_value == 0x200C: 

164 if pos > 0: 

165 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

166 return True 

167 

168 ok = False 

169 for i in range(pos - 1, -1, -1): 

170 joining_type = idnadata.joining_types().get(ord(label[i])) 

171 if joining_type == ord("T"): 

172 continue 

173 elif joining_type in [ord("L"), ord("D")]: 

174 ok = True 

175 break 

176 else: 

177 break 

178 

179 if not ok: 

180 return False 

181 

182 ok = False 

183 for i in range(pos + 1, len(label)): 

184 joining_type = idnadata.joining_types().get(ord(label[i])) 

185 if joining_type == ord("T"): 

186 continue 

187 elif joining_type in [ord("R"), ord("D")]: 

188 ok = True 

189 break 

190 else: 

191 break 

192 return ok 

193 

194 if cp_value == 0x200D: 

195 if pos > 0: 

196 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

197 return True 

198 return False 

199 

200 else: 

201 return False 

202 

203 

204def valid_contexto(label: str, pos: int, exception: bool = False) -> bool: 

205 cp_value = ord(label[pos]) 

206 

207 if cp_value == 0x00B7: 

208 if 0 < pos < len(label) - 1: 

209 if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C: 

210 return True 

211 return False 

212 

213 elif cp_value == 0x0375: 

214 if pos < len(label) - 1 and len(label) > 1: 

215 return _is_script(label[pos + 1], "Greek") 

216 return False 

217 

218 elif cp_value == 0x05F3 or cp_value == 0x05F4: 

219 if pos > 0: 

220 return _is_script(label[pos - 1], "Hebrew") 

221 return False 

222 

223 elif cp_value == 0x30FB: 

224 for cp in label: 

225 if cp == "\u30fb": 

226 continue 

227 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"): 

228 return True 

229 return False 

230 

231 elif 0x660 <= cp_value <= 0x669: 

232 for cp in label: 

233 if 0x6F0 <= ord(cp) <= 0x06F9: 

234 return False 

235 return True 

236 

237 elif 0x6F0 <= cp_value <= 0x6F9: 

238 for cp in label: 

239 if 0x660 <= ord(cp) <= 0x0669: 

240 return False 

241 return True 

242 

243 return False 

244 

245 

246def check_label(label: Union[str, bytes, bytearray]) -> None: 

247 if isinstance(label, (bytes, bytearray)): 

248 label = label.decode("utf-8") 

249 if len(label) == 0: 

250 raise IDNAError("Empty Label") 

251 

252 check_nfc(label) 

253 check_hyphen_ok(label) 

254 check_initial_combiner(label) 

255 

256 for pos, cp in enumerate(label): 

257 cp_value = ord(cp) 

258 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]): 

259 continue 

260 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]): 

261 try: 

262 if not valid_contextj(label, pos): 

263 raise InvalidCodepointContext( 

264 "Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label)) 

265 ) 

266 except ValueError: 

267 raise IDNAError( 

268 "Unknown codepoint adjacent to joiner {} at position {} in {}".format( 

269 _unot(cp_value), pos + 1, repr(label) 

270 ) 

271 ) 

272 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]): 

273 if not valid_contexto(label, pos): 

274 raise InvalidCodepointContext( 

275 "Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label)) 

276 ) 

277 else: 

278 raise InvalidCodepoint( 

279 "Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label)) 

280 ) 

281 

282 check_bidi(label) 

283 

284 

285def alabel(label: str) -> bytes: 

286 try: 

287 label_bytes = label.encode("ascii") 

288 ulabel(label_bytes) 

289 if not valid_label_length(label_bytes): 

290 raise IDNAError("Label too long") 

291 return label_bytes 

292 except UnicodeEncodeError: 

293 pass 

294 

295 check_label(label) 

296 label_bytes = _alabel_prefix + _punycode(label) 

297 

298 if not valid_label_length(label_bytes): 

299 raise IDNAError("Label too long") 

300 

301 return label_bytes 

302 

303 

304def ulabel(label: Union[str, bytes, bytearray]) -> str: 

305 if not isinstance(label, (bytes, bytearray)): 

306 try: 

307 label_bytes = label.encode("ascii") 

308 except UnicodeEncodeError: 

309 check_label(label) 

310 return label 

311 else: 

312 label_bytes = bytes(label) 

313 

314 label_bytes = label_bytes.lower() 

315 if label_bytes.startswith(_alabel_prefix): 

316 label_bytes = label_bytes[len(_alabel_prefix) :] 

317 if not label_bytes: 

318 raise IDNAError("Malformed A-label, no Punycode eligible content found") 

319 if label_bytes.decode("ascii")[-1] == "-": 

320 raise IDNAError("A-label must not end with a hyphen") 

321 else: 

322 check_label(label_bytes) 

323 return label_bytes.decode("ascii") 

324 

325 try: 

326 label = label_bytes.decode("punycode") 

327 except UnicodeError: 

328 raise IDNAError("Invalid A-label") 

329 check_label(label) 

330 return label 

331 

332 

333def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str: 

334 """Re-map the characters in the string according to UTS46 processing.""" 

335 from .uts46data import uts46data 

336 

337 output = "" 

338 

339 for pos, char in enumerate(domain): 

340 code_point = ord(char) 

341 uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1] 

342 status = uts46row[1] 

343 replacement: Optional[str] = None 

344 if len(uts46row) == 3: 

345 replacement = uts46row[2] 

346 if status == "V" or (status == "D" and not transitional) or (status == "3" and not std3_rules and replacement is None): 

347 output += char 

348 elif replacement is not None and ( 

349 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional) 

350 ): 

351 output += replacement 

352 elif status == "I": 

353 continue 

354 else: 

355 raise InvalidCodepoint( 

356 "Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain)) 

357 ) 

358 

359 return unicodedata.normalize("NFC", output) 

360 

361 

362def encode( 

363 s: Union[str, bytes, bytearray], 

364 strict: bool = False, 

365 uts46: bool = False, 

366 std3_rules: bool = False, 

367 transitional: bool = False, 

368) -> bytes: 

369 if transitional: 

370 warnings.warn( 

371 "Transitional processing has been removed from UTS #46. " 

372 "The transitional argument will be removed in a future version.", 

373 DeprecationWarning, 

374 stacklevel=2, 

375 ) 

376 if not isinstance(s, str): 

377 try: 

378 s = str(s, "ascii") 

379 except UnicodeDecodeError: 

380 raise IDNAError("should pass a unicode string to the function rather than a byte string.") 

381 if uts46: 

382 s = uts46_remap(s, std3_rules, transitional) 

383 trailing_dot = False 

384 result = [] 

385 if strict: 

386 labels = s.split(".") 

387 else: 

388 labels = _unicode_dots_re.split(s) 

389 if not labels or labels == [""]: 

390 raise IDNAError("Empty domain") 

391 if labels[-1] == "": 

392 del labels[-1] 

393 trailing_dot = True 

394 for label in labels: 

395 s = alabel(label) 

396 if s: 

397 result.append(s) 

398 else: 

399 raise IDNAError("Empty label") 

400 if trailing_dot: 

401 result.append(b"") 

402 s = b".".join(result) 

403 if not valid_string_length(s, trailing_dot): 

404 raise IDNAError("Domain too long") 

405 return s 

406 

407 

408def decode( 

409 s: Union[str, bytes, bytearray], 

410 strict: bool = False, 

411 uts46: bool = False, 

412 std3_rules: bool = False, 

413) -> str: 

414 try: 

415 if not isinstance(s, str): 

416 s = str(s, "ascii") 

417 except UnicodeDecodeError: 

418 raise IDNAError("Invalid ASCII in A-label") 

419 if uts46: 

420 s = uts46_remap(s, std3_rules, False) 

421 trailing_dot = False 

422 result = [] 

423 if not strict: 

424 labels = _unicode_dots_re.split(s) 

425 else: 

426 labels = s.split(".") 

427 if not labels or labels == [""]: 

428 raise IDNAError("Empty domain") 

429 if not labels[-1]: 

430 del labels[-1] 

431 trailing_dot = True 

432 for label in labels: 

433 s = ulabel(label) 

434 if s: 

435 result.append(s) 

436 else: 

437 raise IDNAError("Empty label") 

438 if trailing_dot: 

439 result.append("") 

440 return ".".join(result)