Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.11/site-packages/idna/core.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

292 statements  

1import bisect 

2import re 

3import unicodedata 

4from typing import Optional, Union 

5 

6from . import idnadata 

7from .intranges import intranges_contain 

8 

9_virama_combining_class = 9 

10_alabel_prefix = b"xn--" 

11_unicode_dots_re = re.compile("[\u002e\u3002\uff0e\uff61]") 

12 

13 

14class IDNAError(UnicodeError): 

15 """Base exception for all IDNA-encoding related problems""" 

16 

17 pass 

18 

19 

20class IDNABidiError(IDNAError): 

21 """Exception when bidirectional requirements are not satisfied""" 

22 

23 pass 

24 

25 

26class InvalidCodepoint(IDNAError): 

27 """Exception when a disallowed or unallocated codepoint is used""" 

28 

29 pass 

30 

31 

32class InvalidCodepointContext(IDNAError): 

33 """Exception when the codepoint is not valid in the context it is used""" 

34 

35 pass 

36 

37 

38def _combining_class(cp: int) -> int: 

39 v = unicodedata.combining(chr(cp)) 

40 if v == 0: 

41 if not unicodedata.name(chr(cp)): 

42 raise ValueError("Unknown character in unicodedata") 

43 return v 

44 

45 

46def _is_script(cp: str, script: str) -> bool: 

47 return intranges_contain(ord(cp), idnadata.scripts[script]) 

48 

49 

50def _punycode(s: str) -> bytes: 

51 return s.encode("punycode") 

52 

53 

54def _unot(s: int) -> str: 

55 return "U+{:04X}".format(s) 

56 

57 

58def valid_label_length(label: Union[bytes, str]) -> bool: 

59 if len(label) > 63: 

60 return False 

61 return True 

62 

63 

64def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool: 

65 if len(label) > (254 if trailing_dot else 253): 

66 return False 

67 return True 

68 

69 

70def check_bidi(label: str, check_ltr: bool = False) -> bool: 

71 # Bidi rules should only be applied if string contains RTL characters 

72 bidi_label = False 

73 for idx, cp in enumerate(label, 1): 

74 direction = unicodedata.bidirectional(cp) 

75 if direction == "": 

76 # String likely comes from a newer version of Unicode 

77 raise IDNABidiError("Unknown directionality in label {} at position {}".format(repr(label), idx)) 

78 if direction in ["R", "AL", "AN"]: 

79 bidi_label = True 

80 if not bidi_label and not check_ltr: 

81 return True 

82 

83 # Bidi rule 1 

84 direction = unicodedata.bidirectional(label[0]) 

85 if direction in ["R", "AL"]: 

86 rtl = True 

87 elif direction == "L": 

88 rtl = False 

89 else: 

90 raise IDNABidiError("First codepoint in label {} must be directionality L, R or AL".format(repr(label))) 

91 

92 valid_ending = False 

93 number_type: Optional[str] = None 

94 for idx, cp in enumerate(label, 1): 

95 direction = unicodedata.bidirectional(cp) 

96 

97 if rtl: 

98 # Bidi rule 2 

99 if direction not in [ 

100 "R", 

101 "AL", 

102 "AN", 

103 "EN", 

104 "ES", 

105 "CS", 

106 "ET", 

107 "ON", 

108 "BN", 

109 "NSM", 

110 ]: 

111 raise IDNABidiError("Invalid direction for codepoint at position {} in a right-to-left label".format(idx)) 

112 # Bidi rule 3 

113 if direction in ["R", "AL", "EN", "AN"]: 

114 valid_ending = True 

115 elif direction != "NSM": 

116 valid_ending = False 

117 # Bidi rule 4 

118 if direction in ["AN", "EN"]: 

119 if not number_type: 

120 number_type = direction 

121 else: 

122 if number_type != direction: 

123 raise IDNABidiError("Can not mix numeral types in a right-to-left label") 

124 else: 

125 # Bidi rule 5 

126 if direction not in ["L", "EN", "ES", "CS", "ET", "ON", "BN", "NSM"]: 

127 raise IDNABidiError("Invalid direction for codepoint at position {} in a left-to-right label".format(idx)) 

128 # Bidi rule 6 

129 if direction in ["L", "EN"]: 

130 valid_ending = True 

131 elif direction != "NSM": 

132 valid_ending = False 

133 

134 if not valid_ending: 

135 raise IDNABidiError("Label ends with illegal codepoint directionality") 

136 

137 return True 

138 

139 

140def check_initial_combiner(label: str) -> bool: 

141 if unicodedata.category(label[0])[0] == "M": 

142 raise IDNAError("Label begins with an illegal combining character") 

143 return True 

144 

145 

146def check_hyphen_ok(label: str) -> bool: 

147 if label[2:4] == "--": 

148 raise IDNAError("Label has disallowed hyphens in 3rd and 4th position") 

149 if label[0] == "-" or label[-1] == "-": 

150 raise IDNAError("Label must not start or end with a hyphen") 

151 return True 

152 

153 

154def check_nfc(label: str) -> None: 

155 if unicodedata.normalize("NFC", label) != label: 

156 raise IDNAError("Label must be in Normalization Form C") 

157 

158 

159def valid_contextj(label: str, pos: int) -> bool: 

160 cp_value = ord(label[pos]) 

161 

162 if cp_value == 0x200C: 

163 if pos > 0: 

164 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

165 return True 

166 

167 ok = False 

168 for i in range(pos - 1, -1, -1): 

169 joining_type = idnadata.joining_types.get(ord(label[i])) 

170 if joining_type == ord("T"): 

171 continue 

172 elif joining_type in [ord("L"), ord("D")]: 

173 ok = True 

174 break 

175 else: 

176 break 

177 

178 if not ok: 

179 return False 

180 

181 ok = False 

182 for i in range(pos + 1, len(label)): 

183 joining_type = idnadata.joining_types.get(ord(label[i])) 

184 if joining_type == ord("T"): 

185 continue 

186 elif joining_type in [ord("R"), ord("D")]: 

187 ok = True 

188 break 

189 else: 

190 break 

191 return ok 

192 

193 if cp_value == 0x200D: 

194 if pos > 0: 

195 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

196 return True 

197 return False 

198 

199 else: 

200 return False 

201 

202 

203def valid_contexto(label: str, pos: int, exception: bool = False) -> bool: 

204 cp_value = ord(label[pos]) 

205 

206 if cp_value == 0x00B7: 

207 if 0 < pos < len(label) - 1: 

208 if ord(label[pos - 1]) == 0x006C and ord(label[pos + 1]) == 0x006C: 

209 return True 

210 return False 

211 

212 elif cp_value == 0x0375: 

213 if pos < len(label) - 1 and len(label) > 1: 

214 return _is_script(label[pos + 1], "Greek") 

215 return False 

216 

217 elif cp_value == 0x05F3 or cp_value == 0x05F4: 

218 if pos > 0: 

219 return _is_script(label[pos - 1], "Hebrew") 

220 return False 

221 

222 elif cp_value == 0x30FB: 

223 for cp in label: 

224 if cp == "\u30fb": 

225 continue 

226 if _is_script(cp, "Hiragana") or _is_script(cp, "Katakana") or _is_script(cp, "Han"): 

227 return True 

228 return False 

229 

230 elif 0x660 <= cp_value <= 0x669: 

231 for cp in label: 

232 if 0x6F0 <= ord(cp) <= 0x06F9: 

233 return False 

234 return True 

235 

236 elif 0x6F0 <= cp_value <= 0x6F9: 

237 for cp in label: 

238 if 0x660 <= ord(cp) <= 0x0669: 

239 return False 

240 return True 

241 

242 return False 

243 

244 

245def check_label(label: Union[str, bytes, bytearray]) -> None: 

246 if isinstance(label, (bytes, bytearray)): 

247 label = label.decode("utf-8") 

248 if len(label) == 0: 

249 raise IDNAError("Empty Label") 

250 

251 check_nfc(label) 

252 check_hyphen_ok(label) 

253 check_initial_combiner(label) 

254 

255 for pos, cp in enumerate(label): 

256 cp_value = ord(cp) 

257 if intranges_contain(cp_value, idnadata.codepoint_classes["PVALID"]): 

258 continue 

259 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTJ"]): 

260 try: 

261 if not valid_contextj(label, pos): 

262 raise InvalidCodepointContext( 

263 "Joiner {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label)) 

264 ) 

265 except ValueError: 

266 raise IDNAError( 

267 "Unknown codepoint adjacent to joiner {} at position {} in {}".format( 

268 _unot(cp_value), pos + 1, repr(label) 

269 ) 

270 ) 

271 elif intranges_contain(cp_value, idnadata.codepoint_classes["CONTEXTO"]): 

272 if not valid_contexto(label, pos): 

273 raise InvalidCodepointContext( 

274 "Codepoint {} not allowed at position {} in {}".format(_unot(cp_value), pos + 1, repr(label)) 

275 ) 

276 else: 

277 raise InvalidCodepoint( 

278 "Codepoint {} at position {} of {} not allowed".format(_unot(cp_value), pos + 1, repr(label)) 

279 ) 

280 

281 check_bidi(label) 

282 

283 

284def alabel(label: str) -> bytes: 

285 try: 

286 label_bytes = label.encode("ascii") 

287 ulabel(label_bytes) 

288 if not valid_label_length(label_bytes): 

289 raise IDNAError("Label too long") 

290 return label_bytes 

291 except UnicodeEncodeError: 

292 pass 

293 

294 check_label(label) 

295 label_bytes = _alabel_prefix + _punycode(label) 

296 

297 if not valid_label_length(label_bytes): 

298 raise IDNAError("Label too long") 

299 

300 return label_bytes 

301 

302 

303def ulabel(label: Union[str, bytes, bytearray]) -> str: 

304 if not isinstance(label, (bytes, bytearray)): 

305 try: 

306 label_bytes = label.encode("ascii") 

307 except UnicodeEncodeError: 

308 check_label(label) 

309 return label 

310 else: 

311 label_bytes = label 

312 

313 label_bytes = label_bytes.lower() 

314 if label_bytes.startswith(_alabel_prefix): 

315 label_bytes = label_bytes[len(_alabel_prefix) :] 

316 if not label_bytes: 

317 raise IDNAError("Malformed A-label, no Punycode eligible content found") 

318 if label_bytes.decode("ascii")[-1] == "-": 

319 raise IDNAError("A-label must not end with a hyphen") 

320 else: 

321 check_label(label_bytes) 

322 return label_bytes.decode("ascii") 

323 

324 try: 

325 label = label_bytes.decode("punycode") 

326 except UnicodeError: 

327 raise IDNAError("Invalid A-label") 

328 check_label(label) 

329 return label 

330 

331 

332def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str: 

333 """Re-map the characters in the string according to UTS46 processing.""" 

334 from .uts46data import uts46data 

335 

336 output = "" 

337 

338 for pos, char in enumerate(domain): 

339 code_point = ord(char) 

340 try: 

341 uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1] 

342 status = uts46row[1] 

343 replacement: Optional[str] = None 

344 if len(uts46row) == 3: 

345 replacement = uts46row[2] 

346 if ( 

347 status == "V" 

348 or (status == "D" and not transitional) 

349 or (status == "3" and not std3_rules and replacement is None) 

350 ): 

351 output += char 

352 elif replacement is not None and ( 

353 status == "M" or (status == "3" and not std3_rules) or (status == "D" and transitional) 

354 ): 

355 output += replacement 

356 elif status != "I": 

357 raise IndexError() 

358 except IndexError: 

359 raise InvalidCodepoint( 

360 "Codepoint {} not allowed at position {} in {}".format(_unot(code_point), pos + 1, repr(domain)) 

361 ) 

362 

363 return unicodedata.normalize("NFC", output) 

364 

365 

366def encode( 

367 s: Union[str, bytes, bytearray], 

368 strict: bool = False, 

369 uts46: bool = False, 

370 std3_rules: bool = False, 

371 transitional: bool = False, 

372) -> bytes: 

373 if not isinstance(s, str): 

374 try: 

375 s = str(s, "ascii") 

376 except UnicodeDecodeError: 

377 raise IDNAError("should pass a unicode string to the function rather than a byte string.") 

378 if uts46: 

379 s = uts46_remap(s, std3_rules, transitional) 

380 trailing_dot = False 

381 result = [] 

382 if strict: 

383 labels = s.split(".") 

384 else: 

385 labels = _unicode_dots_re.split(s) 

386 if not labels or labels == [""]: 

387 raise IDNAError("Empty domain") 

388 if labels[-1] == "": 

389 del labels[-1] 

390 trailing_dot = True 

391 for label in labels: 

392 s = alabel(label) 

393 if s: 

394 result.append(s) 

395 else: 

396 raise IDNAError("Empty label") 

397 if trailing_dot: 

398 result.append(b"") 

399 s = b".".join(result) 

400 if not valid_string_length(s, trailing_dot): 

401 raise IDNAError("Domain too long") 

402 return s 

403 

404 

405def decode( 

406 s: Union[str, bytes, bytearray], 

407 strict: bool = False, 

408 uts46: bool = False, 

409 std3_rules: bool = False, 

410) -> str: 

411 try: 

412 if not isinstance(s, str): 

413 s = str(s, "ascii") 

414 except UnicodeDecodeError: 

415 raise IDNAError("Invalid ASCII in A-label") 

416 if uts46: 

417 s = uts46_remap(s, std3_rules, False) 

418 trailing_dot = False 

419 result = [] 

420 if not strict: 

421 labels = _unicode_dots_re.split(s) 

422 else: 

423 labels = s.split(".") 

424 if not labels or labels == [""]: 

425 raise IDNAError("Empty domain") 

426 if not labels[-1]: 

427 del labels[-1] 

428 trailing_dot = True 

429 for label in labels: 

430 s = ulabel(label) 

431 if s: 

432 result.append(s) 

433 else: 

434 raise IDNAError("Empty label") 

435 if trailing_dot: 

436 result.append("") 

437 return ".".join(result)