Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/idna/core.py: 12%

294 statements  

« prev     ^ index     » next       coverage.py v7.3.2, created at 2023-12-08 06:05 +0000

1from . import idnadata 

2import bisect 

3import unicodedata 

4import re 

5from typing import Union, Optional 

6from .intranges import intranges_contain 

7 

8_virama_combining_class = 9 

9_alabel_prefix = b'xn--' 

10_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]') 

11 

12class IDNAError(UnicodeError): 

13 """ Base exception for all IDNA-encoding related problems """ 

14 pass 

15 

16 

17class IDNABidiError(IDNAError): 

18 """ Exception when bidirectional requirements are not satisfied """ 

19 pass 

20 

21 

22class InvalidCodepoint(IDNAError): 

23 """ Exception when a disallowed or unallocated codepoint is used """ 

24 pass 

25 

26 

27class InvalidCodepointContext(IDNAError): 

28 """ Exception when the codepoint is not valid in the context it is used """ 

29 pass 

30 

31 

32def _combining_class(cp: int) -> int: 

33 v = unicodedata.combining(chr(cp)) 

34 if v == 0: 

35 if not unicodedata.name(chr(cp)): 

36 raise ValueError('Unknown character in unicodedata') 

37 return v 

38 

39def _is_script(cp: str, script: str) -> bool: 

40 return intranges_contain(ord(cp), idnadata.scripts[script]) 

41 

42def _punycode(s: str) -> bytes: 

43 return s.encode('punycode') 

44 

45def _unot(s: int) -> str: 

46 return 'U+{:04X}'.format(s) 

47 

48 

49def valid_label_length(label: Union[bytes, str]) -> bool: 

50 if len(label) > 63: 

51 return False 

52 return True 

53 

54 

55def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool: 

56 if len(label) > (254 if trailing_dot else 253): 

57 return False 

58 return True 

59 

60 

61def check_bidi(label: str, check_ltr: bool = False) -> bool: 

62 # Bidi rules should only be applied if string contains RTL characters 

63 bidi_label = False 

64 for (idx, cp) in enumerate(label, 1): 

65 direction = unicodedata.bidirectional(cp) 

66 if direction == '': 

67 # String likely comes from a newer version of Unicode 

68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx)) 

69 if direction in ['R', 'AL', 'AN']: 

70 bidi_label = True 

71 if not bidi_label and not check_ltr: 

72 return True 

73 

74 # Bidi rule 1 

75 direction = unicodedata.bidirectional(label[0]) 

76 if direction in ['R', 'AL']: 

77 rtl = True 

78 elif direction == 'L': 

79 rtl = False 

80 else: 

81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label))) 

82 

83 valid_ending = False 

84 number_type = None # type: Optional[str] 

85 for (idx, cp) in enumerate(label, 1): 

86 direction = unicodedata.bidirectional(cp) 

87 

88 if rtl: 

89 # Bidi rule 2 

90 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: 

91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx)) 

92 # Bidi rule 3 

93 if direction in ['R', 'AL', 'EN', 'AN']: 

94 valid_ending = True 

95 elif direction != 'NSM': 

96 valid_ending = False 

97 # Bidi rule 4 

98 if direction in ['AN', 'EN']: 

99 if not number_type: 

100 number_type = direction 

101 else: 

102 if number_type != direction: 

103 raise IDNABidiError('Can not mix numeral types in a right-to-left label') 

104 else: 

105 # Bidi rule 5 

106 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: 

107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx)) 

108 # Bidi rule 6 

109 if direction in ['L', 'EN']: 

110 valid_ending = True 

111 elif direction != 'NSM': 

112 valid_ending = False 

113 

114 if not valid_ending: 

115 raise IDNABidiError('Label ends with illegal codepoint directionality') 

116 

117 return True 

118 

119 

120def check_initial_combiner(label: str) -> bool: 

121 if unicodedata.category(label[0])[0] == 'M': 

122 raise IDNAError('Label begins with an illegal combining character') 

123 return True 

124 

125 

126def check_hyphen_ok(label: str) -> bool: 

127 if label[2:4] == '--': 

128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position') 

129 if label[0] == '-' or label[-1] == '-': 

130 raise IDNAError('Label must not start or end with a hyphen') 

131 return True 

132 

133 

134def check_nfc(label: str) -> None: 

135 if unicodedata.normalize('NFC', label) != label: 

136 raise IDNAError('Label must be in Normalization Form C') 

137 

138 

139def valid_contextj(label: str, pos: int) -> bool: 

140 cp_value = ord(label[pos]) 

141 

142 if cp_value == 0x200c: 

143 

144 if pos > 0: 

145 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

146 return True 

147 

148 ok = False 

149 for i in range(pos-1, -1, -1): 

150 joining_type = idnadata.joining_types.get(ord(label[i])) 

151 if joining_type == ord('T'): 

152 continue 

153 if joining_type in [ord('L'), ord('D')]: 

154 ok = True 

155 break 

156 

157 if not ok: 

158 return False 

159 

160 ok = False 

161 for i in range(pos+1, len(label)): 

162 joining_type = idnadata.joining_types.get(ord(label[i])) 

163 if joining_type == ord('T'): 

164 continue 

165 if joining_type in [ord('R'), ord('D')]: 

166 ok = True 

167 break 

168 return ok 

169 

170 if cp_value == 0x200d: 

171 

172 if pos > 0: 

173 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

174 return True 

175 return False 

176 

177 else: 

178 

179 return False 

180 

181 

182def valid_contexto(label: str, pos: int, exception: bool = False) -> bool: 

183 cp_value = ord(label[pos]) 

184 

185 if cp_value == 0x00b7: 

186 if 0 < pos < len(label)-1: 

187 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c: 

188 return True 

189 return False 

190 

191 elif cp_value == 0x0375: 

192 if pos < len(label)-1 and len(label) > 1: 

193 return _is_script(label[pos + 1], 'Greek') 

194 return False 

195 

196 elif cp_value == 0x05f3 or cp_value == 0x05f4: 

197 if pos > 0: 

198 return _is_script(label[pos - 1], 'Hebrew') 

199 return False 

200 

201 elif cp_value == 0x30fb: 

202 for cp in label: 

203 if cp == '\u30fb': 

204 continue 

205 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'): 

206 return True 

207 return False 

208 

209 elif 0x660 <= cp_value <= 0x669: 

210 for cp in label: 

211 if 0x6f0 <= ord(cp) <= 0x06f9: 

212 return False 

213 return True 

214 

215 elif 0x6f0 <= cp_value <= 0x6f9: 

216 for cp in label: 

217 if 0x660 <= ord(cp) <= 0x0669: 

218 return False 

219 return True 

220 

221 return False 

222 

223 

224def check_label(label: Union[str, bytes, bytearray]) -> None: 

225 if isinstance(label, (bytes, bytearray)): 

226 label = label.decode('utf-8') 

227 if len(label) == 0: 

228 raise IDNAError('Empty Label') 

229 

230 check_nfc(label) 

231 check_hyphen_ok(label) 

232 check_initial_combiner(label) 

233 

234 for (pos, cp) in enumerate(label): 

235 cp_value = ord(cp) 

236 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']): 

237 continue 

238 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']): 

239 try: 

240 if not valid_contextj(label, pos): 

241 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format( 

242 _unot(cp_value), pos+1, repr(label))) 

243 except ValueError: 

244 raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format( 

245 _unot(cp_value), pos+1, repr(label))) 

246 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']): 

247 if not valid_contexto(label, pos): 

248 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label))) 

249 else: 

250 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label))) 

251 

252 check_bidi(label) 

253 

254 

255def alabel(label: str) -> bytes: 

256 try: 

257 label_bytes = label.encode('ascii') 

258 ulabel(label_bytes) 

259 if not valid_label_length(label_bytes): 

260 raise IDNAError('Label too long') 

261 return label_bytes 

262 except UnicodeEncodeError: 

263 pass 

264 

265 if not label: 

266 raise IDNAError('No Input') 

267 

268 label = str(label) 

269 check_label(label) 

270 label_bytes = _punycode(label) 

271 label_bytes = _alabel_prefix + label_bytes 

272 

273 if not valid_label_length(label_bytes): 

274 raise IDNAError('Label too long') 

275 

276 return label_bytes 

277 

278 

279def ulabel(label: Union[str, bytes, bytearray]) -> str: 

280 if not isinstance(label, (bytes, bytearray)): 

281 try: 

282 label_bytes = label.encode('ascii') 

283 except UnicodeEncodeError: 

284 check_label(label) 

285 return label 

286 else: 

287 label_bytes = label 

288 

289 label_bytes = label_bytes.lower() 

290 if label_bytes.startswith(_alabel_prefix): 

291 label_bytes = label_bytes[len(_alabel_prefix):] 

292 if not label_bytes: 

293 raise IDNAError('Malformed A-label, no Punycode eligible content found') 

294 if label_bytes.decode('ascii')[-1] == '-': 

295 raise IDNAError('A-label must not end with a hyphen') 

296 else: 

297 check_label(label_bytes) 

298 return label_bytes.decode('ascii') 

299 

300 try: 

301 label = label_bytes.decode('punycode') 

302 except UnicodeError: 

303 raise IDNAError('Invalid A-label') 

304 check_label(label) 

305 return label 

306 

307 

308def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str: 

309 """Re-map the characters in the string according to UTS46 processing.""" 

310 from .uts46data import uts46data 

311 output = '' 

312 

313 for pos, char in enumerate(domain): 

314 code_point = ord(char) 

315 try: 

316 uts46row = uts46data[code_point if code_point < 256 else 

317 bisect.bisect_left(uts46data, (code_point, 'Z')) - 1] 

318 status = uts46row[1] 

319 replacement = None # type: Optional[str] 

320 if len(uts46row) == 3: 

321 replacement = uts46row[2] 

322 if (status == 'V' or 

323 (status == 'D' and not transitional) or 

324 (status == '3' and not std3_rules and replacement is None)): 

325 output += char 

326 elif replacement is not None and (status == 'M' or 

327 (status == '3' and not std3_rules) or 

328 (status == 'D' and transitional)): 

329 output += replacement 

330 elif status != 'I': 

331 raise IndexError() 

332 except IndexError: 

333 raise InvalidCodepoint( 

334 'Codepoint {} not allowed at position {} in {}'.format( 

335 _unot(code_point), pos + 1, repr(domain))) 

336 

337 return unicodedata.normalize('NFC', output) 

338 

339 

340def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes: 

341 if not isinstance(s, str): 

342 try: 

343 s = str(s, 'ascii') 

344 except UnicodeDecodeError: 

345 raise IDNAError('should pass a unicode string to the function rather than a byte string.') 

346 if uts46: 

347 s = uts46_remap(s, std3_rules, transitional) 

348 trailing_dot = False 

349 result = [] 

350 if strict: 

351 labels = s.split('.') 

352 else: 

353 labels = _unicode_dots_re.split(s) 

354 if not labels or labels == ['']: 

355 raise IDNAError('Empty domain') 

356 if labels[-1] == '': 

357 del labels[-1] 

358 trailing_dot = True 

359 for label in labels: 

360 s = alabel(label) 

361 if s: 

362 result.append(s) 

363 else: 

364 raise IDNAError('Empty label') 

365 if trailing_dot: 

366 result.append(b'') 

367 s = b'.'.join(result) 

368 if not valid_string_length(s, trailing_dot): 

369 raise IDNAError('Domain too long') 

370 return s 

371 

372 

373def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str: 

374 try: 

375 if not isinstance(s, str): 

376 s = str(s, 'ascii') 

377 except UnicodeDecodeError: 

378 raise IDNAError('Invalid ASCII in A-label') 

379 if uts46: 

380 s = uts46_remap(s, std3_rules, False) 

381 trailing_dot = False 

382 result = [] 

383 if not strict: 

384 labels = _unicode_dots_re.split(s) 

385 else: 

386 labels = s.split('.') 

387 if not labels or labels == ['']: 

388 raise IDNAError('Empty domain') 

389 if not labels[-1]: 

390 del labels[-1] 

391 trailing_dot = True 

392 for label in labels: 

393 s = ulabel(label) 

394 if s: 

395 result.append(s) 

396 else: 

397 raise IDNAError('Empty label') 

398 if trailing_dot: 

399 result.append('') 

400 return '.'.join(result)