Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/idna/core.py: 12%

Shortcuts on this page

r m x   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

292 statements  

1from . import idnadata 

2import bisect 

3import unicodedata 

4import re 

5from typing import Union, Optional 

6from .intranges import intranges_contain 

7 

8_virama_combining_class = 9 

9_alabel_prefix = b'xn--' 

10_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]') 

11 

12class IDNAError(UnicodeError): 

13 """ Base exception for all IDNA-encoding related problems """ 

14 pass 

15 

16 

17class IDNABidiError(IDNAError): 

18 """ Exception when bidirectional requirements are not satisfied """ 

19 pass 

20 

21 

22class InvalidCodepoint(IDNAError): 

23 """ Exception when a disallowed or unallocated codepoint is used """ 

24 pass 

25 

26 

27class InvalidCodepointContext(IDNAError): 

28 """ Exception when the codepoint is not valid in the context it is used """ 

29 pass 

30 

31 

32def _combining_class(cp: int) -> int: 

33 v = unicodedata.combining(chr(cp)) 

34 if v == 0: 

35 if not unicodedata.name(chr(cp)): 

36 raise ValueError('Unknown character in unicodedata') 

37 return v 

38 

39def _is_script(cp: str, script: str) -> bool: 

40 return intranges_contain(ord(cp), idnadata.scripts[script]) 

41 

42def _punycode(s: str) -> bytes: 

43 return s.encode('punycode') 

44 

45def _unot(s: int) -> str: 

46 return 'U+{:04X}'.format(s) 

47 

48 

49def valid_label_length(label: Union[bytes, str]) -> bool: 

50 if len(label) > 63: 

51 return False 

52 return True 

53 

54 

55def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool: 

56 if len(label) > (254 if trailing_dot else 253): 

57 return False 

58 return True 

59 

60 

61def check_bidi(label: str, check_ltr: bool = False) -> bool: 

62 # Bidi rules should only be applied if string contains RTL characters 

63 bidi_label = False 

64 for (idx, cp) in enumerate(label, 1): 

65 direction = unicodedata.bidirectional(cp) 

66 if direction == '': 

67 # String likely comes from a newer version of Unicode 

68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx)) 

69 if direction in ['R', 'AL', 'AN']: 

70 bidi_label = True 

71 if not bidi_label and not check_ltr: 

72 return True 

73 

74 # Bidi rule 1 

75 direction = unicodedata.bidirectional(label[0]) 

76 if direction in ['R', 'AL']: 

77 rtl = True 

78 elif direction == 'L': 

79 rtl = False 

80 else: 

81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label))) 

82 

83 valid_ending = False 

84 number_type = None # type: Optional[str] 

85 for (idx, cp) in enumerate(label, 1): 

86 direction = unicodedata.bidirectional(cp) 

87 

88 if rtl: 

89 # Bidi rule 2 

90 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: 

91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx)) 

92 # Bidi rule 3 

93 if direction in ['R', 'AL', 'EN', 'AN']: 

94 valid_ending = True 

95 elif direction != 'NSM': 

96 valid_ending = False 

97 # Bidi rule 4 

98 if direction in ['AN', 'EN']: 

99 if not number_type: 

100 number_type = direction 

101 else: 

102 if number_type != direction: 

103 raise IDNABidiError('Can not mix numeral types in a right-to-left label') 

104 else: 

105 # Bidi rule 5 

106 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']: 

107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx)) 

108 # Bidi rule 6 

109 if direction in ['L', 'EN']: 

110 valid_ending = True 

111 elif direction != 'NSM': 

112 valid_ending = False 

113 

114 if not valid_ending: 

115 raise IDNABidiError('Label ends with illegal codepoint directionality') 

116 

117 return True 

118 

119 

120def check_initial_combiner(label: str) -> bool: 

121 if unicodedata.category(label[0])[0] == 'M': 

122 raise IDNAError('Label begins with an illegal combining character') 

123 return True 

124 

125 

126def check_hyphen_ok(label: str) -> bool: 

127 if label[2:4] == '--': 

128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position') 

129 if label[0] == '-' or label[-1] == '-': 

130 raise IDNAError('Label must not start or end with a hyphen') 

131 return True 

132 

133 

134def check_nfc(label: str) -> None: 

135 if unicodedata.normalize('NFC', label) != label: 

136 raise IDNAError('Label must be in Normalization Form C') 

137 

138 

139def valid_contextj(label: str, pos: int) -> bool: 

140 cp_value = ord(label[pos]) 

141 

142 if cp_value == 0x200c: 

143 

144 if pos > 0: 

145 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

146 return True 

147 

148 ok = False 

149 for i in range(pos-1, -1, -1): 

150 joining_type = idnadata.joining_types.get(ord(label[i])) 

151 if joining_type == ord('T'): 

152 continue 

153 elif joining_type in [ord('L'), ord('D')]: 

154 ok = True 

155 break 

156 else: 

157 break 

158 

159 if not ok: 

160 return False 

161 

162 ok = False 

163 for i in range(pos+1, len(label)): 

164 joining_type = idnadata.joining_types.get(ord(label[i])) 

165 if joining_type == ord('T'): 

166 continue 

167 elif joining_type in [ord('R'), ord('D')]: 

168 ok = True 

169 break 

170 else: 

171 break 

172 return ok 

173 

174 if cp_value == 0x200d: 

175 

176 if pos > 0: 

177 if _combining_class(ord(label[pos - 1])) == _virama_combining_class: 

178 return True 

179 return False 

180 

181 else: 

182 

183 return False 

184 

185 

186def valid_contexto(label: str, pos: int, exception: bool = False) -> bool: 

187 cp_value = ord(label[pos]) 

188 

189 if cp_value == 0x00b7: 

190 if 0 < pos < len(label)-1: 

191 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c: 

192 return True 

193 return False 

194 

195 elif cp_value == 0x0375: 

196 if pos < len(label)-1 and len(label) > 1: 

197 return _is_script(label[pos + 1], 'Greek') 

198 return False 

199 

200 elif cp_value == 0x05f3 or cp_value == 0x05f4: 

201 if pos > 0: 

202 return _is_script(label[pos - 1], 'Hebrew') 

203 return False 

204 

205 elif cp_value == 0x30fb: 

206 for cp in label: 

207 if cp == '\u30fb': 

208 continue 

209 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'): 

210 return True 

211 return False 

212 

213 elif 0x660 <= cp_value <= 0x669: 

214 for cp in label: 

215 if 0x6f0 <= ord(cp) <= 0x06f9: 

216 return False 

217 return True 

218 

219 elif 0x6f0 <= cp_value <= 0x6f9: 

220 for cp in label: 

221 if 0x660 <= ord(cp) <= 0x0669: 

222 return False 

223 return True 

224 

225 return False 

226 

227 

228def check_label(label: Union[str, bytes, bytearray]) -> None: 

229 if isinstance(label, (bytes, bytearray)): 

230 label = label.decode('utf-8') 

231 if len(label) == 0: 

232 raise IDNAError('Empty Label') 

233 

234 check_nfc(label) 

235 check_hyphen_ok(label) 

236 check_initial_combiner(label) 

237 

238 for (pos, cp) in enumerate(label): 

239 cp_value = ord(cp) 

240 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']): 

241 continue 

242 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']): 

243 try: 

244 if not valid_contextj(label, pos): 

245 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format( 

246 _unot(cp_value), pos+1, repr(label))) 

247 except ValueError: 

248 raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format( 

249 _unot(cp_value), pos+1, repr(label))) 

250 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']): 

251 if not valid_contexto(label, pos): 

252 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label))) 

253 else: 

254 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label))) 

255 

256 check_bidi(label) 

257 

258 

259def alabel(label: str) -> bytes: 

260 try: 

261 label_bytes = label.encode('ascii') 

262 ulabel(label_bytes) 

263 if not valid_label_length(label_bytes): 

264 raise IDNAError('Label too long') 

265 return label_bytes 

266 except UnicodeEncodeError: 

267 pass 

268 

269 check_label(label) 

270 label_bytes = _alabel_prefix + _punycode(label) 

271 

272 if not valid_label_length(label_bytes): 

273 raise IDNAError('Label too long') 

274 

275 return label_bytes 

276 

277 

278def ulabel(label: Union[str, bytes, bytearray]) -> str: 

279 if not isinstance(label, (bytes, bytearray)): 

280 try: 

281 label_bytes = label.encode('ascii') 

282 except UnicodeEncodeError: 

283 check_label(label) 

284 return label 

285 else: 

286 label_bytes = label 

287 

288 label_bytes = label_bytes.lower() 

289 if label_bytes.startswith(_alabel_prefix): 

290 label_bytes = label_bytes[len(_alabel_prefix):] 

291 if not label_bytes: 

292 raise IDNAError('Malformed A-label, no Punycode eligible content found') 

293 if label_bytes.decode('ascii')[-1] == '-': 

294 raise IDNAError('A-label must not end with a hyphen') 

295 else: 

296 check_label(label_bytes) 

297 return label_bytes.decode('ascii') 

298 

299 try: 

300 label = label_bytes.decode('punycode') 

301 except UnicodeError: 

302 raise IDNAError('Invalid A-label') 

303 check_label(label) 

304 return label 

305 

306 

307def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str: 

308 """Re-map the characters in the string according to UTS46 processing.""" 

309 from .uts46data import uts46data 

310 output = '' 

311 

312 for pos, char in enumerate(domain): 

313 code_point = ord(char) 

314 try: 

315 uts46row = uts46data[code_point if code_point < 256 else 

316 bisect.bisect_left(uts46data, (code_point, 'Z')) - 1] 

317 status = uts46row[1] 

318 replacement = None # type: Optional[str] 

319 if len(uts46row) == 3: 

320 replacement = uts46row[2] 

321 if (status == 'V' or 

322 (status == 'D' and not transitional) or 

323 (status == '3' and not std3_rules and replacement is None)): 

324 output += char 

325 elif replacement is not None and (status == 'M' or 

326 (status == '3' and not std3_rules) or 

327 (status == 'D' and transitional)): 

328 output += replacement 

329 elif status != 'I': 

330 raise IndexError() 

331 except IndexError: 

332 raise InvalidCodepoint( 

333 'Codepoint {} not allowed at position {} in {}'.format( 

334 _unot(code_point), pos + 1, repr(domain))) 

335 

336 return unicodedata.normalize('NFC', output) 

337 

338 

339def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes: 

340 if not isinstance(s, str): 

341 try: 

342 s = str(s, 'ascii') 

343 except UnicodeDecodeError: 

344 raise IDNAError('should pass a unicode string to the function rather than a byte string.') 

345 if uts46: 

346 s = uts46_remap(s, std3_rules, transitional) 

347 trailing_dot = False 

348 result = [] 

349 if strict: 

350 labels = s.split('.') 

351 else: 

352 labels = _unicode_dots_re.split(s) 

353 if not labels or labels == ['']: 

354 raise IDNAError('Empty domain') 

355 if labels[-1] == '': 

356 del labels[-1] 

357 trailing_dot = True 

358 for label in labels: 

359 s = alabel(label) 

360 if s: 

361 result.append(s) 

362 else: 

363 raise IDNAError('Empty label') 

364 if trailing_dot: 

365 result.append(b'') 

366 s = b'.'.join(result) 

367 if not valid_string_length(s, trailing_dot): 

368 raise IDNAError('Domain too long') 

369 return s 

370 

371 

372def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str: 

373 try: 

374 if not isinstance(s, str): 

375 s = str(s, 'ascii') 

376 except UnicodeDecodeError: 

377 raise IDNAError('Invalid ASCII in A-label') 

378 if uts46: 

379 s = uts46_remap(s, std3_rules, False) 

380 trailing_dot = False 

381 result = [] 

382 if not strict: 

383 labels = _unicode_dots_re.split(s) 

384 else: 

385 labels = s.split('.') 

386 if not labels or labels == ['']: 

387 raise IDNAError('Empty domain') 

388 if not labels[-1]: 

389 del labels[-1] 

390 trailing_dot = True 

391 for label in labels: 

392 s = ulabel(label) 

393 if s: 

394 result.append(s) 

395 else: 

396 raise IDNAError('Empty label') 

397 if trailing_dot: 

398 result.append('') 

399 return '.'.join(result)