Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/idna/core.py: 12%

Shortcuts on this page

r m x toggle line displays

j k next/prev highlighted chunk

0 (zero) top of page

1 (one) first highlighted chunk

292 statements

1from . import idnadata

2import bisect

3import unicodedata

4import re

5from typing import Union, Optional

6from .intranges import intranges_contain

8_virama_combining_class = 9

9_alabel_prefix = b'xn--'

10_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')

12class IDNAError(UnicodeError):

13 """ Base exception for all IDNA-encoding related problems """

14 pass

17class IDNABidiError(IDNAError):

18 """ Exception when bidirectional requirements are not satisfied """

19 pass

22class InvalidCodepoint(IDNAError):

23 """ Exception when a disallowed or unallocated codepoint is used """

24 pass

27class InvalidCodepointContext(IDNAError):

28 """ Exception when the codepoint is not valid in the context it is used """

29 pass

32def _combining_class(cp: int) -> int:

33 v = unicodedata.combining(chr(cp))

34 if v == 0:

35 if not unicodedata.name(chr(cp)):

36 raise ValueError('Unknown character in unicodedata')

37 return v

39def _is_script(cp: str, script: str) -> bool:

40 return intranges_contain(ord(cp), idnadata.scripts[script])

42def _punycode(s: str) -> bytes:

43 return s.encode('punycode')

45def _unot(s: int) -> str:

46 return 'U+{:04X}'.format(s)

49def valid_label_length(label: Union[bytes, str]) -> bool:

50 if len(label) > 63:

51 return False

52 return True

55def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:

56 if len(label) > (254 if trailing_dot else 253):

57 return False

58 return True

61def check_bidi(label: str, check_ltr: bool = False) -> bool:

62 # Bidi rules should only be applied if string contains RTL characters

63 bidi_label = False

64 for (idx, cp) in enumerate(label, 1):

65 direction = unicodedata.bidirectional(cp)

66 if direction == '':

67 # String likely comes from a newer version of Unicode

68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx))

69 if direction in ['R', 'AL', 'AN']:

70 bidi_label = True

71 if not bidi_label and not check_ltr:

72 return True

74 # Bidi rule 1

75 direction = unicodedata.bidirectional(label[0])

76 if direction in ['R', 'AL']:

77 rtl = True

78 elif direction == 'L':

79 rtl = False

80 else:

81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label)))

83 valid_ending = False

84 number_type = None # type: Optional[str]

85 for (idx, cp) in enumerate(label, 1):

86 direction = unicodedata.bidirectional(cp)

88 if rtl:

89 # Bidi rule 2

90 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:

91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx))

92 # Bidi rule 3

93 if direction in ['R', 'AL', 'EN', 'AN']:

94 valid_ending = True

95 elif direction != 'NSM':

96 valid_ending = False

97 # Bidi rule 4

98 if direction in ['AN', 'EN']:

99 if not number_type:

100 number_type = direction

101 else:

102 if number_type != direction:

103 raise IDNABidiError('Can not mix numeral types in a right-to-left label')

104 else:

105 # Bidi rule 5

106 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:

107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx))

108 # Bidi rule 6

109 if direction in ['L', 'EN']:

110 valid_ending = True

111 elif direction != 'NSM':

112 valid_ending = False

114 if not valid_ending:

115 raise IDNABidiError('Label ends with illegal codepoint directionality')

117 return True

120def check_initial_combiner(label: str) -> bool:

121 if unicodedata.category(label[0])[0] == 'M':

122 raise IDNAError('Label begins with an illegal combining character')

123 return True

126def check_hyphen_ok(label: str) -> bool:

127 if label[2:4] == '--':

128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')

129 if label[0] == '-' or label[-1] == '-':

130 raise IDNAError('Label must not start or end with a hyphen')

131 return True

134def check_nfc(label: str) -> None:

135 if unicodedata.normalize('NFC', label) != label:

136 raise IDNAError('Label must be in Normalization Form C')

139def valid_contextj(label: str, pos: int) -> bool:

140 cp_value = ord(label[pos])

142 if cp_value == 0x200c:

144 if pos > 0:

145 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

146 return True

148 ok = False

149 for i in range(pos-1, -1, -1):

150 joining_type = idnadata.joining_types.get(ord(label[i]))

151 if joining_type == ord('T'):

152 continue

153 elif joining_type in [ord('L'), ord('D')]:

154 ok = True

155 break

156 else:

157 break

159 if not ok:

160 return False

162 ok = False

163 for i in range(pos+1, len(label)):

164 joining_type = idnadata.joining_types.get(ord(label[i]))

165 if joining_type == ord('T'):

166 continue

167 elif joining_type in [ord('R'), ord('D')]:

168 ok = True

169 break

170 else:

171 break

172 return ok

174 if cp_value == 0x200d:

176 if pos > 0:

177 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

178 return True

179 return False

181 else:

183 return False

186def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:

187 cp_value = ord(label[pos])

189 if cp_value == 0x00b7:

190 if 0 < pos < len(label)-1:

191 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:

192 return True

193 return False

195 elif cp_value == 0x0375:

196 if pos < len(label)-1 and len(label) > 1:

197 return _is_script(label[pos + 1], 'Greek')

198 return False

200 elif cp_value == 0x05f3 or cp_value == 0x05f4:

201 if pos > 0:

202 return _is_script(label[pos - 1], 'Hebrew')

203 return False

205 elif cp_value == 0x30fb:

206 for cp in label:

207 if cp == '\u30fb':

208 continue

209 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):

210 return True

211 return False

213 elif 0x660 <= cp_value <= 0x669:

214 for cp in label:

215 if 0x6f0 <= ord(cp) <= 0x06f9:

216 return False

217 return True

219 elif 0x6f0 <= cp_value <= 0x6f9:

220 for cp in label:

221 if 0x660 <= ord(cp) <= 0x0669:

222 return False

223 return True

225 return False

228def check_label(label: Union[str, bytes, bytearray]) -> None:

229 if isinstance(label, (bytes, bytearray)):

230 label = label.decode('utf-8')

231 if len(label) == 0:

232 raise IDNAError('Empty Label')

234 check_nfc(label)

235 check_hyphen_ok(label)

236 check_initial_combiner(label)

238 for (pos, cp) in enumerate(label):

239 cp_value = ord(cp)

240 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):

241 continue

242 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):

243 try:

244 if not valid_contextj(label, pos):

245 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(

246 _unot(cp_value), pos+1, repr(label)))

247 except ValueError:

248 raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(

249 _unot(cp_value), pos+1, repr(label)))

250 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):

251 if not valid_contexto(label, pos):

252 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label)))

253 else:

254 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label)))

256 check_bidi(label)

259def alabel(label: str) -> bytes:

260 try:

261 label_bytes = label.encode('ascii')

262 ulabel(label_bytes)

263 if not valid_label_length(label_bytes):

264 raise IDNAError('Label too long')

265 return label_bytes

266 except UnicodeEncodeError:

267 pass

269 check_label(label)

270 label_bytes = _alabel_prefix + _punycode(label)

272 if not valid_label_length(label_bytes):

273 raise IDNAError('Label too long')

275 return label_bytes

278def ulabel(label: Union[str, bytes, bytearray]) -> str:

279 if not isinstance(label, (bytes, bytearray)):

280 try:

281 label_bytes = label.encode('ascii')

282 except UnicodeEncodeError:

283 check_label(label)

284 return label

285 else:

286 label_bytes = label

288 label_bytes = label_bytes.lower()

289 if label_bytes.startswith(_alabel_prefix):

290 label_bytes = label_bytes[len(_alabel_prefix):]

291 if not label_bytes:

292 raise IDNAError('Malformed A-label, no Punycode eligible content found')

293 if label_bytes.decode('ascii')[-1] == '-':

294 raise IDNAError('A-label must not end with a hyphen')

295 else:

296 check_label(label_bytes)

297 return label_bytes.decode('ascii')

299 try:

300 label = label_bytes.decode('punycode')

301 except UnicodeError:

302 raise IDNAError('Invalid A-label')

303 check_label(label)

304 return label

307def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:

308 """Re-map the characters in the string according to UTS46 processing."""

309 from .uts46data import uts46data

310 output = ''

312 for pos, char in enumerate(domain):

313 code_point = ord(char)

314 try:

315 uts46row = uts46data[code_point if code_point < 256 else

316 bisect.bisect_left(uts46data, (code_point, 'Z')) - 1]

317 status = uts46row[1]

318 replacement = None # type: Optional[str]

319 if len(uts46row) == 3:

320 replacement = uts46row[2]

321 if (status == 'V' or

322 (status == 'D' and not transitional) or

323 (status == '3' and not std3_rules and replacement is None)):

324 output += char

325 elif replacement is not None and (status == 'M' or

326 (status == '3' and not std3_rules) or

327 (status == 'D' and transitional)):

328 output += replacement

329 elif status != 'I':

330 raise IndexError()

331 except IndexError:

332 raise InvalidCodepoint(

333 'Codepoint {} not allowed at position {} in {}'.format(

334 _unot(code_point), pos + 1, repr(domain)))

336 return unicodedata.normalize('NFC', output)

339def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes:

340 if not isinstance(s, str):

341 try:

342 s = str(s, 'ascii')

343 except UnicodeDecodeError:

344 raise IDNAError('should pass a unicode string to the function rather than a byte string.')

345 if uts46:

346 s = uts46_remap(s, std3_rules, transitional)

347 trailing_dot = False

348 result = []

349 if strict:

350 labels = s.split('.')

351 else:

352 labels = _unicode_dots_re.split(s)

353 if not labels or labels == ['']:

354 raise IDNAError('Empty domain')

355 if labels[-1] == '':

356 del labels[-1]

357 trailing_dot = True

358 for label in labels:

359 s = alabel(label)

360 if s:

361 result.append(s)

362 else:

363 raise IDNAError('Empty label')

364 if trailing_dot:

365 result.append(b'')

366 s = b'.'.join(result)

367 if not valid_string_length(s, trailing_dot):

368 raise IDNAError('Domain too long')

369 return s

372def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str:

373 try:

374 if not isinstance(s, str):

375 s = str(s, 'ascii')

376 except UnicodeDecodeError:

377 raise IDNAError('Invalid ASCII in A-label')

378 if uts46:

379 s = uts46_remap(s, std3_rules, False)

380 trailing_dot = False

381 result = []

382 if not strict:

383 labels = _unicode_dots_re.split(s)

384 else:

385 labels = s.split('.')

386 if not labels or labels == ['']:

387 raise IDNAError('Empty domain')

388 if not labels[-1]:

389 del labels[-1]

390 trailing_dot = True

391 for label in labels:

392 s = ulabel(label)

393 if s:

394 result.append(s)

395 else:

396 raise IDNAError('Empty label')

397 if trailing_dot:

398 result.append('')

399 return '.'.join(result)