Coverage for /pythoncovmergedfiles/medio/medio/usr/local/lib/python3.8/site-packages/idna/core.py: 12%

1from . import idnadata

2import bisect

3import unicodedata

4import re

5from typing import Union, Optional

6from .intranges import intranges_contain

8_virama_combining_class = 9

9_alabel_prefix = b'xn--'

10_unicode_dots_re = re.compile('[\u002e\u3002\uff0e\uff61]')

12class IDNAError(UnicodeError):

13 """ Base exception for all IDNA-encoding related problems """

14 pass

17class IDNABidiError(IDNAError):

18 """ Exception when bidirectional requirements are not satisfied """

19 pass

22class InvalidCodepoint(IDNAError):

23 """ Exception when a disallowed or unallocated codepoint is used """

24 pass

27class InvalidCodepointContext(IDNAError):

28 """ Exception when the codepoint is not valid in the context it is used """

29 pass

32def _combining_class(cp: int) -> int:

33 v = unicodedata.combining(chr(cp))

34 if v == 0:

35 if not unicodedata.name(chr(cp)):

36 raise ValueError('Unknown character in unicodedata')

37 return v

39def _is_script(cp: str, script: str) -> bool:

40 return intranges_contain(ord(cp), idnadata.scripts[script])

42def _punycode(s: str) -> bytes:

43 return s.encode('punycode')

45def _unot(s: int) -> str:

46 return 'U+{:04X}'.format(s)

49def valid_label_length(label: Union[bytes, str]) -> bool:

50 if len(label) > 63:

51 return False

52 return True

55def valid_string_length(label: Union[bytes, str], trailing_dot: bool) -> bool:

56 if len(label) > (254 if trailing_dot else 253):

57 return False

58 return True

61def check_bidi(label: str, check_ltr: bool = False) -> bool:

62 # Bidi rules should only be applied if string contains RTL characters

63 bidi_label = False

64 for (idx, cp) in enumerate(label, 1):

65 direction = unicodedata.bidirectional(cp)

66 if direction == '':

67 # String likely comes from a newer version of Unicode

68 raise IDNABidiError('Unknown directionality in label {} at position {}'.format(repr(label), idx))

69 if direction in ['R', 'AL', 'AN']:

70 bidi_label = True

71 if not bidi_label and not check_ltr:

72 return True

74 # Bidi rule 1

75 direction = unicodedata.bidirectional(label[0])

76 if direction in ['R', 'AL']:

77 rtl = True

78 elif direction == 'L':

79 rtl = False

80 else:

81 raise IDNABidiError('First codepoint in label {} must be directionality L, R or AL'.format(repr(label)))

83 valid_ending = False

84 number_type = None # type: Optional[str]

85 for (idx, cp) in enumerate(label, 1):

86 direction = unicodedata.bidirectional(cp)

88 if rtl:

89 # Bidi rule 2

90 if not direction in ['R', 'AL', 'AN', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:

91 raise IDNABidiError('Invalid direction for codepoint at position {} in a right-to-left label'.format(idx))

92 # Bidi rule 3

93 if direction in ['R', 'AL', 'EN', 'AN']:

94 valid_ending = True

95 elif direction != 'NSM':

96 valid_ending = False

97 # Bidi rule 4

98 if direction in ['AN', 'EN']:

99 if not number_type:

100 number_type = direction

101 else:

102 if number_type != direction:

103 raise IDNABidiError('Can not mix numeral types in a right-to-left label')

104 else:

105 # Bidi rule 5

106 if not direction in ['L', 'EN', 'ES', 'CS', 'ET', 'ON', 'BN', 'NSM']:

107 raise IDNABidiError('Invalid direction for codepoint at position {} in a left-to-right label'.format(idx))

108 # Bidi rule 6

109 if direction in ['L', 'EN']:

110 valid_ending = True

111 elif direction != 'NSM':

112 valid_ending = False

113

114 if not valid_ending:

115 raise IDNABidiError('Label ends with illegal codepoint directionality')

116

117 return True

118

119

120def check_initial_combiner(label: str) -> bool:

121 if unicodedata.category(label[0])[0] == 'M':

122 raise IDNAError('Label begins with an illegal combining character')

123 return True

124

125

126def check_hyphen_ok(label: str) -> bool:

127 if label[2:4] == '--':

128 raise IDNAError('Label has disallowed hyphens in 3rd and 4th position')

129 if label[0] == '-' or label[-1] == '-':

130 raise IDNAError('Label must not start or end with a hyphen')

131 return True

132

133

134def check_nfc(label: str) -> None:

135 if unicodedata.normalize('NFC', label) != label:

136 raise IDNAError('Label must be in Normalization Form C')

137

138

139def valid_contextj(label: str, pos: int) -> bool:

140 cp_value = ord(label[pos])

141

142 if cp_value == 0x200c:

143

144 if pos > 0:

145 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

146 return True

147

148 ok = False

149 for i in range(pos-1, -1, -1):

150 joining_type = idnadata.joining_types.get(ord(label[i]))

151 if joining_type == ord('T'):

152 continue

153 if joining_type in [ord('L'), ord('D')]:

154 ok = True

155 break

156

157 if not ok:

158 return False

159

160 ok = False

161 for i in range(pos+1, len(label)):

162 joining_type = idnadata.joining_types.get(ord(label[i]))

163 if joining_type == ord('T'):

164 continue

165 if joining_type in [ord('R'), ord('D')]:

166 ok = True

167 break

168 return ok

169

170 if cp_value == 0x200d:

171

172 if pos > 0:

173 if _combining_class(ord(label[pos - 1])) == _virama_combining_class:

174 return True

175 return False

177 else:

179 return False

182def valid_contexto(label: str, pos: int, exception: bool = False) -> bool:

183 cp_value = ord(label[pos])

184

185 if cp_value == 0x00b7:

186 if 0 < pos < len(label)-1:

187 if ord(label[pos - 1]) == 0x006c and ord(label[pos + 1]) == 0x006c:

188 return True

189 return False

190

191 elif cp_value == 0x0375:

192 if pos < len(label)-1 and len(label) > 1:

193 return _is_script(label[pos + 1], 'Greek')

194 return False

195

196 elif cp_value == 0x05f3 or cp_value == 0x05f4:

197 if pos > 0:

198 return _is_script(label[pos - 1], 'Hebrew')

199 return False

200

201 elif cp_value == 0x30fb:

202 for cp in label:

203 if cp == '\u30fb':

204 continue

205 if _is_script(cp, 'Hiragana') or _is_script(cp, 'Katakana') or _is_script(cp, 'Han'):

206 return True

207 return False

208

209 elif 0x660 <= cp_value <= 0x669:

210 for cp in label:

211 if 0x6f0 <= ord(cp) <= 0x06f9:

212 return False

213 return True

214

215 elif 0x6f0 <= cp_value <= 0x6f9:

216 for cp in label:

217 if 0x660 <= ord(cp) <= 0x0669:

218 return False

219 return True

220

221 return False

222

223

224def check_label(label: Union[str, bytes, bytearray]) -> None:

225 if isinstance(label, (bytes, bytearray)):

226 label = label.decode('utf-8')

227 if len(label) == 0:

228 raise IDNAError('Empty Label')

229

230 check_nfc(label)

231 check_hyphen_ok(label)

232 check_initial_combiner(label)

233

234 for (pos, cp) in enumerate(label):

235 cp_value = ord(cp)

236 if intranges_contain(cp_value, idnadata.codepoint_classes['PVALID']):

237 continue

238 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTJ']):

239 try:

240 if not valid_contextj(label, pos):

241 raise InvalidCodepointContext('Joiner {} not allowed at position {} in {}'.format(

242 _unot(cp_value), pos+1, repr(label)))

243 except ValueError:

244 raise IDNAError('Unknown codepoint adjacent to joiner {} at position {} in {}'.format(

245 _unot(cp_value), pos+1, repr(label)))

246 elif intranges_contain(cp_value, idnadata.codepoint_classes['CONTEXTO']):

247 if not valid_contexto(label, pos):

248 raise InvalidCodepointContext('Codepoint {} not allowed at position {} in {}'.format(_unot(cp_value), pos+1, repr(label)))

249 else:

250 raise InvalidCodepoint('Codepoint {} at position {} of {} not allowed'.format(_unot(cp_value), pos+1, repr(label)))

251

252 check_bidi(label)

253

254

255def alabel(label: str) -> bytes:

256 try:

257 label_bytes = label.encode('ascii')

258 ulabel(label_bytes)

259 if not valid_label_length(label_bytes):

260 raise IDNAError('Label too long')

261 return label_bytes

262 except UnicodeEncodeError:

263 pass

264

265 if not label:

266 raise IDNAError('No Input')

267

268 label = str(label)

269 check_label(label)

270 label_bytes = _punycode(label)

271 label_bytes = _alabel_prefix + label_bytes

272

273 if not valid_label_length(label_bytes):

274 raise IDNAError('Label too long')

275

276 return label_bytes

277

278

279def ulabel(label: Union[str, bytes, bytearray]) -> str:

280 if not isinstance(label, (bytes, bytearray)):

281 try:

282 label_bytes = label.encode('ascii')

283 except UnicodeEncodeError:

284 check_label(label)

285 return label

286 else:

287 label_bytes = label

288

289 label_bytes = label_bytes.lower()

290 if label_bytes.startswith(_alabel_prefix):

291 label_bytes = label_bytes[len(_alabel_prefix):]

292 if not label_bytes:

293 raise IDNAError('Malformed A-label, no Punycode eligible content found')

294 if label_bytes.decode('ascii')[-1] == '-':

295 raise IDNAError('A-label must not end with a hyphen')

296 else:

297 check_label(label_bytes)

298 return label_bytes.decode('ascii')

299

300 try:

301 label = label_bytes.decode('punycode')

302 except UnicodeError:

303 raise IDNAError('Invalid A-label')

304 check_label(label)

305 return label

306

307

308def uts46_remap(domain: str, std3_rules: bool = True, transitional: bool = False) -> str:

309 """Re-map the characters in the string according to UTS46 processing."""

310 from .uts46data import uts46data

311 output = ''

312

313 for pos, char in enumerate(domain):

314 code_point = ord(char)

315 try:

316 uts46row = uts46data[code_point if code_point < 256 else

317 bisect.bisect_left(uts46data, (code_point, 'Z')) - 1]

318 status = uts46row[1]

319 replacement = None # type: Optional[str]

320 if len(uts46row) == 3:

321 replacement = uts46row[2]

322 if (status == 'V' or

323 (status == 'D' and not transitional) or

324 (status == '3' and not std3_rules and replacement is None)):

325 output += char

326 elif replacement is not None and (status == 'M' or

327 (status == '3' and not std3_rules) or

328 (status == 'D' and transitional)):

329 output += replacement

330 elif status != 'I':

331 raise IndexError()

332 except IndexError:

333 raise InvalidCodepoint(

334 'Codepoint {} not allowed at position {} in {}'.format(

335 _unot(code_point), pos + 1, repr(domain)))

336

337 return unicodedata.normalize('NFC', output)

338

339

340def encode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False, transitional: bool = False) -> bytes:

341 if not isinstance(s, str):

342 try:

343 s = str(s, 'ascii')

344 except UnicodeDecodeError:

345 raise IDNAError('should pass a unicode string to the function rather than a byte string.')

346 if uts46:

347 s = uts46_remap(s, std3_rules, transitional)

348 trailing_dot = False

349 result = []

350 if strict:

351 labels = s.split('.')

352 else:

353 labels = _unicode_dots_re.split(s)

354 if not labels or labels == ['']:

355 raise IDNAError('Empty domain')

356 if labels[-1] == '':

357 del labels[-1]

358 trailing_dot = True

359 for label in labels:

360 s = alabel(label)

361 if s:

362 result.append(s)

363 else:

364 raise IDNAError('Empty label')

365 if trailing_dot:

366 result.append(b'')

367 s = b'.'.join(result)

368 if not valid_string_length(s, trailing_dot):

369 raise IDNAError('Domain too long')

370 return s

371

372

373def decode(s: Union[str, bytes, bytearray], strict: bool = False, uts46: bool = False, std3_rules: bool = False) -> str:

374 try:

375 if not isinstance(s, str):

376 s = str(s, 'ascii')

377 except UnicodeDecodeError:

378 raise IDNAError('Invalid ASCII in A-label')

379 if uts46:

380 s = uts46_remap(s, std3_rules, False)

381 trailing_dot = False

382 result = []

383 if not strict:

384 labels = _unicode_dots_re.split(s)

385 else:

386 labels = s.split('.')

387 if not labels or labels == ['']:

388 raise IDNAError('Empty domain')

389 if not labels[-1]:

390 del labels[-1]

391 trailing_dot = True

392 for label in labels:

393 s = ulabel(label)

394 if s:

395 result.append(s)

396 else:

397 raise IDNAError('Empty label')

398 if trailing_dot:

399 result.append('')

400 return '.'.join(result)