/rust/registry/src/index.crates.io-1949cf8c6b5b557f/idna-1.1.0/src/punycode.rs
Line | Count | Source |
1 | | // Copyright 2013 The rust-url developers. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
4 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
5 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
6 | | // option. This file may not be copied, modified, or distributed |
7 | | // except according to those terms. |
8 | | |
9 | | //! Punycode ([RFC 3492](http://tools.ietf.org/html/rfc3492)) implementation. |
10 | | //! |
11 | | //! Since Punycode fundamentally works on unicode code points, |
12 | | //! `encode` and `decode` take and return slices and vectors of `char`. |
13 | | //! `encode_str` and `decode_to_string` provide convenience wrappers |
14 | | //! that convert from and to Rust’s UTF-8 based `str` and `String` types. |
15 | | |
16 | | use alloc::{string::String, vec::Vec}; |
17 | | use core::char; |
18 | | use core::fmt::Write; |
19 | | use core::marker::PhantomData; |
20 | | |
21 | | // Bootstring parameters for Punycode |
22 | | const BASE: u32 = 36; |
23 | | const T_MIN: u32 = 1; |
24 | | const T_MAX: u32 = 26; |
25 | | const SKEW: u32 = 38; |
26 | | const DAMP: u32 = 700; |
27 | | const INITIAL_BIAS: u32 = 72; |
28 | | const INITIAL_N: u32 = 0x80; |
29 | | |
30 | | #[inline] |
31 | 0 | fn adapt(mut delta: u32, num_points: u32, first_time: bool) -> u32 { |
32 | 0 | delta /= if first_time { DAMP } else { 2 }; |
33 | 0 | delta += delta / num_points; |
34 | 0 | let mut k = 0; |
35 | 0 | while delta > ((BASE - T_MIN) * T_MAX) / 2 { |
36 | 0 | delta /= BASE - T_MIN; |
37 | 0 | k += BASE; |
38 | 0 | } |
39 | 0 | k + (((BASE - T_MIN + 1) * delta) / (delta + SKEW)) |
40 | 0 | } |
41 | | |
42 | | /// Convert Punycode to an Unicode `String`. |
43 | | /// |
44 | | /// Return None on malformed input or overflow. |
45 | | /// Overflow can only happen on inputs that take more than |
46 | | /// 63 encoded bytes, the DNS limit on domain name labels. |
47 | | #[inline] |
48 | 0 | pub fn decode_to_string(input: &str) -> Option<String> { |
49 | | Some( |
50 | 0 | Decoder::default() |
51 | 0 | .decode::<u8, ExternalCaller>(input.as_bytes()) |
52 | 0 | .ok()? |
53 | 0 | .collect(), |
54 | | ) |
55 | 0 | } |
56 | | |
57 | | /// Convert Punycode to Unicode. |
58 | | /// |
59 | | /// Return None on malformed input or overflow. |
60 | | /// Overflow can only happen on inputs that take more than |
61 | | /// 63 encoded bytes, the DNS limit on domain name labels. |
62 | 0 | pub fn decode(input: &str) -> Option<Vec<char>> { |
63 | | Some( |
64 | 0 | Decoder::default() |
65 | 0 | .decode::<u8, ExternalCaller>(input.as_bytes()) |
66 | 0 | .ok()? |
67 | 0 | .collect(), |
68 | | ) |
69 | 0 | } |
70 | | |
71 | | /// Marker for internal vs. external caller to retain old API behavior |
72 | | /// while tweaking behavior for internal callers. |
73 | | /// |
74 | | /// External callers need overflow checks when encoding, but internal |
75 | | /// callers don't, because `PUNYCODE_ENCODE_MAX_INPUT_LENGTH` is set |
76 | | /// to 1000, and per RFC 3492 section 6.4, the integer variable does |
77 | | /// not need to be able to represent values larger than |
78 | | /// (char::MAX - INITIAL_N) * (PUNYCODE_ENCODE_MAX_INPUT_LENGTH + 1), |
79 | | /// which is less than u32::MAX. |
80 | | /// |
81 | | /// External callers need to handle upper-case ASCII when decoding, |
82 | | /// but internal callers don't, because the internal code calls the |
83 | | /// decoder only with lower-case inputs. |
84 | | pub(crate) trait PunycodeCaller { |
85 | | const EXTERNAL_CALLER: bool; |
86 | | } |
87 | | |
88 | | pub(crate) struct InternalCaller; |
89 | | |
90 | | impl PunycodeCaller for InternalCaller { |
91 | | const EXTERNAL_CALLER: bool = false; |
92 | | } |
93 | | |
94 | | struct ExternalCaller; |
95 | | |
96 | | impl PunycodeCaller for ExternalCaller { |
97 | | const EXTERNAL_CALLER: bool = true; |
98 | | } |
99 | | |
100 | | pub(crate) trait PunycodeCodeUnit { |
101 | | fn is_delimiter(&self) -> bool; |
102 | | fn is_ascii(&self) -> bool; |
103 | | fn digit(&self) -> Option<u32>; |
104 | | fn char(&self) -> char; |
105 | | fn char_ascii_lower_case(&self) -> char; |
106 | | } |
107 | | |
108 | | impl PunycodeCodeUnit for u8 { |
109 | 0 | fn is_delimiter(&self) -> bool { |
110 | 0 | *self == b'-' |
111 | 0 | } |
112 | 0 | fn is_ascii(&self) -> bool { |
113 | 0 | *self < 0x80 |
114 | 0 | } |
115 | 0 | fn digit(&self) -> Option<u32> { |
116 | 0 | let byte = *self; |
117 | 0 | Some(match byte { |
118 | 0 | byte @ b'0'..=b'9' => byte - b'0' + 26, |
119 | 0 | byte @ b'A'..=b'Z' => byte - b'A', |
120 | 0 | byte @ b'a'..=b'z' => byte - b'a', |
121 | 0 | _ => return None, |
122 | | } as u32) |
123 | 0 | } |
124 | 0 | fn char(&self) -> char { |
125 | 0 | char::from(*self) |
126 | 0 | } |
127 | 0 | fn char_ascii_lower_case(&self) -> char { |
128 | 0 | char::from(self.to_ascii_lowercase()) |
129 | 0 | } |
130 | | } |
131 | | |
132 | | impl PunycodeCodeUnit for char { |
133 | 0 | fn is_delimiter(&self) -> bool { |
134 | 0 | *self == '-' |
135 | 0 | } |
136 | 0 | fn is_ascii(&self) -> bool { |
137 | 0 | debug_assert!(false); // Unused |
138 | 0 | true |
139 | 0 | } |
140 | 0 | fn digit(&self) -> Option<u32> { |
141 | 0 | let byte = *self; |
142 | 0 | Some(match byte { |
143 | 0 | byte @ '0'..='9' => u32::from(byte) - u32::from('0') + 26, |
144 | | // byte @ 'A'..='Z' => u32::from(byte) - u32::from('A'), // XXX not needed if no public input |
145 | 0 | byte @ 'a'..='z' => u32::from(byte) - u32::from('a'), |
146 | 0 | _ => return None, |
147 | | }) |
148 | 0 | } |
149 | 0 | fn char(&self) -> char { |
150 | 0 | debug_assert!(false); // Unused |
151 | 0 | *self |
152 | 0 | } |
153 | 0 | fn char_ascii_lower_case(&self) -> char { |
154 | | // No need to actually lower-case! |
155 | 0 | *self |
156 | 0 | } |
157 | | } |
158 | | |
159 | | #[derive(Default)] |
160 | | pub(crate) struct Decoder { |
161 | | insertions: smallvec::SmallVec<[(usize, char); 59]>, |
162 | | } |
163 | | |
164 | | impl Decoder { |
165 | | /// Split the input iterator and return a Vec with insertions of encoded characters |
166 | 0 | pub(crate) fn decode<'a, T: PunycodeCodeUnit + Copy, C: PunycodeCaller>( |
167 | 0 | &'a mut self, |
168 | 0 | input: &'a [T], |
169 | 0 | ) -> Result<Decode<'a, T, C>, ()> { |
170 | 0 | self.insertions.clear(); |
171 | | // Handle "basic" (ASCII) code points. |
172 | | // They are encoded as-is before the last delimiter, if any. |
173 | 0 | let (base, input) = if let Some(position) = input.iter().rposition(|c| c.is_delimiter()) {Unexecuted instantiation: <idna::punycode::Decoder>::decode::<char, idna::punycode::InternalCaller>::{closure#0}Unexecuted instantiation: <idna::punycode::Decoder>::decode::<u8, idna::punycode::ExternalCaller>::{closure#0}Unexecuted instantiation: <idna::punycode::Decoder>::decode::<u8, idna::punycode::InternalCaller>::{closure#0} |
174 | | ( |
175 | 0 | &input[..position], |
176 | 0 | if position > 0 { |
177 | 0 | &input[position + 1..] |
178 | | } else { |
179 | 0 | input |
180 | | }, |
181 | | ) |
182 | | } else { |
183 | 0 | (&input[..0], input) |
184 | | }; |
185 | | |
186 | 0 | if C::EXTERNAL_CALLER && !base.iter().all(|c| c.is_ascii()) { |
187 | 0 | return Err(()); |
188 | 0 | } |
189 | | |
190 | 0 | let base_len = base.len(); |
191 | 0 | let mut length = base_len as u32; |
192 | 0 | let mut code_point = INITIAL_N; |
193 | 0 | let mut bias = INITIAL_BIAS; |
194 | 0 | let mut i = 0u32; |
195 | 0 | let mut iter = input.iter(); |
196 | | loop { |
197 | 0 | let previous_i = i; |
198 | 0 | let mut weight = 1; |
199 | 0 | let mut k = BASE; |
200 | 0 | let mut byte = match iter.next() { |
201 | 0 | None => break, |
202 | 0 | Some(byte) => byte, |
203 | | }; |
204 | | |
205 | | // Decode a generalized variable-length integer into delta, |
206 | | // which gets added to i. |
207 | | loop { |
208 | 0 | let digit = if let Some(digit) = byte.digit() { |
209 | 0 | digit |
210 | | } else { |
211 | 0 | return Err(()); |
212 | | }; |
213 | 0 | let product = digit.checked_mul(weight).ok_or(())?; |
214 | 0 | i = i.checked_add(product).ok_or(())?; |
215 | 0 | let t = if k <= bias { |
216 | 0 | T_MIN |
217 | 0 | } else if k >= bias + T_MAX { |
218 | 0 | T_MAX |
219 | | } else { |
220 | 0 | k - bias |
221 | | }; |
222 | 0 | if digit < t { |
223 | 0 | break; |
224 | 0 | } |
225 | 0 | weight = weight.checked_mul(BASE - t).ok_or(())?; |
226 | 0 | k += BASE; |
227 | 0 | byte = match iter.next() { |
228 | 0 | None => return Err(()), // End of input before the end of this delta |
229 | 0 | Some(byte) => byte, |
230 | | }; |
231 | | } |
232 | | |
233 | 0 | bias = adapt(i - previous_i, length + 1, previous_i == 0); |
234 | | |
235 | | // i was supposed to wrap around from length+1 to 0, |
236 | | // incrementing code_point each time. |
237 | 0 | code_point = code_point.checked_add(i / (length + 1)).ok_or(())?; |
238 | 0 | i %= length + 1; |
239 | 0 | let c = match char::from_u32(code_point) { |
240 | 0 | Some(c) => c, |
241 | 0 | None => return Err(()), |
242 | | }; |
243 | | |
244 | | // Move earlier insertions farther out in the string |
245 | 0 | for (idx, _) in &mut self.insertions { |
246 | 0 | if *idx >= i as usize { |
247 | 0 | *idx += 1; |
248 | 0 | } |
249 | | } |
250 | 0 | self.insertions.push((i as usize, c)); |
251 | 0 | length += 1; |
252 | 0 | i += 1; |
253 | | } |
254 | | |
255 | 0 | self.insertions.sort_by_key(|(i, _)| *i); |
256 | 0 | Ok(Decode { |
257 | 0 | base: base.iter(), |
258 | 0 | insertions: &self.insertions, |
259 | 0 | inserted: 0, |
260 | 0 | position: 0, |
261 | 0 | len: base_len + self.insertions.len(), |
262 | 0 | phantom: PhantomData::<C>, |
263 | 0 | }) |
264 | 0 | } Unexecuted instantiation: <idna::punycode::Decoder>::decode::<char, idna::punycode::InternalCaller> Unexecuted instantiation: <idna::punycode::Decoder>::decode::<u8, idna::punycode::ExternalCaller> Unexecuted instantiation: <idna::punycode::Decoder>::decode::<u8, idna::punycode::InternalCaller> |
265 | | } |
266 | | |
267 | | pub(crate) struct Decode<'a, T, C> |
268 | | where |
269 | | T: PunycodeCodeUnit + Copy, |
270 | | C: PunycodeCaller, |
271 | | { |
272 | | base: core::slice::Iter<'a, T>, |
273 | | pub(crate) insertions: &'a [(usize, char)], |
274 | | inserted: usize, |
275 | | position: usize, |
276 | | len: usize, |
277 | | phantom: PhantomData<C>, |
278 | | } |
279 | | |
280 | | impl<T: PunycodeCodeUnit + Copy, C: PunycodeCaller> Iterator for Decode<'_, T, C> { |
281 | | type Item = char; |
282 | | |
283 | 0 | fn next(&mut self) -> Option<Self::Item> { |
284 | | loop { |
285 | 0 | match self.insertions.get(self.inserted) { |
286 | 0 | Some((pos, c)) if *pos == self.position => { |
287 | 0 | self.inserted += 1; |
288 | 0 | self.position += 1; |
289 | 0 | return Some(*c); |
290 | | } |
291 | 0 | _ => {} |
292 | | } |
293 | 0 | if let Some(c) = self.base.next() { |
294 | 0 | self.position += 1; |
295 | 0 | return Some(if C::EXTERNAL_CALLER { |
296 | 0 | c.char() |
297 | | } else { |
298 | 0 | c.char_ascii_lower_case() |
299 | | }); |
300 | 0 | } else if self.inserted >= self.insertions.len() { |
301 | 0 | return None; |
302 | 0 | } |
303 | | } |
304 | 0 | } Unexecuted instantiation: <idna::punycode::Decode<char, idna::punycode::InternalCaller> as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <idna::punycode::Decode<u8, idna::punycode::ExternalCaller> as core::iter::traits::iterator::Iterator>::next Unexecuted instantiation: <idna::punycode::Decode<u8, idna::punycode::InternalCaller> as core::iter::traits::iterator::Iterator>::next |
305 | | |
306 | 0 | fn size_hint(&self) -> (usize, Option<usize>) { |
307 | 0 | let len = self.len - self.position; |
308 | 0 | (len, Some(len)) |
309 | 0 | } Unexecuted instantiation: <idna::punycode::Decode<char, idna::punycode::InternalCaller> as core::iter::traits::iterator::Iterator>::size_hint Unexecuted instantiation: <idna::punycode::Decode<u8, idna::punycode::ExternalCaller> as core::iter::traits::iterator::Iterator>::size_hint Unexecuted instantiation: <idna::punycode::Decode<u8, idna::punycode::InternalCaller> as core::iter::traits::iterator::Iterator>::size_hint |
310 | | } |
311 | | |
312 | | impl<T: PunycodeCodeUnit + Copy, C: PunycodeCaller> ExactSizeIterator for Decode<'_, T, C> { |
313 | 0 | fn len(&self) -> usize { |
314 | 0 | self.len - self.position |
315 | 0 | } |
316 | | } |
317 | | |
318 | | /// Convert an Unicode `str` to Punycode. |
319 | | /// |
320 | | /// This is a convenience wrapper around `encode`. |
321 | | #[inline] |
322 | 0 | pub fn encode_str(input: &str) -> Option<String> { |
323 | 0 | if input.len() > u32::MAX as usize { |
324 | 0 | return None; |
325 | 0 | } |
326 | 0 | let mut buf = String::with_capacity(input.len()); |
327 | 0 | encode_into::<_, _, ExternalCaller>(input.chars(), &mut buf) |
328 | 0 | .ok() |
329 | 0 | .map(|()| buf) |
330 | 0 | } |
331 | | |
332 | | /// Convert Unicode to Punycode. |
333 | | /// |
334 | | /// Return None on overflow, which can only happen on inputs that would take more than |
335 | | /// 63 encoded bytes, the DNS limit on domain name labels. |
336 | 0 | pub fn encode(input: &[char]) -> Option<String> { |
337 | 0 | if input.len() > u32::MAX as usize { |
338 | 0 | return None; |
339 | 0 | } |
340 | 0 | let mut buf = String::with_capacity(input.len()); |
341 | 0 | encode_into::<_, _, ExternalCaller>(input.iter().copied(), &mut buf) |
342 | 0 | .ok() |
343 | 0 | .map(|()| buf) |
344 | 0 | } |
345 | | |
346 | | pub(crate) enum PunycodeEncodeError { |
347 | | Overflow, |
348 | | Sink, |
349 | | } |
350 | | |
351 | | impl From<core::fmt::Error> for PunycodeEncodeError { |
352 | 0 | fn from(_: core::fmt::Error) -> Self { |
353 | 0 | Self::Sink |
354 | 0 | } |
355 | | } |
356 | | |
357 | 0 | pub(crate) fn encode_into<I, W, C>(input: I, output: &mut W) -> Result<(), PunycodeEncodeError> |
358 | 0 | where |
359 | 0 | I: Iterator<Item = char> + Clone, |
360 | 0 | W: Write + ?Sized, |
361 | 0 | C: PunycodeCaller, |
362 | | { |
363 | | // Handle "basic" (ASCII) code points. They are encoded as-is. |
364 | 0 | let (mut input_length, mut basic_length) = (0u32, 0); |
365 | 0 | for c in input.clone() { |
366 | 0 | input_length = input_length |
367 | 0 | .checked_add(1) |
368 | 0 | .ok_or(PunycodeEncodeError::Overflow)?; |
369 | 0 | if c.is_ascii() { |
370 | 0 | output.write_char(c)?; |
371 | 0 | basic_length += 1; |
372 | 0 | } |
373 | | } |
374 | | |
375 | 0 | if !C::EXTERNAL_CALLER { |
376 | | // We should never get an overflow here with the internal caller being |
377 | | // length-limited, but let's check anyway once here trusting the math |
378 | | // from RFC 3492 section 6.4 and then omit the overflow checks in the |
379 | | // loop below. |
380 | 0 | let len_plus_one = input_length |
381 | 0 | .checked_add(1) |
382 | 0 | .ok_or(PunycodeEncodeError::Overflow)?; |
383 | 0 | len_plus_one |
384 | 0 | .checked_mul(u32::from(char::MAX) - INITIAL_N) |
385 | 0 | .ok_or(PunycodeEncodeError::Overflow)?; |
386 | 0 | } |
387 | | |
388 | 0 | if basic_length > 0 { |
389 | 0 | output.write_char('-')?; |
390 | 0 | } |
391 | 0 | let mut code_point = INITIAL_N; |
392 | 0 | let mut delta = 0u32; |
393 | 0 | let mut bias = INITIAL_BIAS; |
394 | 0 | let mut processed = basic_length; |
395 | 0 | while processed < input_length { |
396 | | // All code points < code_point have been handled already. |
397 | | // Find the next larger one. |
398 | 0 | let min_code_point = input |
399 | 0 | .clone() |
400 | 0 | .map(|c| c as u32) Unexecuted instantiation: idna::punycode::encode_into::<core::iter::adapters::copied::Copied<core::slice::iter::Iter<char>>, alloc::string::String, idna::punycode::ExternalCaller>::{closure#0}Unexecuted instantiation: idna::punycode::encode_into::<core::iter::adapters::copied::Copied<core::slice::iter::Iter<char>>, alloc::string::String, idna::punycode::InternalCaller>::{closure#0} |
401 | 0 | .filter(|&c| c >= code_point) Unexecuted instantiation: idna::punycode::encode_into::<core::iter::adapters::copied::Copied<core::slice::iter::Iter<char>>, alloc::string::String, idna::punycode::ExternalCaller>::{closure#1}Unexecuted instantiation: idna::punycode::encode_into::<core::iter::adapters::copied::Copied<core::slice::iter::Iter<char>>, alloc::string::String, idna::punycode::InternalCaller>::{closure#1} |
402 | 0 | .min() |
403 | 0 | .unwrap(); |
404 | | // Increase delta to advance the decoder’s <code_point,i> state to <min_code_point,0> |
405 | 0 | if C::EXTERNAL_CALLER { |
406 | 0 | let product = (min_code_point - code_point) |
407 | 0 | .checked_mul(processed + 1) |
408 | 0 | .ok_or(PunycodeEncodeError::Overflow)?; |
409 | 0 | delta = delta |
410 | 0 | .checked_add(product) |
411 | 0 | .ok_or(PunycodeEncodeError::Overflow)?; |
412 | 0 | } else { |
413 | 0 | delta += (min_code_point - code_point) * (processed + 1); |
414 | 0 | } |
415 | 0 | code_point = min_code_point; |
416 | 0 | for c in input.clone() { |
417 | 0 | let c = c as u32; |
418 | 0 | if c < code_point { |
419 | 0 | if C::EXTERNAL_CALLER { |
420 | 0 | delta = delta.checked_add(1).ok_or(PunycodeEncodeError::Overflow)?; |
421 | 0 | } else { |
422 | 0 | delta += 1; |
423 | 0 | } |
424 | 0 | } |
425 | 0 | if c == code_point { |
426 | | // Represent delta as a generalized variable-length integer: |
427 | 0 | let mut q = delta; |
428 | 0 | let mut k = BASE; |
429 | | loop { |
430 | 0 | let t = if k <= bias { |
431 | 0 | T_MIN |
432 | 0 | } else if k >= bias + T_MAX { |
433 | 0 | T_MAX |
434 | | } else { |
435 | 0 | k - bias |
436 | | }; |
437 | 0 | if q < t { |
438 | 0 | break; |
439 | 0 | } |
440 | 0 | let value = t + ((q - t) % (BASE - t)); |
441 | 0 | output.write_char(value_to_digit(value))?; |
442 | 0 | q = (q - t) / (BASE - t); |
443 | 0 | k += BASE; |
444 | | } |
445 | 0 | output.write_char(value_to_digit(q))?; |
446 | 0 | bias = adapt(delta, processed + 1, processed == basic_length); |
447 | 0 | delta = 0; |
448 | 0 | processed += 1; |
449 | 0 | } |
450 | | } |
451 | 0 | delta += 1; |
452 | 0 | code_point += 1; |
453 | | } |
454 | 0 | Ok(()) |
455 | 0 | } Unexecuted instantiation: idna::punycode::encode_into::<core::iter::adapters::copied::Copied<core::slice::iter::Iter<char>>, alloc::string::String, idna::punycode::ExternalCaller> Unexecuted instantiation: idna::punycode::encode_into::<core::iter::adapters::copied::Copied<core::slice::iter::Iter<char>>, alloc::string::String, idna::punycode::InternalCaller> |
456 | | |
457 | | #[inline] |
458 | 0 | fn value_to_digit(value: u32) -> char { |
459 | 0 | match value { |
460 | 0 | 0..=25 => (value as u8 + b'a') as char, // a..z |
461 | 0 | 26..=35 => (value as u8 - 26 + b'0') as char, // 0..9 |
462 | 0 | _ => panic!(), |
463 | | } |
464 | 0 | } |
465 | | |
466 | | #[test] |
467 | | #[ignore = "slow"] |
468 | | #[cfg(target_pointer_width = "64")] |
469 | | fn huge_encode() { |
470 | | let mut buf = String::new(); |
471 | | assert!(encode_into::<_, _, ExternalCaller>( |
472 | | core::iter::repeat('ß').take(u32::MAX as usize + 1), |
473 | | &mut buf |
474 | | ) |
475 | | .is_err()); |
476 | | assert_eq!(buf.len(), 0); |
477 | | } |