/rust/registry/src/index.crates.io-1949cf8c6b5b557f/idna-1.1.0/src/uts46.rs
Line | Count | Source |
1 | | // Copyright The rust-url developers. |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or |
4 | | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license |
5 | | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your |
6 | | // option. This file may not be copied, modified, or distributed |
7 | | // except according to those terms. |
8 | | |
9 | | //! This module provides the lower-level API for UTS 46. |
10 | | //! |
11 | | //! [`Uts46::process`] is the core that the other convenience |
12 | | //! methods build on. |
13 | | //! |
14 | | //! UTS 46 flags map to this API as follows: |
15 | | //! |
16 | | //! * _CheckHyphens_ - _true_: [`Hyphens::Check`], _false_: [`Hyphens::Allow`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents. |
17 | | //! * _CheckBidi_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_. |
18 | | //! * _CheckJoiners_ - Always _true_; cannot be configured, since this flag is _true_ even when WHATWG URL Standard _beStrict_ is _false_. |
19 | | //! * _UseSTD3ASCIIRules_ - _true_: [`AsciiDenyList::STD3`], _false_: [`AsciiDenyList::EMPTY`]; however, the check the WHATWG URL Standard performs right after the UTS 46 invocation corresponds to [`AsciiDenyList::URL`]. |
20 | | //! * _Transitional_Processing_ - Always _false_ but could be implemented as a preprocessing step. This flag is deprecated and for Web purposes the transition is over in the sense that all of Firefox, Safari, or Chrome set this flag to _false_. |
21 | | //! * _VerifyDnsLength_ - _true_: [`DnsLength::Verify`], _false_: [`DnsLength::Ignore`]; the WHATWG URL Standard sets this to _false_ for normal (non-conformance-checker) user agents. |
22 | | //! * _IgnoreInvalidPunycode_ - Always _false_; cannot be configured. (Not yet covered by the WHATWG URL Standard, but 2 out of 3 major browser clearly behave as if this was _false_). |
23 | | |
24 | | use crate::punycode::Decoder; |
25 | | use crate::punycode::InternalCaller; |
26 | | use alloc::borrow::Cow; |
27 | | use alloc::string::String; |
28 | | use core::fmt::Write; |
29 | | use idna_adapter::*; |
30 | | use smallvec::SmallVec; |
31 | | use utf8_iter::Utf8CharsEx; |
32 | | |
33 | | /// ICU4C-compatible constraint. |
34 | | /// https://unicode-org.atlassian.net/browse/ICU-13727 |
35 | | const PUNYCODE_DECODE_MAX_INPUT_LENGTH: usize = 2000; |
36 | | |
37 | | /// ICU4C-compatible constraint. (Note: ICU4C measures |
38 | | /// UTF-16 and we measure UTF-32. This means that we |
39 | | /// allow longer non-BMP inputs. For this implementation, |
40 | | /// the denial-of-service scaling does not depend on BMP vs. |
41 | | /// non-BMP: only the scalar values matter.) |
42 | | /// |
43 | | /// https://unicode-org.atlassian.net/browse/ICU-13727 |
44 | | const PUNYCODE_ENCODE_MAX_INPUT_LENGTH: usize = 1000; |
45 | | |
46 | | /// For keeping track of what kind of numerals have been |
47 | | /// seen in an RTL label. |
48 | | #[derive(Debug, PartialEq, Eq)] |
49 | | enum RtlNumeralState { |
50 | | Undecided, |
51 | | European, |
52 | | Arabic, |
53 | | } |
54 | | |
55 | | /// Computes the mask for upper-case ASCII. |
56 | 0 | const fn upper_case_mask() -> u128 { |
57 | 0 | let mut accu = 0u128; |
58 | 0 | let mut b = 0u8; |
59 | 0 | while b < 128 { |
60 | 0 | if (b >= b'A') && (b <= b'Z') { |
61 | 0 | accu |= 1u128 << b; |
62 | 0 | } |
63 | 0 | b += 1; |
64 | | } |
65 | 0 | accu |
66 | 0 | } |
67 | | |
68 | | /// Bit set for upper-case ASCII. |
69 | | const UPPER_CASE_MASK: u128 = upper_case_mask(); |
70 | | |
71 | | /// Computes the mask for glyphless ASCII. |
72 | 0 | const fn glyphless_mask() -> u128 { |
73 | 0 | let mut accu = 0u128; |
74 | 0 | let mut b = 0u8; |
75 | 0 | while b < 128 { |
76 | 0 | if (b <= b' ') || (b == 0x7F) { |
77 | 0 | accu |= 1u128 << b; |
78 | 0 | } |
79 | 0 | b += 1; |
80 | | } |
81 | 0 | accu |
82 | 0 | } |
83 | | |
84 | | /// Bit set for glyphless ASCII. |
85 | | const GLYPHLESS_MASK: u128 = glyphless_mask(); |
86 | | |
87 | | /// The mask for the ASCII dot. |
88 | | const DOT_MASK: u128 = 1 << b'.'; |
89 | | |
90 | | /// Computes the ASCII deny list for STD3 ASCII rules. |
91 | 0 | const fn ldh_mask() -> u128 { |
92 | 0 | let mut accu = 0u128; |
93 | 0 | let mut b = 0u8; |
94 | 0 | while b < 128 { |
95 | 0 | if !((b >= b'a' && b <= b'z') || (b >= b'0' && b <= b'9') || b == b'-' || b == b'.') { |
96 | 0 | accu |= 1u128 << b; |
97 | 0 | } |
98 | 0 | b += 1; |
99 | | } |
100 | 0 | accu |
101 | 0 | } |
102 | | |
103 | | const PUNYCODE_PREFIX: u32 = |
104 | | ((b'-' as u32) << 24) | ((b'-' as u32) << 16) | ((b'N' as u32) << 8) | b'X' as u32; |
105 | | |
106 | | const PUNYCODE_PREFIX_MASK: u32 = (0xFF << 24) | (0xFF << 16) | (0xDF << 8) | 0xDF; |
107 | | |
108 | 0 | fn write_punycode_label<W: Write + ?Sized>( |
109 | 0 | label: &[char], |
110 | 0 | sink: &mut W, |
111 | 0 | ) -> Result<(), ProcessingError> { |
112 | 0 | sink.write_str("xn--")?; |
113 | 0 | crate::punycode::encode_into::<_, _, InternalCaller>(label.iter().copied(), sink)?; |
114 | 0 | Ok(()) |
115 | 0 | } |
116 | | |
117 | | #[inline(always)] |
118 | 0 | fn has_punycode_prefix(slice: &[u8]) -> bool { |
119 | 0 | if slice.len() < 4 { |
120 | 0 | return false; |
121 | 0 | } |
122 | | // Sadly, the optimizer doesn't figure out that more idiomatic code |
123 | | // should compile to masking on 32-bit value. |
124 | 0 | let a = slice[0]; |
125 | 0 | let b = slice[1]; |
126 | 0 | let c = slice[2]; |
127 | 0 | let d = slice[3]; |
128 | 0 | let u = (u32::from(d) << 24) | (u32::from(c) << 16) | (u32::from(b) << 8) | u32::from(a); |
129 | 0 | (u & PUNYCODE_PREFIX_MASK) == PUNYCODE_PREFIX |
130 | 0 | } |
131 | | |
132 | | #[inline(always)] |
133 | 0 | fn in_inclusive_range8(u: u8, start: u8, end: u8) -> bool { |
134 | 0 | u.wrapping_sub(start) <= (end - start) |
135 | 0 | } |
136 | | |
137 | | #[inline(always)] |
138 | 0 | fn in_inclusive_range_char(c: char, start: char, end: char) -> bool { |
139 | 0 | u32::from(c).wrapping_sub(u32::from(start)) <= (u32::from(end) - u32::from(start)) |
140 | 0 | } |
141 | | |
142 | | #[inline(always)] |
143 | 0 | fn is_passthrough_ascii_label(label: &[u8]) -> bool { |
144 | | // XXX if we aren't performing _CheckHyphens_, this could |
145 | | // check for "xn--" and pass through YouTube CDN node names. |
146 | 0 | if label.len() >= 4 && label[2] == b'-' && label[3] == b'-' { |
147 | 0 | return false; |
148 | 0 | } |
149 | 0 | if let Some((&first, tail)) = label.split_first() { |
150 | | // We need to check the first and last character |
151 | | // more strictly in case this turns out to be a |
152 | | // label in a bidi domain name. This has the side |
153 | | // effect that this function only accepts labels |
154 | | // that also conform to the STD3 rules. |
155 | | // |
156 | | // XXX: If we are in the fail-fast mode (i.e. we don't need |
157 | | // to be able to overwrite anything with U+FFFD), we could |
158 | | // merely record that we've seen a digit here and error out |
159 | | // if we later discover that the domain name is a bidi |
160 | | // domain name. |
161 | 0 | if !in_inclusive_range8(first, b'a', b'z') { |
162 | 0 | return false; |
163 | 0 | } |
164 | 0 | for &b in tail { |
165 | | // If we used LDH_MASK, we'd have to check |
166 | | // the bytes for the ASCII range anyhow. |
167 | 0 | if in_inclusive_range8(b, b'a', b'z') { |
168 | 0 | continue; |
169 | 0 | } |
170 | 0 | if in_inclusive_range8(b, b'0', b'9') { |
171 | 0 | continue; |
172 | 0 | } |
173 | 0 | if b == b'-' { |
174 | 0 | continue; |
175 | 0 | } |
176 | 0 | return false; |
177 | | } |
178 | 0 | label.last() != Some(&b'-') |
179 | | } else { |
180 | | // empty |
181 | 0 | true |
182 | | } |
183 | 0 | } |
184 | | |
185 | | #[inline(always)] |
186 | 0 | fn split_ascii_fast_path_prefix(label: &[u8]) -> (&[u8], &[u8]) { |
187 | 0 | if let Some(pos) = label.iter().position(|b| !b.is_ascii()) { |
188 | 0 | if pos == 0 { |
189 | | // First is non-ASCII |
190 | 0 | (&[], label) |
191 | | } else { |
192 | | // Leave one ASCII character in the suffix |
193 | | // in case it's a letter that a combining |
194 | | // character combines with. |
195 | 0 | let (head, tail) = label.split_at(pos - 1); |
196 | 0 | (head, tail) |
197 | | } |
198 | | } else { |
199 | | // All ASCII |
200 | 0 | (label, &[]) |
201 | | } |
202 | 0 | } |
203 | | |
204 | | // Input known to be lower-case, but may contain non-ASCII. |
205 | | #[inline(always)] |
206 | 0 | fn apply_ascii_deny_list_to_lower_cased_unicode(c: char, deny_list: u128) -> char { |
207 | 0 | if let Some(shifted) = 1u128.checked_shl(u32::from(c)) { |
208 | 0 | if (deny_list & shifted) == 0 { |
209 | 0 | c |
210 | | } else { |
211 | 0 | '\u{FFFD}' |
212 | | } |
213 | | } else { |
214 | 0 | c |
215 | | } |
216 | 0 | } |
217 | | |
218 | | // Input known to be ASCII, but may contain upper case ASCII. |
219 | | #[inline(always)] |
220 | 0 | fn apply_ascii_deny_list_to_potentially_upper_case_ascii(b: u8, deny_list: u128) -> char { |
221 | 0 | if (deny_list & (1u128 << b)) == 0 { |
222 | 0 | return char::from(b); |
223 | 0 | } |
224 | 0 | if in_inclusive_range8(b, b'A', b'Z') { |
225 | 0 | return char::from(b + 0x20); |
226 | 0 | } |
227 | 0 | '\u{FFFD}' |
228 | 0 | } |
229 | | |
230 | | #[inline(always)] |
231 | 0 | fn is_ascii(label: &[char]) -> bool { |
232 | 0 | for c in label.iter() { |
233 | 0 | if !c.is_ascii() { |
234 | 0 | return false; |
235 | 0 | } |
236 | | } |
237 | 0 | true |
238 | 0 | } |
239 | | |
240 | | #[derive(PartialEq, Eq, Copy, Clone)] |
241 | | enum PunycodeClassification { |
242 | | Ascii, |
243 | | Unicode, |
244 | | Error, |
245 | | } |
246 | | |
247 | | #[inline(always)] |
248 | 0 | fn classify_for_punycode(label: &[char]) -> PunycodeClassification { |
249 | 0 | let mut iter = label.iter().copied(); |
250 | | loop { |
251 | 0 | if let Some(c) = iter.next() { |
252 | 0 | if c.is_ascii() { |
253 | 0 | continue; |
254 | 0 | } |
255 | 0 | if c == '\u{FFFD}' { |
256 | 0 | return PunycodeClassification::Error; |
257 | 0 | } |
258 | 0 | for c in iter { |
259 | 0 | if c == '\u{FFFD}' { |
260 | 0 | return PunycodeClassification::Error; |
261 | 0 | } |
262 | | } |
263 | 0 | return PunycodeClassification::Unicode; |
264 | 0 | } |
265 | 0 | return PunycodeClassification::Ascii; |
266 | | } |
267 | 0 | } |
268 | | |
269 | | /// The ASCII deny list to be applied. |
270 | | #[derive(PartialEq, Eq, Copy, Clone)] |
271 | | #[repr(transparent)] |
272 | | pub struct AsciiDenyList { |
273 | | bits: u128, |
274 | | } |
275 | | |
276 | | impl AsciiDenyList { |
277 | | /// Computes (preferably at compile time) an ASCII deny list. |
278 | | /// |
279 | | /// Setting `deny_glyphless` to `true` denies U+0020 SPACE and below |
280 | | /// as well as U+007F DELETE for convenience without having to list |
281 | | /// these characters in the `deny_list` string. |
282 | | /// |
283 | | /// `deny_list` is the list of ASCII characters to deny. This |
284 | | /// list must not contain any of: |
285 | | /// * Letters |
286 | | /// * Digits |
287 | | /// * Hyphen |
288 | | /// * Dot (period / full-stop) |
289 | | /// * Non-ASCII |
290 | | /// |
291 | | /// # Panics |
292 | | /// |
293 | | /// If the deny list contains characters listed as prohibited above. |
294 | 0 | pub const fn new(deny_glyphless: bool, deny_list: &str) -> Self { |
295 | 0 | let mut bits = UPPER_CASE_MASK; |
296 | 0 | if deny_glyphless { |
297 | 0 | bits |= GLYPHLESS_MASK; |
298 | 0 | } |
299 | 0 | let mut i = 0; |
300 | 0 | let bytes = deny_list.as_bytes(); |
301 | 0 | while i < bytes.len() { |
302 | 0 | let b = bytes[i]; |
303 | 0 | assert!(b < 0x80, "ASCII deny list must be ASCII."); |
304 | | // assert_ne not yet available in const context. |
305 | 0 | assert!(b != b'.', "ASCII deny list must not contain the dot."); |
306 | 0 | assert!(b != b'-', "ASCII deny list must not contain the hyphen."); |
307 | 0 | assert!( |
308 | 0 | !((b >= b'0') && (b <= b'9')), |
309 | | "ASCII deny list must not contain digits." |
310 | | ); |
311 | 0 | assert!( |
312 | 0 | !((b >= b'a') && (b <= b'z')), |
313 | | "ASCII deny list must not contain letters." |
314 | | ); |
315 | 0 | assert!( |
316 | 0 | !((b >= b'A') && (b <= b'Z')), |
317 | | "ASCII deny list must not contain letters." |
318 | | ); |
319 | 0 | bits |= 1u128 << b; |
320 | 0 | i += 1; |
321 | | } |
322 | 0 | Self { bits } |
323 | 0 | } |
324 | | |
325 | | /// No ASCII deny list. This corresponds to _UseSTD3ASCIIRules=false_. |
326 | | /// |
327 | | /// Equivalent to `AsciiDenyList::new(false, "")`. |
328 | | /// |
329 | | /// Note: Not denying the space and control characters can result in |
330 | | /// strange behavior. Without a deny list provided to the UTS 46 |
331 | | /// operation, the caller is expected perform filtering afterwards, |
332 | | /// but it's more efficient to use `AsciiDenyList` than post-processing, |
333 | | /// because the internals of this crate can optimize away checks in |
334 | | /// certain cases. |
335 | | pub const EMPTY: Self = Self::new(false, ""); |
336 | | |
337 | | /// The STD3 deny list. This corresponds to _UseSTD3ASCIIRules=true_. |
338 | | /// |
339 | | /// Note that this deny list rejects the underscore, which occurs in |
340 | | /// pseudo-hosts used by various TXT record-based protocols, and also |
341 | | /// characters that may occurs in non-DNS naming, such as NetBIOS. |
342 | | pub const STD3: Self = Self { bits: ldh_mask() }; |
343 | | |
344 | | /// [Forbidden domain code point](https://url.spec.whatwg.org/#forbidden-domain-code-point) from the WHATWG URL Standard. |
345 | | /// |
346 | | /// Equivalent to `AsciiDenyList::new(true, "%#/:<>?@[\\]^|")`. |
347 | | /// |
348 | | /// Note that this deny list rejects IPv6 addresses, so (as in URL |
349 | | /// parsing) you need to check for IPv6 addresses first and not |
350 | | /// put them through UTS 46 processing. |
351 | | pub const URL: Self = Self::new(true, "%#/:<>?@[\\]^|"); |
352 | | } |
353 | | |
354 | | /// The _CheckHyphens_ mode. |
355 | | #[derive(PartialEq, Eq, Copy, Clone)] |
356 | | #[non_exhaustive] // non_exhaustive in case a middle mode that prohibits only first and last position needs to be added |
357 | | pub enum Hyphens { |
358 | | /// _CheckHyphens=false_: Do not place positional restrictions on hyphens. |
359 | | /// |
360 | | /// This mode is used by the WHATWG URL Standard for normal User Agent processing |
361 | | /// (i.e. not conformance checking). |
362 | | Allow, |
363 | | |
364 | | /// Prohibit hyphens in the first and last position in the label but allow in |
365 | | /// the third and fourth position. |
366 | | /// |
367 | | /// Note that this mode rejects real-world names, including some GitHub user pages. |
368 | | CheckFirstLast, |
369 | | |
370 | | /// _CheckHyphens=true_: Prohibit hyphens in the first, third, fourth, |
371 | | /// and last position in the label. |
372 | | /// |
373 | | /// Note that this mode rejects real-world names, including YouTube CDN nodes |
374 | | /// and some GitHub user pages. |
375 | | Check, |
376 | | } |
377 | | |
378 | | /// The UTS 46 _VerifyDNSLength_ flag. |
379 | | #[derive(PartialEq, Eq, Copy, Clone)] |
380 | | #[non_exhaustive] |
381 | | pub enum DnsLength { |
382 | | /// _VerifyDNSLength=false_. (Possibly relevant for allowing non-DNS naming systems.) |
383 | | Ignore, |
384 | | /// _VerifyDNSLength=true_ with the exception that the trailing root label dot is |
385 | | /// allowed. |
386 | | VerifyAllowRootDot, |
387 | | /// _VerifyDNSLength=true_. (The trailing root label dot is not allowed.) |
388 | | Verify, |
389 | | } |
390 | | |
391 | | /// Policy for customizing behavior in case of an error. |
392 | | #[derive(PartialEq, Eq, Copy, Clone)] |
393 | | #[non_exhaustive] |
394 | | pub enum ErrorPolicy { |
395 | | /// Return as early as possible without producing output in case of error. |
396 | | FailFast, |
397 | | /// In case of error, mark errors with the REPLACEMENT CHARACTER. (The output |
398 | | /// containing REPLACEMENT CHARACTERs may be show to the user to illustrate |
399 | | /// what was wrong but must not be used for naming in a network protocol.) |
400 | | MarkErrors, |
401 | | } |
402 | | |
403 | | /// The success outcome of [`Uts46::process`] |
404 | | #[derive(PartialEq, Eq, Copy, Clone, Debug)] |
405 | | pub enum ProcessingSuccess { |
406 | | /// There were no errors. The caller must consider the input to be the output. |
407 | | /// |
408 | | /// This asserts that the input can be safely passed to [`core::str::from_utf8_unchecked`]. |
409 | | /// |
410 | | /// (Distinct from `WroteToSink` in order to allow `Cow` behavior to be implemented on top of |
411 | | /// [`Uts46::process`].) |
412 | | Passthrough, |
413 | | |
414 | | /// There were no errors. The caller must consider what was written to the sink to be the output. |
415 | | /// |
416 | | /// (Distinct from `Passthrough` in order to allow `Cow` behavior to be implemented on top of |
417 | | /// [`Uts46::process`].) |
418 | | WroteToSink, |
419 | | } |
420 | | |
421 | | /// The failure outcome of [`Uts46::process`] |
422 | | #[derive(PartialEq, Eq, Copy, Clone, Debug)] |
423 | | pub enum ProcessingError { |
424 | | /// There was a validity error according to the chosen options. |
425 | | /// |
426 | | /// In case of `Operation::ToAscii`, there is no output. Otherwise, output was written to the |
427 | | /// sink and the output contains at least one U+FFFD REPLACEMENT CHARACTER to denote an error. |
428 | | ValidityError, |
429 | | |
430 | | /// The sink emitted [`core::fmt::Error`]. The partial output written to the sink must not |
431 | | /// be used. |
432 | | SinkError, |
433 | | } |
434 | | |
435 | | impl From<core::fmt::Error> for ProcessingError { |
436 | 0 | fn from(_: core::fmt::Error) -> Self { |
437 | 0 | Self::SinkError |
438 | 0 | } |
439 | | } |
440 | | |
441 | | impl From<crate::punycode::PunycodeEncodeError> for ProcessingError { |
442 | 0 | fn from(_: crate::punycode::PunycodeEncodeError) -> Self { |
443 | 0 | unreachable!( |
444 | | "Punycode overflows should not be possible due to PUNYCODE_ENCODE_MAX_INPUT_LENGTH" |
445 | | ); |
446 | | } |
447 | | } |
448 | | |
449 | | #[derive(Debug, Clone, Copy)] |
450 | | enum AlreadyAsciiLabel<'a> { |
451 | | MixedCaseAscii(&'a [u8]), |
452 | | MixedCasePunycode(&'a [u8]), |
453 | | Other, |
454 | | } |
455 | | |
456 | | /// Performs the _VerifyDNSLength_ check on the output of the _ToASCII_ operation. |
457 | | /// |
458 | | /// If the second argument is `false`, the trailing root label dot is allowed. |
459 | | /// |
460 | | /// # Panics |
461 | | /// |
462 | | /// Panics in debug mode if the argument isn't ASCII. |
463 | 0 | pub fn verify_dns_length(domain_name: &str, allow_trailing_dot: bool) -> bool { |
464 | 0 | let bytes = domain_name.as_bytes(); |
465 | 0 | debug_assert!(bytes.is_ascii()); |
466 | 0 | let domain_name_without_trailing_dot = if let Some(without) = bytes.strip_suffix(b".") { |
467 | 0 | if !allow_trailing_dot { |
468 | 0 | return false; |
469 | 0 | } |
470 | 0 | without |
471 | | } else { |
472 | 0 | bytes |
473 | | }; |
474 | 0 | if domain_name_without_trailing_dot.len() > 253 { |
475 | 0 | return false; |
476 | 0 | } |
477 | 0 | for label in domain_name_without_trailing_dot.split(|b| *b == b'.') { |
478 | 0 | if label.is_empty() { |
479 | 0 | return false; |
480 | 0 | } |
481 | 0 | if label.len() > 63 { |
482 | 0 | return false; |
483 | 0 | } |
484 | | } |
485 | 0 | true |
486 | 0 | } |
487 | | |
488 | | /// An implementation of UTS #46. |
489 | | pub struct Uts46 { |
490 | | data: idna_adapter::Adapter, |
491 | | } |
492 | | |
493 | | #[cfg(feature = "compiled_data")] |
494 | | impl Default for Uts46 { |
495 | 0 | fn default() -> Self { |
496 | 0 | Self::new() |
497 | 0 | } |
498 | | } |
499 | | |
500 | | impl Uts46 { |
501 | | /// Constructor using data compiled into the binary. |
502 | | #[cfg(feature = "compiled_data")] |
503 | 0 | pub const fn new() -> Self { |
504 | 0 | Self { |
505 | 0 | data: idna_adapter::Adapter::new(), |
506 | 0 | } |
507 | 0 | } |
508 | | |
509 | | // XXX Should there be an `icu_provider` feature for enabling |
510 | | // a constructor for run-time data loading? |
511 | | |
512 | | /// Performs the [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) operation |
513 | | /// from UTS #46 with the options indicated. |
514 | | /// |
515 | | /// # Arguments |
516 | | /// |
517 | | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
518 | | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
519 | | /// already have a `&str`, call `.as_bytes()` on it.) |
520 | | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
521 | | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
522 | | /// processing is handled via this argument. Most callers are probably the best off |
523 | | /// by using [`AsciiDenyList::URL`] here. |
524 | | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
525 | | /// off by using [`Hyphens::Allow`] here. |
526 | | /// * `dns_length` - The UTS 46 _VerifyDNSLength_ flag. |
527 | 0 | pub fn to_ascii<'a>( |
528 | 0 | &self, |
529 | 0 | domain_name: &'a [u8], |
530 | 0 | ascii_deny_list: AsciiDenyList, |
531 | 0 | hyphens: Hyphens, |
532 | 0 | dns_length: DnsLength, |
533 | 0 | ) -> Result<Cow<'a, str>, crate::Errors> { |
534 | 0 | self.to_ascii_from_cow( |
535 | 0 | Cow::Borrowed(domain_name), |
536 | 0 | ascii_deny_list, |
537 | 0 | hyphens, |
538 | 0 | dns_length, |
539 | | ) |
540 | 0 | } |
541 | | |
542 | 0 | pub(crate) fn to_ascii_from_cow<'a>( |
543 | 0 | &self, |
544 | 0 | domain_name: Cow<'a, [u8]>, |
545 | 0 | ascii_deny_list: AsciiDenyList, |
546 | 0 | hyphens: Hyphens, |
547 | 0 | dns_length: DnsLength, |
548 | 0 | ) -> Result<Cow<'a, str>, crate::Errors> { |
549 | 0 | let mut s = String::new(); |
550 | 0 | match self.process( |
551 | 0 | &domain_name, |
552 | 0 | ascii_deny_list, |
553 | 0 | hyphens, |
554 | 0 | ErrorPolicy::FailFast, |
555 | | |_, _, _| false, |
556 | 0 | &mut s, |
557 | 0 | None, |
558 | | ) { |
559 | | Ok(ProcessingSuccess::Passthrough) => { |
560 | | // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII. |
561 | 0 | let cow = match domain_name { |
562 | 0 | Cow::Borrowed(v) => Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(v) }), |
563 | 0 | Cow::Owned(v) => Cow::Owned(unsafe { String::from_utf8_unchecked(v) }), |
564 | | }; |
565 | 0 | if dns_length != DnsLength::Ignore |
566 | 0 | && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot) |
567 | | { |
568 | 0 | Err(crate::Errors::default()) |
569 | | } else { |
570 | 0 | Ok(cow) |
571 | | } |
572 | | } |
573 | | Ok(ProcessingSuccess::WroteToSink) => { |
574 | 0 | let cow: Cow<'_, str> = Cow::Owned(s); |
575 | 0 | if dns_length != DnsLength::Ignore |
576 | 0 | && !verify_dns_length(&cow, dns_length == DnsLength::VerifyAllowRootDot) |
577 | | { |
578 | 0 | Err(crate::Errors::default()) |
579 | | } else { |
580 | 0 | Ok(cow) |
581 | | } |
582 | | } |
583 | 0 | Err(ProcessingError::ValidityError) => Err(crate::Errors::default()), |
584 | 0 | Err(ProcessingError::SinkError) => unreachable!(), |
585 | | } |
586 | 0 | } |
587 | | |
588 | | /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation |
589 | | /// from UTS #46 according to the options given. When there |
590 | | /// are errors, there is still output, which may be rendered user, even through |
591 | | /// the output must not be used in networking protocols. Errors are denoted |
592 | | /// by U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item of the |
593 | | /// return tuple is `Err`, the first item of the return tuple is guaranteed to contain |
594 | | /// at least one U+FFFD.) |
595 | | /// |
596 | | /// Most applications probably shouldn't use this method and should be using |
597 | | /// [`Uts46::to_user_interface`] instead. |
598 | | /// |
599 | | /// # Arguments |
600 | | /// |
601 | | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
602 | | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
603 | | /// already have a `&str`, call `.as_bytes()` on it.) |
604 | | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
605 | | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
606 | | /// processing is handled via this argument. Most callers are probably the best off |
607 | | /// by using [`AsciiDenyList::URL`] here. |
608 | | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
609 | | /// off by using [`Hyphens::Allow`] here. |
610 | 0 | pub fn to_unicode<'a>( |
611 | 0 | &self, |
612 | 0 | domain_name: &'a [u8], |
613 | 0 | ascii_deny_list: AsciiDenyList, |
614 | 0 | hyphens: Hyphens, |
615 | 0 | ) -> (Cow<'a, str>, Result<(), crate::Errors>) { |
616 | 0 | self.to_user_interface(domain_name, ascii_deny_list, hyphens, |_, _, _| true) |
617 | 0 | } |
618 | | |
619 | | /// Performs the [ToUnicode](https://www.unicode.org/reports/tr46/#ToUnicode) operation |
620 | | /// from UTS #46 according to options given with some |
621 | | /// error-free Unicode labels output according to |
622 | | /// [ToASCII](https://www.unicode.org/reports/tr46/#ToASCII) instead as decided by |
623 | | /// application policy implemented via the `output_as_unicode` closure. The purpose |
624 | | /// is to convert user-visible domains to the Unicode form in general but to render |
625 | | /// potentially misleading labels as Punycode. |
626 | | /// |
627 | | /// This is an imperfect security mechanism, because [the Punycode form itself may be |
628 | | /// resemble a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing). |
629 | | /// However, since this mechanism is common practice, this API provides support for The |
630 | | /// the mechanism. |
631 | | /// |
632 | | /// ASCII labels always pass through as ASCII and labels with errors always pass through |
633 | | /// as Unicode. For non-erroneous labels that contain at least one non-ASCII character |
634 | | /// (implies non-empty), `output_as_unicode` is called with the Unicode form of the label, |
635 | | /// the TLD (potentially empty), and a flag indicating whether the domain name as a whole |
636 | | /// is a bidi domain name. If the return value is `true`, the label passes through as |
637 | | /// Unicode. If the return value is `false`, the label is converted to Punycode. |
638 | | /// |
639 | | /// When there are errors, there is still output, which may be rendered user, even through |
640 | | /// the output must not be used in networking protocols. Errors are denoted by |
641 | | /// U+FFFD REPLACEMENT CHARACTERs in the output. (That is, if the second item |
642 | | /// of the return tuple is `Err`, the first item of the return tuple is guaranteed to contain |
643 | | /// at least one U+FFFD.) Labels that contain errors are not converted to Punycode. |
644 | | /// |
645 | | /// # Arguments |
646 | | /// |
647 | | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
648 | | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
649 | | /// already have a `&str`, call `.as_bytes()` on it.) |
650 | | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
651 | | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
652 | | /// processing is handled via this argument. Most callers are probably the best off |
653 | | /// by using [`AsciiDenyList::URL`] here. |
654 | | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
655 | | /// off by using [`Hyphens::Allow`] here. |
656 | | /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode |
657 | | /// (as opposed to Punycode). The first argument is the label for which a decision is |
658 | | /// needed (always non-empty slice). The second argument is the TLD (potentially empty). |
659 | | /// The third argument is `true` iff the domain name as a whole is a bidi domain name. |
660 | | /// Only non-erroneous labels that contain at least one non-ASCII character are passed |
661 | | /// to the closure as the first argument. The second and third argument values are |
662 | | /// guaranteed to remain the same during a single call to `process`, and the closure |
663 | | /// may cache computations derived from the second and third argument (hence the |
664 | | /// `FnMut` type). |
665 | 0 | pub fn to_user_interface<'a, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>( |
666 | 0 | &self, |
667 | 0 | domain_name: &'a [u8], |
668 | 0 | ascii_deny_list: AsciiDenyList, |
669 | 0 | hyphens: Hyphens, |
670 | 0 | output_as_unicode: OutputUnicode, |
671 | 0 | ) -> (Cow<'a, str>, Result<(), crate::Errors>) { |
672 | 0 | let mut s = String::new(); |
673 | 0 | match self.process( |
674 | 0 | domain_name, |
675 | 0 | ascii_deny_list, |
676 | 0 | hyphens, |
677 | 0 | ErrorPolicy::MarkErrors, |
678 | 0 | output_as_unicode, |
679 | 0 | &mut s, |
680 | 0 | None, |
681 | | ) { |
682 | | // SAFETY: `ProcessingSuccess::Passthrough` asserts that `domain_name` is ASCII. |
683 | 0 | Ok(ProcessingSuccess::Passthrough) => ( |
684 | 0 | Cow::Borrowed(unsafe { core::str::from_utf8_unchecked(domain_name) }), |
685 | 0 | Ok(()), |
686 | 0 | ), |
687 | 0 | Ok(ProcessingSuccess::WroteToSink) => (Cow::Owned(s), Ok(())), |
688 | 0 | Err(ProcessingError::ValidityError) => (Cow::Owned(s), Err(crate::Errors::default())), |
689 | 0 | Err(ProcessingError::SinkError) => unreachable!(), |
690 | | } |
691 | 0 | } |
692 | | |
693 | | /// The lower-level function that [`Uts46::to_ascii`], [`Uts46::to_unicode`], and |
694 | | /// [`Uts46::to_user_interface`] are built on to allow support for output types other |
695 | | /// than `Cow<'a, str>` (e.g. string types in a non-Rust programming language). |
696 | | /// |
697 | | /// # Arguments |
698 | | /// |
699 | | /// * `domain_name` - The input domain name as UTF-8 bytes. (The UTF-8ness is checked by |
700 | | /// this method and input that is not well-formed UTF-8 is treated as an error. If you |
701 | | /// already have a `&str`, call `.as_bytes()` on it.) |
702 | | /// * `ascii_deny_list` - What ASCII deny list, if any, to apply. The UTS 46 |
703 | | /// _UseSTD3ASCIIRules_ flag or the WHATWG URL Standard forbidden domain code point |
704 | | /// processing is handled via this argument. Most callers are probably the best off |
705 | | /// by using [`AsciiDenyList::URL`] here. |
706 | | /// * `hyphens` - The UTS 46 _CheckHyphens_ flag. Most callers are probably the best |
707 | | /// off by using [`Hyphens::Allow`] here. |
708 | | /// * `error_policy` - Whether to fail fast or to produce output that may be rendered |
709 | | /// for the user to examine in case of errors. |
710 | | /// * `output_as_unicode` - A closure for deciding if a label should be output as Unicode |
711 | | /// (as opposed to Punycode). The first argument is the label for which a decision is |
712 | | /// needed (always non-empty slice). The second argument is the TLD (potentially empty). |
713 | | /// The third argument is `true` iff the domain name as a whole is a bidi domain name. |
714 | | /// Only non-erroneous labels that contain at least one non-ASCII character are passed |
715 | | /// to the closure as the first argument. The second and third argument values are |
716 | | /// guaranteed to remain the same during a single call to `process`, and the closure |
717 | | /// may cache computations derived from the second and third argument (hence the |
718 | | /// `FnMut` type). To perform the _ToASCII_ operation, `|_, _, _| false` must be |
719 | | /// passed as the closure. To perform the _ToUnicode_ operation, `|_, _, _| true` must |
720 | | /// be passed as the closure. A more complex closure may be used to prepare a domain |
721 | | /// name for display in a user interface so that labels are converted to the Unicode |
722 | | /// form in general but potentially misleading labels are converted to the Punycode |
723 | | /// form. |
724 | | /// * `sink` - The object that receives the output (in the non-passthrough case). |
725 | | /// * `ascii_sink` - A second sink that receives the _ToASCII_ form only if there |
726 | | /// were no errors and `sink` received at least one character of non-ASCII output. |
727 | | /// The purpose of this argument is to enable a user interface display form of the |
728 | | /// domain and the _ToASCII_ form of the domain to be computed efficiently together. |
729 | | /// This argument is useless when `output_as_unicode` always returns `false`, in |
730 | | /// which case the _ToASCII_ form ends up in `sink` already. If `ascii_sink` receives |
731 | | /// no output and the return value is `Ok(ProcessingSuccess::WroteToSink)`, use the |
732 | | /// output received by `sink` also as the _ToASCII_ result. |
733 | | /// |
734 | | /// # Return value |
735 | | /// |
736 | | /// * `Ok(ProcessingSuccess::Passthrough)` - The caller must treat |
737 | | /// `unsafe { core::str::from_utf8_unchecked(domain_name) }` as the output. (This |
738 | | /// return value asserts that calling `core::str::from_utf8_unchecked(domain_name)` |
739 | | /// is safe.) |
740 | | /// * `Ok(ProcessingSuccess::WroteToSink)` - The caller must treat was was written |
741 | | /// to `sink` as the output. If another sink was passed as `ascii_sink` but it did |
742 | | /// not receive output, the caller must treat what was written to `sink` also as |
743 | | /// the _ToASCII_ output. Otherwise, if `ascii_sink` received output, the caller |
744 | | /// must treat what was written to `ascii_sink` as the _ToASCII_ output. |
745 | | /// * `Err(ProcessingError::ValidityError)` - The input was in error and must |
746 | | /// not be used for DNS lookup or otherwise in a network protocol. If `error_policy` |
747 | | /// was `ErrorPolicy::MarkErrors`, the output written to `sink` may be displayed |
748 | | /// to the user as an illustration of where the error was or the errors were. |
749 | | /// * `Err(ProcessingError::SinkError)` - Either `sink` or `ascii_sink` returned |
750 | | /// [`core::fmt::Error`]. The partial output written to `sink` `ascii_sink` must not |
751 | | /// be used. If `W` never returns [`core::fmt::Error`], this method never returns |
752 | | /// `Err(ProcessingError::SinkError)`. |
753 | | /// |
754 | | /// # Safety-usable invariant |
755 | | /// |
756 | | /// If the return value is `Ok(ProcessingSuccess::Passthrough)`, `domain_name` is |
757 | | /// ASCII and `core::str::from_utf8_unchecked(domain_name)` is safe. (Note: |
758 | | /// Other return values do _not_ imply that `domain_name` wasn't ASCII!) |
759 | | /// |
760 | | /// # Security considerations |
761 | | /// |
762 | | /// Showing labels whose Unicode form might mislead the user as Punycode instead is |
763 | | /// an imperfect security mechanism, because [the Punycode form itself may be resemble |
764 | | /// a user-recognizable name](https://www.unicode.org/reports/tr36/#TablePunycodeSpoofing). |
765 | | /// However, since this mechanism is common practice, this API provides support for the |
766 | | /// the mechanism. |
767 | | /// |
768 | | /// Punycode processing is quadratic, so to avoid denial of service, this method imposes |
769 | | /// length limits on Punycode treating especially long inputs as being in error. These |
770 | | /// limits are well higher than the DNS length limits and are not more restrictive than |
771 | | /// the limits imposed by ICU4C. |
772 | | #[allow(clippy::too_many_arguments)] |
773 | 0 | pub fn process<W: Write + ?Sized, OutputUnicode: FnMut(&[char], &[char], bool) -> bool>( |
774 | 0 | &self, |
775 | 0 | domain_name: &[u8], |
776 | 0 | ascii_deny_list: AsciiDenyList, |
777 | 0 | hyphens: Hyphens, |
778 | 0 | error_policy: ErrorPolicy, |
779 | 0 | mut output_as_unicode: OutputUnicode, |
780 | 0 | sink: &mut W, |
781 | 0 | ascii_sink: Option<&mut W>, |
782 | 0 | ) -> Result<ProcessingSuccess, ProcessingError> { |
783 | 0 | let fail_fast = error_policy == ErrorPolicy::FailFast; |
784 | 0 | let mut domain_buffer = SmallVec::<[char; 253]>::new(); |
785 | 0 | let mut already_punycode = SmallVec::<[AlreadyAsciiLabel; 8]>::new(); |
786 | | // `process_inner` could be pasted inline here, but it's out of line in order |
787 | | // to avoid duplicating that code when monomorphizing over `W` and `OutputUnicode`. |
788 | 0 | let (passthrough_up_to, is_bidi, had_errors) = self.process_inner( |
789 | 0 | domain_name, |
790 | 0 | ascii_deny_list, |
791 | 0 | hyphens, |
792 | 0 | fail_fast, |
793 | 0 | &mut domain_buffer, |
794 | 0 | &mut already_punycode, |
795 | 0 | ); |
796 | 0 | if passthrough_up_to == domain_name.len() { |
797 | 0 | debug_assert!(!had_errors); |
798 | 0 | return Ok(ProcessingSuccess::Passthrough); |
799 | 0 | } |
800 | | // Checked only after passthrough as a micro optimization. |
801 | 0 | if fail_fast && had_errors { |
802 | 0 | return Err(ProcessingError::ValidityError); |
803 | 0 | } |
804 | 0 | debug_assert_eq!(had_errors, domain_buffer.contains(&'\u{FFFD}')); |
805 | 0 | let without_dot = if let Some(without_dot) = domain_buffer.strip_suffix(&['.']) { |
806 | 0 | without_dot |
807 | | } else { |
808 | 0 | &domain_buffer[..] |
809 | | }; |
810 | | // unwrap is OK, because we always have at least one label |
811 | 0 | let tld = without_dot.rsplit(|c| *c == '.').next().unwrap(); Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>::{closure#0}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>::{closure#0}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>::{closure#0}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}>::{closure#0} |
812 | 0 | let mut had_unicode_output = false; |
813 | 0 | let mut seen_label = false; |
814 | 0 | let mut already_punycode_iter = already_punycode.iter(); |
815 | 0 | let mut passthrough_up_to_extended = passthrough_up_to; |
816 | 0 | let mut flushed_prefix = false; |
817 | 0 | for label in domain_buffer.split(|c| *c == '.') {Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>::{closure#1}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>::{closure#1}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>::{closure#1}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}>::{closure#1} |
818 | | // Unwrap is OK, because there are supposed to be as many items in |
819 | | // `already_punycode` as there are labels. |
820 | 0 | let input_punycode = *already_punycode_iter.next().unwrap(); |
821 | 0 | if seen_label { |
822 | 0 | if flushed_prefix { |
823 | 0 | sink.write_char('.')?; |
824 | | } else { |
825 | 0 | debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.'); |
826 | 0 | passthrough_up_to_extended += 1; |
827 | 0 | if passthrough_up_to_extended == domain_name.len() { |
828 | 0 | debug_assert!(!had_errors); |
829 | 0 | return Ok(ProcessingSuccess::Passthrough); |
830 | 0 | } |
831 | | } |
832 | 0 | } |
833 | 0 | seen_label = true; |
834 | | |
835 | 0 | if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode { |
836 | 0 | if let Some(first_upper_case) = |
837 | 0 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>::{closure#2}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>::{closure#2}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>::{closure#2}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}>::{closure#2} |
838 | | { |
839 | 0 | let (head, tail) = mixed_case.split_at(first_upper_case); |
840 | 0 | let slice_to_write = if flushed_prefix { |
841 | 0 | head |
842 | | } else { |
843 | 0 | flushed_prefix = true; |
844 | 0 | passthrough_up_to_extended += head.len(); |
845 | 0 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
846 | 0 | &domain_name[..passthrough_up_to_extended] |
847 | | }; |
848 | | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
849 | 0 | sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?; |
850 | 0 | for c in tail.iter() { |
851 | 0 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
852 | | } |
853 | 0 | } else if flushed_prefix { |
854 | | // SAFETY: `mixed_case` is known to be ASCII. |
855 | 0 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
856 | | } else { |
857 | 0 | passthrough_up_to_extended += mixed_case.len(); |
858 | 0 | if passthrough_up_to_extended == domain_name.len() { |
859 | 0 | debug_assert!(!had_errors); |
860 | 0 | return Ok(ProcessingSuccess::Passthrough); |
861 | 0 | } |
862 | | } |
863 | 0 | continue; |
864 | 0 | } |
865 | | |
866 | 0 | let potentially_punycode = if fail_fast { |
867 | 0 | debug_assert!(classify_for_punycode(label) != PunycodeClassification::Error); |
868 | 0 | !is_ascii(label) |
869 | | } else { |
870 | 0 | classify_for_punycode(label) == PunycodeClassification::Unicode |
871 | | }; |
872 | 0 | let passthrough = if potentially_punycode { |
873 | 0 | let unicode = output_as_unicode(label, tld, is_bidi); |
874 | 0 | had_unicode_output |= unicode; |
875 | 0 | unicode |
876 | | } else { |
877 | 0 | true |
878 | | }; |
879 | 0 | if passthrough { |
880 | 0 | if !flushed_prefix { |
881 | 0 | flushed_prefix = true; |
882 | | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
883 | 0 | sink.write_str(unsafe { |
884 | 0 | core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended]) |
885 | 0 | })?; |
886 | 0 | } |
887 | 0 | for c in label.iter().copied() { |
888 | 0 | sink.write_char(c)?; |
889 | | } |
890 | 0 | } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode { |
891 | 0 | if let Some(first_upper_case) = |
892 | 0 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>::{closure#3}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>::{closure#3}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>::{closure#3}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}>::{closure#3} |
893 | | { |
894 | 0 | let (head, tail) = mixed_case.split_at(first_upper_case); |
895 | 0 | let slice_to_write = if flushed_prefix { |
896 | 0 | head |
897 | | } else { |
898 | 0 | flushed_prefix = true; |
899 | 0 | passthrough_up_to_extended += head.len(); |
900 | 0 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
901 | 0 | &domain_name[..passthrough_up_to_extended] |
902 | | }; |
903 | | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
904 | 0 | sink.write_str(unsafe { core::str::from_utf8_unchecked(slice_to_write) })?; |
905 | 0 | for c in tail.iter() { |
906 | 0 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
907 | | } |
908 | 0 | } else if flushed_prefix { |
909 | | // SAFETY: `mixed_case` is known to be ASCII. |
910 | 0 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
911 | | } else { |
912 | 0 | passthrough_up_to_extended += mixed_case.len(); |
913 | 0 | if passthrough_up_to_extended == domain_name.len() { |
914 | 0 | debug_assert!(!had_errors); |
915 | 0 | return Ok(ProcessingSuccess::Passthrough); |
916 | 0 | } |
917 | | } |
918 | | } else { |
919 | 0 | if !flushed_prefix { |
920 | 0 | flushed_prefix = true; |
921 | | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
922 | 0 | sink.write_str(unsafe { |
923 | 0 | core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended]) |
924 | 0 | })?; |
925 | 0 | } |
926 | 0 | write_punycode_label(label, sink)?; |
927 | | } |
928 | | } |
929 | | |
930 | 0 | if had_errors { |
931 | 0 | return Err(ProcessingError::ValidityError); |
932 | 0 | } |
933 | | |
934 | 0 | if had_unicode_output { |
935 | 0 | if let Some(sink) = ascii_sink { |
936 | 0 | let mut seen_label = false; |
937 | 0 | let mut already_punycode_iter = already_punycode.iter(); |
938 | 0 | let mut passthrough_up_to_extended = passthrough_up_to; |
939 | 0 | let mut flushed_prefix = false; |
940 | 0 | for label in domain_buffer.split(|c| *c == '.') {Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>::{closure#4}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>::{closure#4}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>::{closure#4}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}>::{closure#4} |
941 | | // Unwrap is OK, because there are supposed to be as many items in |
942 | | // `already_punycode` as there are labels. |
943 | 0 | let input_punycode = *already_punycode_iter.next().unwrap(); |
944 | 0 | if seen_label { |
945 | 0 | if flushed_prefix { |
946 | 0 | sink.write_char('.')?; |
947 | | } else { |
948 | 0 | debug_assert_eq!(domain_name[passthrough_up_to_extended], b'.'); |
949 | 0 | passthrough_up_to_extended += 1; |
950 | | } |
951 | 0 | } |
952 | 0 | seen_label = true; |
953 | | |
954 | 0 | if let AlreadyAsciiLabel::MixedCaseAscii(mixed_case) = input_punycode { |
955 | 0 | if let Some(first_upper_case) = |
956 | 0 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>::{closure#5}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>::{closure#5}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>::{closure#5}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}>::{closure#5} |
957 | | { |
958 | 0 | let (head, tail) = mixed_case.split_at(first_upper_case); |
959 | 0 | let slice_to_write = if flushed_prefix { |
960 | 0 | head |
961 | | } else { |
962 | 0 | flushed_prefix = true; |
963 | 0 | passthrough_up_to_extended += head.len(); |
964 | 0 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
965 | 0 | &domain_name[..passthrough_up_to_extended] |
966 | | }; |
967 | | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
968 | 0 | sink.write_str(unsafe { |
969 | 0 | core::str::from_utf8_unchecked(slice_to_write) |
970 | 0 | })?; |
971 | 0 | for c in tail.iter() { |
972 | 0 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
973 | | } |
974 | 0 | } else if flushed_prefix { |
975 | | // SAFETY: `mixed_case` is known to be ASCII. |
976 | 0 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
977 | 0 | } else { |
978 | 0 | passthrough_up_to_extended += mixed_case.len(); |
979 | 0 | } |
980 | 0 | continue; |
981 | 0 | } |
982 | | |
983 | 0 | if is_ascii(label) { |
984 | 0 | if !flushed_prefix { |
985 | 0 | flushed_prefix = true; |
986 | | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
987 | 0 | sink.write_str(unsafe { |
988 | 0 | core::str::from_utf8_unchecked( |
989 | 0 | &domain_name[..passthrough_up_to_extended], |
990 | 0 | ) |
991 | 0 | })?; |
992 | 0 | } |
993 | 0 | for c in label.iter().copied() { |
994 | 0 | sink.write_char(c)?; |
995 | | } |
996 | 0 | } else if let AlreadyAsciiLabel::MixedCasePunycode(mixed_case) = input_punycode |
997 | | { |
998 | 0 | if let Some(first_upper_case) = |
999 | 0 | mixed_case.iter().position(|c| c.is_ascii_uppercase()) Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>::{closure#6}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>::{closure#6}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>::{closure#6}Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}>::{closure#6} |
1000 | | { |
1001 | 0 | let (head, tail) = mixed_case.split_at(first_upper_case); |
1002 | 0 | let slice_to_write = if flushed_prefix { |
1003 | 0 | head |
1004 | | } else { |
1005 | 0 | flushed_prefix = true; |
1006 | 0 | passthrough_up_to_extended += head.len(); |
1007 | 0 | debug_assert_ne!(passthrough_up_to_extended, domain_name.len()); |
1008 | 0 | &domain_name[..passthrough_up_to_extended] |
1009 | | }; |
1010 | | // SAFETY: `mixed_case` and `domain_name` up to `passthrough_up_to_extended` are known to be ASCII. |
1011 | 0 | sink.write_str(unsafe { |
1012 | 0 | core::str::from_utf8_unchecked(slice_to_write) |
1013 | 0 | })?; |
1014 | 0 | for c in tail.iter() { |
1015 | 0 | sink.write_char(char::from(c.to_ascii_lowercase()))?; |
1016 | | } |
1017 | 0 | } else if flushed_prefix { |
1018 | | // SAFETY: `mixed_case` is known to be ASCII. |
1019 | 0 | sink.write_str(unsafe { core::str::from_utf8_unchecked(mixed_case) })?; |
1020 | 0 | } else { |
1021 | 0 | passthrough_up_to_extended += mixed_case.len(); |
1022 | 0 | } |
1023 | | } else { |
1024 | 0 | if !flushed_prefix { |
1025 | 0 | flushed_prefix = true; |
1026 | | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
1027 | 0 | sink.write_str(unsafe { |
1028 | 0 | core::str::from_utf8_unchecked( |
1029 | 0 | &domain_name[..passthrough_up_to_extended], |
1030 | 0 | ) |
1031 | 0 | })?; |
1032 | 0 | } |
1033 | 0 | write_punycode_label(label, sink)?; |
1034 | | } |
1035 | | } |
1036 | 0 | if !flushed_prefix { |
1037 | | // SAFETY: `domain_name` up to `passthrough_up_to_extended` is known to be ASCII. |
1038 | 0 | sink.write_str(unsafe { |
1039 | 0 | core::str::from_utf8_unchecked(&domain_name[..passthrough_up_to_extended]) |
1040 | 0 | })?; |
1041 | 0 | } |
1042 | 0 | } |
1043 | 0 | } |
1044 | 0 | Ok(ProcessingSuccess::WroteToSink) |
1045 | 0 | } Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_unicode::{closure#0}>Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::uts46::Uts46>::to_ascii_from_cow::{closure#0}>Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_unicode::{closure#0}>Unexecuted instantiation: <idna::uts46::Uts46>::process::<alloc::string::String, <idna::deprecated::Idna>::to_ascii::{closure#0}> |
1046 | | |
1047 | | /// The part of `process` that doesn't need to be generic over the sink. |
1048 | | #[inline(always)] |
1049 | 0 | fn process_inner<'a>( |
1050 | 0 | &self, |
1051 | 0 | domain_name: &'a [u8], |
1052 | 0 | ascii_deny_list: AsciiDenyList, |
1053 | 0 | hyphens: Hyphens, |
1054 | 0 | fail_fast: bool, |
1055 | 0 | domain_buffer: &mut SmallVec<[char; 253]>, |
1056 | 0 | already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>, |
1057 | 0 | ) -> (usize, bool, bool) { |
1058 | | // Sadly, this even faster-path ASCII tier is needed to avoid regressing |
1059 | | // performance. |
1060 | 0 | let mut iter = domain_name.iter(); |
1061 | 0 | let mut most_recent_label_start = iter.clone(); |
1062 | | loop { |
1063 | 0 | if let Some(&b) = iter.next() { |
1064 | 0 | if in_inclusive_range8(b, b'a', b'z') { |
1065 | 0 | continue; |
1066 | 0 | } |
1067 | 0 | if b == b'.' { |
1068 | 0 | most_recent_label_start = iter.clone(); |
1069 | 0 | continue; |
1070 | 0 | } |
1071 | 0 | return self.process_innermost( |
1072 | 0 | domain_name, |
1073 | 0 | ascii_deny_list, |
1074 | 0 | hyphens, |
1075 | 0 | fail_fast, |
1076 | 0 | domain_buffer, |
1077 | 0 | already_punycode, |
1078 | 0 | most_recent_label_start.as_slice(), |
1079 | | ); |
1080 | | } else { |
1081 | | // Success! The whole input passes through on the fastest path! |
1082 | 0 | return (domain_name.len(), false, false); |
1083 | | } |
1084 | | } |
1085 | 0 | } |
1086 | | |
1087 | | /// The part of `process` that doesn't need to be generic over the sink and |
1088 | | /// can avoid monomorphizing in the interest of code size. |
1089 | | /// Separating this into a different stack frame compared to `process_inner` |
1090 | | /// improves performance in the ICU4X case. |
1091 | | #[allow(clippy::too_many_arguments)] |
1092 | | #[inline(never)] |
1093 | 0 | fn process_innermost<'a>( |
1094 | 0 | &self, |
1095 | 0 | domain_name: &'a [u8], |
1096 | 0 | ascii_deny_list: AsciiDenyList, |
1097 | 0 | hyphens: Hyphens, |
1098 | 0 | fail_fast: bool, |
1099 | 0 | domain_buffer: &mut SmallVec<[char; 253]>, |
1100 | 0 | already_punycode: &mut SmallVec<[AlreadyAsciiLabel<'a>; 8]>, |
1101 | 0 | tail: &'a [u8], |
1102 | 0 | ) -> (usize, bool, bool) { |
1103 | 0 | let deny_list = ascii_deny_list.bits; |
1104 | 0 | let deny_list_deny_dot = deny_list | DOT_MASK; |
1105 | | |
1106 | 0 | let mut had_errors = false; |
1107 | | |
1108 | 0 | let mut passthrough_up_to = domain_name.len() - tail.len(); // Index into `domain_name` |
1109 | | // 253 ASCII characters is the max length for a valid domain name |
1110 | | // (excluding the root dot). |
1111 | | let mut current_label_start; // Index into `domain_buffer` |
1112 | 0 | let mut seen_label = false; |
1113 | 0 | let mut in_prefix = true; |
1114 | 0 | for label in tail.split(|b| *b == b'.') { |
1115 | | // We check for passthrough only for the prefix. That is, if we |
1116 | | // haven't moved on and started filling `domain_buffer`. Keeping |
1117 | | // this stuff in one loop where the first items keep being skipped |
1118 | | // once they have been skipped at least once instead of working |
1119 | | // this into a fancier loop structure in order to make sure that |
1120 | | // no item from the iterator is lost or processed twice. |
1121 | | // Furthermore, after the passthrough fails, restarting the |
1122 | | // normalization process after each pre-existing ASCII dot also |
1123 | | // provides an opportunity for the processing to get back onto |
1124 | | // an ASCII fast path that bypasses the normalizer for ASCII |
1125 | | // after a pre-existing ASCII dot (pre-existing in the sense |
1126 | | // of not coming from e.g. normalizing an ideographic dot). |
1127 | 0 | if in_prefix && is_passthrough_ascii_label(label) { |
1128 | 0 | if seen_label { |
1129 | 0 | debug_assert_eq!(domain_name[passthrough_up_to], b'.'); |
1130 | 0 | passthrough_up_to += 1; |
1131 | 0 | } |
1132 | 0 | seen_label = true; |
1133 | | |
1134 | 0 | passthrough_up_to += label.len(); |
1135 | 0 | continue; |
1136 | 0 | } |
1137 | 0 | if seen_label { |
1138 | 0 | if in_prefix { |
1139 | 0 | debug_assert_eq!(domain_name[passthrough_up_to], b'.'); |
1140 | 0 | passthrough_up_to += 1; |
1141 | 0 | } else { |
1142 | 0 | domain_buffer.push('.'); |
1143 | 0 | } |
1144 | 0 | } |
1145 | 0 | seen_label = true; |
1146 | 0 | in_prefix = false; |
1147 | 0 | current_label_start = domain_buffer.len(); |
1148 | 0 | if !label.is_empty() { |
1149 | 0 | let (ascii, non_ascii) = split_ascii_fast_path_prefix(label); |
1150 | 0 | let non_punycode_ascii_label = if non_ascii.is_empty() { |
1151 | 0 | if has_punycode_prefix(ascii) { |
1152 | 0 | if (ascii.last() != Some(&b'-')) |
1153 | 0 | && (ascii.len() - 4 <= PUNYCODE_DECODE_MAX_INPUT_LENGTH) |
1154 | | { |
1155 | 0 | if let Ok(decode) = |
1156 | 0 | Decoder::default().decode::<u8, InternalCaller>(&ascii[4..]) |
1157 | | { |
1158 | | // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4 |
1159 | | // characters. |
1160 | 0 | let mut label_buffer = SmallVec::<[char; 59]>::new(); |
1161 | 0 | label_buffer.extend(decode); |
1162 | | |
1163 | 0 | if self.after_punycode_decode( |
1164 | 0 | domain_buffer, |
1165 | 0 | current_label_start, |
1166 | 0 | &label_buffer, |
1167 | 0 | deny_list_deny_dot, |
1168 | 0 | fail_fast, |
1169 | 0 | &mut had_errors, |
1170 | | ) { |
1171 | 0 | return (0, false, true); |
1172 | 0 | } |
1173 | | |
1174 | 0 | if self.check_label( |
1175 | 0 | hyphens, |
1176 | 0 | &mut domain_buffer[current_label_start..], |
1177 | 0 | fail_fast, |
1178 | 0 | &mut had_errors, |
1179 | | true, |
1180 | | true, |
1181 | | ) { |
1182 | 0 | return (0, false, true); |
1183 | 0 | } |
1184 | | } else { |
1185 | | // Punycode failed |
1186 | 0 | if fail_fast { |
1187 | 0 | return (0, false, true); |
1188 | 0 | } |
1189 | 0 | had_errors = true; |
1190 | 0 | domain_buffer.push('\u{FFFD}'); |
1191 | 0 | let mut iter = ascii.iter(); |
1192 | | // Discard the first character that we replaced. |
1193 | 0 | let _ = iter.next(); |
1194 | 0 | domain_buffer.extend(iter.map(|c| { |
1195 | | // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does |
1196 | | // not matter. |
1197 | 0 | apply_ascii_deny_list_to_potentially_upper_case_ascii( |
1198 | 0 | *c, deny_list, |
1199 | | ) |
1200 | 0 | })); |
1201 | | }; |
1202 | | // If there were errors, we won't be trying to use this |
1203 | | // anyway later, so it's fine to put it here unconditionally. |
1204 | 0 | already_punycode.push(AlreadyAsciiLabel::MixedCasePunycode(label)); |
1205 | 0 | continue; |
1206 | 0 | } else if fail_fast { |
1207 | 0 | return (0, false, true); |
1208 | 0 | } |
1209 | | // Else fall through to the complex path and rediscover error |
1210 | | // there. |
1211 | 0 | false |
1212 | | } else { |
1213 | 0 | true |
1214 | | } |
1215 | | } else { |
1216 | 0 | false |
1217 | | }; |
1218 | 0 | for c in ascii.iter().map(|c| { |
1219 | | // Can't have dot here, so `deny_list` vs `deny_list_deny_dot` does |
1220 | | // not matter. |
1221 | 0 | apply_ascii_deny_list_to_potentially_upper_case_ascii(*c, deny_list) |
1222 | 0 | }) { |
1223 | 0 | if c == '\u{FFFD}' { |
1224 | 0 | if fail_fast { |
1225 | 0 | return (0, false, true); |
1226 | 0 | } |
1227 | 0 | had_errors = true; |
1228 | 0 | } |
1229 | 0 | domain_buffer.push(c); |
1230 | | } |
1231 | 0 | if non_punycode_ascii_label { |
1232 | 0 | if hyphens != Hyphens::Allow |
1233 | 0 | && check_hyphens( |
1234 | 0 | &mut domain_buffer[current_label_start..], |
1235 | 0 | hyphens == Hyphens::CheckFirstLast, |
1236 | 0 | fail_fast, |
1237 | 0 | &mut had_errors, |
1238 | | ) |
1239 | | { |
1240 | 0 | return (0, false, true); |
1241 | 0 | } |
1242 | 0 | already_punycode.push(if had_errors { |
1243 | 0 | AlreadyAsciiLabel::Other |
1244 | | } else { |
1245 | 0 | AlreadyAsciiLabel::MixedCaseAscii(label) |
1246 | | }); |
1247 | 0 | continue; |
1248 | 0 | } |
1249 | 0 | already_punycode.push(AlreadyAsciiLabel::Other); |
1250 | 0 | let mut first_needs_combining_mark_check = ascii.is_empty(); |
1251 | 0 | let mut needs_contextj_check = !non_ascii.is_empty(); |
1252 | 0 | let mut mapping = self |
1253 | 0 | .data |
1254 | 0 | .map_normalize(non_ascii.chars()) |
1255 | 0 | .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list)); |
1256 | | loop { |
1257 | 0 | let n = mapping.next(); |
1258 | 0 | match n { |
1259 | | None | Some('.') => { |
1260 | 0 | if domain_buffer[current_label_start..] |
1261 | 0 | .starts_with(&['x', 'n', '-', '-']) |
1262 | | { |
1263 | 0 | let mut punycode_precondition_failed = false; |
1264 | 0 | for c in domain_buffer[current_label_start + 4..].iter_mut() { |
1265 | 0 | if !c.is_ascii() { |
1266 | 0 | if fail_fast { |
1267 | 0 | return (0, false, true); |
1268 | 0 | } |
1269 | 0 | had_errors = true; |
1270 | 0 | *c = '\u{FFFD}'; |
1271 | 0 | punycode_precondition_failed = true; |
1272 | 0 | } |
1273 | | } |
1274 | | |
1275 | 0 | if let Some(last) = domain_buffer.last_mut() { |
1276 | 0 | if *last == '-' { |
1277 | | // Either there's nothing after the "xn--" prefix |
1278 | | // and we got the last hyphen of "xn--", or there |
1279 | | // are no Punycode digits after the last delimiter |
1280 | | // which would result in Punycode decode outputting |
1281 | | // ASCII only. |
1282 | 0 | if fail_fast { |
1283 | 0 | return (0, false, true); |
1284 | 0 | } |
1285 | 0 | had_errors = true; |
1286 | 0 | *last = '\u{FFFD}'; |
1287 | 0 | punycode_precondition_failed = true; |
1288 | 0 | } |
1289 | | } else { |
1290 | 0 | unreachable!(); |
1291 | | } |
1292 | | |
1293 | | // Reject excessively long input |
1294 | | // https://github.com/whatwg/url/issues/824 |
1295 | | // https://unicode-org.atlassian.net/browse/ICU-13727 |
1296 | 0 | if domain_buffer.len() - current_label_start - 4 |
1297 | 0 | > PUNYCODE_DECODE_MAX_INPUT_LENGTH |
1298 | | { |
1299 | 0 | if fail_fast { |
1300 | 0 | return (0, false, true); |
1301 | 0 | } |
1302 | 0 | had_errors = true; |
1303 | 0 | domain_buffer[current_label_start |
1304 | 0 | + 4 |
1305 | 0 | + PUNYCODE_DECODE_MAX_INPUT_LENGTH] = '\u{FFFD}'; |
1306 | 0 | punycode_precondition_failed = true; |
1307 | 0 | } |
1308 | | |
1309 | 0 | if !punycode_precondition_failed { |
1310 | 0 | if let Ok(decode) = Decoder::default() |
1311 | 0 | .decode::<char, InternalCaller>( |
1312 | 0 | &domain_buffer[current_label_start + 4..], |
1313 | 0 | ) |
1314 | | { |
1315 | 0 | first_needs_combining_mark_check = true; |
1316 | 0 | needs_contextj_check = true; |
1317 | | // 63 ASCII characters is the max length for a valid DNS label and xn-- takes 4 |
1318 | | // characters. |
1319 | 0 | let mut label_buffer = SmallVec::<[char; 59]>::new(); |
1320 | 0 | label_buffer.extend(decode); |
1321 | | |
1322 | 0 | domain_buffer.truncate(current_label_start); |
1323 | 0 | if self.after_punycode_decode( |
1324 | 0 | domain_buffer, |
1325 | 0 | current_label_start, |
1326 | 0 | &label_buffer, |
1327 | 0 | deny_list_deny_dot, |
1328 | 0 | fail_fast, |
1329 | 0 | &mut had_errors, |
1330 | | ) { |
1331 | 0 | return (0, false, true); |
1332 | 0 | } |
1333 | | } else { |
1334 | | // Punycode failed |
1335 | 0 | if fail_fast { |
1336 | 0 | return (0, false, true); |
1337 | 0 | } |
1338 | 0 | had_errors = true; |
1339 | 0 | domain_buffer[current_label_start] = '\u{FFFD}'; |
1340 | 0 | needs_contextj_check = false; // ASCII label |
1341 | 0 | first_needs_combining_mark_check = false; |
1342 | | }; |
1343 | 0 | } else { |
1344 | 0 | first_needs_combining_mark_check = false; |
1345 | 0 | needs_contextj_check = false; // Non-ASCII already turned to U+FFFD. |
1346 | 0 | } |
1347 | 0 | } |
1348 | 0 | if self.check_label( |
1349 | 0 | hyphens, |
1350 | 0 | &mut domain_buffer[current_label_start..], |
1351 | 0 | fail_fast, |
1352 | 0 | &mut had_errors, |
1353 | 0 | first_needs_combining_mark_check, |
1354 | 0 | needs_contextj_check, |
1355 | | ) { |
1356 | 0 | return (0, false, true); |
1357 | 0 | } |
1358 | | |
1359 | 0 | if n.is_none() { |
1360 | 0 | break; |
1361 | 0 | } |
1362 | 0 | domain_buffer.push('.'); |
1363 | 0 | current_label_start = domain_buffer.len(); |
1364 | 0 | first_needs_combining_mark_check = true; |
1365 | 0 | needs_contextj_check = true; |
1366 | 0 | already_punycode.push(AlreadyAsciiLabel::Other); |
1367 | | } |
1368 | 0 | Some(c) => { |
1369 | 0 | if c == '\u{FFFD}' { |
1370 | 0 | if fail_fast { |
1371 | 0 | return (0, false, true); |
1372 | 0 | } |
1373 | 0 | had_errors = true; |
1374 | 0 | } |
1375 | 0 | domain_buffer.push(c); |
1376 | | } |
1377 | | } |
1378 | | } |
1379 | 0 | } else { |
1380 | 0 | // Empty label |
1381 | 0 | already_punycode.push(AlreadyAsciiLabel::MixedCaseAscii(label)); |
1382 | 0 | } |
1383 | | } |
1384 | | |
1385 | 0 | let is_bidi = self.is_bidi(domain_buffer); |
1386 | 0 | if is_bidi { |
1387 | 0 | for label in domain_buffer.split_mut(|c| *c == '.') { |
1388 | 0 | if let Some((first, tail)) = label.split_first_mut() { |
1389 | 0 | let first_bc = self.data.bidi_class(*first); |
1390 | 0 | if !FIRST_BC_MASK.intersects(first_bc.to_mask()) { |
1391 | | // Neither RTL label nor LTR label |
1392 | 0 | if fail_fast { |
1393 | 0 | return (0, false, true); |
1394 | 0 | } |
1395 | 0 | had_errors = true; |
1396 | 0 | *first = '\u{FFFD}'; |
1397 | 0 | continue; |
1398 | 0 | } |
1399 | 0 | let is_ltr = first_bc.is_ltr(); |
1400 | | // Trim NSM |
1401 | 0 | let mut middle = tail; |
1402 | | #[allow(clippy::while_let_loop)] |
1403 | | loop { |
1404 | 0 | if let Some((last, prior)) = middle.split_last_mut() { |
1405 | 0 | let last_bc = self.data.bidi_class(*last); |
1406 | 0 | if last_bc.is_nonspacing_mark() { |
1407 | 0 | middle = prior; |
1408 | 0 | continue; |
1409 | 0 | } |
1410 | 0 | let last_mask = if is_ltr { LAST_LTR_MASK } else { LAST_RTL_MASK }; |
1411 | 0 | if !last_mask.intersects(last_bc.to_mask()) { |
1412 | 0 | if fail_fast { |
1413 | 0 | return (0, false, true); |
1414 | 0 | } |
1415 | 0 | had_errors = true; |
1416 | 0 | *last = '\u{FFFD}'; |
1417 | 0 | } |
1418 | 0 | if is_ltr { |
1419 | 0 | for c in prior.iter_mut() { |
1420 | 0 | let bc = self.data.bidi_class(*c); |
1421 | 0 | if !MIDDLE_LTR_MASK.intersects(bc.to_mask()) { |
1422 | 0 | if fail_fast { |
1423 | 0 | return (0, false, true); |
1424 | 0 | } |
1425 | 0 | had_errors = true; |
1426 | 0 | *c = '\u{FFFD}'; |
1427 | 0 | } |
1428 | | } |
1429 | | } else { |
1430 | 0 | let mut numeral_state = RtlNumeralState::Undecided; |
1431 | 0 | for c in prior.iter_mut() { |
1432 | 0 | let bc = self.data.bidi_class(*c); |
1433 | 0 | if !MIDDLE_RTL_MASK.intersects(bc.to_mask()) { |
1434 | 0 | if fail_fast { |
1435 | 0 | return (0, false, true); |
1436 | 0 | } |
1437 | 0 | had_errors = true; |
1438 | 0 | *c = '\u{FFFD}'; |
1439 | | } else { |
1440 | 0 | match numeral_state { |
1441 | | RtlNumeralState::Undecided => { |
1442 | 0 | if bc.is_european_number() { |
1443 | 0 | numeral_state = RtlNumeralState::European; |
1444 | 0 | } else if bc.is_arabic_number() { |
1445 | 0 | numeral_state = RtlNumeralState::Arabic; |
1446 | 0 | } |
1447 | | } |
1448 | | RtlNumeralState::European => { |
1449 | 0 | if bc.is_arabic_number() { |
1450 | 0 | if fail_fast { |
1451 | 0 | return (0, false, true); |
1452 | 0 | } |
1453 | 0 | had_errors = true; |
1454 | 0 | *c = '\u{FFFD}'; |
1455 | 0 | } |
1456 | | } |
1457 | | RtlNumeralState::Arabic => { |
1458 | 0 | if bc.is_european_number() { |
1459 | 0 | if fail_fast { |
1460 | 0 | return (0, false, true); |
1461 | 0 | } |
1462 | 0 | had_errors = true; |
1463 | 0 | *c = '\u{FFFD}'; |
1464 | 0 | } |
1465 | | } |
1466 | | } |
1467 | | } |
1468 | | } |
1469 | 0 | if (numeral_state == RtlNumeralState::European |
1470 | 0 | && last_bc.is_arabic_number()) |
1471 | 0 | || (numeral_state == RtlNumeralState::Arabic |
1472 | 0 | && last_bc.is_european_number()) |
1473 | | { |
1474 | 0 | if fail_fast { |
1475 | 0 | return (0, false, true); |
1476 | 0 | } |
1477 | 0 | had_errors = true; |
1478 | 0 | *last = '\u{FFFD}'; |
1479 | 0 | } |
1480 | | } |
1481 | 0 | break; |
1482 | | } else { |
1483 | | // One-character label or label where |
1484 | | // everything after the first character |
1485 | | // is just non-spacing marks. |
1486 | 0 | break; |
1487 | | } |
1488 | | } |
1489 | 0 | } |
1490 | | } |
1491 | 0 | } |
1492 | | |
1493 | 0 | (passthrough_up_to, is_bidi, had_errors) |
1494 | 0 | } |
1495 | | |
1496 | | #[inline(never)] |
1497 | 0 | fn after_punycode_decode( |
1498 | 0 | &self, |
1499 | 0 | domain_buffer: &mut SmallVec<[char; 253]>, |
1500 | 0 | current_label_start: usize, |
1501 | 0 | label_buffer: &[char], |
1502 | 0 | deny_list_deny_dot: u128, |
1503 | 0 | fail_fast: bool, |
1504 | 0 | had_errors: &mut bool, |
1505 | 0 | ) -> bool { |
1506 | 0 | for c in self |
1507 | 0 | .data |
1508 | 0 | .normalize_validate(label_buffer.iter().copied()) |
1509 | 0 | .map(|c| apply_ascii_deny_list_to_lower_cased_unicode(c, deny_list_deny_dot)) |
1510 | | { |
1511 | 0 | if c == '\u{FFFD}' { |
1512 | 0 | if fail_fast { |
1513 | 0 | return true; |
1514 | 0 | } |
1515 | 0 | *had_errors = true; |
1516 | 0 | } |
1517 | 0 | domain_buffer.push(c); |
1518 | | } |
1519 | 0 | let normalized = &mut domain_buffer[current_label_start..]; |
1520 | | if let Err(()) = |
1521 | 0 | normalized |
1522 | 0 | .iter_mut() |
1523 | 0 | .zip(label_buffer.iter()) |
1524 | 0 | .try_for_each(|(norm_c, decoded_c)| { |
1525 | 0 | if *norm_c == *decoded_c { |
1526 | 0 | Ok(()) |
1527 | | } else { |
1528 | | // Mark the first difference |
1529 | 0 | *norm_c = '\u{FFFD}'; |
1530 | 0 | Err(()) |
1531 | | } |
1532 | 0 | }) |
1533 | | { |
1534 | 0 | if fail_fast { |
1535 | 0 | return true; |
1536 | 0 | } |
1537 | 0 | *had_errors = true; |
1538 | 0 | } |
1539 | 0 | false |
1540 | 0 | } |
1541 | | |
1542 | | #[inline(never)] |
1543 | 0 | fn check_label( |
1544 | 0 | &self, |
1545 | 0 | hyphens: Hyphens, |
1546 | 0 | mut_label: &mut [char], |
1547 | 0 | fail_fast: bool, |
1548 | 0 | had_errors: &mut bool, |
1549 | 0 | first_needs_combining_mark_check: bool, |
1550 | 0 | needs_contextj_check: bool, |
1551 | 0 | ) -> bool { |
1552 | 0 | if hyphens != Hyphens::Allow |
1553 | 0 | && check_hyphens( |
1554 | 0 | mut_label, |
1555 | 0 | hyphens == Hyphens::CheckFirstLast, |
1556 | 0 | fail_fast, |
1557 | 0 | had_errors, |
1558 | | ) |
1559 | | { |
1560 | 0 | return true; |
1561 | 0 | } |
1562 | 0 | if first_needs_combining_mark_check { |
1563 | 0 | if let Some(first) = mut_label.first_mut() { |
1564 | 0 | if self.data.is_mark(*first) { |
1565 | 0 | if fail_fast { |
1566 | 0 | return true; |
1567 | 0 | } |
1568 | 0 | *had_errors = true; |
1569 | 0 | *first = '\u{FFFD}'; |
1570 | 0 | } |
1571 | 0 | } |
1572 | 0 | } |
1573 | 0 | if needs_contextj_check { |
1574 | | // ContextJ |
1575 | 0 | for i in 0..mut_label.len() { |
1576 | 0 | let c = mut_label[i]; |
1577 | 0 | if !in_inclusive_range_char(c, '\u{200C}', '\u{200D}') { |
1578 | 0 | continue; |
1579 | 0 | } |
1580 | 0 | let (head, joiner_and_tail) = mut_label.split_at_mut(i); |
1581 | | |
1582 | 0 | if let Some((joiner, tail)) = joiner_and_tail.split_first_mut() { |
1583 | 0 | if let Some(previous) = head.last() { |
1584 | 0 | if self.data.is_virama(*previous) { |
1585 | 0 | continue; |
1586 | 0 | } |
1587 | | } else { |
1588 | | // No preceding character |
1589 | 0 | if fail_fast { |
1590 | 0 | return true; |
1591 | 0 | } |
1592 | 0 | *had_errors = true; |
1593 | 0 | *joiner = '\u{FFFD}'; |
1594 | 0 | continue; |
1595 | | } |
1596 | 0 | if c == '\u{200D}' { |
1597 | | // ZWJ only has the virama rule |
1598 | 0 | if fail_fast { |
1599 | 0 | return true; |
1600 | 0 | } |
1601 | 0 | *had_errors = true; |
1602 | 0 | *joiner = '\u{FFFD}'; |
1603 | 0 | continue; |
1604 | 0 | } |
1605 | 0 | debug_assert_eq!(c, '\u{200C}'); |
1606 | 0 | if !self.has_appropriately_joining_char( |
1607 | 0 | head.iter().rev().copied(), |
1608 | 0 | LEFT_OR_DUAL_JOINING_MASK, |
1609 | 0 | ) || !self.has_appropriately_joining_char( |
1610 | 0 | tail.iter().copied(), |
1611 | 0 | RIGHT_OR_DUAL_JOINING_MASK, |
1612 | 0 | ) { |
1613 | 0 | if fail_fast { |
1614 | 0 | return true; |
1615 | 0 | } |
1616 | 0 | *had_errors = true; |
1617 | 0 | *joiner = '\u{FFFD}'; |
1618 | 0 | } |
1619 | | } else { |
1620 | 0 | debug_assert!(false); |
1621 | | } |
1622 | | } |
1623 | 0 | } |
1624 | | |
1625 | 0 | if !is_ascii(mut_label) && mut_label.len() > PUNYCODE_ENCODE_MAX_INPUT_LENGTH { |
1626 | | // Limit quadratic behavior |
1627 | | // https://github.com/whatwg/url/issues/824 |
1628 | | // https://unicode-org.atlassian.net/browse/ICU-13727 |
1629 | 0 | if fail_fast { |
1630 | 0 | return true; |
1631 | 0 | } |
1632 | 0 | *had_errors = true; |
1633 | 0 | mut_label[PUNYCODE_ENCODE_MAX_INPUT_LENGTH] = '\u{FFFD}'; |
1634 | 0 | } |
1635 | 0 | false |
1636 | 0 | } |
1637 | | |
1638 | | #[inline(always)] |
1639 | 0 | fn has_appropriately_joining_char<I: Iterator<Item = char>>( |
1640 | 0 | &self, |
1641 | 0 | iter: I, |
1642 | 0 | required_mask: JoiningTypeMask, |
1643 | 0 | ) -> bool { |
1644 | 0 | for c in iter { |
1645 | 0 | let jt = self.data.joining_type(c); |
1646 | 0 | if jt.to_mask().intersects(required_mask) { |
1647 | 0 | return true; |
1648 | 0 | } |
1649 | 0 | if jt.is_transparent() { |
1650 | 0 | continue; |
1651 | 0 | } |
1652 | 0 | return false; |
1653 | | } |
1654 | 0 | false |
1655 | 0 | } Unexecuted instantiation: <idna::uts46::Uts46>::has_appropriately_joining_char::<core::iter::adapters::copied::Copied<core::iter::adapters::rev::Rev<core::slice::iter::Iter<char>>>> Unexecuted instantiation: <idna::uts46::Uts46>::has_appropriately_joining_char::<core::iter::adapters::copied::Copied<core::slice::iter::Iter<char>>> |
1656 | | |
1657 | | #[inline(always)] |
1658 | 0 | fn is_bidi(&self, buffer: &[char]) -> bool { |
1659 | 0 | for &c in buffer { |
1660 | 0 | if c < '\u{0590}' { |
1661 | | // Below Hebrew |
1662 | 0 | continue; |
1663 | 0 | } |
1664 | 0 | if in_inclusive_range_char(c, '\u{0900}', '\u{FB1C}') { |
1665 | 0 | debug_assert_ne!(c, '\u{200F}'); // disallowed |
1666 | 0 | continue; |
1667 | 0 | } |
1668 | 0 | if in_inclusive_range_char(c, '\u{1F000}', '\u{3FFFF}') { |
1669 | 0 | continue; |
1670 | 0 | } |
1671 | 0 | if in_inclusive_range_char(c, '\u{FF00}', '\u{107FF}') { |
1672 | 0 | continue; |
1673 | 0 | } |
1674 | 0 | if in_inclusive_range_char(c, '\u{11000}', '\u{1E7FF}') { |
1675 | 0 | continue; |
1676 | 0 | } |
1677 | 0 | if RTL_MASK.intersects(self.data.bidi_class(c).to_mask()) { |
1678 | 0 | return true; |
1679 | 0 | } |
1680 | | } |
1681 | 0 | false |
1682 | 0 | } |
1683 | | } |
1684 | | |
1685 | 0 | fn check_hyphens( |
1686 | 0 | mut_label: &mut [char], |
1687 | 0 | allow_third_fourth: bool, |
1688 | 0 | fail_fast: bool, |
1689 | 0 | had_errors: &mut bool, |
1690 | 0 | ) -> bool { |
1691 | 0 | if let Some(first) = mut_label.first_mut() { |
1692 | 0 | if *first == '-' { |
1693 | 0 | if fail_fast { |
1694 | 0 | return true; |
1695 | 0 | } |
1696 | 0 | *had_errors = true; |
1697 | 0 | *first = '\u{FFFD}'; |
1698 | 0 | } |
1699 | 0 | } |
1700 | 0 | if let Some(last) = mut_label.last_mut() { |
1701 | 0 | if *last == '-' { |
1702 | 0 | if fail_fast { |
1703 | 0 | return true; |
1704 | 0 | } |
1705 | 0 | *had_errors = true; |
1706 | 0 | *last = '\u{FFFD}'; |
1707 | 0 | } |
1708 | 0 | } |
1709 | 0 | if allow_third_fourth { |
1710 | 0 | return false; |
1711 | 0 | } |
1712 | 0 | if mut_label.len() >= 4 && mut_label[2] == '-' && mut_label[3] == '-' { |
1713 | 0 | if fail_fast { |
1714 | 0 | return true; |
1715 | 0 | } |
1716 | 0 | *had_errors = true; |
1717 | 0 | mut_label[2] = '\u{FFFD}'; |
1718 | 0 | mut_label[3] = '\u{FFFD}'; |
1719 | 0 | } |
1720 | 0 | false |
1721 | 0 | } |