/rust/registry/src/index.crates.io-1949cf8c6b5b557f/iri-string-0.7.9/src/parser/trusted.rs
Line | Count | Source |
1 | | //! Fast parsers for trusted (already validated) input. |
2 | | //! |
3 | | //! Using this in wrong way will lead to unexpected wrong result. |
4 | | |
5 | | pub(crate) mod authority; |
6 | | |
7 | | use core::cmp::Ordering; |
8 | | use core::num::NonZeroUsize; |
9 | | |
10 | | use crate::components::{RiReferenceComponents, Splitter}; |
11 | | use crate::format::eq_str_display; |
12 | | use crate::normalize::{is_pct_case_normalized, NormalizedAsciiOnlyHost, NormalizednessCheckMode}; |
13 | | use crate::parser::str::{find_split2, find_split3, find_split4_hole, find_split_hole}; |
14 | | use crate::spec::Spec; |
15 | | use crate::types::RiReferenceStr; |
16 | | |
17 | | /// Eats a `scheme` and a following colon, and returns the rest and the scheme. |
18 | | /// |
19 | | /// Returns `(rest, scheme)`. |
20 | | /// |
21 | | /// This should be called at the head of an absolute IRIs/URIs. |
22 | | #[must_use] |
23 | 0 | fn scheme_colon(i: &str) -> (&str, &str) { |
24 | 0 | let (scheme, rest) = |
25 | 0 | find_split_hole(i, b':').expect("[precondition] absolute IRIs must have `scheme` part"); |
26 | 0 | (rest, scheme) |
27 | 0 | } |
28 | | |
29 | | /// Eats a `scheme` and a following colon if available, and returns the rest and the scheme. |
30 | | /// |
31 | | /// This should be called at the head of an `IRI-reference` or similar. |
32 | | #[must_use] |
33 | 0 | fn scheme_colon_opt(i: &str) -> (&str, Option<&str>) { |
34 | 0 | match find_split4_hole(i, b':', b'/', b'?', b'#') { |
35 | 0 | Some((scheme, b':', rest)) => (rest, Some(scheme)), |
36 | 0 | _ => (i, None), |
37 | | } |
38 | 0 | } |
39 | | |
40 | | /// Eats double slash and the following authority if available, and returns the authority. |
41 | | /// |
42 | | /// This should be called at the head of an `IRI-reference`, or at the result of `scheme_colon`. |
43 | | #[must_use] |
44 | 0 | fn slash_slash_authority_opt(i: &str) -> (&str, Option<&str>) { |
45 | 0 | let s = match i.strip_prefix("//") { |
46 | 0 | Some(rest) => rest, |
47 | 0 | None => return (i, None), |
48 | | }; |
49 | | // `i` might match `path-abempty` (which can start with `//`), but it is not |
50 | | // allowed as `relative-part`, so no need to care `path-abempty` rule here. |
51 | | // A slash, question mark, and hash character won't appear in `authority`. |
52 | 0 | match find_split3(s, b'/', b'?', b'#') { |
53 | 0 | Some((authority, rest)) => (rest, Some(authority)), |
54 | 0 | None => ("", Some(s)), |
55 | | } |
56 | 0 | } |
57 | | |
58 | | /// Eats a string until the query, and returns that part (excluding `?` for the query). |
59 | | #[must_use] |
60 | 0 | fn until_query(i: &str) -> (&str, &str) { |
61 | | // `?` won't appear before the query part. |
62 | 0 | match find_split2(i, b'?', b'#') { |
63 | 0 | Some((before_query, rest)) => (rest, before_query), |
64 | 0 | None => ("", i), |
65 | | } |
66 | 0 | } |
67 | | |
68 | | /// Decomposes query and fragment, if available. |
69 | | /// |
70 | | /// The string must starts with `?`, or `#`, or be empty. |
71 | | #[must_use] |
72 | 0 | fn decompose_query_and_fragment(i: &str) -> (Option<&str>, Option<&str>) { |
73 | 0 | match i.as_bytes().first().copied() { |
74 | 0 | None => (None, None), |
75 | | Some(b'?') => { |
76 | 0 | let rest = &i[1..]; |
77 | 0 | match find_split_hole(rest, b'#') { |
78 | 0 | Some((query, fragment)) => (Some(query), Some(fragment)), |
79 | 0 | None => (Some(rest), None), |
80 | | } |
81 | | } |
82 | 0 | Some(c) => { |
83 | 0 | debug_assert_eq!(c, b'#'); |
84 | 0 | (None, Some(&i[1..])) |
85 | | } |
86 | | } |
87 | 0 | } |
88 | | |
89 | | /// Decomposes the given valid `IRI-reference`. |
90 | | #[must_use] |
91 | 0 | pub(crate) fn decompose_iri_reference<S: Spec>( |
92 | 0 | i: &RiReferenceStr<S>, |
93 | 0 | ) -> RiReferenceComponents<'_, S> { |
94 | | /// Inner function to avoid unnecessary monomorphizations on `S`. |
95 | 0 | fn decompose(i: &str) -> Splitter { |
96 | 0 | let len = i.len(); |
97 | | |
98 | 0 | let (i, scheme_end) = { |
99 | 0 | let (i, scheme) = scheme_colon_opt(i); |
100 | 0 | let end = scheme.and_then(|s| NonZeroUsize::new(s.len())); |
101 | 0 | (i, end) |
102 | | }; |
103 | 0 | let (i, authority_end) = { |
104 | | // 2: "//".len() |
105 | 0 | let start = len - i.len() + 2; |
106 | | // `authority` does not contain the two slashes of `://'. |
107 | 0 | let (i, authority) = slash_slash_authority_opt(i); |
108 | 0 | let end = authority.and_then(|s| NonZeroUsize::new(start + s.len())); |
109 | 0 | (i, end) |
110 | | }; |
111 | 0 | let (i, _path) = until_query(i); |
112 | | |
113 | 0 | let (query_start, fragment_start) = { |
114 | | // This could theoretically be zero if `len` is `usize::MAX` and |
115 | | // `i` has neither a query nor a fragment. However, this is |
116 | | // practically impossible. |
117 | 0 | let after_first_prefix = NonZeroUsize::new((len - i.len()).wrapping_add(1)); |
118 | | |
119 | 0 | let (query, fragment) = decompose_query_and_fragment(i); |
120 | 0 | match (query.is_some(), fragment) { |
121 | 0 | (true, Some(fragment)) => { |
122 | 0 | (after_first_prefix, NonZeroUsize::new(len - fragment.len())) |
123 | | } |
124 | 0 | (true, None) => (after_first_prefix, None), |
125 | 0 | (false, Some(_fragment)) => (None, after_first_prefix), |
126 | 0 | (false, None) => (None, None), |
127 | | } |
128 | | }; |
129 | | |
130 | 0 | Splitter::new(scheme_end, authority_end, query_start, fragment_start) |
131 | 0 | } |
132 | | |
133 | 0 | RiReferenceComponents { |
134 | 0 | iri: i, |
135 | 0 | splitter: decompose(i.as_str()), |
136 | 0 | } |
137 | 0 | } Unexecuted instantiation: iri_string::parser::trusted::decompose_iri_reference::<iri_string::spec::UriSpec> Unexecuted instantiation: iri_string::parser::trusted::decompose_iri_reference::<_> |
138 | | |
139 | | /// Extracts `scheme` part from an IRI reference. |
140 | | /// |
141 | | /// # Precondition |
142 | | /// |
143 | | /// The given string must be a valid IRI reference. |
144 | | #[inline] |
145 | | #[must_use] |
146 | 0 | pub(crate) fn extract_scheme(i: &str) -> Option<&str> { |
147 | 0 | scheme_colon_opt(i).1 |
148 | 0 | } |
149 | | |
150 | | /// Extracts `scheme` part from an absolute IRI. |
151 | | /// |
152 | | /// # Precondition |
153 | | /// |
154 | | /// The given string must be a valid absolute IRI. |
155 | | #[inline] |
156 | | #[must_use] |
157 | 0 | pub(crate) fn extract_scheme_absolute(i: &str) -> &str { |
158 | 0 | scheme_colon(i).1 |
159 | 0 | } |
160 | | |
161 | | /// Extracts `authority` part from an IRI reference. |
162 | | /// |
163 | | /// # Precondition |
164 | | /// |
165 | | /// The given string must be a valid IRI reference. |
166 | | #[inline] |
167 | | #[must_use] |
168 | 0 | pub(crate) fn extract_authority(i: &str) -> Option<&str> { |
169 | 0 | let (i, _scheme) = scheme_colon_opt(i); |
170 | 0 | slash_slash_authority_opt(i).1 |
171 | 0 | } |
172 | | |
173 | | /// Extracts `authority` part from an absolute IRI. |
174 | | /// |
175 | | /// # Precondition |
176 | | /// |
177 | | /// The given string must be a valid absolute IRI. |
178 | | #[inline] |
179 | | #[must_use] |
180 | 0 | pub(crate) fn extract_authority_absolute(i: &str) -> Option<&str> { |
181 | 0 | let (i, _scheme) = scheme_colon(i); |
182 | 0 | slash_slash_authority_opt(i).1 |
183 | 0 | } |
184 | | |
185 | | /// Extracts `authority` part from a relative IRI. |
186 | | /// |
187 | | /// # Precondition |
188 | | /// |
189 | | /// The given string must be a valid relative IRI. |
190 | | #[inline] |
191 | | #[must_use] |
192 | 0 | pub(crate) fn extract_authority_relative(i: &str) -> Option<&str> { |
193 | 0 | slash_slash_authority_opt(i).1 |
194 | 0 | } |
195 | | |
196 | | /// Extracts `path` part from an IRI reference. |
197 | | /// |
198 | | /// # Precondition |
199 | | /// |
200 | | /// The given string must be a valid IRI reference. |
201 | | #[inline] |
202 | | #[must_use] |
203 | 0 | pub(crate) fn extract_path(i: &str) -> &str { |
204 | 0 | let (i, _scheme) = scheme_colon_opt(i); |
205 | 0 | let (i, _authority) = slash_slash_authority_opt(i); |
206 | 0 | until_query(i).1 |
207 | 0 | } |
208 | | |
209 | | /// Extracts `path` part from an absolute IRI. |
210 | | /// |
211 | | /// # Precondition |
212 | | /// |
213 | | /// The given string must be a valid absolute IRI. |
214 | | #[inline] |
215 | | #[must_use] |
216 | 0 | pub(crate) fn extract_path_absolute(i: &str) -> &str { |
217 | 0 | let (i, _scheme) = scheme_colon(i); |
218 | 0 | let (i, _authority) = slash_slash_authority_opt(i); |
219 | 0 | until_query(i).1 |
220 | 0 | } |
221 | | |
222 | | /// Extracts `path` part from a relative IRI. |
223 | | /// |
224 | | /// # Precondition |
225 | | /// |
226 | | /// The given string must be a valid relative IRI. |
227 | | #[inline] |
228 | | #[must_use] |
229 | 0 | pub(crate) fn extract_path_relative(i: &str) -> &str { |
230 | 0 | let (i, _authority) = slash_slash_authority_opt(i); |
231 | 0 | until_query(i).1 |
232 | 0 | } |
233 | | |
234 | | /// Extracts `query` part from an IRI reference. |
235 | | /// |
236 | | /// # Precondition |
237 | | /// |
238 | | /// The given string must be a valid IRI reference. |
239 | | #[inline] |
240 | | #[must_use] |
241 | 0 | pub(crate) fn extract_query(i: &str) -> Option<&str> { |
242 | 0 | let (i, _before_query) = until_query(i); |
243 | 0 | decompose_query_and_fragment(i).0 |
244 | 0 | } |
245 | | |
246 | | /// Extracts `query` part from an `absolute-IRI` string. |
247 | | /// |
248 | | /// # Precondition |
249 | | /// |
250 | | /// The given string must be a valid `absolute-IRI` string. |
251 | | #[must_use] |
252 | 0 | pub(crate) fn extract_query_absolute_iri(i: &str) -> Option<&str> { |
253 | 0 | let (i, _before_query) = until_query(i); |
254 | 0 | if i.is_empty() { |
255 | 0 | None |
256 | | } else { |
257 | 0 | debug_assert_eq!( |
258 | 0 | i.as_bytes().first(), |
259 | | Some(&b'?'), |
260 | 0 | "`absolute-IRI` string must not have `fragment part" |
261 | | ); |
262 | 0 | Some(&i[1..]) |
263 | | } |
264 | 0 | } |
265 | | |
266 | | /// Splits an IRI string into the prefix and the fragment part. |
267 | | /// |
268 | | /// A leading `#` character is truncated if the fragment part exists. |
269 | | /// |
270 | | /// # Precondition |
271 | | /// |
272 | | /// The given string must be a valid IRI reference. |
273 | | #[inline] |
274 | | #[must_use] |
275 | 0 | pub(crate) fn split_fragment(iri: &str) -> (&str, Option<&str>) { |
276 | | // It is completely OK to find the first `#` character from valid IRI to get fragment part, |
277 | | // because the spec says that there are no `#` characters before the fragment part. |
278 | | // |
279 | | // > ``` |
280 | | // > scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) |
281 | | // > ``` |
282 | | // > |
283 | | // > --- [RFC 3986, section 3.1. Scheme](https://tools.ietf.org/html/rfc3986#section-3.1) |
284 | | // |
285 | | // > The authority component is preceded by a double slash ("//") and is terminated by the |
286 | | // > next slash ("/"), question mark ("?"), or number sign ("#") character, or by the end |
287 | | // > of the URI. |
288 | | // > |
289 | | // > --- [RFC 3986, section 3.2. Authority](https://tools.ietf.org/html/rfc3986#section-3.2) |
290 | | // |
291 | | // > The path is terminated by the first question mark ("?") or number sign ("#") |
292 | | // > character, or by the end of the URI. |
293 | | // > |
294 | | // > --- [RFC 3986, section 3.3. Path](https://tools.ietf.org/html/rfc3986#section-3.3) |
295 | | // |
296 | | // > The query component is indicated by the first question mark ("?") character and |
297 | | // > terminated by a number sign ("#") character or by the end of the URI. |
298 | | // > |
299 | | // > --- [RFC 3986, section 3.4. Query](https://tools.ietf.org/html/rfc3986#section-3.4) |
300 | 0 | match find_split_hole(iri, b'#') { |
301 | 0 | Some((prefix, fragment)) => (prefix, Some(fragment)), |
302 | 0 | None => (iri, None), |
303 | | } |
304 | 0 | } |
305 | | |
306 | | /// Returns the fragment part of the given IRI. |
307 | | /// |
308 | | /// A leading `#` character of the fragment is truncated. |
309 | | #[inline] |
310 | | #[must_use] |
311 | 0 | pub(crate) fn extract_fragment(iri: &str) -> Option<&str> { |
312 | 0 | split_fragment(iri).1 |
313 | 0 | } |
314 | | |
315 | | /// Returns `Ok(_)` if the string is normalized. |
316 | | /// |
317 | | /// If this function returns `true`, normalization input and output will be identical. |
318 | | /// |
319 | | /// In this function, "normalized" means that any of the normalization below |
320 | | /// won't change the input on normalization: |
321 | | /// |
322 | | /// * syntax-based normalization, |
323 | | /// * case normalization, |
324 | | /// * percent-encoding normalization, and |
325 | | /// * path segment normalizaiton. |
326 | | /// |
327 | | /// Note that scheme-based normalization is not considered. |
328 | | #[must_use] |
329 | 0 | pub(crate) fn is_normalized<S: Spec>(i: &str, mode: NormalizednessCheckMode) -> bool { |
330 | 0 | let (i, scheme) = scheme_colon(i); |
331 | 0 | let (after_authority, authority) = slash_slash_authority_opt(i); |
332 | 0 | let (_after_path, path) = until_query(after_authority); |
333 | | |
334 | | // Syntax-based normalization: uppercase chars in `scheme` should be |
335 | | // converted to lowercase. |
336 | 0 | if scheme.bytes().any(|b| b.is_ascii_uppercase()) { |
337 | 0 | return false; |
338 | 0 | } |
339 | | |
340 | | // Case normalization: ASCII alphabets in US-ASCII only `host` should be |
341 | | // normalized to lowercase. |
342 | | // Case normalization: ASCII alphabets in percent-encoding triplet should be |
343 | | // normalized to uppercase. |
344 | | // Percent-encoding normalization: unresreved characters should be decoded |
345 | | // in `userinfo`, `host`, `path`, `query`, and `fragments`. |
346 | | // Path segment normalization: the path should not have dot segments (`.` |
347 | | // and/or `..`). |
348 | | // |
349 | | // Note that `authority` can have percent-encoded `userinfo`. |
350 | 0 | if let Some(authority) = authority { |
351 | 0 | let authority_components = authority::decompose_authority(authority); |
352 | | |
353 | | // Check `host`. |
354 | 0 | let host = authority_components.host(); |
355 | 0 | let host_is_normalized = if is_ascii_only_host(host) { |
356 | 0 | eq_str_display(host, &NormalizedAsciiOnlyHost::new(host)) |
357 | | } else { |
358 | | // If the host is not ASCII-only, conversion to lowercase is not performed. |
359 | 0 | is_pct_case_normalized::<S>(host) |
360 | | }; |
361 | 0 | if !host_is_normalized { |
362 | 0 | return false; |
363 | 0 | } |
364 | | |
365 | | // Check pencent encodings in `userinfo`. |
366 | 0 | if let Some(userinfo) = authority_components.userinfo() { |
367 | 0 | if !is_pct_case_normalized::<S>(userinfo) { |
368 | 0 | return false; |
369 | 0 | } |
370 | 0 | } |
371 | 0 | } |
372 | | |
373 | | // Check `path`. |
374 | | // |
375 | | // Syntax-based normalization: Dot segments might be removed. |
376 | | // Note that we don't have to care `%2e` and `%2E` since `.` is unreserved |
377 | | // and they will be decoded if not normalized. |
378 | | // Also note that WHATWG serialization will use `/.//` as a path prefix if |
379 | | // the path is absolute and won't modify the path if the path is relative. |
380 | | // |
381 | | // Percent-encoding normalization: unresreved characters should be decoded |
382 | | // in `path`, `query`, and `fragments`. |
383 | 0 | let path_span_no_dot_segments = if authority.is_some() { |
384 | 0 | Some(path) |
385 | | } else { |
386 | 0 | match mode { |
387 | 0 | NormalizednessCheckMode::Default => Some(path.strip_prefix("/.//").unwrap_or(path)), |
388 | 0 | NormalizednessCheckMode::Rfc3986 => Some(path), |
389 | | NormalizednessCheckMode::PreserveAuthoritylessRelativePath => { |
390 | 0 | if path.starts_with('/') { |
391 | | // Absolute. |
392 | 0 | Some(path.strip_prefix("/.//").unwrap_or(path)) |
393 | | } else { |
394 | | // Relative. Treat the path as "opaque". No span to check. |
395 | 0 | None |
396 | | } |
397 | | } |
398 | | } |
399 | | }; |
400 | 0 | if let Some(path_span_no_dot_segments) = path_span_no_dot_segments { |
401 | 0 | if path_span_no_dot_segments |
402 | 0 | .split('/') |
403 | 0 | .any(|segment| matches!(segment, "." | "..")) |
404 | | { |
405 | 0 | return false; |
406 | 0 | } |
407 | 0 | } |
408 | 0 | is_pct_case_normalized::<S>(after_authority) |
409 | 0 | } |
410 | | |
411 | | /// Decodes two hexdigits into a byte. |
412 | | /// |
413 | | /// # Preconditions |
414 | | /// |
415 | | /// The parameters `upper` and `lower` should be an ASCII hexadecimal digit. |
416 | | #[must_use] |
417 | 0 | pub(super) fn hexdigits_to_byte([upper, lower]: [u8; 2]) -> u8 { |
418 | 0 | let i_upper = match (upper & 0xf0).cmp(&0x40) { |
419 | 0 | Ordering::Less => upper - b'0', |
420 | 0 | Ordering::Equal => upper - (b'A' - 10), |
421 | 0 | Ordering::Greater => upper - (b'a' - 10), |
422 | | }; |
423 | 0 | let i_lower = match (lower & 0xf0).cmp(&0x40) { |
424 | 0 | Ordering::Less => lower - b'0', |
425 | 0 | Ordering::Equal => lower - (b'A' - 10), |
426 | 0 | Ordering::Greater => lower - (b'a' - 10), |
427 | | }; |
428 | 0 | (i_upper << 4) + i_lower |
429 | 0 | } |
430 | | |
431 | | /// Converts the first two hexdigit bytes in the buffer into a byte. |
432 | | /// |
433 | | /// # Panics |
434 | | /// |
435 | | /// Panics if the string does not start with two hexdigits. |
436 | | #[must_use] |
437 | 0 | pub(crate) fn take_xdigits2(s: &str) -> (u8, &str) { |
438 | 0 | let mut bytes = s.bytes(); |
439 | 0 | let upper_xdigit = bytes |
440 | 0 | .next() |
441 | 0 | .expect("[validity] at least two bytes should follow the `%` in a valid IRI reference"); |
442 | 0 | let lower_xdigit = bytes |
443 | 0 | .next() |
444 | 0 | .expect("[validity] at least two bytes should follow the `%` in a valid IRI reference"); |
445 | 0 | let v = hexdigits_to_byte([upper_xdigit, lower_xdigit]); |
446 | 0 | (v, &s[2..]) |
447 | 0 | } |
448 | | |
449 | | /// Returns true if the given `host`/`ihost` string consists of only US-ASCII characters. |
450 | | /// |
451 | | /// # Precondition |
452 | | /// |
453 | | /// The given string should be valid `host` or `host ":" port` string. |
454 | | #[must_use] |
455 | 0 | pub(crate) fn is_ascii_only_host(mut host: &str) -> bool { |
456 | 0 | while let Some((i, c)) = host |
457 | 0 | .char_indices() |
458 | 0 | .find(|(_i, c)| !c.is_ascii() || *c == '%') |
459 | | { |
460 | 0 | if c != '%' { |
461 | | // Non-ASCII character found. |
462 | 0 | debug_assert!(!c.is_ascii()); |
463 | 0 | return false; |
464 | 0 | } |
465 | | // Percent-encoded character found. |
466 | 0 | let after_pct = &host[(i + 1)..]; |
467 | 0 | let (byte, rest) = take_xdigits2(after_pct); |
468 | 0 | if !byte.is_ascii() { |
469 | 0 | return false; |
470 | 0 | } |
471 | 0 | host = rest; |
472 | | } |
473 | | |
474 | | // Neither non-ASCII characters nor percent-encoded characters found. |
475 | 0 | true |
476 | 0 | } |