/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/util/interpolate.rs
Line | Count | Source (jump to first uncovered line) |
1 | | /*! |
2 | | Provides routines for interpolating capture group references. |
3 | | |
4 | | That is, if a replacement string contains references like `$foo` or `${foo1}`, |
5 | | then they are replaced with the corresponding capture values for the groups |
6 | | named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}` |
7 | | is supported as well, with `1` corresponding to a capture group index and not |
8 | | a name. |
9 | | |
10 | | This module provides the free functions [`string`] and [`bytes`], which |
11 | | interpolate Rust Unicode strings and byte strings, respectively. |
12 | | |
13 | | # Format |
14 | | |
15 | | These routines support two different kinds of capture references: unbraced and |
16 | | braced. |
17 | | |
18 | | For the unbraced format, the format supported is `$ref` where `name` can be |
19 | | any character in the class `[0-9A-Za-z_]`. `ref` is always the longest |
20 | | possible parse. So for example, `$1a` corresponds to the capture group named |
21 | | `1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then |
22 | | it is treated as a capture group index itself and not a name. |
23 | | |
24 | | For the braced format, the format supported is `${ref}` where `ref` can be any |
25 | | sequence of bytes except for `}`. If no closing brace occurs, then it is not |
26 | | considered a capture reference. As with the unbraced format, if `ref` matches |
27 | | `^[0-9]+$`, then it is treated as a capture group index and not a name. |
28 | | |
29 | | The braced format is useful for exerting precise control over the name of the |
30 | | capture reference. For example, `${1}a` corresponds to the capture group |
31 | | reference `1` followed by the letter `a`, where as `$1a` (as mentioned above) |
32 | | corresponds to the capture group reference `1a`. The braced format is also |
33 | | useful for expressing capture group names that use characters not supported by |
34 | | the unbraced format. For example, `${foo[bar].baz}` refers to the capture group |
35 | | named `foo[bar].baz`. |
36 | | |
37 | | If a capture group reference is found and it does not refer to a valid capture |
38 | | group, then it will be replaced with the empty string. |
39 | | |
40 | | To write a literal `$`, use `$$`. |
41 | | |
42 | | To be clear, and as exhibited via the type signatures in the routines in this |
43 | | module, it is impossible for a replacement string to be invalid. A replacement |
44 | | string may not have the intended semantics, but the interpolation procedure |
45 | | itself can never fail. |
46 | | */ |
47 | | |
48 | | use alloc::{string::String, vec::Vec}; |
49 | | |
50 | | use crate::util::memchr::memchr; |
51 | | |
52 | | /// Accepts a replacement string and interpolates capture references with their |
53 | | /// corresponding values. |
54 | | /// |
55 | | /// `append` should be a function that appends the string value of a capture |
56 | | /// group at a particular index to the string given. If the capture group |
57 | | /// index is invalid, then nothing should be appended. |
58 | | /// |
59 | | /// `name_to_index` should be a function that maps a capture group name to a |
60 | | /// capture group index. If the given name doesn't exist, then `None` should |
61 | | /// be returned. |
62 | | /// |
63 | | /// Finally, `dst` is where the final interpolated contents should be written. |
64 | | /// If `replacement` contains no capture group references, then `dst` will be |
65 | | /// equivalent to `replacement`. |
66 | | /// |
67 | | /// See the [module documentation](self) for details about the format |
68 | | /// supported. |
69 | | /// |
70 | | /// # Example |
71 | | /// |
72 | | /// ``` |
73 | | /// use regex_automata::util::interpolate; |
74 | | /// |
75 | | /// let mut dst = String::new(); |
76 | | /// interpolate::string( |
77 | | /// "foo $bar baz", |
78 | | /// |index, dst| { |
79 | | /// if index == 0 { |
80 | | /// dst.push_str("BAR"); |
81 | | /// } |
82 | | /// }, |
83 | | /// |name| { |
84 | | /// if name == "bar" { |
85 | | /// Some(0) |
86 | | /// } else { |
87 | | /// None |
88 | | /// } |
89 | | /// }, |
90 | | /// &mut dst, |
91 | | /// ); |
92 | | /// assert_eq!("foo BAR baz", dst); |
93 | | /// ``` |
94 | 0 | pub fn string( |
95 | 0 | mut replacement: &str, |
96 | 0 | mut append: impl FnMut(usize, &mut String), |
97 | 0 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
98 | 0 | dst: &mut String, |
99 | 0 | ) { |
100 | 0 | while !replacement.is_empty() { |
101 | 0 | match memchr(b'$', replacement.as_bytes()) { |
102 | 0 | None => break, |
103 | 0 | Some(i) => { |
104 | 0 | dst.push_str(&replacement[..i]); |
105 | 0 | replacement = &replacement[i..]; |
106 | 0 | } |
107 | 0 | } |
108 | 0 | // Handle escaping of '$'. |
109 | 0 | if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') { |
110 | 0 | dst.push_str("$"); |
111 | 0 | replacement = &replacement[2..]; |
112 | 0 | continue; |
113 | 0 | } |
114 | 0 | debug_assert!(!replacement.is_empty()); |
115 | 0 | let cap_ref = match find_cap_ref(replacement.as_bytes()) { |
116 | 0 | Some(cap_ref) => cap_ref, |
117 | | None => { |
118 | 0 | dst.push_str("$"); |
119 | 0 | replacement = &replacement[1..]; |
120 | 0 | continue; |
121 | | } |
122 | | }; |
123 | 0 | replacement = &replacement[cap_ref.end..]; |
124 | 0 | match cap_ref.cap { |
125 | 0 | Ref::Number(i) => append(i, dst), |
126 | 0 | Ref::Named(name) => { |
127 | 0 | if let Some(i) = name_to_index(name) { |
128 | 0 | append(i, dst); |
129 | 0 | } |
130 | | } |
131 | | } |
132 | | } |
133 | 0 | dst.push_str(replacement); |
134 | 0 | } |
135 | | |
136 | | /// Accepts a replacement byte string and interpolates capture references with |
137 | | /// their corresponding values. |
138 | | /// |
139 | | /// `append` should be a function that appends the byte string value of a |
140 | | /// capture group at a particular index to the byte string given. If the |
141 | | /// capture group index is invalid, then nothing should be appended. |
142 | | /// |
143 | | /// `name_to_index` should be a function that maps a capture group name to a |
144 | | /// capture group index. If the given name doesn't exist, then `None` should |
145 | | /// be returned. |
146 | | /// |
147 | | /// Finally, `dst` is where the final interpolated contents should be written. |
148 | | /// If `replacement` contains no capture group references, then `dst` will be |
149 | | /// equivalent to `replacement`. |
150 | | /// |
151 | | /// See the [module documentation](self) for details about the format |
152 | | /// supported. |
153 | | /// |
154 | | /// # Example |
155 | | /// |
156 | | /// ``` |
157 | | /// use regex_automata::util::interpolate; |
158 | | /// |
159 | | /// let mut dst = vec![]; |
160 | | /// interpolate::bytes( |
161 | | /// b"foo $bar baz", |
162 | | /// |index, dst| { |
163 | | /// if index == 0 { |
164 | | /// dst.extend_from_slice(b"BAR"); |
165 | | /// } |
166 | | /// }, |
167 | | /// |name| { |
168 | | /// if name == "bar" { |
169 | | /// Some(0) |
170 | | /// } else { |
171 | | /// None |
172 | | /// } |
173 | | /// }, |
174 | | /// &mut dst, |
175 | | /// ); |
176 | | /// assert_eq!(&b"foo BAR baz"[..], dst); |
177 | | /// ``` |
178 | 0 | pub fn bytes( |
179 | 0 | mut replacement: &[u8], |
180 | 0 | mut append: impl FnMut(usize, &mut Vec<u8>), |
181 | 0 | mut name_to_index: impl FnMut(&str) -> Option<usize>, |
182 | 0 | dst: &mut Vec<u8>, |
183 | 0 | ) { |
184 | 0 | while !replacement.is_empty() { |
185 | 0 | match memchr(b'$', replacement) { |
186 | 0 | None => break, |
187 | 0 | Some(i) => { |
188 | 0 | dst.extend_from_slice(&replacement[..i]); |
189 | 0 | replacement = &replacement[i..]; |
190 | 0 | } |
191 | 0 | } |
192 | 0 | // Handle escaping of '$'. |
193 | 0 | if replacement.get(1).map_or(false, |&b| b == b'$') { |
194 | 0 | dst.push(b'$'); |
195 | 0 | replacement = &replacement[2..]; |
196 | 0 | continue; |
197 | 0 | } |
198 | 0 | debug_assert!(!replacement.is_empty()); |
199 | 0 | let cap_ref = match find_cap_ref(replacement) { |
200 | 0 | Some(cap_ref) => cap_ref, |
201 | | None => { |
202 | 0 | dst.push(b'$'); |
203 | 0 | replacement = &replacement[1..]; |
204 | 0 | continue; |
205 | | } |
206 | | }; |
207 | 0 | replacement = &replacement[cap_ref.end..]; |
208 | 0 | match cap_ref.cap { |
209 | 0 | Ref::Number(i) => append(i, dst), |
210 | 0 | Ref::Named(name) => { |
211 | 0 | if let Some(i) = name_to_index(name) { |
212 | 0 | append(i, dst); |
213 | 0 | } |
214 | | } |
215 | | } |
216 | | } |
217 | 0 | dst.extend_from_slice(replacement); |
218 | 0 | } |
219 | | |
220 | | /// `CaptureRef` represents a reference to a capture group inside some text. |
221 | | /// The reference is either a capture group name or a number. |
222 | | /// |
223 | | /// It is also tagged with the position in the text following the |
224 | | /// capture reference. |
225 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
226 | | struct CaptureRef<'a> { |
227 | | cap: Ref<'a>, |
228 | | end: usize, |
229 | | } |
230 | | |
231 | | /// A reference to a capture group in some text. |
232 | | /// |
233 | | /// e.g., `$2`, `$foo`, `${foo}`. |
234 | | #[derive(Clone, Copy, Debug, Eq, PartialEq)] |
235 | | enum Ref<'a> { |
236 | | Named(&'a str), |
237 | | Number(usize), |
238 | | } |
239 | | |
240 | | impl<'a> From<&'a str> for Ref<'a> { |
241 | 0 | fn from(x: &'a str) -> Ref<'a> { |
242 | 0 | Ref::Named(x) |
243 | 0 | } |
244 | | } |
245 | | |
246 | | impl From<usize> for Ref<'static> { |
247 | 0 | fn from(x: usize) -> Ref<'static> { |
248 | 0 | Ref::Number(x) |
249 | 0 | } |
250 | | } |
251 | | |
252 | | /// Parses a possible reference to a capture group name in the given text, |
253 | | /// starting at the beginning of `replacement`. |
254 | | /// |
255 | | /// If no such valid reference could be found, None is returned. |
256 | | /// |
257 | | /// Note that this returns a "possible" reference because this routine doesn't |
258 | | /// know whether the reference is to a valid group or not. If it winds up not |
259 | | /// being a valid reference, then it should be replaced with the empty string. |
260 | 0 | fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> { |
261 | 0 | let mut i = 0; |
262 | 0 | let rep: &[u8] = replacement; |
263 | 0 | if rep.len() <= 1 || rep[0] != b'$' { |
264 | 0 | return None; |
265 | 0 | } |
266 | 0 | i += 1; |
267 | 0 | if rep[i] == b'{' { |
268 | 0 | return find_cap_ref_braced(rep, i + 1); |
269 | 0 | } |
270 | 0 | let mut cap_end = i; |
271 | 0 | while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) { |
272 | 0 | cap_end += 1; |
273 | 0 | } |
274 | 0 | if cap_end == i { |
275 | 0 | return None; |
276 | 0 | } |
277 | 0 | // We just verified that the range 0..cap_end is valid ASCII, so it must |
278 | 0 | // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8 |
279 | 0 | // check via an unchecked conversion or by parsing the number straight from |
280 | 0 | // &[u8]. |
281 | 0 | let cap = core::str::from_utf8(&rep[i..cap_end]) |
282 | 0 | .expect("valid UTF-8 capture name"); |
283 | 0 | Some(CaptureRef { |
284 | 0 | cap: match cap.parse::<usize>() { |
285 | 0 | Ok(i) => Ref::Number(i), |
286 | 0 | Err(_) => Ref::Named(cap), |
287 | | }, |
288 | 0 | end: cap_end, |
289 | | }) |
290 | 0 | } |
291 | | |
292 | | /// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening |
293 | | /// brace has been found at `i-1` in `rep`. This then looks for a closing |
294 | | /// brace and returns the capture reference within the brace. |
295 | 0 | fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> { |
296 | 0 | assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]); |
297 | 0 | let start = i; |
298 | 0 | while rep.get(i).map_or(false, |&b| b != b'}') { |
299 | 0 | i += 1; |
300 | 0 | } |
301 | 0 | if !rep.get(i).map_or(false, |&b| b == b'}') { |
302 | 0 | return None; |
303 | 0 | } |
304 | | // When looking at braced names, we don't put any restrictions on the name, |
305 | | // so it's possible it could be invalid UTF-8. But a capture group name |
306 | | // can never be invalid UTF-8, so if we have invalid UTF-8, then we can |
307 | | // safely return None. |
308 | 0 | let cap = match core::str::from_utf8(&rep[start..i]) { |
309 | 0 | Err(_) => return None, |
310 | 0 | Ok(cap) => cap, |
311 | 0 | }; |
312 | 0 | Some(CaptureRef { |
313 | 0 | cap: match cap.parse::<usize>() { |
314 | 0 | Ok(i) => Ref::Number(i), |
315 | 0 | Err(_) => Ref::Named(cap), |
316 | | }, |
317 | 0 | end: i + 1, |
318 | | }) |
319 | 0 | } |
320 | | |
321 | | /// Returns true if and only if the given byte is allowed in a capture name |
322 | | /// written in non-brace form. |
323 | 0 | fn is_valid_cap_letter(b: u8) -> bool { |
324 | 0 | match b { |
325 | 0 | b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true, |
326 | 0 | _ => false, |
327 | | } |
328 | 0 | } |
329 | | |
330 | | #[cfg(test)] |
331 | | mod tests { |
332 | | use alloc::{string::String, vec, vec::Vec}; |
333 | | |
334 | | use super::{find_cap_ref, CaptureRef}; |
335 | | |
336 | | macro_rules! find { |
337 | | ($name:ident, $text:expr) => { |
338 | | #[test] |
339 | | fn $name() { |
340 | | assert_eq!(None, find_cap_ref($text.as_bytes())); |
341 | | } |
342 | | }; |
343 | | ($name:ident, $text:expr, $capref:expr) => { |
344 | | #[test] |
345 | | fn $name() { |
346 | | assert_eq!(Some($capref), find_cap_ref($text.as_bytes())); |
347 | | } |
348 | | }; |
349 | | } |
350 | | |
351 | | macro_rules! c { |
352 | | ($name_or_number:expr, $pos:expr) => { |
353 | | CaptureRef { cap: $name_or_number.into(), end: $pos } |
354 | | }; |
355 | | } |
356 | | |
357 | | find!(find_cap_ref1, "$foo", c!("foo", 4)); |
358 | | find!(find_cap_ref2, "${foo}", c!("foo", 6)); |
359 | | find!(find_cap_ref3, "$0", c!(0, 2)); |
360 | | find!(find_cap_ref4, "$5", c!(5, 2)); |
361 | | find!(find_cap_ref5, "$10", c!(10, 3)); |
362 | | // See https://github.com/rust-lang/regex/pull/585 |
363 | | // for more on characters following numbers |
364 | | find!(find_cap_ref6, "$42a", c!("42a", 4)); |
365 | | find!(find_cap_ref7, "${42}a", c!(42, 5)); |
366 | | find!(find_cap_ref8, "${42"); |
367 | | find!(find_cap_ref9, "${42 "); |
368 | | find!(find_cap_ref10, " $0 "); |
369 | | find!(find_cap_ref11, "$"); |
370 | | find!(find_cap_ref12, " "); |
371 | | find!(find_cap_ref13, ""); |
372 | | find!(find_cap_ref14, "$1-$2", c!(1, 2)); |
373 | | find!(find_cap_ref15, "$1_$2", c!("1_", 3)); |
374 | | find!(find_cap_ref16, "$x-$y", c!("x", 2)); |
375 | | find!(find_cap_ref17, "$x_$y", c!("x_", 3)); |
376 | | find!(find_cap_ref18, "${#}", c!("#", 4)); |
377 | | find!(find_cap_ref19, "${Z[}", c!("Z[", 5)); |
378 | | find!(find_cap_ref20, "${¾}", c!("¾", 5)); |
379 | | find!(find_cap_ref21, "${¾a}", c!("¾a", 6)); |
380 | | find!(find_cap_ref22, "${a¾}", c!("a¾", 6)); |
381 | | find!(find_cap_ref23, "${☃}", c!("☃", 6)); |
382 | | find!(find_cap_ref24, "${a☃}", c!("a☃", 7)); |
383 | | find!(find_cap_ref25, "${☃a}", c!("☃a", 7)); |
384 | | find!(find_cap_ref26, "${名字}", c!("名字", 9)); |
385 | | |
386 | | fn interpolate_string( |
387 | | mut name_to_index: Vec<(&'static str, usize)>, |
388 | | caps: Vec<&'static str>, |
389 | | replacement: &str, |
390 | | ) -> String { |
391 | | name_to_index.sort_by_key(|x| x.0); |
392 | | |
393 | | let mut dst = String::new(); |
394 | | super::string( |
395 | | replacement, |
396 | | |i, dst| { |
397 | | if let Some(&s) = caps.get(i) { |
398 | | dst.push_str(s); |
399 | | } |
400 | | }, |
401 | | |name| -> Option<usize> { |
402 | | name_to_index |
403 | | .binary_search_by_key(&name, |x| x.0) |
404 | | .ok() |
405 | | .map(|i| name_to_index[i].1) |
406 | | }, |
407 | | &mut dst, |
408 | | ); |
409 | | dst |
410 | | } |
411 | | |
412 | | fn interpolate_bytes( |
413 | | mut name_to_index: Vec<(&'static str, usize)>, |
414 | | caps: Vec<&'static str>, |
415 | | replacement: &str, |
416 | | ) -> String { |
417 | | name_to_index.sort_by_key(|x| x.0); |
418 | | |
419 | | let mut dst = vec![]; |
420 | | super::bytes( |
421 | | replacement.as_bytes(), |
422 | | |i, dst| { |
423 | | if let Some(&s) = caps.get(i) { |
424 | | dst.extend_from_slice(s.as_bytes()); |
425 | | } |
426 | | }, |
427 | | |name| -> Option<usize> { |
428 | | name_to_index |
429 | | .binary_search_by_key(&name, |x| x.0) |
430 | | .ok() |
431 | | .map(|i| name_to_index[i].1) |
432 | | }, |
433 | | &mut dst, |
434 | | ); |
435 | | String::from_utf8(dst).unwrap() |
436 | | } |
437 | | |
438 | | macro_rules! interp { |
439 | | ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => { |
440 | | #[test] |
441 | | fn $name() { |
442 | | assert_eq!( |
443 | | $expected, |
444 | | interpolate_string($map, $caps, $hay), |
445 | | "interpolate::string failed", |
446 | | ); |
447 | | assert_eq!( |
448 | | $expected, |
449 | | interpolate_bytes($map, $caps, $hay), |
450 | | "interpolate::bytes failed", |
451 | | ); |
452 | | } |
453 | | }; |
454 | | } |
455 | | |
456 | | interp!( |
457 | | interp1, |
458 | | vec![("foo", 2)], |
459 | | vec!["", "", "xxx"], |
460 | | "test $foo test", |
461 | | "test xxx test", |
462 | | ); |
463 | | |
464 | | interp!( |
465 | | interp2, |
466 | | vec![("foo", 2)], |
467 | | vec!["", "", "xxx"], |
468 | | "test$footest", |
469 | | "test", |
470 | | ); |
471 | | |
472 | | interp!( |
473 | | interp3, |
474 | | vec![("foo", 2)], |
475 | | vec!["", "", "xxx"], |
476 | | "test${foo}test", |
477 | | "testxxxtest", |
478 | | ); |
479 | | |
480 | | interp!( |
481 | | interp4, |
482 | | vec![("foo", 2)], |
483 | | vec!["", "", "xxx"], |
484 | | "test$2test", |
485 | | "test", |
486 | | ); |
487 | | |
488 | | interp!( |
489 | | interp5, |
490 | | vec![("foo", 2)], |
491 | | vec!["", "", "xxx"], |
492 | | "test${2}test", |
493 | | "testxxxtest", |
494 | | ); |
495 | | |
496 | | interp!( |
497 | | interp6, |
498 | | vec![("foo", 2)], |
499 | | vec!["", "", "xxx"], |
500 | | "test $$foo test", |
501 | | "test $foo test", |
502 | | ); |
503 | | |
504 | | interp!( |
505 | | interp7, |
506 | | vec![("foo", 2)], |
507 | | vec!["", "", "xxx"], |
508 | | "test $foo", |
509 | | "test xxx", |
510 | | ); |
511 | | |
512 | | interp!( |
513 | | interp8, |
514 | | vec![("foo", 2)], |
515 | | vec!["", "", "xxx"], |
516 | | "$foo test", |
517 | | "xxx test", |
518 | | ); |
519 | | |
520 | | interp!( |
521 | | interp9, |
522 | | vec![("bar", 1), ("foo", 2)], |
523 | | vec!["", "yyy", "xxx"], |
524 | | "test $bar$foo", |
525 | | "test yyyxxx", |
526 | | ); |
527 | | |
528 | | interp!( |
529 | | interp10, |
530 | | vec![("bar", 1), ("foo", 2)], |
531 | | vec!["", "yyy", "xxx"], |
532 | | "test $ test", |
533 | | "test $ test", |
534 | | ); |
535 | | |
536 | | interp!( |
537 | | interp11, |
538 | | vec![("bar", 1), ("foo", 2)], |
539 | | vec!["", "yyy", "xxx"], |
540 | | "test ${} test", |
541 | | "test test", |
542 | | ); |
543 | | |
544 | | interp!( |
545 | | interp12, |
546 | | vec![("bar", 1), ("foo", 2)], |
547 | | vec!["", "yyy", "xxx"], |
548 | | "test ${ } test", |
549 | | "test test", |
550 | | ); |
551 | | |
552 | | interp!( |
553 | | interp13, |
554 | | vec![("bar", 1), ("foo", 2)], |
555 | | vec!["", "yyy", "xxx"], |
556 | | "test ${a b} test", |
557 | | "test test", |
558 | | ); |
559 | | |
560 | | interp!( |
561 | | interp14, |
562 | | vec![("bar", 1), ("foo", 2)], |
563 | | vec!["", "yyy", "xxx"], |
564 | | "test ${a} test", |
565 | | "test test", |
566 | | ); |
567 | | |
568 | | // This is a funny case where a braced reference is never closed, but |
569 | | // within the unclosed braced reference, there is an unbraced reference. |
570 | | // In this case, the braced reference is just treated literally and the |
571 | | // unbraced reference is found. |
572 | | interp!( |
573 | | interp15, |
574 | | vec![("bar", 1), ("foo", 2)], |
575 | | vec!["", "yyy", "xxx"], |
576 | | "test ${wat $bar ok", |
577 | | "test ${wat yyy ok", |
578 | | ); |
579 | | } |