Coverage Report

Created: 2025-02-25 06:39

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.4.9/src/util/interpolate.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Provides routines for interpolating capture group references.
3
4
That is, if a replacement string contains references like `$foo` or `${foo1}`,
5
then they are replaced with the corresponding capture values for the groups
6
named `foo` and `foo1`, respectively. Similarly, syntax like `$1` and `${1}`
7
is supported as well, with `1` corresponding to a capture group index and not
8
a name.
9
10
This module provides the free functions [`string`] and [`bytes`], which
11
interpolate Rust Unicode strings and byte strings, respectively.
12
13
# Format
14
15
These routines support two different kinds of capture references: unbraced and
16
braced.
17
18
For the unbraced format, the format supported is `$ref` where `name` can be
19
any character in the class `[0-9A-Za-z_]`. `ref` is always the longest
20
possible parse. So for example, `$1a` corresponds to the capture group named
21
`1a` and not the capture group at index `1`. If `ref` matches `^[0-9]+$`, then
22
it is treated as a capture group index itself and not a name.
23
24
For the braced format, the format supported is `${ref}` where `ref` can be any
25
sequence of bytes except for `}`. If no closing brace occurs, then it is not
26
considered a capture reference. As with the unbraced format, if `ref` matches
27
`^[0-9]+$`, then it is treated as a capture group index and not a name.
28
29
The braced format is useful for exerting precise control over the name of the
30
capture reference. For example, `${1}a` corresponds to the capture group
31
reference `1` followed by the letter `a`, where as `$1a` (as mentioned above)
32
corresponds to the capture group reference `1a`. The braced format is also
33
useful for expressing capture group names that use characters not supported by
34
the unbraced format. For example, `${foo[bar].baz}` refers to the capture group
35
named `foo[bar].baz`.
36
37
If a capture group reference is found and it does not refer to a valid capture
38
group, then it will be replaced with the empty string.
39
40
To write a literal `$`, use `$$`.
41
42
To be clear, and as exhibited via the type signatures in the routines in this
43
module, it is impossible for a replacement string to be invalid. A replacement
44
string may not have the intended semantics, but the interpolation procedure
45
itself can never fail.
46
*/
47
48
use alloc::{string::String, vec::Vec};
49
50
use crate::util::memchr::memchr;
51
52
/// Accepts a replacement string and interpolates capture references with their
53
/// corresponding values.
54
///
55
/// `append` should be a function that appends the string value of a capture
56
/// group at a particular index to the string given. If the capture group
57
/// index is invalid, then nothing should be appended.
58
///
59
/// `name_to_index` should be a function that maps a capture group name to a
60
/// capture group index. If the given name doesn't exist, then `None` should
61
/// be returned.
62
///
63
/// Finally, `dst` is where the final interpolated contents should be written.
64
/// If `replacement` contains no capture group references, then `dst` will be
65
/// equivalent to `replacement`.
66
///
67
/// See the [module documentation](self) for details about the format
68
/// supported.
69
///
70
/// # Example
71
///
72
/// ```
73
/// use regex_automata::util::interpolate;
74
///
75
/// let mut dst = String::new();
76
/// interpolate::string(
77
///     "foo $bar baz",
78
///     |index, dst| {
79
///         if index == 0 {
80
///             dst.push_str("BAR");
81
///         }
82
///     },
83
///     |name| {
84
///         if name == "bar" {
85
///             Some(0)
86
///         } else {
87
///             None
88
///         }
89
///     },
90
///     &mut dst,
91
/// );
92
/// assert_eq!("foo BAR baz", dst);
93
/// ```
94
0
pub fn string(
95
0
    mut replacement: &str,
96
0
    mut append: impl FnMut(usize, &mut String),
97
0
    mut name_to_index: impl FnMut(&str) -> Option<usize>,
98
0
    dst: &mut String,
99
0
) {
100
0
    while !replacement.is_empty() {
101
0
        match memchr(b'$', replacement.as_bytes()) {
102
0
            None => break,
103
0
            Some(i) => {
104
0
                dst.push_str(&replacement[..i]);
105
0
                replacement = &replacement[i..];
106
0
            }
107
0
        }
108
0
        // Handle escaping of '$'.
109
0
        if replacement.as_bytes().get(1).map_or(false, |&b| b == b'$') {
110
0
            dst.push_str("$");
111
0
            replacement = &replacement[2..];
112
0
            continue;
113
0
        }
114
0
        debug_assert!(!replacement.is_empty());
115
0
        let cap_ref = match find_cap_ref(replacement.as_bytes()) {
116
0
            Some(cap_ref) => cap_ref,
117
            None => {
118
0
                dst.push_str("$");
119
0
                replacement = &replacement[1..];
120
0
                continue;
121
            }
122
        };
123
0
        replacement = &replacement[cap_ref.end..];
124
0
        match cap_ref.cap {
125
0
            Ref::Number(i) => append(i, dst),
126
0
            Ref::Named(name) => {
127
0
                if let Some(i) = name_to_index(name) {
128
0
                    append(i, dst);
129
0
                }
130
            }
131
        }
132
    }
133
0
    dst.push_str(replacement);
134
0
}
135
136
/// Accepts a replacement byte string and interpolates capture references with
137
/// their corresponding values.
138
///
139
/// `append` should be a function that appends the byte string value of a
140
/// capture group at a particular index to the byte string given. If the
141
/// capture group index is invalid, then nothing should be appended.
142
///
143
/// `name_to_index` should be a function that maps a capture group name to a
144
/// capture group index. If the given name doesn't exist, then `None` should
145
/// be returned.
146
///
147
/// Finally, `dst` is where the final interpolated contents should be written.
148
/// If `replacement` contains no capture group references, then `dst` will be
149
/// equivalent to `replacement`.
150
///
151
/// See the [module documentation](self) for details about the format
152
/// supported.
153
///
154
/// # Example
155
///
156
/// ```
157
/// use regex_automata::util::interpolate;
158
///
159
/// let mut dst = vec![];
160
/// interpolate::bytes(
161
///     b"foo $bar baz",
162
///     |index, dst| {
163
///         if index == 0 {
164
///             dst.extend_from_slice(b"BAR");
165
///         }
166
///     },
167
///     |name| {
168
///         if name == "bar" {
169
///             Some(0)
170
///         } else {
171
///             None
172
///         }
173
///     },
174
///     &mut dst,
175
/// );
176
/// assert_eq!(&b"foo BAR baz"[..], dst);
177
/// ```
178
0
pub fn bytes(
179
0
    mut replacement: &[u8],
180
0
    mut append: impl FnMut(usize, &mut Vec<u8>),
181
0
    mut name_to_index: impl FnMut(&str) -> Option<usize>,
182
0
    dst: &mut Vec<u8>,
183
0
) {
184
0
    while !replacement.is_empty() {
185
0
        match memchr(b'$', replacement) {
186
0
            None => break,
187
0
            Some(i) => {
188
0
                dst.extend_from_slice(&replacement[..i]);
189
0
                replacement = &replacement[i..];
190
0
            }
191
0
        }
192
0
        // Handle escaping of '$'.
193
0
        if replacement.get(1).map_or(false, |&b| b == b'$') {
194
0
            dst.push(b'$');
195
0
            replacement = &replacement[2..];
196
0
            continue;
197
0
        }
198
0
        debug_assert!(!replacement.is_empty());
199
0
        let cap_ref = match find_cap_ref(replacement) {
200
0
            Some(cap_ref) => cap_ref,
201
            None => {
202
0
                dst.push(b'$');
203
0
                replacement = &replacement[1..];
204
0
                continue;
205
            }
206
        };
207
0
        replacement = &replacement[cap_ref.end..];
208
0
        match cap_ref.cap {
209
0
            Ref::Number(i) => append(i, dst),
210
0
            Ref::Named(name) => {
211
0
                if let Some(i) = name_to_index(name) {
212
0
                    append(i, dst);
213
0
                }
214
            }
215
        }
216
    }
217
0
    dst.extend_from_slice(replacement);
218
0
}
219
220
/// `CaptureRef` represents a reference to a capture group inside some text.
221
/// The reference is either a capture group name or a number.
222
///
223
/// It is also tagged with the position in the text following the
224
/// capture reference.
225
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
226
struct CaptureRef<'a> {
227
    cap: Ref<'a>,
228
    end: usize,
229
}
230
231
/// A reference to a capture group in some text.
232
///
233
/// e.g., `$2`, `$foo`, `${foo}`.
234
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
235
enum Ref<'a> {
236
    Named(&'a str),
237
    Number(usize),
238
}
239
240
impl<'a> From<&'a str> for Ref<'a> {
241
0
    fn from(x: &'a str) -> Ref<'a> {
242
0
        Ref::Named(x)
243
0
    }
244
}
245
246
impl From<usize> for Ref<'static> {
247
0
    fn from(x: usize) -> Ref<'static> {
248
0
        Ref::Number(x)
249
0
    }
250
}
251
252
/// Parses a possible reference to a capture group name in the given text,
253
/// starting at the beginning of `replacement`.
254
///
255
/// If no such valid reference could be found, None is returned.
256
///
257
/// Note that this returns a "possible" reference because this routine doesn't
258
/// know whether the reference is to a valid group or not. If it winds up not
259
/// being a valid reference, then it should be replaced with the empty string.
260
0
fn find_cap_ref(replacement: &[u8]) -> Option<CaptureRef<'_>> {
261
0
    let mut i = 0;
262
0
    let rep: &[u8] = replacement;
263
0
    if rep.len() <= 1 || rep[0] != b'$' {
264
0
        return None;
265
0
    }
266
0
    i += 1;
267
0
    if rep[i] == b'{' {
268
0
        return find_cap_ref_braced(rep, i + 1);
269
0
    }
270
0
    let mut cap_end = i;
271
0
    while rep.get(cap_end).copied().map_or(false, is_valid_cap_letter) {
272
0
        cap_end += 1;
273
0
    }
274
0
    if cap_end == i {
275
0
        return None;
276
0
    }
277
0
    // We just verified that the range 0..cap_end is valid ASCII, so it must
278
0
    // therefore be valid UTF-8. If we really cared, we could avoid this UTF-8
279
0
    // check via an unchecked conversion or by parsing the number straight from
280
0
    // &[u8].
281
0
    let cap = core::str::from_utf8(&rep[i..cap_end])
282
0
        .expect("valid UTF-8 capture name");
283
0
    Some(CaptureRef {
284
0
        cap: match cap.parse::<usize>() {
285
0
            Ok(i) => Ref::Number(i),
286
0
            Err(_) => Ref::Named(cap),
287
        },
288
0
        end: cap_end,
289
    })
290
0
}
291
292
/// Looks for a braced reference, e.g., `${foo1}`. This assumes that an opening
293
/// brace has been found at `i-1` in `rep`. This then looks for a closing
294
/// brace and returns the capture reference within the brace.
295
0
fn find_cap_ref_braced(rep: &[u8], mut i: usize) -> Option<CaptureRef<'_>> {
296
0
    assert_eq!(b'{', rep[i.checked_sub(1).unwrap()]);
297
0
    let start = i;
298
0
    while rep.get(i).map_or(false, |&b| b != b'}') {
299
0
        i += 1;
300
0
    }
301
0
    if !rep.get(i).map_or(false, |&b| b == b'}') {
302
0
        return None;
303
0
    }
304
    // When looking at braced names, we don't put any restrictions on the name,
305
    // so it's possible it could be invalid UTF-8. But a capture group name
306
    // can never be invalid UTF-8, so if we have invalid UTF-8, then we can
307
    // safely return None.
308
0
    let cap = match core::str::from_utf8(&rep[start..i]) {
309
0
        Err(_) => return None,
310
0
        Ok(cap) => cap,
311
0
    };
312
0
    Some(CaptureRef {
313
0
        cap: match cap.parse::<usize>() {
314
0
            Ok(i) => Ref::Number(i),
315
0
            Err(_) => Ref::Named(cap),
316
        },
317
0
        end: i + 1,
318
    })
319
0
}
320
321
/// Returns true if and only if the given byte is allowed in a capture name
322
/// written in non-brace form.
323
0
fn is_valid_cap_letter(b: u8) -> bool {
324
0
    match b {
325
0
        b'0'..=b'9' | b'a'..=b'z' | b'A'..=b'Z' | b'_' => true,
326
0
        _ => false,
327
    }
328
0
}
329
330
#[cfg(test)]
331
mod tests {
332
    use alloc::{string::String, vec, vec::Vec};
333
334
    use super::{find_cap_ref, CaptureRef};
335
336
    macro_rules! find {
337
        ($name:ident, $text:expr) => {
338
            #[test]
339
            fn $name() {
340
                assert_eq!(None, find_cap_ref($text.as_bytes()));
341
            }
342
        };
343
        ($name:ident, $text:expr, $capref:expr) => {
344
            #[test]
345
            fn $name() {
346
                assert_eq!(Some($capref), find_cap_ref($text.as_bytes()));
347
            }
348
        };
349
    }
350
351
    macro_rules! c {
352
        ($name_or_number:expr, $pos:expr) => {
353
            CaptureRef { cap: $name_or_number.into(), end: $pos }
354
        };
355
    }
356
357
    find!(find_cap_ref1, "$foo", c!("foo", 4));
358
    find!(find_cap_ref2, "${foo}", c!("foo", 6));
359
    find!(find_cap_ref3, "$0", c!(0, 2));
360
    find!(find_cap_ref4, "$5", c!(5, 2));
361
    find!(find_cap_ref5, "$10", c!(10, 3));
362
    // See https://github.com/rust-lang/regex/pull/585
363
    // for more on characters following numbers
364
    find!(find_cap_ref6, "$42a", c!("42a", 4));
365
    find!(find_cap_ref7, "${42}a", c!(42, 5));
366
    find!(find_cap_ref8, "${42");
367
    find!(find_cap_ref9, "${42 ");
368
    find!(find_cap_ref10, " $0 ");
369
    find!(find_cap_ref11, "$");
370
    find!(find_cap_ref12, " ");
371
    find!(find_cap_ref13, "");
372
    find!(find_cap_ref14, "$1-$2", c!(1, 2));
373
    find!(find_cap_ref15, "$1_$2", c!("1_", 3));
374
    find!(find_cap_ref16, "$x-$y", c!("x", 2));
375
    find!(find_cap_ref17, "$x_$y", c!("x_", 3));
376
    find!(find_cap_ref18, "${#}", c!("#", 4));
377
    find!(find_cap_ref19, "${Z[}", c!("Z[", 5));
378
    find!(find_cap_ref20, "${¾}", c!("¾", 5));
379
    find!(find_cap_ref21, "${¾a}", c!("¾a", 6));
380
    find!(find_cap_ref22, "${a¾}", c!("a¾", 6));
381
    find!(find_cap_ref23, "${☃}", c!("☃", 6));
382
    find!(find_cap_ref24, "${a☃}", c!("a☃", 7));
383
    find!(find_cap_ref25, "${☃a}", c!("☃a", 7));
384
    find!(find_cap_ref26, "${名字}", c!("名字", 9));
385
386
    fn interpolate_string(
387
        mut name_to_index: Vec<(&'static str, usize)>,
388
        caps: Vec<&'static str>,
389
        replacement: &str,
390
    ) -> String {
391
        name_to_index.sort_by_key(|x| x.0);
392
393
        let mut dst = String::new();
394
        super::string(
395
            replacement,
396
            |i, dst| {
397
                if let Some(&s) = caps.get(i) {
398
                    dst.push_str(s);
399
                }
400
            },
401
            |name| -> Option<usize> {
402
                name_to_index
403
                    .binary_search_by_key(&name, |x| x.0)
404
                    .ok()
405
                    .map(|i| name_to_index[i].1)
406
            },
407
            &mut dst,
408
        );
409
        dst
410
    }
411
412
    fn interpolate_bytes(
413
        mut name_to_index: Vec<(&'static str, usize)>,
414
        caps: Vec<&'static str>,
415
        replacement: &str,
416
    ) -> String {
417
        name_to_index.sort_by_key(|x| x.0);
418
419
        let mut dst = vec![];
420
        super::bytes(
421
            replacement.as_bytes(),
422
            |i, dst| {
423
                if let Some(&s) = caps.get(i) {
424
                    dst.extend_from_slice(s.as_bytes());
425
                }
426
            },
427
            |name| -> Option<usize> {
428
                name_to_index
429
                    .binary_search_by_key(&name, |x| x.0)
430
                    .ok()
431
                    .map(|i| name_to_index[i].1)
432
            },
433
            &mut dst,
434
        );
435
        String::from_utf8(dst).unwrap()
436
    }
437
438
    macro_rules! interp {
439
        ($name:ident, $map:expr, $caps:expr, $hay:expr, $expected:expr $(,)*) => {
440
            #[test]
441
            fn $name() {
442
                assert_eq!(
443
                    $expected,
444
                    interpolate_string($map, $caps, $hay),
445
                    "interpolate::string failed",
446
                );
447
                assert_eq!(
448
                    $expected,
449
                    interpolate_bytes($map, $caps, $hay),
450
                    "interpolate::bytes failed",
451
                );
452
            }
453
        };
454
    }
455
456
    interp!(
457
        interp1,
458
        vec![("foo", 2)],
459
        vec!["", "", "xxx"],
460
        "test $foo test",
461
        "test xxx test",
462
    );
463
464
    interp!(
465
        interp2,
466
        vec![("foo", 2)],
467
        vec!["", "", "xxx"],
468
        "test$footest",
469
        "test",
470
    );
471
472
    interp!(
473
        interp3,
474
        vec![("foo", 2)],
475
        vec!["", "", "xxx"],
476
        "test${foo}test",
477
        "testxxxtest",
478
    );
479
480
    interp!(
481
        interp4,
482
        vec![("foo", 2)],
483
        vec!["", "", "xxx"],
484
        "test$2test",
485
        "test",
486
    );
487
488
    interp!(
489
        interp5,
490
        vec![("foo", 2)],
491
        vec!["", "", "xxx"],
492
        "test${2}test",
493
        "testxxxtest",
494
    );
495
496
    interp!(
497
        interp6,
498
        vec![("foo", 2)],
499
        vec!["", "", "xxx"],
500
        "test $$foo test",
501
        "test $foo test",
502
    );
503
504
    interp!(
505
        interp7,
506
        vec![("foo", 2)],
507
        vec!["", "", "xxx"],
508
        "test $foo",
509
        "test xxx",
510
    );
511
512
    interp!(
513
        interp8,
514
        vec![("foo", 2)],
515
        vec!["", "", "xxx"],
516
        "$foo test",
517
        "xxx test",
518
    );
519
520
    interp!(
521
        interp9,
522
        vec![("bar", 1), ("foo", 2)],
523
        vec!["", "yyy", "xxx"],
524
        "test $bar$foo",
525
        "test yyyxxx",
526
    );
527
528
    interp!(
529
        interp10,
530
        vec![("bar", 1), ("foo", 2)],
531
        vec!["", "yyy", "xxx"],
532
        "test $ test",
533
        "test $ test",
534
    );
535
536
    interp!(
537
        interp11,
538
        vec![("bar", 1), ("foo", 2)],
539
        vec!["", "yyy", "xxx"],
540
        "test ${} test",
541
        "test  test",
542
    );
543
544
    interp!(
545
        interp12,
546
        vec![("bar", 1), ("foo", 2)],
547
        vec!["", "yyy", "xxx"],
548
        "test ${ } test",
549
        "test  test",
550
    );
551
552
    interp!(
553
        interp13,
554
        vec![("bar", 1), ("foo", 2)],
555
        vec!["", "yyy", "xxx"],
556
        "test ${a b} test",
557
        "test  test",
558
    );
559
560
    interp!(
561
        interp14,
562
        vec![("bar", 1), ("foo", 2)],
563
        vec!["", "yyy", "xxx"],
564
        "test ${a} test",
565
        "test  test",
566
    );
567
568
    // This is a funny case where a braced reference is never closed, but
569
    // within the unclosed braced reference, there is an unbraced reference.
570
    // In this case, the braced reference is just treated literally and the
571
    // unbraced reference is found.
572
    interp!(
573
        interp15,
574
        vec![("bar", 1), ("foo", 2)],
575
        vec!["", "yyy", "xxx"],
576
        "test ${wat $bar ok",
577
        "test ${wat yyy ok",
578
    );
579
}