Coverage Report

Created: 2025-08-29 06:18

/rust/registry/src/index.crates.io-6f17d22bba15001f/regex-automata-0.2.0/src/dfa/sparse.rs
Line
Count
Source (jump to first uncovered line)
1
/*!
2
Types and routines specific to sparse DFAs.
3
4
This module is the home of [`sparse::DFA`](DFA).
5
6
Unlike the [`dense`](super::dense) module, this module does not contain a
7
builder or configuration specific for sparse DFAs. Instead, the intended
8
way to build a sparse DFA is either by using a default configuration with
9
its constructor [`sparse::DFA::new`](DFA::new), or by first configuring the
10
construction of a dense DFA with [`dense::Builder`](super::dense::Builder)
11
and then calling [`dense::DFA::to_sparse`](super::dense::DFA::to_sparse). For
12
example, this configures a sparse DFA to do an overlapping search:
13
14
```
15
use regex_automata::{
16
    dfa::{Automaton, OverlappingState, dense},
17
    HalfMatch, MatchKind,
18
};
19
20
let dense_re = dense::Builder::new()
21
    .configure(dense::Config::new().match_kind(MatchKind::All))
22
    .build(r"Samwise|Sam")?;
23
let sparse_re = dense_re.to_sparse()?;
24
25
// Setup our haystack and initial start state.
26
let haystack = b"Samwise";
27
let mut state = OverlappingState::start();
28
29
// First, 'Sam' will match.
30
let end1 = sparse_re.find_overlapping_fwd_at(
31
    None, None, haystack, 0, haystack.len(), &mut state,
32
)?;
33
assert_eq!(end1, Some(HalfMatch::must(0, 3)));
34
35
// And now 'Samwise' will match.
36
let end2 = sparse_re.find_overlapping_fwd_at(
37
    None, None, haystack, 3, haystack.len(), &mut state,
38
)?;
39
assert_eq!(end2, Some(HalfMatch::must(0, 7)));
40
# Ok::<(), Box<dyn std::error::Error>>(())
41
```
42
*/
43
44
#[cfg(feature = "alloc")]
45
use core::iter;
46
use core::{
47
    convert::{TryFrom, TryInto},
48
    fmt,
49
    mem::size_of,
50
};
51
52
#[cfg(feature = "alloc")]
53
use alloc::{collections::BTreeSet, vec, vec::Vec};
54
55
#[cfg(feature = "alloc")]
56
use crate::dfa::{dense, error::Error};
57
use crate::{
58
    dfa::{
59
        automaton::{fmt_state_indicator, Automaton},
60
        special::Special,
61
        DEAD,
62
    },
63
    util::{
64
        alphabet::ByteClasses,
65
        bytes::{self, DeserializeError, Endian, SerializeError},
66
        id::{PatternID, StateID},
67
        start::Start,
68
        DebugByte,
69
    },
70
};
71
72
const LABEL: &str = "rust-regex-automata-dfa-sparse";
73
const VERSION: u32 = 2;
74
75
/// A sparse deterministic finite automaton (DFA) with variable sized states.
76
///
77
/// In contrast to a [dense::DFA](crate::dfa::dense::DFA), a sparse DFA uses
78
/// a more space efficient representation for its transitions. Consequently,
79
/// sparse DFAs may use much less memory than dense DFAs, but this comes at a
80
/// price. In particular, reading the more space efficient transitions takes
81
/// more work, and consequently, searching using a sparse DFA is typically
82
/// slower than a dense DFA.
83
///
84
/// A sparse DFA can be built using the default configuration via the
85
/// [`DFA::new`] constructor. Otherwise, one can configure various aspects
86
/// of a dense DFA via [`dense::Builder`](crate::dfa::dense::Builder),
87
/// and then convert a dense DFA to a sparse DFA using
88
/// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse).
89
///
90
/// In general, a sparse DFA supports all the same search operations as a dense
91
/// DFA.
92
///
93
/// Making the choice between a dense and sparse DFA depends on your specific
94
/// work load. If you can sacrifice a bit of search time performance, then a
95
/// sparse DFA might be the best choice. In particular, while sparse DFAs are
96
/// probably always slower than dense DFAs, you may find that they are easily
97
/// fast enough for your purposes!
98
///
99
/// # Type parameters
100
///
101
/// A `DFA` has one type parameter, `T`, which is used to represent the parts
102
/// of a sparse DFA. `T` is typically a `Vec<u8>` or a `&[u8]`.
103
///
104
/// # The `Automaton` trait
105
///
106
/// This type implements the [`Automaton`] trait, which means it can be used
107
/// for searching. For example:
108
///
109
/// ```
110
/// use regex_automata::{
111
///     dfa::{Automaton, sparse::DFA},
112
///     HalfMatch,
113
/// };
114
///
115
/// let dfa = DFA::new("foo[0-9]+")?;
116
/// let expected = HalfMatch::must(0, 8);
117
/// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
118
/// # Ok::<(), Box<dyn std::error::Error>>(())
119
/// ```
120
#[derive(Clone)]
121
pub struct DFA<T> {
122
    // When compared to a dense DFA, a sparse DFA *looks* a lot simpler
123
    // representation-wise. In reality, it is perhaps more complicated. Namely,
124
    // in a dense DFA, all information needs to be very cheaply accessible
125
    // using only state IDs. In a sparse DFA however, each state uses a
126
    // variable amount of space because each state encodes more information
127
    // than just its transitions. Each state also includes an accelerator if
128
    // one exists, along with the matching pattern IDs if the state is a match
129
    // state.
130
    //
131
    // That is, a lot of the complexity is pushed down into how each state
132
    // itself is represented.
133
    trans: Transitions<T>,
134
    starts: StartTable<T>,
135
    special: Special,
136
}
137
138
#[cfg(feature = "alloc")]
139
impl DFA<Vec<u8>> {
140
    /// Parse the given regular expression using a default configuration and
141
    /// return the corresponding sparse DFA.
142
    ///
143
    /// If you want a non-default configuration, then use
144
    /// the [`dense::Builder`](crate::dfa::dense::Builder)
145
    /// to set your own configuration, and then call
146
    /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
147
    /// a sparse DFA.
148
    ///
149
    /// # Example
150
    ///
151
    /// ```
152
    /// use regex_automata::{
153
    ///     dfa::{Automaton, sparse},
154
    ///     HalfMatch,
155
    /// };
156
    ///
157
    /// let dfa = sparse::DFA::new("foo[0-9]+bar")?;
158
    ///
159
    /// let expected = HalfMatch::must(0, 11);
160
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
161
    /// # Ok::<(), Box<dyn std::error::Error>>(())
162
    /// ```
163
    pub fn new(pattern: &str) -> Result<DFA<Vec<u8>>, Error> {
164
        dense::Builder::new()
165
            .build(pattern)
166
            .and_then(|dense| dense.to_sparse())
167
    }
168
169
    /// Parse the given regular expressions using a default configuration and
170
    /// return the corresponding multi-DFA.
171
    ///
172
    /// If you want a non-default configuration, then use
173
    /// the [`dense::Builder`](crate::dfa::dense::Builder)
174
    /// to set your own configuration, and then call
175
    /// [`dense::DFA::to_sparse`](crate::dfa::dense::DFA::to_sparse) to create
176
    /// a sparse DFA.
177
    ///
178
    /// # Example
179
    ///
180
    /// ```
181
    /// use regex_automata::{
182
    ///     dfa::{Automaton, sparse},
183
    ///     HalfMatch,
184
    /// };
185
    ///
186
    /// let dfa = sparse::DFA::new_many(&["[0-9]+", "[a-z]+"])?;
187
    /// let expected = HalfMatch::must(1, 3);
188
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345bar")?);
189
    /// # Ok::<(), Box<dyn std::error::Error>>(())
190
    /// ```
191
    pub fn new_many<P: AsRef<str>>(
192
        patterns: &[P],
193
    ) -> Result<DFA<Vec<u8>>, Error> {
194
        dense::Builder::new()
195
            .build_many(patterns)
196
            .and_then(|dense| dense.to_sparse())
197
    }
198
}
199
200
#[cfg(feature = "alloc")]
201
impl DFA<Vec<u8>> {
202
    /// Create a new DFA that matches every input.
203
    ///
204
    /// # Example
205
    ///
206
    /// ```
207
    /// use regex_automata::{
208
    ///     dfa::{Automaton, sparse},
209
    ///     HalfMatch,
210
    /// };
211
    ///
212
    /// let dfa = sparse::DFA::always_match()?;
213
    ///
214
    /// let expected = HalfMatch::must(0, 0);
215
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"")?);
216
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo")?);
217
    /// # Ok::<(), Box<dyn std::error::Error>>(())
218
    /// ```
219
    pub fn always_match() -> Result<DFA<Vec<u8>>, Error> {
220
        dense::DFA::always_match()?.to_sparse()
221
    }
222
223
    /// Create a new sparse DFA that never matches any input.
224
    ///
225
    /// # Example
226
    ///
227
    /// ```
228
    /// use regex_automata::dfa::{Automaton, sparse};
229
    ///
230
    /// let dfa = sparse::DFA::never_match()?;
231
    /// assert_eq!(None, dfa.find_leftmost_fwd(b"")?);
232
    /// assert_eq!(None, dfa.find_leftmost_fwd(b"foo")?);
233
    /// # Ok::<(), Box<dyn std::error::Error>>(())
234
    /// ```
235
    pub fn never_match() -> Result<DFA<Vec<u8>>, Error> {
236
        dense::DFA::never_match()?.to_sparse()
237
    }
238
239
    /// The implementation for constructing a sparse DFA from a dense DFA.
240
    pub(crate) fn from_dense<T: AsRef<[u32]>>(
241
        dfa: &dense::DFA<T>,
242
    ) -> Result<DFA<Vec<u8>>, Error> {
243
        // In order to build the transition table, we need to be able to write
244
        // state identifiers for each of the "next" transitions in each state.
245
        // Our state identifiers correspond to the byte offset in the
246
        // transition table at which the state is encoded. Therefore, we do not
247
        // actually know what the state identifiers are until we've allocated
248
        // exactly as much space as we need for each state. Thus, construction
249
        // of the transition table happens in two passes.
250
        //
251
        // In the first pass, we fill out the shell of each state, which
252
        // includes the transition count, the input byte ranges and zero-filled
253
        // space for the transitions and accelerators, if present. In this
254
        // first pass, we also build up a map from the state identifier index
255
        // of the dense DFA to the state identifier in this sparse DFA.
256
        //
257
        // In the second pass, we fill in the transitions based on the map
258
        // built in the first pass.
259
260
        // The capacity given here reflects a minimum. (Well, the true minimum
261
        // is likely even bigger, but hopefully this saves a few reallocs.)
262
        let mut sparse = Vec::with_capacity(StateID::SIZE * dfa.state_count());
263
        // This maps state indices from the dense DFA to StateIDs in the sparse
264
        // DFA. We build out this map on the first pass, and then use it in the
265
        // second pass to back-fill our transitions.
266
        let mut remap: Vec<StateID> = vec![DEAD; dfa.state_count()];
267
        for state in dfa.states() {
268
            let pos = sparse.len();
269
270
            remap[dfa.to_index(state.id())] =
271
                StateID::new(pos).map_err(|_| Error::too_many_states())?;
272
            // zero-filled space for the transition count
273
            sparse.push(0);
274
            sparse.push(0);
275
276
            let mut transition_count = 0;
277
            for (unit1, unit2, _) in state.sparse_transitions() {
278
                match (unit1.as_u8(), unit2.as_u8()) {
279
                    (Some(b1), Some(b2)) => {
280
                        transition_count += 1;
281
                        sparse.push(b1);
282
                        sparse.push(b2);
283
                    }
284
                    (None, None) => {}
285
                    (Some(_), None) | (None, Some(_)) => {
286
                        // can never occur because sparse_transitions never
287
                        // groups EOI with any other transition.
288
                        unreachable!()
289
                    }
290
                }
291
            }
292
            // Add dummy EOI transition. This is never actually read while
293
            // searching, but having space equivalent to the total number
294
            // of transitions is convenient. Otherwise, we'd need to track
295
            // a different number of transitions for the byte ranges as for
296
            // the 'next' states.
297
            //
298
            // N.B. The loop above is not guaranteed to yield the EOI
299
            // transition, since it may point to a DEAD state. By putting
300
            // it here, we always write the EOI transition, and thus
301
            // guarantee that our transition count is >0. Why do we always
302
            // need the EOI transition? Because in order to implement
303
            // Automaton::next_eoi_state, this lets us just ask for the last
304
            // transition. There are probably other/better ways to do this.
305
            transition_count += 1;
306
            sparse.push(0);
307
            sparse.push(0);
308
309
            // Check some assumptions about transition count.
310
            assert_ne!(
311
                transition_count, 0,
312
                "transition count should be non-zero",
313
            );
314
            assert!(
315
                transition_count <= 257,
316
                "expected transition count {} to be <= 257",
317
                transition_count,
318
            );
319
320
            // Fill in the transition count.
321
            // Since transition count is always <= 257, we use the most
322
            // significant bit to indicate whether this is a match state or
323
            // not.
324
            let ntrans = if dfa.is_match_state(state.id()) {
325
                transition_count | (1 << 15)
326
            } else {
327
                transition_count
328
            };
329
            bytes::NE::write_u16(ntrans, &mut sparse[pos..]);
330
331
            // zero-fill the actual transitions.
332
            // Unwraps are OK since transition_count <= 257 and our minimum
333
            // support usize size is 16-bits.
334
            let zeros = usize::try_from(transition_count)
335
                .unwrap()
336
                .checked_mul(StateID::SIZE)
337
                .unwrap();
338
            sparse.extend(iter::repeat(0).take(zeros));
339
340
            // If this is a match state, write the pattern IDs matched by this
341
            // state.
342
            if dfa.is_match_state(state.id()) {
343
                let plen = dfa.match_pattern_len(state.id());
344
                // Write the actual pattern IDs with a u32 length prefix.
345
                // First, zero-fill space.
346
                let mut pos = sparse.len();
347
                // Unwraps are OK since it's guaranteed that plen <=
348
                // PatternID::LIMIT, which is in turn guaranteed to fit into a
349
                // u32.
350
                let zeros = size_of::<u32>()
351
                    .checked_mul(plen)
352
                    .unwrap()
353
                    .checked_add(size_of::<u32>())
354
                    .unwrap();
355
                sparse.extend(iter::repeat(0).take(zeros));
356
357
                // Now write the length prefix.
358
                bytes::NE::write_u32(
359
                    // Will never fail since u32::MAX is invalid pattern ID.
360
                    // Thus, the number of pattern IDs is representable by a
361
                    // u32.
362
                    plen.try_into().expect("pattern ID count fits in u32"),
363
                    &mut sparse[pos..],
364
                );
365
                pos += size_of::<u32>();
366
367
                // Now write the pattern IDs.
368
                for &pid in dfa.pattern_id_slice(state.id()) {
369
                    pos += bytes::write_pattern_id::<bytes::NE>(
370
                        pid,
371
                        &mut sparse[pos..],
372
                    );
373
                }
374
            }
375
376
            // And now add the accelerator, if one exists. An accelerator is
377
            // at most 4 bytes and at least 1 byte. The first byte is the
378
            // length, N. N bytes follow the length. The set of bytes that
379
            // follow correspond (exhaustively) to the bytes that must be seen
380
            // to leave this state.
381
            let accel = dfa.accelerator(state.id());
382
            sparse.push(accel.len().try_into().unwrap());
383
            sparse.extend_from_slice(accel);
384
        }
385
386
        let mut new = DFA {
387
            trans: Transitions {
388
                sparse,
389
                classes: dfa.byte_classes().clone(),
390
                count: dfa.state_count(),
391
                patterns: dfa.pattern_count(),
392
            },
393
            starts: StartTable::from_dense_dfa(dfa, &remap)?,
394
            special: dfa.special().remap(|id| remap[dfa.to_index(id)]),
395
        };
396
        // And here's our second pass. Iterate over all of the dense states
397
        // again, and update the transitions in each of the states in the
398
        // sparse DFA.
399
        for old_state in dfa.states() {
400
            let new_id = remap[dfa.to_index(old_state.id())];
401
            let mut new_state = new.trans.state_mut(new_id);
402
            let sparse = old_state.sparse_transitions();
403
            for (i, (_, _, next)) in sparse.enumerate() {
404
                let next = remap[dfa.to_index(next)];
405
                new_state.set_next_at(i, next);
406
            }
407
        }
408
        trace!(
409
            "created sparse DFA, memory usage: {} (dense memory usage: {})",
410
            new.memory_usage(),
411
            dfa.memory_usage(),
412
        );
413
        Ok(new)
414
    }
415
}
416
417
impl<T: AsRef<[u8]>> DFA<T> {
418
    /// Cheaply return a borrowed version of this sparse DFA. Specifically, the
419
    /// DFA returned always uses `&[u8]` for its transitions.
420
0
    pub fn as_ref<'a>(&'a self) -> DFA<&'a [u8]> {
421
0
        DFA {
422
0
            trans: self.trans.as_ref(),
423
0
            starts: self.starts.as_ref(),
424
0
            special: self.special,
425
0
        }
426
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]>>::as_ref
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_>>::as_ref
427
428
    /// Return an owned version of this sparse DFA. Specifically, the DFA
429
    /// returned always uses `Vec<u8>` for its transitions.
430
    ///
431
    /// Effectively, this returns a sparse DFA whose transitions live on the
432
    /// heap.
433
    #[cfg(feature = "alloc")]
434
    pub fn to_owned(&self) -> DFA<Vec<u8>> {
435
        DFA {
436
            trans: self.trans.to_owned(),
437
            starts: self.starts.to_owned(),
438
            special: self.special,
439
        }
440
    }
441
442
    /// Returns the memory usage, in bytes, of this DFA.
443
    ///
444
    /// The memory usage is computed based on the number of bytes used to
445
    /// represent this DFA.
446
    ///
447
    /// This does **not** include the stack size used up by this DFA. To
448
    /// compute that, use `std::mem::size_of::<sparse::DFA>()`.
449
0
    pub fn memory_usage(&self) -> usize {
450
0
        self.trans.memory_usage() + self.starts.memory_usage()
451
0
    }
452
453
    /// Returns true only if this DFA has starting states for each pattern.
454
    ///
455
    /// When a DFA has starting states for each pattern, then a search with the
456
    /// DFA can be configured to only look for anchored matches of a specific
457
    /// pattern. Specifically, APIs like [`Automaton::find_earliest_fwd_at`]
458
    /// can accept a non-None `pattern_id` if and only if this method returns
459
    /// true. Otherwise, calling `find_earliest_fwd_at` will panic.
460
    ///
461
    /// Note that if the DFA is empty, this always returns false.
462
0
    pub fn has_starts_for_each_pattern(&self) -> bool {
463
0
        self.starts.patterns > 0
464
0
    }
465
}
466
467
/// Routines for converting a sparse DFA to other representations, such as raw
468
/// bytes suitable for persistent storage.
469
impl<T: AsRef<[u8]>> DFA<T> {
470
    /// Serialize this DFA as raw bytes to a `Vec<u8>` in little endian
471
    /// format.
472
    ///
473
    /// The written bytes are guaranteed to be deserialized correctly and
474
    /// without errors in a semver compatible release of this crate by a
475
    /// `DFA`'s deserialization APIs (assuming all other criteria for the
476
    /// deserialization APIs has been satisfied):
477
    ///
478
    /// * [`DFA::from_bytes`]
479
    /// * [`DFA::from_bytes_unchecked`]
480
    ///
481
    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
482
    /// serialization methods, this does not add any initial padding to the
483
    /// returned bytes. Padding isn't required for sparse DFAs since they have
484
    /// no alignment requirements.
485
    ///
486
    /// # Example
487
    ///
488
    /// This example shows how to serialize and deserialize a DFA:
489
    ///
490
    /// ```
491
    /// use regex_automata::{
492
    ///     dfa::{Automaton, sparse::DFA},
493
    ///     HalfMatch,
494
    /// };
495
    ///
496
    /// // Compile our original DFA.
497
    /// let original_dfa = DFA::new("foo[0-9]+")?;
498
    ///
499
    /// // N.B. We use native endianness here to make the example work, but
500
    /// // using to_bytes_little_endian would work on a little endian target.
501
    /// let buf = original_dfa.to_bytes_native_endian();
502
    /// // Even if buf has initial padding, DFA::from_bytes will automatically
503
    /// // ignore it.
504
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
505
    ///
506
    /// let expected = HalfMatch::must(0, 8);
507
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
508
    /// # Ok::<(), Box<dyn std::error::Error>>(())
509
    /// ```
510
    #[cfg(feature = "alloc")]
511
    pub fn to_bytes_little_endian(&self) -> Vec<u8> {
512
        self.to_bytes::<bytes::LE>()
513
    }
514
515
    /// Serialize this DFA as raw bytes to a `Vec<u8>` in big endian
516
    /// format.
517
    ///
518
    /// The written bytes are guaranteed to be deserialized correctly and
519
    /// without errors in a semver compatible release of this crate by a
520
    /// `DFA`'s deserialization APIs (assuming all other criteria for the
521
    /// deserialization APIs has been satisfied):
522
    ///
523
    /// * [`DFA::from_bytes`]
524
    /// * [`DFA::from_bytes_unchecked`]
525
    ///
526
    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
527
    /// serialization methods, this does not add any initial padding to the
528
    /// returned bytes. Padding isn't required for sparse DFAs since they have
529
    /// no alignment requirements.
530
    ///
531
    /// # Example
532
    ///
533
    /// This example shows how to serialize and deserialize a DFA:
534
    ///
535
    /// ```
536
    /// use regex_automata::{
537
    ///     dfa::{Automaton, sparse::DFA},
538
    ///     HalfMatch,
539
    /// };
540
    ///
541
    /// // Compile our original DFA.
542
    /// let original_dfa = DFA::new("foo[0-9]+")?;
543
    ///
544
    /// // N.B. We use native endianness here to make the example work, but
545
    /// // using to_bytes_big_endian would work on a big endian target.
546
    /// let buf = original_dfa.to_bytes_native_endian();
547
    /// // Even if buf has initial padding, DFA::from_bytes will automatically
548
    /// // ignore it.
549
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
550
    ///
551
    /// let expected = HalfMatch::must(0, 8);
552
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
553
    /// # Ok::<(), Box<dyn std::error::Error>>(())
554
    /// ```
555
    #[cfg(feature = "alloc")]
556
    pub fn to_bytes_big_endian(&self) -> Vec<u8> {
557
        self.to_bytes::<bytes::BE>()
558
    }
559
560
    /// Serialize this DFA as raw bytes to a `Vec<u8>` in native endian
561
    /// format.
562
    ///
563
    /// The written bytes are guaranteed to be deserialized correctly and
564
    /// without errors in a semver compatible release of this crate by a
565
    /// `DFA`'s deserialization APIs (assuming all other criteria for the
566
    /// deserialization APIs has been satisfied):
567
    ///
568
    /// * [`DFA::from_bytes`]
569
    /// * [`DFA::from_bytes_unchecked`]
570
    ///
571
    /// Note that unlike a [`dense::DFA`](crate::dfa::dense::DFA)'s
572
    /// serialization methods, this does not add any initial padding to the
573
    /// returned bytes. Padding isn't required for sparse DFAs since they have
574
    /// no alignment requirements.
575
    ///
576
    /// Generally speaking, native endian format should only be used when
577
    /// you know that the target you're compiling the DFA for matches the
578
    /// endianness of the target on which you're compiling DFA. For example,
579
    /// if serialization and deserialization happen in the same process or on
580
    /// the same machine. Otherwise, when serializing a DFA for use in a
581
    /// portable environment, you'll almost certainly want to serialize _both_
582
    /// a little endian and a big endian version and then load the correct one
583
    /// based on the target's configuration.
584
    ///
585
    /// # Example
586
    ///
587
    /// This example shows how to serialize and deserialize a DFA:
588
    ///
589
    /// ```
590
    /// use regex_automata::{
591
    ///     dfa::{Automaton, sparse::DFA},
592
    ///     HalfMatch,
593
    /// };
594
    ///
595
    /// // Compile our original DFA.
596
    /// let original_dfa = DFA::new("foo[0-9]+")?;
597
    ///
598
    /// let buf = original_dfa.to_bytes_native_endian();
599
    /// // Even if buf has initial padding, DFA::from_bytes will automatically
600
    /// // ignore it.
601
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf)?.0;
602
    ///
603
    /// let expected = HalfMatch::must(0, 8);
604
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
605
    /// # Ok::<(), Box<dyn std::error::Error>>(())
606
    /// ```
607
    #[cfg(feature = "alloc")]
608
    pub fn to_bytes_native_endian(&self) -> Vec<u8> {
609
        self.to_bytes::<bytes::NE>()
610
    }
611
612
    /// The implementation of the public `to_bytes` serialization methods,
613
    /// which is generic over endianness.
614
    #[cfg(feature = "alloc")]
615
    fn to_bytes<E: Endian>(&self) -> Vec<u8> {
616
        let mut buf = vec![0; self.write_to_len()];
617
        // This should always succeed since the only possible serialization
618
        // error is providing a buffer that's too small, but we've ensured that
619
        // `buf` is big enough here.
620
        self.write_to::<E>(&mut buf).unwrap();
621
        buf
622
    }
623
624
    /// Serialize this DFA as raw bytes to the given slice, in little endian
625
    /// format. Upon success, the total number of bytes written to `dst` is
626
    /// returned.
627
    ///
628
    /// The written bytes are guaranteed to be deserialized correctly and
629
    /// without errors in a semver compatible release of this crate by a
630
    /// `DFA`'s deserialization APIs (assuming all other criteria for the
631
    /// deserialization APIs has been satisfied):
632
    ///
633
    /// * [`DFA::from_bytes`]
634
    /// * [`DFA::from_bytes_unchecked`]
635
    ///
636
    /// # Errors
637
    ///
638
    /// This returns an error if the given destination slice is not big enough
639
    /// to contain the full serialized DFA. If an error occurs, then nothing
640
    /// is written to `dst`.
641
    ///
642
    /// # Example
643
    ///
644
    /// This example shows how to serialize and deserialize a DFA without
645
    /// dynamic memory allocation.
646
    ///
647
    /// ```
648
    /// use regex_automata::{
649
    ///     dfa::{Automaton, sparse::DFA},
650
    ///     HalfMatch,
651
    /// };
652
    ///
653
    /// // Compile our original DFA.
654
    /// let original_dfa = DFA::new("foo[0-9]+")?;
655
    ///
656
    /// // Create a 4KB buffer on the stack to store our serialized DFA.
657
    /// let mut buf = [0u8; 4 * (1<<10)];
658
    /// // N.B. We use native endianness here to make the example work, but
659
    /// // using write_to_little_endian would work on a little endian target.
660
    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
661
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
662
    ///
663
    /// let expected = HalfMatch::must(0, 8);
664
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
665
    /// # Ok::<(), Box<dyn std::error::Error>>(())
666
    /// ```
667
0
    pub fn write_to_little_endian(
668
0
        &self,
669
0
        dst: &mut [u8],
670
0
    ) -> Result<usize, SerializeError> {
671
0
        self.write_to::<bytes::LE>(dst)
672
0
    }
673
674
    /// Serialize this DFA as raw bytes to the given slice, in big endian
675
    /// format. Upon success, the total number of bytes written to `dst` is
676
    /// returned.
677
    ///
678
    /// The written bytes are guaranteed to be deserialized correctly and
679
    /// without errors in a semver compatible release of this crate by a
680
    /// `DFA`'s deserialization APIs (assuming all other criteria for the
681
    /// deserialization APIs has been satisfied):
682
    ///
683
    /// * [`DFA::from_bytes`]
684
    /// * [`DFA::from_bytes_unchecked`]
685
    ///
686
    /// # Errors
687
    ///
688
    /// This returns an error if the given destination slice is not big enough
689
    /// to contain the full serialized DFA. If an error occurs, then nothing
690
    /// is written to `dst`.
691
    ///
692
    /// # Example
693
    ///
694
    /// This example shows how to serialize and deserialize a DFA without
695
    /// dynamic memory allocation.
696
    ///
697
    /// ```
698
    /// use regex_automata::{
699
    ///     dfa::{Automaton, sparse::DFA},
700
    ///     HalfMatch,
701
    /// };
702
    ///
703
    /// // Compile our original DFA.
704
    /// let original_dfa = DFA::new("foo[0-9]+")?;
705
    ///
706
    /// // Create a 4KB buffer on the stack to store our serialized DFA.
707
    /// let mut buf = [0u8; 4 * (1<<10)];
708
    /// // N.B. We use native endianness here to make the example work, but
709
    /// // using write_to_big_endian would work on a big endian target.
710
    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
711
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
712
    ///
713
    /// let expected = HalfMatch::must(0, 8);
714
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
715
    /// # Ok::<(), Box<dyn std::error::Error>>(())
716
    /// ```
717
0
    pub fn write_to_big_endian(
718
0
        &self,
719
0
        dst: &mut [u8],
720
0
    ) -> Result<usize, SerializeError> {
721
0
        self.write_to::<bytes::BE>(dst)
722
0
    }
723
724
    /// Serialize this DFA as raw bytes to the given slice, in native endian
725
    /// format. Upon success, the total number of bytes written to `dst` is
726
    /// returned.
727
    ///
728
    /// The written bytes are guaranteed to be deserialized correctly and
729
    /// without errors in a semver compatible release of this crate by a
730
    /// `DFA`'s deserialization APIs (assuming all other criteria for the
731
    /// deserialization APIs has been satisfied):
732
    ///
733
    /// * [`DFA::from_bytes`]
734
    /// * [`DFA::from_bytes_unchecked`]
735
    ///
736
    /// Generally speaking, native endian format should only be used when
737
    /// you know that the target you're compiling the DFA for matches the
738
    /// endianness of the target on which you're compiling DFA. For example,
739
    /// if serialization and deserialization happen in the same process or on
740
    /// the same machine. Otherwise, when serializing a DFA for use in a
741
    /// portable environment, you'll almost certainly want to serialize _both_
742
    /// a little endian and a big endian version and then load the correct one
743
    /// based on the target's configuration.
744
    ///
745
    /// # Errors
746
    ///
747
    /// This returns an error if the given destination slice is not big enough
748
    /// to contain the full serialized DFA. If an error occurs, then nothing
749
    /// is written to `dst`.
750
    ///
751
    /// # Example
752
    ///
753
    /// This example shows how to serialize and deserialize a DFA without
754
    /// dynamic memory allocation.
755
    ///
756
    /// ```
757
    /// use regex_automata::{
758
    ///     dfa::{Automaton, sparse::DFA},
759
    ///     HalfMatch,
760
    /// };
761
    ///
762
    /// // Compile our original DFA.
763
    /// let original_dfa = DFA::new("foo[0-9]+")?;
764
    ///
765
    /// // Create a 4KB buffer on the stack to store our serialized DFA.
766
    /// let mut buf = [0u8; 4 * (1<<10)];
767
    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
768
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
769
    ///
770
    /// let expected = HalfMatch::must(0, 8);
771
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
772
    /// # Ok::<(), Box<dyn std::error::Error>>(())
773
    /// ```
774
0
    pub fn write_to_native_endian(
775
0
        &self,
776
0
        dst: &mut [u8],
777
0
    ) -> Result<usize, SerializeError> {
778
0
        self.write_to::<bytes::NE>(dst)
779
0
    }
780
781
    /// The implementation of the public `write_to` serialization methods,
782
    /// which is generic over endianness.
783
0
    fn write_to<E: Endian>(
784
0
        &self,
785
0
        dst: &mut [u8],
786
0
    ) -> Result<usize, SerializeError> {
787
0
        let mut nw = 0;
788
0
        nw += bytes::write_label(LABEL, &mut dst[nw..])?;
789
0
        nw += bytes::write_endianness_check::<E>(&mut dst[nw..])?;
790
0
        nw += bytes::write_version::<E>(VERSION, &mut dst[nw..])?;
791
0
        nw += {
792
0
            // Currently unused, intended for future flexibility
793
0
            E::write_u32(0, &mut dst[nw..]);
794
0
            size_of::<u32>()
795
0
        };
796
0
        nw += self.trans.write_to::<E>(&mut dst[nw..])?;
797
0
        nw += self.starts.write_to::<E>(&mut dst[nw..])?;
798
0
        nw += self.special.write_to::<E>(&mut dst[nw..])?;
799
0
        Ok(nw)
800
0
    }
801
802
    /// Return the total number of bytes required to serialize this DFA.
803
    ///
804
    /// This is useful for determining the size of the buffer required to pass
805
    /// to one of the serialization routines:
806
    ///
807
    /// * [`DFA::write_to_little_endian`]
808
    /// * [`DFA::write_to_big_endian`]
809
    /// * [`DFA::write_to_native_endian`]
810
    ///
811
    /// Passing a buffer smaller than the size returned by this method will
812
    /// result in a serialization error.
813
    ///
814
    /// # Example
815
    ///
816
    /// This example shows how to dynamically allocate enough room to serialize
817
    /// a sparse DFA.
818
    ///
819
    /// ```
820
    /// use regex_automata::{
821
    ///     dfa::{Automaton, sparse::DFA},
822
    ///     HalfMatch,
823
    /// };
824
    ///
825
    /// // Compile our original DFA.
826
    /// let original_dfa = DFA::new("foo[0-9]+")?;
827
    ///
828
    /// let mut buf = vec![0; original_dfa.write_to_len()];
829
    /// let written = original_dfa.write_to_native_endian(&mut buf)?;
830
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&buf[..written])?.0;
831
    ///
832
    /// let expected = HalfMatch::must(0, 8);
833
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
834
    /// # Ok::<(), Box<dyn std::error::Error>>(())
835
    /// ```
836
0
    pub fn write_to_len(&self) -> usize {
837
0
        bytes::write_label_len(LABEL)
838
0
        + bytes::write_endianness_check_len()
839
0
        + bytes::write_version_len()
840
0
        + size_of::<u32>() // unused, intended for future flexibility
841
0
        + self.trans.write_to_len()
842
0
        + self.starts.write_to_len()
843
0
        + self.special.write_to_len()
844
0
    }
845
}
846
847
impl<'a> DFA<&'a [u8]> {
848
    /// Safely deserialize a sparse DFA with a specific state identifier
849
    /// representation. Upon success, this returns both the deserialized DFA
850
    /// and the number of bytes read from the given slice. Namely, the contents
851
    /// of the slice beyond the DFA are not read.
852
    ///
853
    /// Deserializing a DFA using this routine will never allocate heap memory.
854
    /// For safety purposes, the DFA's transitions will be verified such that
855
    /// every transition points to a valid state. If this verification is too
856
    /// costly, then a [`DFA::from_bytes_unchecked`] API is provided, which
857
    /// will always execute in constant time.
858
    ///
859
    /// The bytes given must be generated by one of the serialization APIs
860
    /// of a `DFA` using a semver compatible release of this crate. Those
861
    /// include:
862
    ///
863
    /// * [`DFA::to_bytes_little_endian`]
864
    /// * [`DFA::to_bytes_big_endian`]
865
    /// * [`DFA::to_bytes_native_endian`]
866
    /// * [`DFA::write_to_little_endian`]
867
    /// * [`DFA::write_to_big_endian`]
868
    /// * [`DFA::write_to_native_endian`]
869
    ///
870
    /// The `to_bytes` methods allocate and return a `Vec<u8>` for you. The
871
    /// `write_to` methods do not allocate and write to an existing slice
872
    /// (which may be on the stack). Since deserialization always uses the
873
    /// native endianness of the target platform, the serialization API you use
874
    /// should match the endianness of the target platform. (It's often a good
875
    /// idea to generate serialized DFAs for both forms of endianness and then
876
    /// load the correct one based on endianness.)
877
    ///
878
    /// # Errors
879
    ///
880
    /// Generally speaking, it's easier to state the conditions in which an
881
    /// error is _not_ returned. All of the following must be true:
882
    ///
883
    /// * The bytes given must be produced by one of the serialization APIs
884
    ///   on this DFA, as mentioned above.
885
    /// * The endianness of the target platform matches the endianness used to
886
    ///   serialized the provided DFA.
887
    ///
888
    /// If any of the above are not true, then an error will be returned.
889
    ///
890
    /// Note that unlike deserializing a
891
    /// [`dense::DFA`](crate::dfa::dense::DFA), deserializing a sparse DFA has
892
    /// no alignment requirements. That is, an alignment of `1` is valid.
893
    ///
894
    /// # Panics
895
    ///
896
    /// This routine will never panic for any input.
897
    ///
898
    /// # Example
899
    ///
900
    /// This example shows how to serialize a DFA to raw bytes, deserialize it
901
    /// and then use it for searching.
902
    ///
903
    /// ```
904
    /// use regex_automata::{
905
    ///     dfa::{Automaton, sparse::DFA},
906
    ///     HalfMatch,
907
    /// };
908
    ///
909
    /// let initial = DFA::new("foo[0-9]+")?;
910
    /// let bytes = initial.to_bytes_native_endian();
911
    /// let dfa: DFA<&[u8]> = DFA::from_bytes(&bytes)?.0;
912
    ///
913
    /// let expected = HalfMatch::must(0, 8);
914
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
915
    /// # Ok::<(), Box<dyn std::error::Error>>(())
916
    /// ```
917
    ///
918
    /// # Example: loading a DFA from static memory
919
    ///
920
    /// One use case this library supports is the ability to serialize a
921
    /// DFA to disk and then use `include_bytes!` to store it in a compiled
922
    /// Rust program. Those bytes can then be cheaply deserialized into a
923
    /// `DFA` structure at runtime and used for searching without having to
924
    /// re-compile the DFA (which can be quite costly).
925
    ///
926
    /// We can show this in two parts. The first part is serializing the DFA to
927
    /// a file:
928
    ///
929
    /// ```no_run
930
    /// use regex_automata::dfa::{Automaton, sparse::DFA};
931
    ///
932
    /// let dfa = DFA::new("foo[0-9]+")?;
933
    ///
934
    /// // Write a big endian serialized version of this DFA to a file.
935
    /// let bytes = dfa.to_bytes_big_endian();
936
    /// std::fs::write("foo.bigendian.dfa", &bytes)?;
937
    ///
938
    /// // Do it again, but this time for little endian.
939
    /// let bytes = dfa.to_bytes_little_endian();
940
    /// std::fs::write("foo.littleendian.dfa", &bytes)?;
941
    /// # Ok::<(), Box<dyn std::error::Error>>(())
942
    /// ```
943
    ///
944
    /// And now the second part is embedding the DFA into the compiled program
945
    /// and deserializing it at runtime on first use. We use conditional
946
    /// compilation to choose the correct endianness. As mentioned above, we
947
    /// do not need to employ any special tricks to ensure a proper alignment,
948
    /// since a sparse DFA has no alignment requirements.
949
    ///
950
    /// ```no_run
951
    /// use regex_automata::{
952
    ///     dfa::{Automaton, sparse},
953
    ///     HalfMatch,
954
    /// };
955
    ///
956
    /// type DFA = sparse::DFA<&'static [u8]>;
957
    ///
958
    /// fn get_foo() -> &'static DFA {
959
    ///     use std::cell::Cell;
960
    ///     use std::mem::MaybeUninit;
961
    ///     use std::sync::Once;
962
    ///
963
    ///     # const _: &str = stringify! {
964
    ///     #[cfg(target_endian = "big")]
965
    ///     static BYTES: &[u8] = include_bytes!("foo.bigendian.dfa");
966
    ///     #[cfg(target_endian = "little")]
967
    ///     static BYTES: &[u8] = include_bytes!("foo.littleendian.dfa");
968
    ///     # };
969
    ///     # static BYTES: &[u8] = b"";
970
    ///
971
    ///     struct Lazy(Cell<MaybeUninit<DFA>>);
972
    ///     // SAFETY: This is safe because DFA impls Sync.
973
    ///     unsafe impl Sync for Lazy {}
974
    ///
975
    ///     static INIT: Once = Once::new();
976
    ///     static DFA: Lazy = Lazy(Cell::new(MaybeUninit::uninit()));
977
    ///
978
    ///     INIT.call_once(|| {
979
    ///         let (dfa, _) = DFA::from_bytes(BYTES)
980
    ///             .expect("serialized DFA should be valid");
981
    ///         // SAFETY: This is guaranteed to only execute once, and all
982
    ///         // we do with the pointer is write the DFA to it.
983
    ///         unsafe {
984
    ///             (*DFA.0.as_ptr()).as_mut_ptr().write(dfa);
985
    ///         }
986
    ///     });
987
    ///     // SAFETY: DFA is guaranteed to by initialized via INIT and is
988
    ///     // stored in static memory.
989
    ///     unsafe {
990
    ///         let dfa = (*DFA.0.as_ptr()).as_ptr();
991
    ///         std::mem::transmute::<*const DFA, &'static DFA>(dfa)
992
    ///     }
993
    /// }
994
    ///
995
    /// let dfa = get_foo();
996
    /// let expected = HalfMatch::must(0, 8);
997
    /// assert_eq!(Ok(Some(expected)), dfa.find_leftmost_fwd(b"foo12345"));
998
    /// ```
999
    ///
1000
    /// Alternatively, consider using
1001
    /// [`lazy_static`](https://crates.io/crates/lazy_static)
1002
    /// or
1003
    /// [`once_cell`](https://crates.io/crates/once_cell),
1004
    /// which will guarantee safety for you.
1005
0
    pub fn from_bytes(
1006
0
        slice: &'a [u8],
1007
0
    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1008
        // SAFETY: This is safe because we validate both the sparse transitions
1009
        // (by trying to decode every state) and start state ID list below. If
1010
        // either validation fails, then we return an error.
1011
0
        let (dfa, nread) = unsafe { DFA::from_bytes_unchecked(slice)? };
1012
0
        dfa.trans.validate()?;
1013
0
        dfa.starts.validate(&dfa.trans)?;
1014
        // N.B. dfa.special doesn't have a way to do unchecked deserialization,
1015
        // so it has already been validated.
1016
0
        Ok((dfa, nread))
1017
0
    }
1018
1019
    /// Deserialize a DFA with a specific state identifier representation in
1020
    /// constant time by omitting the verification of the validity of the
1021
    /// sparse transitions.
1022
    ///
1023
    /// This is just like [`DFA::from_bytes`], except it can potentially return
1024
    /// a DFA that exhibits undefined behavior if its transitions contains
1025
    /// invalid state identifiers.
1026
    ///
1027
    /// This routine is useful if you need to deserialize a DFA cheaply and
1028
    /// cannot afford the transition validation performed by `from_bytes`.
1029
    ///
1030
    /// # Safety
1031
    ///
1032
    /// This routine is unsafe because it permits callers to provide
1033
    /// arbitrary transitions with possibly incorrect state identifiers. While
1034
    /// the various serialization routines will never return an incorrect
1035
    /// DFA, there is no guarantee that the bytes provided here
1036
    /// are correct. While `from_bytes_unchecked` will still do several forms
1037
    /// of basic validation, this routine does not check that the transitions
1038
    /// themselves are correct. Given an incorrect transition table, it is
1039
    /// possible for the search routines to access out-of-bounds memory because
1040
    /// of explicit bounds check elision.
1041
    ///
1042
    /// # Example
1043
    ///
1044
    /// ```
1045
    /// use regex_automata::{
1046
    ///     dfa::{Automaton, sparse::DFA},
1047
    ///     HalfMatch,
1048
    /// };
1049
    ///
1050
    /// let initial = DFA::new("foo[0-9]+")?;
1051
    /// let bytes = initial.to_bytes_native_endian();
1052
    /// // SAFETY: This is guaranteed to be safe since the bytes given come
1053
    /// // directly from a compatible serialization routine.
1054
    /// let dfa: DFA<&[u8]> = unsafe { DFA::from_bytes_unchecked(&bytes)?.0 };
1055
    ///
1056
    /// let expected = HalfMatch::must(0, 8);
1057
    /// assert_eq!(Some(expected), dfa.find_leftmost_fwd(b"foo12345")?);
1058
    /// # Ok::<(), Box<dyn std::error::Error>>(())
1059
    /// ```
1060
0
    pub unsafe fn from_bytes_unchecked(
1061
0
        slice: &'a [u8],
1062
0
    ) -> Result<(DFA<&'a [u8]>, usize), DeserializeError> {
1063
0
        let mut nr = 0;
1064
0
1065
0
        nr += bytes::read_label(&slice[nr..], LABEL)?;
1066
0
        nr += bytes::read_endianness_check(&slice[nr..])?;
1067
0
        nr += bytes::read_version(&slice[nr..], VERSION)?;
1068
1069
0
        let _unused = bytes::try_read_u32(&slice[nr..], "unused space")?;
1070
0
        nr += size_of::<u32>();
1071
1072
0
        let (trans, nread) = Transitions::from_bytes_unchecked(&slice[nr..])?;
1073
0
        nr += nread;
1074
1075
0
        let (starts, nread) = StartTable::from_bytes_unchecked(&slice[nr..])?;
1076
0
        nr += nread;
1077
1078
0
        let (special, nread) = Special::from_bytes(&slice[nr..])?;
1079
0
        nr += nread;
1080
0
        if special.max.as_usize() >= trans.sparse().len() {
1081
0
            return Err(DeserializeError::generic(
1082
0
                "max should not be greater than or equal to sparse bytes",
1083
0
            ));
1084
0
        }
1085
0
1086
0
        Ok((DFA { trans, starts, special }, nr))
1087
0
    }
1088
}
1089
1090
impl<T: AsRef<[u8]>> fmt::Debug for DFA<T> {
1091
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1092
0
        writeln!(f, "sparse::DFA(")?;
1093
0
        for state in self.trans.states() {
1094
0
            fmt_state_indicator(f, self, state.id())?;
1095
0
            writeln!(f, "{:06?}: {:?}", state.id(), state)?;
1096
        }
1097
0
        writeln!(f, "")?;
1098
0
        for (i, (start_id, sty, pid)) in self.starts.iter().enumerate() {
1099
0
            if i % self.starts.stride == 0 {
1100
0
                match pid {
1101
0
                    None => writeln!(f, "START-GROUP(ALL)")?,
1102
0
                    Some(pid) => {
1103
0
                        writeln!(f, "START_GROUP(pattern: {:?})", pid)?
1104
                    }
1105
                }
1106
0
            }
1107
0
            writeln!(f, "  {:?} => {:06?}", sty, start_id.as_usize())?;
1108
        }
1109
0
        writeln!(f, "state count: {:?}", self.trans.count)?;
1110
0
        writeln!(f, ")")?;
1111
0
        Ok(())
1112
0
    }
1113
}
1114
1115
unsafe impl<T: AsRef<[u8]>> Automaton for DFA<T> {
1116
    #[inline]
1117
0
    fn is_special_state(&self, id: StateID) -> bool {
1118
0
        self.special.is_special_state(id)
1119
0
    }
1120
1121
    #[inline]
1122
0
    fn is_dead_state(&self, id: StateID) -> bool {
1123
0
        self.special.is_dead_state(id)
1124
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::is_dead_state
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::is_dead_state
1125
1126
    #[inline]
1127
0
    fn is_quit_state(&self, id: StateID) -> bool {
1128
0
        self.special.is_quit_state(id)
1129
0
    }
1130
1131
    #[inline]
1132
0
    fn is_match_state(&self, id: StateID) -> bool {
1133
0
        self.special.is_match_state(id)
1134
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::is_match_state
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::is_match_state
1135
1136
    #[inline]
1137
0
    fn is_start_state(&self, id: StateID) -> bool {
1138
0
        self.special.is_start_state(id)
1139
0
    }
1140
1141
    #[inline]
1142
0
    fn is_accel_state(&self, id: StateID) -> bool {
1143
0
        self.special.is_accel_state(id)
1144
0
    }
1145
1146
    // This is marked as inline to help dramatically boost sparse searching,
1147
    // which decodes each state it enters to follow the next transition.
1148
    #[inline(always)]
1149
0
    fn next_state(&self, current: StateID, input: u8) -> StateID {
1150
0
        let input = self.trans.classes.get(input);
1151
0
        self.trans.state(current).next(input)
1152
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::next_state
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::next_state
1153
1154
    #[inline]
1155
0
    unsafe fn next_state_unchecked(
1156
0
        &self,
1157
0
        current: StateID,
1158
0
        input: u8,
1159
0
    ) -> StateID {
1160
0
        self.next_state(current, input)
1161
0
    }
1162
1163
    #[inline]
1164
0
    fn next_eoi_state(&self, current: StateID) -> StateID {
1165
0
        self.trans.state(current).next_eoi()
1166
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::next_eoi_state
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::next_eoi_state
1167
1168
    #[inline]
1169
0
    fn pattern_count(&self) -> usize {
1170
0
        self.trans.patterns
1171
0
    }
1172
1173
    #[inline]
1174
0
    fn match_count(&self, id: StateID) -> usize {
1175
0
        self.trans.state(id).pattern_count()
1176
0
    }
1177
1178
    #[inline]
1179
0
    fn match_pattern(&self, id: StateID, match_index: usize) -> PatternID {
1180
0
        // This is an optimization for the very common case of a DFA with a
1181
0
        // single pattern. This conditional avoids a somewhat more costly path
1182
0
        // that finds the pattern ID from the state machine, which requires
1183
0
        // a bit of slicing/pointer-chasing. This optimization tends to only
1184
0
        // matter when matches are frequent.
1185
0
        if self.trans.patterns == 1 {
1186
0
            return PatternID::ZERO;
1187
0
        }
1188
0
        self.trans.state(id).pattern_id(match_index)
1189
0
    }
1190
1191
    #[inline]
1192
0
    fn start_state_forward(
1193
0
        &self,
1194
0
        pattern_id: Option<PatternID>,
1195
0
        bytes: &[u8],
1196
0
        start: usize,
1197
0
        end: usize,
1198
0
    ) -> StateID {
1199
0
        let index = Start::from_position_fwd(bytes, start, end);
1200
0
        self.starts.start(index, pattern_id)
1201
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<&[u8]> as regex_automata::dfa::automaton::Automaton>::start_state_forward
Unexecuted instantiation: <regex_automata::dfa::sparse::DFA<_> as regex_automata::dfa::automaton::Automaton>::start_state_forward
1202
1203
    #[inline]
1204
0
    fn start_state_reverse(
1205
0
        &self,
1206
0
        pattern_id: Option<PatternID>,
1207
0
        bytes: &[u8],
1208
0
        start: usize,
1209
0
        end: usize,
1210
0
    ) -> StateID {
1211
0
        let index = Start::from_position_rev(bytes, start, end);
1212
0
        self.starts.start(index, pattern_id)
1213
0
    }
1214
1215
    #[inline]
1216
0
    fn accelerator(&self, id: StateID) -> &[u8] {
1217
0
        self.trans.state(id).accelerator()
1218
0
    }
1219
}
1220
1221
/// The transition table portion of a sparse DFA.
1222
///
1223
/// The transition table is the core part of the DFA in that it describes how
1224
/// to move from one state to another based on the input sequence observed.
1225
///
1226
/// Unlike a typical dense table based DFA, states in a sparse transition
1227
/// table have variable size. That is, states with more transitions use more
1228
/// space than states with fewer transitions. This means that finding the next
1229
/// transition takes more work than with a dense DFA, but also typically uses
1230
/// much less space.
1231
#[derive(Clone)]
1232
struct Transitions<T> {
1233
    /// The raw encoding of each state in this DFA.
1234
    ///
1235
    /// Each state has the following information:
1236
    ///
1237
    /// * A set of transitions to subsequent states. Transitions to the dead
1238
    ///   state are omitted.
1239
    /// * If the state can be accelerated, then any additional accelerator
1240
    ///   information.
1241
    /// * If the state is a match state, then the state contains all pattern
1242
    ///   IDs that match when in that state.
1243
    ///
1244
    /// To decode a state, use Transitions::state.
1245
    ///
1246
    /// In practice, T is either Vec<u8> or &[u8].
1247
    sparse: T,
1248
    /// A set of equivalence classes, where a single equivalence class
1249
    /// represents a set of bytes that never discriminate between a match
1250
    /// and a non-match in the DFA. Each equivalence class corresponds to a
1251
    /// single character in this DFA's alphabet, where the maximum number of
1252
    /// characters is 257 (each possible value of a byte plus the special
1253
    /// EOI transition). Consequently, the number of equivalence classes
1254
    /// corresponds to the number of transitions for each DFA state. Note
1255
    /// though that the *space* used by each DFA state in the transition table
1256
    /// may be larger. The total space used by each DFA state is known as the
1257
    /// stride and is documented above.
1258
    ///
1259
    /// The only time the number of equivalence classes is fewer than 257 is
1260
    /// if the DFA's kind uses byte classes which is the default. Equivalence
1261
    /// classes should generally only be disabled when debugging, so that
1262
    /// the transitions themselves aren't obscured. Disabling them has no
1263
    /// other benefit, since the equivalence class map is always used while
1264
    /// searching. In the vast majority of cases, the number of equivalence
1265
    /// classes is substantially smaller than 257, particularly when large
1266
    /// Unicode classes aren't used.
1267
    ///
1268
    /// N.B. Equivalence classes aren't particularly useful in a sparse DFA
1269
    /// in the current implementation, since equivalence classes generally tend
1270
    /// to correspond to continuous ranges of bytes that map to the same
1271
    /// transition. So in a sparse DFA, equivalence classes don't really lead
1272
    /// to a space savings. In the future, it would be good to try and remove
1273
    /// them from sparse DFAs entirely, but requires a bit of work since sparse
1274
    /// DFAs are built from dense DFAs, which are in turn built on top of
1275
    /// equivalence classes.
1276
    classes: ByteClasses,
1277
    /// The total number of states in this DFA. Note that a DFA always has at
1278
    /// least one state---the dead state---even the empty DFA. In particular,
1279
    /// the dead state always has ID 0 and is correspondingly always the first
1280
    /// state. The dead state is never a match state.
1281
    count: usize,
1282
    /// The total number of unique patterns represented by these match states.
1283
    patterns: usize,
1284
}
1285
1286
impl<'a> Transitions<&'a [u8]> {
1287
0
    unsafe fn from_bytes_unchecked(
1288
0
        mut slice: &'a [u8],
1289
0
    ) -> Result<(Transitions<&'a [u8]>, usize), DeserializeError> {
1290
0
        let slice_start = slice.as_ptr() as usize;
1291
1292
0
        let (state_count, nr) =
1293
0
            bytes::try_read_u32_as_usize(&slice, "state count")?;
1294
0
        slice = &slice[nr..];
1295
1296
0
        let (pattern_count, nr) =
1297
0
            bytes::try_read_u32_as_usize(&slice, "pattern count")?;
1298
0
        slice = &slice[nr..];
1299
1300
0
        let (classes, nr) = ByteClasses::from_bytes(&slice)?;
1301
0
        slice = &slice[nr..];
1302
1303
0
        let (len, nr) =
1304
0
            bytes::try_read_u32_as_usize(&slice, "sparse transitions length")?;
1305
0
        slice = &slice[nr..];
1306
0
1307
0
        bytes::check_slice_len(slice, len, "sparse states byte length")?;
1308
0
        let sparse = &slice[..len];
1309
0
        slice = &slice[len..];
1310
0
1311
0
        let trans = Transitions {
1312
0
            sparse,
1313
0
            classes,
1314
0
            count: state_count,
1315
0
            patterns: pattern_count,
1316
0
        };
1317
0
        Ok((trans, slice.as_ptr() as usize - slice_start))
1318
0
    }
1319
}
1320
1321
impl<T: AsRef<[u8]>> Transitions<T> {
1322
    /// Writes a serialized form of this transition table to the buffer given.
1323
    /// If the buffer is too small, then an error is returned. To determine
1324
    /// how big the buffer must be, use `write_to_len`.
1325
0
    fn write_to<E: Endian>(
1326
0
        &self,
1327
0
        mut dst: &mut [u8],
1328
0
    ) -> Result<usize, SerializeError> {
1329
0
        let nwrite = self.write_to_len();
1330
0
        if dst.len() < nwrite {
1331
0
            return Err(SerializeError::buffer_too_small(
1332
0
                "sparse transition table",
1333
0
            ));
1334
0
        }
1335
0
        dst = &mut dst[..nwrite];
1336
0
1337
0
        // write state count
1338
0
        E::write_u32(u32::try_from(self.count).unwrap(), dst);
1339
0
        dst = &mut dst[size_of::<u32>()..];
1340
0
1341
0
        // write pattern count
1342
0
        E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
1343
0
        dst = &mut dst[size_of::<u32>()..];
1344
1345
        // write byte class map
1346
0
        let n = self.classes.write_to(dst)?;
1347
0
        dst = &mut dst[n..];
1348
0
1349
0
        // write number of bytes in sparse transitions
1350
0
        E::write_u32(u32::try_from(self.sparse().len()).unwrap(), dst);
1351
0
        dst = &mut dst[size_of::<u32>()..];
1352
0
1353
0
        // write actual transitions
1354
0
        dst.copy_from_slice(self.sparse());
1355
0
        Ok(nwrite)
1356
0
    }
1357
1358
    /// Returns the number of bytes the serialized form of this transition
1359
    /// table will use.
1360
0
    fn write_to_len(&self) -> usize {
1361
0
        size_of::<u32>()   // state count
1362
0
        + size_of::<u32>() // pattern count
1363
0
        + self.classes.write_to_len()
1364
0
        + size_of::<u32>() // sparse transitions length
1365
0
        + self.sparse().len()
1366
0
    }
1367
1368
    /// Validates that every state ID in this transition table is valid.
1369
    ///
1370
    /// That is, every state ID can be used to correctly index a state in this
1371
    /// table.
1372
0
    fn validate(&self) -> Result<(), DeserializeError> {
1373
        // In order to validate everything, we not only need to make sure we
1374
        // can decode every state, but that every transition in every state
1375
        // points to a valid state. There are many duplicative transitions, so
1376
        // we record state IDs that we've verified so that we don't redo the
1377
        // decoding work.
1378
        //
1379
        // Except, when in no_std mode, we don't have dynamic memory allocation
1380
        // available to us, so we skip this optimization. It's not clear
1381
        // whether doing something more clever is worth it just yet. If you're
1382
        // profiling this code and need it to run faster, please file an issue.
1383
        //
1384
        // ---AG
1385
        struct Seen {
1386
            #[cfg(feature = "alloc")]
1387
            set: BTreeSet<StateID>,
1388
            #[cfg(not(feature = "alloc"))]
1389
            set: core::marker::PhantomData<StateID>,
1390
        }
1391
1392
        #[cfg(feature = "alloc")]
1393
        impl Seen {
1394
            fn new() -> Seen {
1395
                Seen { set: BTreeSet::new() }
1396
            }
1397
            fn insert(&mut self, id: StateID) {
1398
                self.set.insert(id);
1399
            }
1400
            fn contains(&self, id: &StateID) -> bool {
1401
                self.set.contains(id)
1402
            }
1403
        }
1404
1405
        #[cfg(not(feature = "alloc"))]
1406
        impl Seen {
1407
0
            fn new() -> Seen {
1408
0
                Seen { set: core::marker::PhantomData }
1409
0
            }
1410
0
            fn insert(&mut self, _id: StateID) {}
1411
0
            fn contains(&self, _id: &StateID) -> bool {
1412
0
                false
1413
0
            }
1414
        }
1415
1416
0
        let mut verified: Seen = Seen::new();
1417
0
        // We need to make sure that we decode the correct number of states.
1418
0
        // Otherwise, an empty set of transitions would validate even if the
1419
0
        // recorded state count is non-empty.
1420
0
        let mut count = 0;
1421
0
        // We can't use the self.states() iterator because it assumes the state
1422
0
        // encodings are valid. It could panic if they aren't.
1423
0
        let mut id = DEAD;
1424
0
        while id.as_usize() < self.sparse().len() {
1425
0
            let state = self.try_state(id)?;
1426
0
            verified.insert(id);
1427
0
            // The next ID should be the offset immediately following `state`.
1428
0
            id = StateID::new(bytes::add(
1429
0
                id.as_usize(),
1430
0
                state.bytes_len(),
1431
0
                "next state ID offset",
1432
0
            )?)
1433
0
            .map_err(|err| {
1434
0
                DeserializeError::state_id_error(err, "next state ID offset")
1435
0
            })?;
1436
0
            count += 1;
1437
1438
            // Now check that all transitions in this state are correct.
1439
0
            for i in 0..state.ntrans {
1440
0
                let to = state.next_at(i);
1441
0
                if verified.contains(&to) {
1442
0
                    continue;
1443
0
                }
1444
0
                let _ = self.try_state(to)?;
1445
0
                verified.insert(id);
1446
            }
1447
        }
1448
0
        if count != self.count {
1449
0
            return Err(DeserializeError::generic(
1450
0
                "mismatching sparse state count",
1451
0
            ));
1452
0
        }
1453
0
        Ok(())
1454
0
    }
1455
1456
    /// Converts these transitions to a borrowed value.
1457
0
    fn as_ref(&self) -> Transitions<&'_ [u8]> {
1458
0
        Transitions {
1459
0
            sparse: self.sparse(),
1460
0
            classes: self.classes.clone(),
1461
0
            count: self.count,
1462
0
            patterns: self.patterns,
1463
0
        }
1464
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<&[u8]>>::as_ref
Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<_>>::as_ref
1465
1466
    /// Converts these transitions to an owned value.
1467
    #[cfg(feature = "alloc")]
1468
    fn to_owned(&self) -> Transitions<Vec<u8>> {
1469
        Transitions {
1470
            sparse: self.sparse().to_vec(),
1471
            classes: self.classes.clone(),
1472
            count: self.count,
1473
            patterns: self.patterns,
1474
        }
1475
    }
1476
1477
    /// Return a convenient representation of the given state.
1478
    ///
1479
    /// This panics if the state is invalid.
1480
    ///
1481
    /// This is marked as inline to help dramatically boost sparse searching,
1482
    /// which decodes each state it enters to follow the next transition. Other
1483
    /// functions involved are also inlined, which should hopefully eliminate
1484
    /// a lot of the extraneous decoding that is never needed just to follow
1485
    /// the next transition.
1486
    #[inline(always)]
1487
0
    fn state(&self, id: StateID) -> State<'_> {
1488
0
        let mut state = &self.sparse()[id.as_usize()..];
1489
0
        let mut ntrans = bytes::read_u16(&state) as usize;
1490
0
        let is_match = (1 << 15) & ntrans != 0;
1491
0
        ntrans &= !(1 << 15);
1492
0
        state = &state[2..];
1493
0
1494
0
        let (input_ranges, state) = state.split_at(ntrans * 2);
1495
0
        let (next, state) = state.split_at(ntrans * StateID::SIZE);
1496
0
        let (pattern_ids, state) = if is_match {
1497
0
            let npats = bytes::read_u32(&state) as usize;
1498
0
            state[4..].split_at(npats * 4)
1499
        } else {
1500
0
            (&[][..], state)
1501
        };
1502
1503
0
        let accel_len = state[0] as usize;
1504
0
        let accel = &state[1..accel_len + 1];
1505
0
        State { id, is_match, ntrans, input_ranges, next, pattern_ids, accel }
1506
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<&[u8]>>::state
Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<&[u8]>>::state
Unexecuted instantiation: <regex_automata::dfa::sparse::Transitions<_>>::state
1507
1508
    /// Like `state`, but will return an error if the state encoding is
1509
    /// invalid. This is useful for verifying states after deserialization,
1510
    /// which is required for a safe deserialization API.
1511
    ///
1512
    /// Note that this only verifies that this state is decodable and that
1513
    /// all of its data is consistent. It does not verify that its state ID
1514
    /// transitions point to valid states themselves, nor does it verify that
1515
    /// every pattern ID is valid.
1516
0
    fn try_state(&self, id: StateID) -> Result<State<'_>, DeserializeError> {
1517
0
        if id.as_usize() > self.sparse().len() {
1518
0
            return Err(DeserializeError::generic("invalid sparse state ID"));
1519
0
        }
1520
0
        let mut state = &self.sparse()[id.as_usize()..];
1521
        // Encoding format starts with a u16 that stores the total number of
1522
        // transitions in this state.
1523
0
        let (mut ntrans, _) =
1524
0
            bytes::try_read_u16_as_usize(state, "state transition count")?;
1525
0
        let is_match = ((1 << 15) & ntrans) != 0;
1526
0
        ntrans &= !(1 << 15);
1527
0
        state = &state[2..];
1528
0
        if ntrans > 257 || ntrans == 0 {
1529
0
            return Err(DeserializeError::generic("invalid transition count"));
1530
0
        }
1531
0
1532
0
        // Each transition has two pieces: an inclusive range of bytes on which
1533
0
        // it is defined, and the state ID that those bytes transition to. The
1534
0
        // pairs come first, followed by a corresponding sequence of state IDs.
1535
0
        let input_ranges_len = ntrans.checked_mul(2).unwrap();
1536
0
        bytes::check_slice_len(state, input_ranges_len, "sparse byte pairs")?;
1537
0
        let (input_ranges, state) = state.split_at(input_ranges_len);
1538
        // Every range should be of the form A-B, where A<=B.
1539
0
        for pair in input_ranges.chunks(2) {
1540
0
            let (start, end) = (pair[0], pair[1]);
1541
0
            if start > end {
1542
0
                return Err(DeserializeError::generic("invalid input range"));
1543
0
            }
1544
        }
1545
1546
        // And now extract the corresponding sequence of state IDs. We leave
1547
        // this sequence as a &[u8] instead of a &[S] because sparse DFAs do
1548
        // not have any alignment requirements.
1549
0
        let next_len = ntrans
1550
0
            .checked_mul(self.id_len())
1551
0
            .expect("state size * #trans should always fit in a usize");
1552
0
        bytes::check_slice_len(state, next_len, "sparse trans state IDs")?;
1553
0
        let (next, state) = state.split_at(next_len);
1554
        // We can at least verify that every state ID is in bounds.
1555
0
        for idbytes in next.chunks(self.id_len()) {
1556
0
            let (id, _) =
1557
0
                bytes::read_state_id(idbytes, "sparse state ID in try_state")?;
1558
0
            bytes::check_slice_len(
1559
0
                self.sparse(),
1560
0
                id.as_usize(),
1561
0
                "invalid sparse state ID",
1562
0
            )?;
1563
        }
1564
1565
        // If this is a match state, then read the pattern IDs for this state.
1566
        // Pattern IDs is a u32-length prefixed sequence of native endian
1567
        // encoded 32-bit integers.
1568
0
        let (pattern_ids, state) = if is_match {
1569
0
            let (npats, nr) =
1570
0
                bytes::try_read_u32_as_usize(state, "pattern ID count")?;
1571
0
            let state = &state[nr..];
1572
1573
0
            let pattern_ids_len =
1574
0
                bytes::mul(npats, 4, "sparse pattern ID byte length")?;
1575
0
            bytes::check_slice_len(
1576
0
                state,
1577
0
                pattern_ids_len,
1578
0
                "sparse pattern IDs",
1579
0
            )?;
1580
0
            let (pattern_ids, state) = state.split_at(pattern_ids_len);
1581
0
            for patbytes in pattern_ids.chunks(PatternID::SIZE) {
1582
0
                bytes::read_pattern_id(
1583
0
                    patbytes,
1584
0
                    "sparse pattern ID in try_state",
1585
0
                )?;
1586
            }
1587
0
            (pattern_ids, state)
1588
        } else {
1589
0
            (&[][..], state)
1590
        };
1591
1592
        // Now read this state's accelerator info. The first byte is the length
1593
        // of the accelerator, which is typically 0 (for no acceleration) but
1594
        // is no bigger than 3. The length indicates the number of bytes that
1595
        // follow, where each byte corresponds to a transition out of this
1596
        // state.
1597
0
        if state.is_empty() {
1598
0
            return Err(DeserializeError::generic("no accelerator length"));
1599
0
        }
1600
0
        let (accel_len, state) = (state[0] as usize, &state[1..]);
1601
0
1602
0
        if accel_len > 3 {
1603
0
            return Err(DeserializeError::generic(
1604
0
                "sparse invalid accelerator length",
1605
0
            ));
1606
0
        }
1607
0
        bytes::check_slice_len(
1608
0
            state,
1609
0
            accel_len,
1610
0
            "sparse corrupt accelerator length",
1611
0
        )?;
1612
0
        let (accel, _) = (&state[..accel_len], &state[accel_len..]);
1613
0
1614
0
        Ok(State {
1615
0
            id,
1616
0
            is_match,
1617
0
            ntrans,
1618
0
            input_ranges,
1619
0
            next,
1620
0
            pattern_ids,
1621
0
            accel,
1622
0
        })
1623
0
    }
1624
1625
    /// Return an iterator over all of the states in this DFA.
1626
    ///
1627
    /// The iterator returned yields tuples, where the first element is the
1628
    /// state ID and the second element is the state itself.
1629
0
    fn states(&self) -> StateIter<'_, T> {
1630
0
        StateIter { trans: self, id: DEAD.as_usize() }
1631
0
    }
1632
1633
    /// Returns the sparse transitions as raw bytes.
1634
0
    fn sparse(&self) -> &[u8] {
1635
0
        self.sparse.as_ref()
1636
0
    }
1637
1638
    /// Returns the number of bytes represented by a single state ID.
1639
0
    fn id_len(&self) -> usize {
1640
0
        StateID::SIZE
1641
0
    }
1642
1643
    /// Return the memory usage, in bytes, of these transitions.
1644
    ///
1645
    /// This does not include the size of a `Transitions` value itself.
1646
0
    fn memory_usage(&self) -> usize {
1647
0
        self.sparse().len()
1648
0
    }
1649
}
1650
1651
#[cfg(feature = "alloc")]
1652
impl<T: AsMut<[u8]>> Transitions<T> {
1653
    /// Return a convenient mutable representation of the given state.
1654
    /// This panics if the state is invalid.
1655
    fn state_mut(&mut self, id: StateID) -> StateMut<'_> {
1656
        let mut state = &mut self.sparse_mut()[id.as_usize()..];
1657
        let mut ntrans = bytes::read_u16(&state) as usize;
1658
        let is_match = (1 << 15) & ntrans != 0;
1659
        ntrans &= !(1 << 15);
1660
        state = &mut state[2..];
1661
1662
        let (input_ranges, state) = state.split_at_mut(ntrans * 2);
1663
        let (next, state) = state.split_at_mut(ntrans * StateID::SIZE);
1664
        let (pattern_ids, state) = if is_match {
1665
            let npats = bytes::read_u32(&state) as usize;
1666
            state[4..].split_at_mut(npats * 4)
1667
        } else {
1668
            (&mut [][..], state)
1669
        };
1670
1671
        let accel_len = state[0] as usize;
1672
        let accel = &mut state[1..accel_len + 1];
1673
        StateMut {
1674
            id,
1675
            is_match,
1676
            ntrans,
1677
            input_ranges,
1678
            next,
1679
            pattern_ids,
1680
            accel,
1681
        }
1682
    }
1683
1684
    /// Returns the sparse transitions as raw mutable bytes.
1685
    fn sparse_mut(&mut self) -> &mut [u8] {
1686
        self.sparse.as_mut()
1687
    }
1688
}
1689
1690
/// The set of all possible starting states in a DFA.
1691
///
1692
/// See the eponymous type in the `dense` module for more details. This type
1693
/// is very similar to `dense::StartTable`, except that its underlying
1694
/// representation is `&[u8]` instead of `&[S]`. (The latter would require
1695
/// sparse DFAs to be aligned, which is explicitly something we do not require
1696
/// because we don't really need it.)
1697
#[derive(Clone)]
1698
struct StartTable<T> {
1699
    /// The initial start state IDs as a contiguous table of native endian
1700
    /// encoded integers, represented by `S`.
1701
    ///
1702
    /// In practice, T is either Vec<u8> or &[u8] and has no alignment
1703
    /// requirements.
1704
    ///
1705
    /// The first `stride` (currently always 4) entries always correspond to
1706
    /// the start states for the entire DFA. After that, there are
1707
    /// `stride * patterns` state IDs, where `patterns` may be zero in the
1708
    /// case of a DFA with no patterns or in the case where the DFA was built
1709
    /// without enabling starting states for each pattern.
1710
    table: T,
1711
    /// The number of starting state IDs per pattern.
1712
    stride: usize,
1713
    /// The total number of patterns for which starting states are encoded.
1714
    /// This may be zero for non-empty DFAs when the DFA was built without
1715
    /// start states for each pattern.
1716
    patterns: usize,
1717
}
1718
1719
#[cfg(feature = "alloc")]
1720
impl StartTable<Vec<u8>> {
1721
    fn new(patterns: usize) -> StartTable<Vec<u8>> {
1722
        let stride = Start::count();
1723
        // This is OK since the only way we're here is if a dense DFA could be
1724
        // constructed successfully, which uses the same space.
1725
        let len = stride
1726
            .checked_mul(patterns)
1727
            .unwrap()
1728
            .checked_add(stride)
1729
            .unwrap()
1730
            .checked_mul(StateID::SIZE)
1731
            .unwrap();
1732
        StartTable { table: vec![0; len], stride, patterns }
1733
    }
1734
1735
    fn from_dense_dfa<T: AsRef<[u32]>>(
1736
        dfa: &dense::DFA<T>,
1737
        remap: &[StateID],
1738
    ) -> Result<StartTable<Vec<u8>>, Error> {
1739
        // Unless the DFA has start states compiled for each pattern, then
1740
        // as far as the starting state table is concerned, there are zero
1741
        // patterns to account for. It will instead only store starting states
1742
        // for the entire DFA.
1743
        let start_pattern_count = if dfa.has_starts_for_each_pattern() {
1744
            dfa.pattern_count()
1745
        } else {
1746
            0
1747
        };
1748
        let mut sl = StartTable::new(start_pattern_count);
1749
        for (old_start_id, sty, pid) in dfa.starts() {
1750
            let new_start_id = remap[dfa.to_index(old_start_id)];
1751
            sl.set_start(sty, pid, new_start_id);
1752
        }
1753
        Ok(sl)
1754
    }
1755
}
1756
1757
impl<'a> StartTable<&'a [u8]> {
1758
0
    unsafe fn from_bytes_unchecked(
1759
0
        mut slice: &'a [u8],
1760
0
    ) -> Result<(StartTable<&'a [u8]>, usize), DeserializeError> {
1761
0
        let slice_start = slice.as_ptr() as usize;
1762
1763
0
        let (stride, nr) =
1764
0
            bytes::try_read_u32_as_usize(slice, "sparse start table stride")?;
1765
0
        slice = &slice[nr..];
1766
1767
0
        let (patterns, nr) = bytes::try_read_u32_as_usize(
1768
0
            slice,
1769
0
            "sparse start table patterns",
1770
0
        )?;
1771
0
        slice = &slice[nr..];
1772
0
1773
0
        if stride != Start::count() {
1774
0
            return Err(DeserializeError::generic(
1775
0
                "invalid sparse starting table stride",
1776
0
            ));
1777
0
        }
1778
0
        if patterns > PatternID::LIMIT {
1779
0
            return Err(DeserializeError::generic(
1780
0
                "sparse invalid number of patterns",
1781
0
            ));
1782
0
        }
1783
0
        let pattern_table_size =
1784
0
            bytes::mul(stride, patterns, "sparse invalid pattern count")?;
1785
        // Our start states always start with a single stride of start states
1786
        // for the entire automaton which permit it to match any pattern. What
1787
        // follows it are an optional set of start states for each pattern.
1788
0
        let start_state_count = bytes::add(
1789
0
            stride,
1790
0
            pattern_table_size,
1791
0
            "sparse invalid 'any' pattern starts size",
1792
0
        )?;
1793
0
        let table_bytes_len = bytes::mul(
1794
0
            start_state_count,
1795
0
            StateID::SIZE,
1796
0
            "sparse pattern table bytes length",
1797
0
        )?;
1798
0
        bytes::check_slice_len(
1799
0
            slice,
1800
0
            table_bytes_len,
1801
0
            "sparse start ID table",
1802
0
        )?;
1803
0
        let table_bytes = &slice[..table_bytes_len];
1804
0
        slice = &slice[table_bytes_len..];
1805
0
1806
0
        let sl = StartTable { table: table_bytes, stride, patterns };
1807
0
        Ok((sl, slice.as_ptr() as usize - slice_start))
1808
0
    }
1809
}
1810
1811
impl<T: AsRef<[u8]>> StartTable<T> {
1812
0
    fn write_to<E: Endian>(
1813
0
        &self,
1814
0
        mut dst: &mut [u8],
1815
0
    ) -> Result<usize, SerializeError> {
1816
0
        let nwrite = self.write_to_len();
1817
0
        if dst.len() < nwrite {
1818
0
            return Err(SerializeError::buffer_too_small(
1819
0
                "sparse starting table ids",
1820
0
            ));
1821
0
        }
1822
0
        dst = &mut dst[..nwrite];
1823
0
1824
0
        // write stride
1825
0
        E::write_u32(u32::try_from(self.stride).unwrap(), dst);
1826
0
        dst = &mut dst[size_of::<u32>()..];
1827
0
        // write pattern count
1828
0
        E::write_u32(u32::try_from(self.patterns).unwrap(), dst);
1829
0
        dst = &mut dst[size_of::<u32>()..];
1830
0
        // write start IDs
1831
0
        dst.copy_from_slice(self.table());
1832
0
        Ok(nwrite)
1833
0
    }
1834
1835
    /// Returns the number of bytes the serialized form of this transition
1836
    /// table will use.
1837
0
    fn write_to_len(&self) -> usize {
1838
0
        size_of::<u32>() // stride
1839
0
        + size_of::<u32>() // # patterns
1840
0
        + self.table().len()
1841
0
    }
1842
1843
    /// Validates that every starting state ID in this table is valid.
1844
    ///
1845
    /// That is, every starting state ID can be used to correctly decode a
1846
    /// state in the DFA's sparse transitions.
1847
0
    fn validate(
1848
0
        &self,
1849
0
        trans: &Transitions<T>,
1850
0
    ) -> Result<(), DeserializeError> {
1851
0
        for (id, _, _) in self.iter() {
1852
0
            let _ = trans.try_state(id)?;
1853
        }
1854
0
        Ok(())
1855
0
    }
1856
1857
    /// Converts this start list to a borrowed value.
1858
0
    fn as_ref(&self) -> StartTable<&'_ [u8]> {
1859
0
        StartTable {
1860
0
            table: self.table(),
1861
0
            stride: self.stride,
1862
0
            patterns: self.patterns,
1863
0
        }
1864
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<&[u8]>>::as_ref
Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<_>>::as_ref
1865
1866
    /// Converts this start list to an owned value.
1867
    #[cfg(feature = "alloc")]
1868
    fn to_owned(&self) -> StartTable<Vec<u8>> {
1869
        StartTable {
1870
            table: self.table().to_vec(),
1871
            stride: self.stride,
1872
            patterns: self.patterns,
1873
        }
1874
    }
1875
1876
    /// Return the start state for the given index and pattern ID. If the
1877
    /// pattern ID is None, then the corresponding start state for the entire
1878
    /// DFA is returned. If the pattern ID is not None, then the corresponding
1879
    /// starting state for the given pattern is returned. If this start table
1880
    /// does not have individual starting states for each pattern, then this
1881
    /// panics.
1882
0
    fn start(&self, index: Start, pattern_id: Option<PatternID>) -> StateID {
1883
0
        let start_index = index.as_usize();
1884
0
        let index = match pattern_id {
1885
0
            None => start_index,
1886
0
            Some(pid) => {
1887
0
                let pid = pid.as_usize();
1888
0
                assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
1889
0
                self.stride
1890
0
                    .checked_mul(pid)
1891
0
                    .unwrap()
1892
0
                    .checked_add(self.stride)
1893
0
                    .unwrap()
1894
0
                    .checked_add(start_index)
1895
0
                    .unwrap()
1896
            }
1897
        };
1898
0
        let start = index * StateID::SIZE;
1899
0
        // This OK since we're allowed to assume that the start table contains
1900
0
        // valid StateIDs.
1901
0
        bytes::read_state_id_unchecked(&self.table()[start..]).0
1902
0
    }
Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<&[u8]>>::start
Unexecuted instantiation: <regex_automata::dfa::sparse::StartTable<_>>::start
1903
1904
    /// Return an iterator over all start IDs in this table.
1905
0
    fn iter(&self) -> StartStateIter<'_, T> {
1906
0
        StartStateIter { st: self, i: 0 }
1907
0
    }
1908
1909
    /// Returns the total number of start state IDs in this table.
1910
0
    fn len(&self) -> usize {
1911
0
        self.table().len() / StateID::SIZE
1912
0
    }
1913
1914
    /// Returns the table as a raw slice of bytes.
1915
0
    fn table(&self) -> &[u8] {
1916
0
        self.table.as_ref()
1917
0
    }
1918
1919
    /// Return the memory usage, in bytes, of this start list.
1920
    ///
1921
    /// This does not include the size of a `StartTable` value itself.
1922
0
    fn memory_usage(&self) -> usize {
1923
0
        self.table().len()
1924
0
    }
1925
}
1926
1927
#[cfg(feature = "alloc")]
1928
impl<T: AsMut<[u8]>> StartTable<T> {
1929
    /// Set the start state for the given index and pattern.
1930
    ///
1931
    /// If the pattern ID or state ID are not valid, then this will panic.
1932
    fn set_start(
1933
        &mut self,
1934
        index: Start,
1935
        pattern_id: Option<PatternID>,
1936
        id: StateID,
1937
    ) {
1938
        let start_index = index.as_usize();
1939
        let index = match pattern_id {
1940
            None => start_index,
1941
            Some(pid) => {
1942
                let pid = pid.as_usize();
1943
                assert!(pid < self.patterns, "invalid pattern ID {:?}", pid);
1944
                self.stride
1945
                    .checked_mul(pid)
1946
                    .unwrap()
1947
                    .checked_add(self.stride)
1948
                    .unwrap()
1949
                    .checked_add(start_index)
1950
                    .unwrap()
1951
            }
1952
        };
1953
        let start = index * StateID::SIZE;
1954
        let end = start + StateID::SIZE;
1955
        bytes::write_state_id::<bytes::NE>(
1956
            id,
1957
            &mut self.table.as_mut()[start..end],
1958
        );
1959
    }
1960
}
1961
1962
/// An iterator over all state state IDs in a sparse DFA.
1963
struct StartStateIter<'a, T> {
1964
    st: &'a StartTable<T>,
1965
    i: usize,
1966
}
1967
1968
impl<'a, T: AsRef<[u8]>> Iterator for StartStateIter<'a, T> {
1969
    type Item = (StateID, Start, Option<PatternID>);
1970
1971
0
    fn next(&mut self) -> Option<(StateID, Start, Option<PatternID>)> {
1972
0
        let i = self.i;
1973
0
        if i >= self.st.len() {
1974
0
            return None;
1975
0
        }
1976
0
        self.i += 1;
1977
0
1978
0
        // This unwrap is okay since the stride of any DFA must always match
1979
0
        // the number of start state types.
1980
0
        let start_type = Start::from_usize(i % self.st.stride).unwrap();
1981
0
        let pid = if i < self.st.stride {
1982
            // This means we don't have start states for each pattern.
1983
0
            None
1984
        } else {
1985
            // These unwraps are OK since we may assume our table and stride
1986
            // is correct.
1987
0
            let pid = i
1988
0
                .checked_sub(self.st.stride)
1989
0
                .unwrap()
1990
0
                .checked_div(self.st.stride)
1991
0
                .unwrap();
1992
0
            Some(PatternID::new(pid).unwrap())
1993
        };
1994
0
        let start = i * StateID::SIZE;
1995
0
        let end = start + StateID::SIZE;
1996
0
        let bytes = self.st.table()[start..end].try_into().unwrap();
1997
0
        // This is OK since we're allowed to assume that any IDs in this start
1998
0
        // table are correct and valid for this DFA.
1999
0
        let id = StateID::from_ne_bytes_unchecked(bytes);
2000
0
        Some((id, start_type, pid))
2001
0
    }
2002
}
2003
2004
impl<'a, T> fmt::Debug for StartStateIter<'a, T> {
2005
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2006
0
        f.debug_struct("StartStateIter").field("i", &self.i).finish()
2007
0
    }
2008
}
2009
2010
/// An iterator over all states in a sparse DFA.
2011
///
2012
/// This iterator yields tuples, where the first element is the state ID and
2013
/// the second element is the state itself.
2014
struct StateIter<'a, T> {
2015
    trans: &'a Transitions<T>,
2016
    id: usize,
2017
}
2018
2019
impl<'a, T: AsRef<[u8]>> Iterator for StateIter<'a, T> {
2020
    type Item = State<'a>;
2021
2022
0
    fn next(&mut self) -> Option<State<'a>> {
2023
0
        if self.id >= self.trans.sparse().len() {
2024
0
            return None;
2025
0
        }
2026
0
        let state = self.trans.state(StateID::new_unchecked(self.id));
2027
0
        self.id = self.id + state.bytes_len();
2028
0
        Some(state)
2029
0
    }
2030
}
2031
2032
impl<'a, T> fmt::Debug for StateIter<'a, T> {
2033
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
2034
0
        f.debug_struct("StateIter").field("id", &self.id).finish()
2035
0
    }
2036
}
2037
2038
/// A representation of a sparse DFA state that can be cheaply materialized
2039
/// from a state identifier.
2040
#[derive(Clone)]
2041
struct State<'a> {
2042
    /// The identifier of this state.
2043
    id: StateID,
2044
    /// Whether this is a match state or not.
2045
    is_match: bool,
2046
    /// The number of transitions in this state.
2047
    ntrans: usize,
2048
    /// Pairs of input ranges, where there is one pair for each transition.
2049
    /// Each pair specifies an inclusive start and end byte range for the
2050
    /// corresponding transition.
2051
    input_ranges: &'a [u8],
2052
    /// Transitions to the next state. This slice contains native endian
2053
    /// encoded state identifiers, with `S` as the representation. Thus, there
2054
    /// are `ntrans * size_of::<S>()` bytes in this slice.
2055
    next: &'a [u8],
2056
    /// If this is a match state, then this contains the pattern IDs that match
2057
    /// when the DFA is in this state.
2058
    ///
2059
    /// This is a contiguous sequence of 32-bit native endian encoded integers.
2060
    pattern_ids: &'a [u8],
2061
    /// An accelerator for this state, if present. If this state has no
2062
    /// accelerator, then this is an empty slice. When non-empty, this slice
2063
    /// has length at most 3 and corresponds to the exhaustive set of bytes
2064
    /// that must be seen in order to transition out of this state.
2065
    accel: &'a [u8],
2066
}
2067
2068
impl<'a> State<'a> {
2069
    /// Searches for the next transition given an input byte. If no such
2070
    /// transition could be found, then a dead state is returned.
2071
    ///
2072
    /// This is marked as inline to help dramatically boost sparse searching,
2073
    /// which decodes each state it enters to follow the next transition.
2074
    #[inline(always)]
2075
0
    fn next(&self, input: u8) -> StateID {
2076
        // This straight linear search was observed to be much better than
2077
        // binary search on ASCII haystacks, likely because a binary search
2078
        // visits the ASCII case last but a linear search sees it first. A
2079
        // binary search does do a little better on non-ASCII haystacks, but
2080
        // not by much. There might be a better trade off lurking here.
2081
0
        for i in 0..(self.ntrans - 1) {
2082
0
            let (start, end) = self.range(i);
2083
0
            if start <= input && input <= end {
2084
0
                return self.next_at(i);
2085
0
            }
2086
            // We could bail early with an extra branch: if input < b1, then
2087
            // we know we'll never find a matching transition. Interestingly,
2088
            // this extra branch seems to not help performance, or will even
2089
            // hurt it. It's likely very dependent on the DFA itself and what
2090
            // is being searched.
2091
        }
2092
0
        DEAD
2093
0
    }
2094
2095
    /// Returns the next state ID for the special EOI transition.
2096
0
    fn next_eoi(&self) -> StateID {
2097
0
        self.next_at(self.ntrans - 1)
2098
0
    }
2099
2100
    /// Returns the identifier for this state.
2101
0
    fn id(&self) -> StateID {
2102
0
        self.id
2103
0
    }
2104
2105
    /// Returns the inclusive input byte range for the ith transition in this
2106
    /// state.
2107
0
    fn range(&self, i: usize) -> (u8, u8) {
2108
0
        (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1])
2109
0
    }
2110
2111
    /// Returns the next state for the ith transition in this state.
2112
0
    fn next_at(&self, i: usize) -> StateID {
2113
0
        let start = i * StateID::SIZE;
2114
0
        let end = start + StateID::SIZE;
2115
0
        let bytes = self.next[start..end].try_into().unwrap();
2116
0
        StateID::from_ne_bytes_unchecked(bytes)
2117
0
    }
2118
2119
    /// Returns the pattern ID for the given match index. If the match index
2120
    /// is invalid, then this panics.
2121
0
    fn pattern_id(&self, match_index: usize) -> PatternID {
2122
0
        let start = match_index * PatternID::SIZE;
2123
0
        bytes::read_pattern_id_unchecked(&self.pattern_ids[start..]).0
2124
0
    }
2125
2126
    /// Returns the total number of pattern IDs for this state. This is always
2127
    /// zero when `is_match` is false.
2128
0
    fn pattern_count(&self) -> usize {
2129
0
        assert_eq!(0, self.pattern_ids.len() % 4);
2130
0
        self.pattern_ids.len() / 4
2131
0
    }
2132
2133
    /// Return the total number of bytes that this state consumes in its
2134
    /// encoded form.
2135
0
    fn bytes_len(&self) -> usize {
2136
0
        let mut len = 2
2137
0
            + (self.ntrans * 2)
2138
0
            + (self.ntrans * StateID::SIZE)
2139
0
            + (1 + self.accel.len());
2140
0
        if self.is_match {
2141
0
            len += size_of::<u32>() + self.pattern_ids.len();
2142
0
        }
2143
0
        len
2144
0
    }
2145
2146
    /// Return an accelerator for this state.
2147
0
    fn accelerator(&self) -> &'a [u8] {
2148
0
        self.accel
2149
0
    }
2150
}
2151
2152
impl<'a> fmt::Debug for State<'a> {
2153
0
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2154
0
        let mut printed = false;
2155
0
        for i in 0..(self.ntrans - 1) {
2156
0
            let next = self.next_at(i);
2157
0
            if next == DEAD {
2158
0
                continue;
2159
0
            }
2160
0
2161
0
            if printed {
2162
0
                write!(f, ", ")?;
2163
0
            }
2164
0
            let (start, end) = self.range(i);
2165
0
            if start == end {
2166
0
                write!(f, "{:?} => {:?}", DebugByte(start), next)?;
2167
            } else {
2168
0
                write!(
2169
0
                    f,
2170
0
                    "{:?}-{:?} => {:?}",
2171
0
                    DebugByte(start),
2172
0
                    DebugByte(end),
2173
0
                    next,
2174
0
                )?;
2175
            }
2176
0
            printed = true;
2177
        }
2178
0
        let eoi = self.next_at(self.ntrans - 1);
2179
0
        if eoi != DEAD {
2180
0
            if printed {
2181
0
                write!(f, ", ")?;
2182
0
            }
2183
0
            write!(f, "EOI => {:?}", eoi)?;
2184
0
        }
2185
0
        Ok(())
2186
0
    }
2187
}
2188
2189
/// A representation of a mutable sparse DFA state that can be cheaply
2190
/// materialized from a state identifier.
2191
#[cfg(feature = "alloc")]
2192
struct StateMut<'a> {
2193
    /// The identifier of this state.
2194
    id: StateID,
2195
    /// Whether this is a match state or not.
2196
    is_match: bool,
2197
    /// The number of transitions in this state.
2198
    ntrans: usize,
2199
    /// Pairs of input ranges, where there is one pair for each transition.
2200
    /// Each pair specifies an inclusive start and end byte range for the
2201
    /// corresponding transition.
2202
    input_ranges: &'a mut [u8],
2203
    /// Transitions to the next state. This slice contains native endian
2204
    /// encoded state identifiers, with `S` as the representation. Thus, there
2205
    /// are `ntrans * size_of::<S>()` bytes in this slice.
2206
    next: &'a mut [u8],
2207
    /// If this is a match state, then this contains the pattern IDs that match
2208
    /// when the DFA is in this state.
2209
    ///
2210
    /// This is a contiguous sequence of 32-bit native endian encoded integers.
2211
    pattern_ids: &'a [u8],
2212
    /// An accelerator for this state, if present. If this state has no
2213
    /// accelerator, then this is an empty slice. When non-empty, this slice
2214
    /// has length at most 3 and corresponds to the exhaustive set of bytes
2215
    /// that must be seen in order to transition out of this state.
2216
    accel: &'a mut [u8],
2217
}
2218
2219
#[cfg(feature = "alloc")]
2220
impl<'a> StateMut<'a> {
2221
    /// Sets the ith transition to the given state.
2222
    fn set_next_at(&mut self, i: usize, next: StateID) {
2223
        let start = i * StateID::SIZE;
2224
        let end = start + StateID::SIZE;
2225
        bytes::write_state_id::<bytes::NE>(next, &mut self.next[start..end]);
2226
    }
2227
}
2228
2229
#[cfg(feature = "alloc")]
2230
impl<'a> fmt::Debug for StateMut<'a> {
2231
    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
2232
        let state = State {
2233
            id: self.id,
2234
            is_match: self.is_match,
2235
            ntrans: self.ntrans,
2236
            input_ranges: self.input_ranges,
2237
            next: self.next,
2238
            pattern_ids: self.pattern_ids,
2239
            accel: self.accel,
2240
        };
2241
        fmt::Debug::fmt(&state, f)
2242
    }
2243
}
2244
2245
/// A binary search routine specialized specifically to a sparse DFA state's
2246
/// transitions. Specifically, the transitions are defined as a set of pairs
2247
/// of input bytes that delineate an inclusive range of bytes. If the input
2248
/// byte is in the range, then the corresponding transition is a match.
2249
///
2250
/// This binary search accepts a slice of these pairs and returns the position
2251
/// of the matching pair (the ith transition), or None if no matching pair
2252
/// could be found.
2253
///
2254
/// Note that this routine is not currently used since it was observed to
2255
/// either decrease performance when searching ASCII, or did not provide enough
2256
/// of a boost on non-ASCII haystacks to be worth it. However, we leave it here
2257
/// for posterity in case we can find a way to use it.
2258
///
2259
/// In theory, we could use the standard library's search routine if we could
2260
/// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this is currently
2261
/// guaranteed to be safe and is thus UB (since I don't think the in-memory
2262
/// representation of `(u8, u8)` has been nailed down). One could define a
2263
/// repr(C) type, but the casting doesn't seem justified.
2264
#[allow(dead_code)]
2265
#[inline(always)]
2266
0
fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option<usize> {
2267
0
    debug_assert!(ranges.len() % 2 == 0, "ranges must have even length");
2268
0
    debug_assert!(ranges.len() <= 512, "ranges should be short");
2269
2270
0
    let (mut left, mut right) = (0, ranges.len() / 2);
2271
0
    while left < right {
2272
0
        let mid = (left + right) / 2;
2273
0
        let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]);
2274
0
        if needle < b1 {
2275
0
            right = mid;
2276
0
        } else if needle > b2 {
2277
0
            left = mid + 1;
2278
0
        } else {
2279
0
            return Some(mid);
2280
        }
2281
    }
2282
0
    None
2283
0
}