/rust/registry/src/index.crates.io-1949cf8c6b5b557f/fst-0.4.7/src/automaton/mod.rs
Line | Count | Source |
1 | | #[cfg(feature = "levenshtein")] |
2 | | pub use self::levenshtein::{Levenshtein, LevenshteinError}; |
3 | | |
4 | | #[cfg(feature = "levenshtein")] |
5 | | mod levenshtein; |
6 | | |
7 | | /// Automaton describes types that behave as a finite automaton. |
8 | | /// |
9 | | /// All implementors of this trait are represented by *byte based* automata. |
10 | | /// Stated differently, all transitions in the automata correspond to a single |
11 | | /// byte in the input. |
12 | | /// |
13 | | /// This implementation choice is important for a couple reasons: |
14 | | /// |
15 | | /// 1. The set of possible transitions in each node is small, which may make |
16 | | /// efficient memory usage easier. |
17 | | /// 2. The finite state transducers in this crate are all byte based, so any |
18 | | /// automata used on them must also be byte based. |
19 | | /// |
20 | | /// In practice, this does present somewhat of a problem, for example, if |
21 | | /// you're storing UTF-8 encoded strings in a finite state transducer. Consider |
22 | | /// using a `Levenshtein` automaton, which accepts a query string and an edit |
23 | | /// distance. The edit distance should apply to some notion of *character*, |
24 | | /// which could be represented by at least 1-4 bytes in a UTF-8 encoding (for |
25 | | /// some definition of "character"). Therefore, the automaton must have UTF-8 |
26 | | /// decoding built into it. This can be tricky to implement, so you may find |
27 | | /// the [`utf8-ranges`](https://crates.io/crates/utf8-ranges) crate useful. |
28 | | pub trait Automaton { |
29 | | /// The type of the state used in the automaton. |
30 | | type State; |
31 | | |
32 | | /// Returns a single start state for this automaton. |
33 | | /// |
34 | | /// This method should always return the same value for each |
35 | | /// implementation. |
36 | | fn start(&self) -> Self::State; |
37 | | |
38 | | /// Returns true if and only if `state` is a match state. |
39 | | fn is_match(&self, state: &Self::State) -> bool; |
40 | | |
41 | | /// Returns true if and only if `state` can lead to a match in zero or more |
42 | | /// steps. |
43 | | /// |
44 | | /// If this returns `false`, then no sequence of inputs from this state |
45 | | /// should ever produce a match. If this does not follow, then those match |
46 | | /// states may never be reached. In other words, behavior may be incorrect. |
47 | | /// |
48 | | /// If this returns `true` even when no match is possible, then behavior |
49 | | /// will be correct, but callers may be forced to do additional work. |
50 | 0 | fn can_match(&self, _state: &Self::State) -> bool { |
51 | 0 | true |
52 | 0 | } |
53 | | |
54 | | /// Returns true if and only if `state` matches and must match no matter |
55 | | /// what steps are taken. |
56 | | /// |
57 | | /// If this returns `true`, then every sequence of inputs from this state |
58 | | /// produces a match. If this does not follow, then those match states may |
59 | | /// never be reached. In other words, behavior may be incorrect. |
60 | | /// |
61 | | /// If this returns `false` even when every sequence of inputs will lead to |
62 | | /// a match, then behavior will be correct, but callers may be forced to do |
63 | | /// additional work. |
64 | 0 | fn will_always_match(&self, _state: &Self::State) -> bool { |
65 | 0 | false |
66 | 0 | } |
67 | | |
68 | | /// Return the next state given `state` and an input. |
69 | | fn accept(&self, state: &Self::State, byte: u8) -> Self::State; |
70 | | |
71 | | /// If applicable, return the next state when the end of a key is seen. |
72 | 0 | fn accept_eof(&self, _: &Self::State) -> Option<Self::State> { |
73 | 0 | None |
74 | 0 | } |
75 | | |
76 | | /// Returns an automaton that matches the strings that start with something |
77 | | /// this automaton matches. |
78 | 0 | fn starts_with(self) -> StartsWith<Self> |
79 | 0 | where |
80 | 0 | Self: Sized, |
81 | | { |
82 | 0 | StartsWith(self) |
83 | 0 | } |
84 | | |
85 | | /// Returns an automaton that matches the strings matched by either this or |
86 | | /// the other automaton. |
87 | 0 | fn union<Rhs: Automaton>(self, rhs: Rhs) -> Union<Self, Rhs> |
88 | 0 | where |
89 | 0 | Self: Sized, |
90 | | { |
91 | 0 | Union(self, rhs) |
92 | 0 | } |
93 | | |
94 | | /// Returns an automaton that matches the strings matched by both this and |
95 | | /// the other automaton. |
96 | 0 | fn intersection<Rhs: Automaton>(self, rhs: Rhs) -> Intersection<Self, Rhs> |
97 | 0 | where |
98 | 0 | Self: Sized, |
99 | | { |
100 | 0 | Intersection(self, rhs) |
101 | 0 | } |
102 | | |
103 | | /// Returns an automaton that matches the strings not matched by this |
104 | | /// automaton. |
105 | 0 | fn complement(self) -> Complement<Self> |
106 | 0 | where |
107 | 0 | Self: Sized, |
108 | | { |
109 | 0 | Complement(self) |
110 | 0 | } |
111 | | } |
112 | | |
113 | | impl<'a, T: Automaton> Automaton for &'a T { |
114 | | type State = T::State; |
115 | | |
116 | 0 | fn start(&self) -> T::State { |
117 | 0 | (*self).start() |
118 | 0 | } |
119 | | |
120 | 0 | fn is_match(&self, state: &T::State) -> bool { |
121 | 0 | (*self).is_match(state) |
122 | 0 | } |
123 | | |
124 | 0 | fn can_match(&self, state: &T::State) -> bool { |
125 | 0 | (*self).can_match(state) |
126 | 0 | } |
127 | | |
128 | 0 | fn will_always_match(&self, state: &T::State) -> bool { |
129 | 0 | (*self).will_always_match(state) |
130 | 0 | } |
131 | | |
132 | 0 | fn accept(&self, state: &T::State, byte: u8) -> T::State { |
133 | 0 | (*self).accept(state, byte) |
134 | 0 | } |
135 | | |
136 | 0 | fn accept_eof(&self, state: &Self::State) -> Option<Self::State> { |
137 | 0 | (*self).accept_eof(state) |
138 | 0 | } |
139 | | } |
140 | | |
141 | | /// An automaton that matches if the input equals to a specific string. |
142 | | /// |
143 | | /// It can be used in combination with [`StartsWith`] to search strings |
144 | | /// starting with a given prefix. |
145 | | /// |
146 | | /// ```rust |
147 | | /// extern crate fst; |
148 | | /// |
149 | | /// use fst::{Automaton, IntoStreamer, Streamer, Set}; |
150 | | /// use fst::automaton::Str; |
151 | | /// |
152 | | /// # fn main() { example().unwrap(); } |
153 | | /// fn example() -> Result<(), Box<dyn std::error::Error>> { |
154 | | /// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"]; |
155 | | /// let set = Set::from_iter(paths)?; |
156 | | /// |
157 | | /// // Build our prefix query. |
158 | | /// let prefix = Str::new("/home").starts_with(); |
159 | | /// |
160 | | /// // Apply our query to the set we built. |
161 | | /// let mut stream = set.search(prefix).into_stream(); |
162 | | /// |
163 | | /// let matches = stream.into_strs()?; |
164 | | /// assert_eq!(matches, vec!["/home/projects/bar", "/home/projects/foo"]); |
165 | | /// Ok(()) |
166 | | /// } |
167 | | /// ``` |
168 | | #[derive(Clone, Debug)] |
169 | | pub struct Str<'a> { |
170 | | string: &'a [u8], |
171 | | } |
172 | | |
173 | | impl<'a> Str<'a> { |
174 | | /// Constructs automaton that matches an exact string. |
175 | | #[inline] |
176 | 0 | pub fn new(string: &'a str) -> Str<'a> { |
177 | 0 | Str { string: string.as_bytes() } |
178 | 0 | } |
179 | | } |
180 | | |
181 | | impl<'a> Automaton for Str<'a> { |
182 | | type State = Option<usize>; |
183 | | |
184 | | #[inline] |
185 | 0 | fn start(&self) -> Option<usize> { |
186 | 0 | Some(0) |
187 | 0 | } |
188 | | |
189 | | #[inline] |
190 | 0 | fn is_match(&self, pos: &Option<usize>) -> bool { |
191 | 0 | *pos == Some(self.string.len()) |
192 | 0 | } |
193 | | |
194 | | #[inline] |
195 | 0 | fn can_match(&self, pos: &Option<usize>) -> bool { |
196 | 0 | pos.is_some() |
197 | 0 | } |
198 | | |
199 | | #[inline] |
200 | 0 | fn accept(&self, pos: &Option<usize>, byte: u8) -> Option<usize> { |
201 | | // if we aren't already past the end... |
202 | 0 | if let Some(pos) = *pos { |
203 | | // and there is still a matching byte at the current position... |
204 | 0 | if self.string.get(pos).cloned() == Some(byte) { |
205 | | // then move forward |
206 | 0 | return Some(pos + 1); |
207 | 0 | } |
208 | 0 | } |
209 | | // otherwise we're either past the end or didn't match the byte |
210 | 0 | None |
211 | 0 | } |
212 | | } |
213 | | |
214 | | /// An automaton that matches if the input contains a specific subsequence. |
215 | | /// |
216 | | /// It can be used to build a simple fuzzy-finder. |
217 | | /// |
218 | | /// ```rust |
219 | | /// extern crate fst; |
220 | | /// |
221 | | /// use fst::{IntoStreamer, Streamer, Set}; |
222 | | /// use fst::automaton::Subsequence; |
223 | | /// |
224 | | /// # fn main() { example().unwrap(); } |
225 | | /// fn example() -> Result<(), Box<dyn std::error::Error>> { |
226 | | /// let paths = vec!["/home/projects/bar", "/home/projects/foo", "/tmp/foo"]; |
227 | | /// let set = Set::from_iter(paths)?; |
228 | | /// |
229 | | /// // Build our fuzzy query. |
230 | | /// let subseq = Subsequence::new("hpf"); |
231 | | /// |
232 | | /// // Apply our fuzzy query to the set we built. |
233 | | /// let mut stream = set.search(subseq).into_stream(); |
234 | | /// |
235 | | /// let matches = stream.into_strs()?; |
236 | | /// assert_eq!(matches, vec!["/home/projects/foo"]); |
237 | | /// Ok(()) |
238 | | /// } |
239 | | /// ``` |
240 | | #[derive(Clone, Debug)] |
241 | | pub struct Subsequence<'a> { |
242 | | subseq: &'a [u8], |
243 | | } |
244 | | |
245 | | impl<'a> Subsequence<'a> { |
246 | | /// Constructs automaton that matches input containing the |
247 | | /// specified subsequence. |
248 | | #[inline] |
249 | 0 | pub fn new(subsequence: &'a str) -> Subsequence<'a> { |
250 | 0 | Subsequence { subseq: subsequence.as_bytes() } |
251 | 0 | } |
252 | | } |
253 | | |
254 | | impl<'a> Automaton for Subsequence<'a> { |
255 | | type State = usize; |
256 | | |
257 | | #[inline] |
258 | 0 | fn start(&self) -> usize { |
259 | 0 | 0 |
260 | 0 | } |
261 | | |
262 | | #[inline] |
263 | 0 | fn is_match(&self, &state: &usize) -> bool { |
264 | 0 | state == self.subseq.len() |
265 | 0 | } |
266 | | |
267 | | #[inline] |
268 | 0 | fn can_match(&self, _: &usize) -> bool { |
269 | 0 | true |
270 | 0 | } |
271 | | |
272 | | #[inline] |
273 | 0 | fn will_always_match(&self, &state: &usize) -> bool { |
274 | 0 | state == self.subseq.len() |
275 | 0 | } |
276 | | |
277 | | #[inline] |
278 | 0 | fn accept(&self, &state: &usize, byte: u8) -> usize { |
279 | 0 | if state == self.subseq.len() { |
280 | 0 | return state; |
281 | 0 | } |
282 | 0 | state + (byte == self.subseq[state]) as usize |
283 | 0 | } |
284 | | } |
285 | | |
286 | | /// An automaton that always matches. |
287 | | /// |
288 | | /// This is useful in a generic context as a way to express that no automaton |
289 | | /// should be used. |
290 | | #[derive(Clone, Debug)] |
291 | | pub struct AlwaysMatch; |
292 | | |
293 | | impl Automaton for AlwaysMatch { |
294 | | type State = (); |
295 | | |
296 | | #[inline] |
297 | 0 | fn start(&self) -> () { |
298 | 0 | () |
299 | 0 | } |
300 | | #[inline] |
301 | 0 | fn is_match(&self, _: &()) -> bool { |
302 | 0 | true |
303 | 0 | } |
304 | | #[inline] |
305 | 0 | fn can_match(&self, _: &()) -> bool { |
306 | 0 | true |
307 | 0 | } |
308 | | #[inline] |
309 | 0 | fn will_always_match(&self, _: &()) -> bool { |
310 | 0 | true |
311 | 0 | } |
312 | | #[inline] |
313 | 0 | fn accept(&self, _: &(), _: u8) -> () { |
314 | 0 | () |
315 | 0 | } |
316 | | } |
317 | | |
318 | | /// An automaton that matches a string that begins with something that the |
319 | | /// wrapped automaton matches. |
320 | | #[derive(Clone, Debug)] |
321 | | pub struct StartsWith<A>(A); |
322 | | |
323 | | /// The `Automaton` state for `StartsWith<A>`. |
324 | | pub struct StartsWithState<A: Automaton>(StartsWithStateKind<A>); |
325 | | |
326 | | enum StartsWithStateKind<A: Automaton> { |
327 | | Done, |
328 | | Running(A::State), |
329 | | } |
330 | | |
331 | | impl<A: Automaton> Automaton for StartsWith<A> { |
332 | | type State = StartsWithState<A>; |
333 | | |
334 | 0 | fn start(&self) -> StartsWithState<A> { |
335 | | StartsWithState({ |
336 | 0 | let inner = self.0.start(); |
337 | 0 | if self.0.is_match(&inner) { |
338 | 0 | StartsWithStateKind::Done |
339 | | } else { |
340 | 0 | StartsWithStateKind::Running(inner) |
341 | | } |
342 | | }) |
343 | 0 | } |
344 | | |
345 | 0 | fn is_match(&self, state: &StartsWithState<A>) -> bool { |
346 | 0 | match state.0 { |
347 | 0 | StartsWithStateKind::Done => true, |
348 | 0 | StartsWithStateKind::Running(_) => false, |
349 | | } |
350 | 0 | } |
351 | | |
352 | 0 | fn can_match(&self, state: &StartsWithState<A>) -> bool { |
353 | 0 | match state.0 { |
354 | 0 | StartsWithStateKind::Done => true, |
355 | 0 | StartsWithStateKind::Running(ref inner) => self.0.can_match(inner), |
356 | | } |
357 | 0 | } |
358 | | |
359 | 0 | fn will_always_match(&self, state: &StartsWithState<A>) -> bool { |
360 | 0 | match state.0 { |
361 | 0 | StartsWithStateKind::Done => true, |
362 | 0 | StartsWithStateKind::Running(_) => false, |
363 | | } |
364 | 0 | } |
365 | | |
366 | 0 | fn accept( |
367 | 0 | &self, |
368 | 0 | state: &StartsWithState<A>, |
369 | 0 | byte: u8, |
370 | 0 | ) -> StartsWithState<A> { |
371 | 0 | StartsWithState(match state.0 { |
372 | 0 | StartsWithStateKind::Done => StartsWithStateKind::Done, |
373 | 0 | StartsWithStateKind::Running(ref inner) => { |
374 | 0 | let next_inner = self.0.accept(inner, byte); |
375 | 0 | if self.0.is_match(&next_inner) { |
376 | 0 | StartsWithStateKind::Done |
377 | | } else { |
378 | 0 | StartsWithStateKind::Running(next_inner) |
379 | | } |
380 | | } |
381 | | }) |
382 | 0 | } |
383 | | } |
384 | | |
385 | | /// An automaton that matches when one of its component automata match. |
386 | | #[derive(Clone, Debug)] |
387 | | pub struct Union<A, B>(A, B); |
388 | | |
389 | | /// The `Automaton` state for `Union<A, B>`. |
390 | | pub struct UnionState<A: Automaton, B: Automaton>(A::State, B::State); |
391 | | |
392 | | impl<A: Automaton, B: Automaton> Automaton for Union<A, B> { |
393 | | type State = UnionState<A, B>; |
394 | | |
395 | 0 | fn start(&self) -> UnionState<A, B> { |
396 | 0 | UnionState(self.0.start(), self.1.start()) |
397 | 0 | } |
398 | | |
399 | 0 | fn is_match(&self, state: &UnionState<A, B>) -> bool { |
400 | 0 | self.0.is_match(&state.0) || self.1.is_match(&state.1) |
401 | 0 | } |
402 | | |
403 | 0 | fn can_match(&self, state: &UnionState<A, B>) -> bool { |
404 | 0 | self.0.can_match(&state.0) || self.1.can_match(&state.1) |
405 | 0 | } |
406 | | |
407 | 0 | fn will_always_match(&self, state: &UnionState<A, B>) -> bool { |
408 | 0 | self.0.will_always_match(&state.0) |
409 | 0 | || self.1.will_always_match(&state.1) |
410 | 0 | } |
411 | | |
412 | 0 | fn accept(&self, state: &UnionState<A, B>, byte: u8) -> UnionState<A, B> { |
413 | 0 | UnionState( |
414 | 0 | self.0.accept(&state.0, byte), |
415 | 0 | self.1.accept(&state.1, byte), |
416 | 0 | ) |
417 | 0 | } |
418 | | } |
419 | | |
420 | | /// An automaton that matches when both of its component automata match. |
421 | | #[derive(Clone, Debug)] |
422 | | pub struct Intersection<A, B>(A, B); |
423 | | |
424 | | /// The `Automaton` state for `Intersection<A, B>`. |
425 | | pub struct IntersectionState<A: Automaton, B: Automaton>(A::State, B::State); |
426 | | |
427 | | impl<A: Automaton, B: Automaton> Automaton for Intersection<A, B> { |
428 | | type State = IntersectionState<A, B>; |
429 | | |
430 | 0 | fn start(&self) -> IntersectionState<A, B> { |
431 | 0 | IntersectionState(self.0.start(), self.1.start()) |
432 | 0 | } |
433 | | |
434 | 0 | fn is_match(&self, state: &IntersectionState<A, B>) -> bool { |
435 | 0 | self.0.is_match(&state.0) && self.1.is_match(&state.1) |
436 | 0 | } |
437 | | |
438 | 0 | fn can_match(&self, state: &IntersectionState<A, B>) -> bool { |
439 | 0 | self.0.can_match(&state.0) && self.1.can_match(&state.1) |
440 | 0 | } |
441 | | |
442 | 0 | fn will_always_match(&self, state: &IntersectionState<A, B>) -> bool { |
443 | 0 | self.0.will_always_match(&state.0) |
444 | 0 | && self.1.will_always_match(&state.1) |
445 | 0 | } |
446 | | |
447 | 0 | fn accept( |
448 | 0 | &self, |
449 | 0 | state: &IntersectionState<A, B>, |
450 | 0 | byte: u8, |
451 | 0 | ) -> IntersectionState<A, B> { |
452 | 0 | IntersectionState( |
453 | 0 | self.0.accept(&state.0, byte), |
454 | 0 | self.1.accept(&state.1, byte), |
455 | 0 | ) |
456 | 0 | } |
457 | | } |
458 | | |
459 | | /// An automaton that matches exactly when the automaton it wraps does not. |
460 | | #[derive(Clone, Debug)] |
461 | | pub struct Complement<A>(A); |
462 | | |
463 | | /// The `Automaton` state for `Complement<A>`. |
464 | | pub struct ComplementState<A: Automaton>(A::State); |
465 | | |
466 | | impl<A: Automaton> Automaton for Complement<A> { |
467 | | type State = ComplementState<A>; |
468 | | |
469 | 0 | fn start(&self) -> ComplementState<A> { |
470 | 0 | ComplementState(self.0.start()) |
471 | 0 | } |
472 | | |
473 | 0 | fn is_match(&self, state: &ComplementState<A>) -> bool { |
474 | 0 | !self.0.is_match(&state.0) |
475 | 0 | } |
476 | | |
477 | 0 | fn can_match(&self, state: &ComplementState<A>) -> bool { |
478 | 0 | !self.0.will_always_match(&state.0) |
479 | 0 | } |
480 | | |
481 | 0 | fn will_always_match(&self, state: &ComplementState<A>) -> bool { |
482 | 0 | !self.0.can_match(&state.0) |
483 | 0 | } |
484 | | |
485 | 0 | fn accept( |
486 | 0 | &self, |
487 | 0 | state: &ComplementState<A>, |
488 | 0 | byte: u8, |
489 | 0 | ) -> ComplementState<A> { |
490 | 0 | ComplementState(self.0.accept(&state.0, byte)) |
491 | 0 | } |
492 | | } |