/rust/registry/src/index.crates.io-1949cf8c6b5b557f/fst-0.4.7/src/map.rs
Line | Count | Source |
1 | | use std::fmt; |
2 | | use std::io; |
3 | | use std::iter::{self, FromIterator}; |
4 | | |
5 | | use crate::automaton::{AlwaysMatch, Automaton}; |
6 | | use crate::raw; |
7 | | pub use crate::raw::IndexedValue; |
8 | | use crate::stream::{IntoStreamer, Streamer}; |
9 | | use crate::Result; |
10 | | |
11 | | /// Map is a lexicographically ordered map from byte strings to integers. |
12 | | /// |
13 | | /// A `Map` is constructed with the `MapBuilder` type. Alternatively, a `Map` |
14 | | /// can be constructed in memory from a lexicographically ordered iterator |
15 | | /// of key-value pairs (`Map::from_iter`). |
16 | | /// |
17 | | /// A key feature of `Map` is that it can be serialized to disk compactly. Its |
18 | | /// underlying representation is built such that the `Map` can be memory mapped |
19 | | /// and searched without necessarily loading the entire map into memory. |
20 | | /// |
21 | | /// It supports most common operations associated with maps, such as key |
22 | | /// lookup and search. It also supports set operations on its keys along with |
23 | | /// the ability to specify how conflicting values are merged together. Maps |
24 | | /// also support range queries and automata based searches (e.g. a regular |
25 | | /// expression). |
26 | | /// |
27 | | /// Maps are represented by a finite state transducer where inputs are the keys |
28 | | /// and outputs are the values. As such, maps have the following invariants: |
29 | | /// |
30 | | /// 1. Once constructed, a `Map` can never be modified. |
31 | | /// 2. Maps must be constructed with lexicographically ordered byte sequences. |
32 | | /// There is no restricting on the ordering of values. |
33 | | /// |
34 | | /// # Differences with sets |
35 | | /// |
36 | | /// Maps and sets are represented by the same underlying data structure: the |
37 | | /// finite state transducer. The principal difference between them is that |
38 | | /// sets always have their output values set to `0`. This has an impact on the |
39 | | /// representation size and is reflected in the type system for convenience. |
40 | | /// A secondary but subtle difference is that duplicate keys can be added |
41 | | /// to a set, but it is an error to do so with maps. That is, a set can have |
42 | | /// the same key added sequentially, but a map can't. |
43 | | /// |
44 | | /// # The future |
45 | | /// |
46 | | /// It is regrettable that the output value is fixed to `u64`. Indeed, it is |
47 | | /// not necessary, but it was a major simplification in the implementation. |
48 | | /// In the future, the value type may become generic to an extent (outputs must |
49 | | /// satisfy a basic algebra). |
50 | | /// |
51 | | /// Keys will always be byte strings; however, we may grow more conveniences |
52 | | /// around dealing with them (such as a serialization/deserialization step, |
53 | | /// although it isn't clear where exactly this should live). |
54 | | #[derive(Clone)] |
55 | | pub struct Map<D>(raw::Fst<D>); |
56 | | |
57 | | impl Map<Vec<u8>> { |
58 | | /// Create a `Map` from an iterator of lexicographically ordered byte |
59 | | /// strings and associated values. |
60 | | /// |
61 | | /// If the iterator does not yield unique keys in lexicographic order, then |
62 | | /// an error is returned. |
63 | | /// |
64 | | /// Note that this is a convenience function to build a map in memory. |
65 | | /// To build a map that streams to an arbitrary `io::Write`, use |
66 | | /// `MapBuilder`. |
67 | 0 | pub fn from_iter<K, I>(iter: I) -> Result<Map<Vec<u8>>> |
68 | 0 | where |
69 | 0 | K: AsRef<[u8]>, |
70 | 0 | I: IntoIterator<Item = (K, u64)>, |
71 | | { |
72 | 0 | let mut builder = MapBuilder::memory(); |
73 | 0 | builder.extend_iter(iter)?; |
74 | 0 | Map::new(builder.into_inner()?) |
75 | 0 | } |
76 | | } |
77 | | |
78 | | impl<D: AsRef<[u8]>> Map<D> { |
79 | | /// Creates a map from its representation as a raw byte sequence. |
80 | | /// |
81 | | /// This accepts anything that can be cheaply converted to a `&[u8]`. The |
82 | | /// caller is responsible for guaranteeing that the given bytes refer to |
83 | | /// a valid FST. While memory safety will not be violated by invalid input, |
84 | | /// a panic could occur while reading the FST at any point. |
85 | | /// |
86 | | /// # Example |
87 | | /// |
88 | | /// ```no_run |
89 | | /// use fst::Map; |
90 | | /// |
91 | | /// // File written from a build script using MapBuilder. |
92 | | /// # const IGNORE: &str = stringify! { |
93 | | /// static FST: &[u8] = include_bytes!(concat!(env!("OUT_DIR"), "/map.fst")); |
94 | | /// # }; |
95 | | /// # static FST: &[u8] = &[]; |
96 | | /// |
97 | | /// let map = Map::new(FST).unwrap(); |
98 | | /// ``` |
99 | 0 | pub fn new(data: D) -> Result<Map<D>> { |
100 | 0 | raw::Fst::new(data).map(Map) |
101 | 0 | } |
102 | | |
103 | | /// Tests the membership of a single key. |
104 | | /// |
105 | | /// # Example |
106 | | /// |
107 | | /// ```rust |
108 | | /// use fst::Map; |
109 | | /// |
110 | | /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); |
111 | | /// |
112 | | /// assert_eq!(map.contains_key("b"), true); |
113 | | /// assert_eq!(map.contains_key("z"), false); |
114 | | /// ``` |
115 | 0 | pub fn contains_key<K: AsRef<[u8]>>(&self, key: K) -> bool { |
116 | 0 | self.0.contains_key(key) |
117 | 0 | } |
118 | | |
119 | | /// Retrieves the value associated with a key. |
120 | | /// |
121 | | /// If the key does not exist, then `None` is returned. |
122 | | /// |
123 | | /// # Example |
124 | | /// |
125 | | /// ```rust |
126 | | /// use fst::Map; |
127 | | /// |
128 | | /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); |
129 | | /// |
130 | | /// assert_eq!(map.get("b"), Some(2)); |
131 | | /// assert_eq!(map.get("z"), None); |
132 | | /// ``` |
133 | 0 | pub fn get<K: AsRef<[u8]>>(&self, key: K) -> Option<u64> { |
134 | 0 | self.0.get(key).map(|output| output.value()) |
135 | 0 | } |
136 | | |
137 | | /// Return a lexicographically ordered stream of all key-value pairs in |
138 | | /// this map. |
139 | | /// |
140 | | /// While this is a stream, it does require heap space proportional to the |
141 | | /// longest key in the map. |
142 | | /// |
143 | | /// If the map is memory mapped, then no further heap space is needed. |
144 | | /// Note though that your operating system may fill your page cache |
145 | | /// (which will cause the resident memory usage of the process to go up |
146 | | /// correspondingly). |
147 | | /// |
148 | | /// # Example |
149 | | /// |
150 | | /// Since streams are not iterators, the traditional `for` loop cannot be |
151 | | /// used. `while let` is useful instead: |
152 | | /// |
153 | | /// ```rust |
154 | | /// use fst::{IntoStreamer, Streamer, Map}; |
155 | | /// |
156 | | /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); |
157 | | /// let mut stream = map.stream(); |
158 | | /// |
159 | | /// let mut kvs = vec![]; |
160 | | /// while let Some((k, v)) = stream.next() { |
161 | | /// kvs.push((k.to_vec(), v)); |
162 | | /// } |
163 | | /// assert_eq!(kvs, vec![ |
164 | | /// (b"a".to_vec(), 1), |
165 | | /// (b"b".to_vec(), 2), |
166 | | /// (b"c".to_vec(), 3), |
167 | | /// ]); |
168 | | /// ``` |
169 | | #[inline] |
170 | 0 | pub fn stream(&self) -> Stream<'_> { |
171 | 0 | Stream(self.0.stream()) |
172 | 0 | } |
173 | | |
174 | | /// Return a lexicographically ordered stream of all keys in this map. |
175 | | /// |
176 | | /// Memory requirements are the same as described on `Map::stream`. |
177 | | /// |
178 | | /// # Example |
179 | | /// |
180 | | /// ```rust |
181 | | /// use fst::{IntoStreamer, Streamer, Map}; |
182 | | /// |
183 | | /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); |
184 | | /// let mut stream = map.keys(); |
185 | | /// |
186 | | /// let mut keys = vec![]; |
187 | | /// while let Some(k) = stream.next() { |
188 | | /// keys.push(k.to_vec()); |
189 | | /// } |
190 | | /// assert_eq!(keys, vec![b"a", b"b", b"c"]); |
191 | | /// ``` |
192 | | #[inline] |
193 | 0 | pub fn keys(&self) -> Keys<'_> { |
194 | 0 | Keys(self.0.stream()) |
195 | 0 | } |
196 | | |
197 | | /// Return a stream of all values in this map ordered lexicographically |
198 | | /// by each value's corresponding key. |
199 | | /// |
200 | | /// Memory requirements are the same as described on `Map::stream`. |
201 | | /// |
202 | | /// # Example |
203 | | /// |
204 | | /// ```rust |
205 | | /// use fst::{IntoStreamer, Streamer, Map}; |
206 | | /// |
207 | | /// let map = Map::from_iter(vec![("a", 1), ("b", 2), ("c", 3)]).unwrap(); |
208 | | /// let mut stream = map.values(); |
209 | | /// |
210 | | /// let mut values = vec![]; |
211 | | /// while let Some(v) = stream.next() { |
212 | | /// values.push(v); |
213 | | /// } |
214 | | /// assert_eq!(values, vec![1, 2, 3]); |
215 | | /// ``` |
216 | | #[inline] |
217 | 0 | pub fn values(&self) -> Values<'_> { |
218 | 0 | Values(self.0.stream()) |
219 | 0 | } |
220 | | |
221 | | /// Return a builder for range queries. |
222 | | /// |
223 | | /// A range query returns a subset of key-value pairs in this map in a |
224 | | /// range given in lexicographic order. |
225 | | /// |
226 | | /// Memory requirements are the same as described on `Map::stream`. |
227 | | /// Notably, only the keys in the range are read; keys outside the range |
228 | | /// are not. |
229 | | /// |
230 | | /// # Example |
231 | | /// |
232 | | /// Returns only the key-value pairs in the range given. |
233 | | /// |
234 | | /// ```rust |
235 | | /// use fst::{IntoStreamer, Streamer, Map}; |
236 | | /// |
237 | | /// let map = Map::from_iter(vec![ |
238 | | /// ("a", 1), ("b", 2), ("c", 3), ("d", 4), ("e", 5), |
239 | | /// ]).unwrap(); |
240 | | /// let mut stream = map.range().ge("b").lt("e").into_stream(); |
241 | | /// |
242 | | /// let mut kvs = vec![]; |
243 | | /// while let Some((k, v)) = stream.next() { |
244 | | /// kvs.push((k.to_vec(), v)); |
245 | | /// } |
246 | | /// assert_eq!(kvs, vec![ |
247 | | /// (b"b".to_vec(), 2), |
248 | | /// (b"c".to_vec(), 3), |
249 | | /// (b"d".to_vec(), 4), |
250 | | /// ]); |
251 | | /// ``` |
252 | | #[inline] |
253 | 0 | pub fn range(&self) -> StreamBuilder<'_> { |
254 | 0 | StreamBuilder(self.0.range()) |
255 | 0 | } |
256 | | |
257 | | /// Executes an automaton on the keys of this map. |
258 | | /// |
259 | | /// Note that this returns a `StreamBuilder`, which can be used to |
260 | | /// add a range query to the search (see the `range` method). |
261 | | /// |
262 | | /// Memory requirements are the same as described on `Map::stream`. |
263 | | /// |
264 | | /// # Example |
265 | | /// |
266 | | /// An implementation of regular expressions for `Automaton` is available |
267 | | /// in the `regex-automata` crate with the `fst1` feature enabled, which |
268 | | /// can be used to search maps. |
269 | | /// |
270 | | /// # Example |
271 | | /// |
272 | | /// An implementation of subsequence search for `Automaton` can be used |
273 | | /// to search maps: |
274 | | /// |
275 | | /// ```rust |
276 | | /// use fst::automaton::Subsequence; |
277 | | /// use fst::{IntoStreamer, Streamer, Map}; |
278 | | /// |
279 | | /// # fn main() { example().unwrap(); } |
280 | | /// fn example() -> Result<(), Box<dyn std::error::Error>> { |
281 | | /// let map = Map::from_iter(vec![ |
282 | | /// ("a foo bar", 1), |
283 | | /// ("foo", 2), |
284 | | /// ("foo1", 3), |
285 | | /// ("foo2", 4), |
286 | | /// ("foo3", 5), |
287 | | /// ("foobar", 6), |
288 | | /// ]).unwrap(); |
289 | | /// |
290 | | /// let matcher = Subsequence::new("for"); |
291 | | /// let mut stream = map.search(&matcher).into_stream(); |
292 | | /// |
293 | | /// let mut kvs = vec![]; |
294 | | /// while let Some((k, v)) = stream.next() { |
295 | | /// kvs.push((String::from_utf8(k.to_vec())?, v)); |
296 | | /// } |
297 | | /// assert_eq!(kvs, vec![ |
298 | | /// ("a foo bar".to_string(), 1), ("foobar".to_string(), 6), |
299 | | /// ]); |
300 | | /// |
301 | | /// Ok(()) |
302 | | /// } |
303 | | /// ``` |
304 | 0 | pub fn search<A: Automaton>(&self, aut: A) -> StreamBuilder<'_, A> { |
305 | 0 | StreamBuilder(self.0.search(aut)) |
306 | 0 | } |
307 | | |
308 | | /// Executes an automaton on the keys of this map and yields matching |
309 | | /// keys along with the corresponding matching states in the given |
310 | | /// automaton. |
311 | | /// |
312 | | /// Note that this returns a `StreamWithStateBuilder`, which can be used to |
313 | | /// add a range query to the search (see the `range` method). |
314 | | /// |
315 | | /// Memory requirements are the same as described on `Map::stream`. |
316 | | /// |
317 | | #[cfg_attr( |
318 | | feature = "levenshtein", |
319 | | doc = r##" |
320 | | # Example |
321 | | |
322 | | An implementation of fuzzy search using Levenshtein automata can be used |
323 | | to search maps: |
324 | | |
325 | | ```rust |
326 | | use fst::automaton::Levenshtein; |
327 | | use fst::{IntoStreamer, Streamer, Map}; |
328 | | |
329 | | # fn main() { example().unwrap(); } |
330 | | fn example() -> Result<(), Box<dyn std::error::Error>> { |
331 | | let map = Map::from_iter(vec![ |
332 | | ("foo", 1), |
333 | | ("foob", 2), |
334 | | ("foobar", 3), |
335 | | ("fozb", 4), |
336 | | ]).unwrap(); |
337 | | |
338 | | let query = Levenshtein::new("foo", 2)?; |
339 | | let mut stream = map.search_with_state(&query).into_stream(); |
340 | | |
341 | | let mut kvs = vec![]; |
342 | | while let Some((k, v, s)) = stream.next() { |
343 | | kvs.push((String::from_utf8(k.to_vec())?, v, s)); |
344 | | } |
345 | | // Currently, there isn't much interesting that you can do with the states. |
346 | | assert_eq!(kvs, vec![ |
347 | | ("foo".to_string(), 1, Some(183)), |
348 | | ("foob".to_string(), 2, Some(123)), |
349 | | ("fozb".to_string(), 4, Some(83)), |
350 | | ]); |
351 | | |
352 | | Ok(()) |
353 | | } |
354 | | ``` |
355 | | "## |
356 | | )] |
357 | 0 | pub fn search_with_state<A: Automaton>( |
358 | 0 | &self, |
359 | 0 | aut: A, |
360 | 0 | ) -> StreamWithStateBuilder<'_, A> { |
361 | 0 | StreamWithStateBuilder(self.0.search_with_state(aut)) |
362 | 0 | } |
363 | | |
364 | | /// Returns the number of elements in this map. |
365 | | #[inline] |
366 | 0 | pub fn len(&self) -> usize { |
367 | 0 | self.0.len() |
368 | 0 | } |
369 | | |
370 | | /// Returns true if and only if this map is empty. |
371 | | #[inline] |
372 | 0 | pub fn is_empty(&self) -> bool { |
373 | 0 | self.0.is_empty() |
374 | 0 | } |
375 | | |
376 | | /// Creates a new map operation with this map added to it. |
377 | | /// |
378 | | /// The `OpBuilder` type can be used to add additional map streams |
379 | | /// and perform set operations like union, intersection, difference and |
380 | | /// symmetric difference on the keys of the map. These set operations also |
381 | | /// allow one to specify how conflicting values are merged in the stream. |
382 | | /// |
383 | | /// # Example |
384 | | /// |
385 | | /// This example demonstrates a union on multiple map streams. Notice that |
386 | | /// the stream returned from the union is not a sequence of key-value |
387 | | /// pairs, but rather a sequence of keys associated with one or more |
388 | | /// values. Namely, a key is associated with each value associated with |
389 | | /// that same key in the all of the streams. |
390 | | /// |
391 | | /// ```rust |
392 | | /// use fst::{Streamer, Map}; |
393 | | /// use fst::map::IndexedValue; |
394 | | /// |
395 | | /// let map1 = Map::from_iter(vec![ |
396 | | /// ("a", 1), ("b", 2), ("c", 3), |
397 | | /// ]).unwrap(); |
398 | | /// let map2 = Map::from_iter(vec![ |
399 | | /// ("a", 10), ("y", 11), ("z", 12), |
400 | | /// ]).unwrap(); |
401 | | /// |
402 | | /// let mut union = map1.op().add(&map2).union(); |
403 | | /// |
404 | | /// let mut kvs = vec![]; |
405 | | /// while let Some((k, vs)) = union.next() { |
406 | | /// kvs.push((k.to_vec(), vs.to_vec())); |
407 | | /// } |
408 | | /// assert_eq!(kvs, vec![ |
409 | | /// (b"a".to_vec(), vec![ |
410 | | /// IndexedValue { index: 0, value: 1 }, |
411 | | /// IndexedValue { index: 1, value: 10 }, |
412 | | /// ]), |
413 | | /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]), |
414 | | /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]), |
415 | | /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 11 }]), |
416 | | /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 12 }]), |
417 | | /// ]); |
418 | | /// ``` |
419 | | #[inline] |
420 | 0 | pub fn op(&self) -> OpBuilder<'_> { |
421 | 0 | OpBuilder::new().add(self) |
422 | 0 | } |
423 | | |
424 | | /// Returns a reference to the underlying raw finite state transducer. |
425 | | #[inline] |
426 | 0 | pub fn as_fst(&self) -> &raw::Fst<D> { |
427 | 0 | &self.0 |
428 | 0 | } |
429 | | |
430 | | /// Returns the underlying raw finite state transducer. |
431 | | #[inline] |
432 | 0 | pub fn into_fst(self) -> raw::Fst<D> { |
433 | 0 | self.0 |
434 | 0 | } |
435 | | |
436 | | /// Maps the underlying data of the fst Map to another data type. |
437 | | /// |
438 | | /// # Example |
439 | | /// |
440 | | /// This example shows that you can map an fst Map based on a `Vec<u8>` |
441 | | /// into an fst Map based on a `Cow<[u8]>`, it can also work with a |
442 | | /// reference counted type (e.g. `Arc`, `Rc`). |
443 | | /// |
444 | | /// ``` |
445 | | /// use std::borrow::Cow; |
446 | | /// |
447 | | /// use fst::Map; |
448 | | /// |
449 | | /// let map: Map<Vec<u8>> = Map::from_iter( |
450 | | /// [("hello", 12), ("world", 42)].iter().cloned(), |
451 | | /// ).unwrap(); |
452 | | /// |
453 | | /// let map_on_cow: Map<Cow<[u8]>> = map.map_data(Cow::Owned).unwrap(); |
454 | | /// ``` |
455 | | #[inline] |
456 | 0 | pub fn map_data<F, T>(self, f: F) -> Result<Map<T>> |
457 | 0 | where |
458 | 0 | F: FnMut(D) -> T, |
459 | 0 | T: AsRef<[u8]>, |
460 | | { |
461 | 0 | self.into_fst().map_data(f).map(Map::from) |
462 | 0 | } |
463 | | } |
464 | | |
465 | | impl Default for Map<Vec<u8>> { |
466 | | #[inline] |
467 | 0 | fn default() -> Map<Vec<u8>> { |
468 | 0 | Map::from_iter(iter::empty::<(&[u8], u64)>()).unwrap() |
469 | 0 | } |
470 | | } |
471 | | |
472 | | impl<D: AsRef<[u8]>> fmt::Debug for Map<D> { |
473 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
474 | 0 | write!(f, "Map([")?; |
475 | 0 | let mut stream = self.stream(); |
476 | 0 | let mut first = true; |
477 | 0 | while let Some((k, v)) = stream.next() { |
478 | 0 | if !first { |
479 | 0 | write!(f, ", ")?; |
480 | 0 | } |
481 | 0 | first = false; |
482 | 0 | write!(f, "({}, {})", String::from_utf8_lossy(k), v)?; |
483 | | } |
484 | 0 | write!(f, "])") |
485 | 0 | } |
486 | | } |
487 | | |
488 | | // Construct a map from an Fst object. |
489 | | impl<D: AsRef<[u8]>> From<raw::Fst<D>> for Map<D> { |
490 | | #[inline] |
491 | 0 | fn from(fst: raw::Fst<D>) -> Map<D> { |
492 | 0 | Map(fst) |
493 | 0 | } |
494 | | } |
495 | | |
496 | | /// Returns the underlying finite state transducer. |
497 | | impl<D: AsRef<[u8]>> AsRef<raw::Fst<D>> for Map<D> { |
498 | | #[inline] |
499 | 0 | fn as_ref(&self) -> &raw::Fst<D> { |
500 | 0 | &self.0 |
501 | 0 | } |
502 | | } |
503 | | |
504 | | impl<'m, 'a, D: AsRef<[u8]>> IntoStreamer<'a> for &'m Map<D> { |
505 | | type Item = (&'a [u8], u64); |
506 | | type Into = Stream<'m>; |
507 | | |
508 | | #[inline] |
509 | 0 | fn into_stream(self) -> Stream<'m> { |
510 | 0 | Stream(self.0.stream()) |
511 | 0 | } |
512 | | } |
513 | | |
514 | | /// A builder for creating a map. |
515 | | /// |
516 | | /// This is not your average everyday builder. It has two important qualities |
517 | | /// that make it a bit unique from what you might expect: |
518 | | /// |
519 | | /// 1. All keys must be added in lexicographic order. Adding a key out of order |
520 | | /// will result in an error. Additionally, adding a duplicate key will also |
521 | | /// result in an error. That is, once a key is associated with a value, |
522 | | /// that association can never be modified or deleted. |
523 | | /// 2. The representation of a map is streamed to *any* `io::Write` as it is |
524 | | /// built. For an in memory representation, this can be a `Vec<u8>`. |
525 | | /// |
526 | | /// Point (2) is especially important because it means that a map can be |
527 | | /// constructed *without storing the entire map in memory*. Namely, since it |
528 | | /// works with any `io::Write`, it can be streamed directly to a file. |
529 | | /// |
530 | | /// With that said, the builder does use memory, but **memory usage is bounded |
531 | | /// to a constant size**. The amount of memory used trades off with the |
532 | | /// compression ratio. Currently, the implementation hard codes this trade off |
533 | | /// which can result in about 5-20MB of heap usage during construction. (N.B. |
534 | | /// Guaranteeing a maximal compression ratio requires memory proportional to |
535 | | /// the size of the map, which defeats some of the benefit of streaming |
536 | | /// it to disk. In practice, a small bounded amount of memory achieves |
537 | | /// close-to-minimal compression ratios.) |
538 | | /// |
539 | | /// The algorithmic complexity of map construction is `O(n)` where `n` is the |
540 | | /// number of elements added to the map. |
541 | | /// |
542 | | /// # Example: build in memory |
543 | | /// |
544 | | /// This shows how to use the builder to construct a map in memory. Note that |
545 | | /// `Map::from_iter` provides a convenience function that achieves this same |
546 | | /// goal without needing to explicitly use `MapBuilder`. |
547 | | /// |
548 | | /// ```rust |
549 | | /// use fst::{IntoStreamer, Streamer, Map, MapBuilder}; |
550 | | /// |
551 | | /// let mut build = MapBuilder::memory(); |
552 | | /// build.insert("bruce", 1).unwrap(); |
553 | | /// build.insert("clarence", 2).unwrap(); |
554 | | /// build.insert("stevie", 3).unwrap(); |
555 | | /// |
556 | | /// // You could also call `finish()` here, but since we're building the map in |
557 | | /// // memory, there would be no way to get the `Vec<u8>` back. |
558 | | /// let bytes = build.into_inner().unwrap(); |
559 | | /// |
560 | | /// // At this point, the map has been constructed, but here's how to read it. |
561 | | /// let map = Map::new(bytes).unwrap(); |
562 | | /// let mut stream = map.into_stream(); |
563 | | /// let mut kvs = vec![]; |
564 | | /// while let Some((k, v)) = stream.next() { |
565 | | /// kvs.push((k.to_vec(), v)); |
566 | | /// } |
567 | | /// assert_eq!(kvs, vec![ |
568 | | /// (b"bruce".to_vec(), 1), |
569 | | /// (b"clarence".to_vec(), 2), |
570 | | /// (b"stevie".to_vec(), 3), |
571 | | /// ]); |
572 | | /// ``` |
573 | | /// |
574 | | /// # Example: stream to file |
575 | | /// |
576 | | /// This shows how to do stream construction of a map to a file. |
577 | | /// |
578 | | /// ```rust,no_run |
579 | | /// use std::fs::File; |
580 | | /// use std::io; |
581 | | /// |
582 | | /// use fst::{IntoStreamer, Streamer, Map, MapBuilder}; |
583 | | /// |
584 | | /// let mut wtr = io::BufWriter::new(File::create("map.fst").unwrap()); |
585 | | /// let mut build = MapBuilder::new(wtr).unwrap(); |
586 | | /// build.insert("bruce", 1).unwrap(); |
587 | | /// build.insert("clarence", 2).unwrap(); |
588 | | /// build.insert("stevie", 3).unwrap(); |
589 | | /// |
590 | | /// // If you want the writer back, then call `into_inner`. Otherwise, this |
591 | | /// // will finish construction and call `flush`. |
592 | | /// build.finish().unwrap(); |
593 | | /// |
594 | | /// // At this point, the map has been constructed, but here's how to read it. |
595 | | /// // NOTE: Normally, one would memory map a file instead of reading its |
596 | | /// // entire contents on to the heap. |
597 | | /// let map = Map::new(std::fs::read("map.fst").unwrap()).unwrap(); |
598 | | /// let mut stream = map.into_stream(); |
599 | | /// let mut kvs = vec![]; |
600 | | /// while let Some((k, v)) = stream.next() { |
601 | | /// kvs.push((k.to_vec(), v)); |
602 | | /// } |
603 | | /// assert_eq!(kvs, vec![ |
604 | | /// (b"bruce".to_vec(), 1), |
605 | | /// (b"clarence".to_vec(), 2), |
606 | | /// (b"stevie".to_vec(), 3), |
607 | | /// ]); |
608 | | /// ``` |
609 | | pub struct MapBuilder<W>(raw::Builder<W>); |
610 | | |
611 | | impl MapBuilder<Vec<u8>> { |
612 | | /// Create a builder that builds a map in memory. |
613 | | #[inline] |
614 | 0 | pub fn memory() -> MapBuilder<Vec<u8>> { |
615 | 0 | MapBuilder(raw::Builder::memory()) |
616 | 0 | } |
617 | | |
618 | | /// Finishes the construction of the map and returns it. |
619 | | #[inline] |
620 | 0 | pub fn into_map(self) -> Map<Vec<u8>> { |
621 | 0 | Map(self.0.into_fst()) |
622 | 0 | } |
623 | | } |
624 | | |
625 | | impl<W: io::Write> MapBuilder<W> { |
626 | | /// Create a builder that builds a map by writing it to `wtr` in a |
627 | | /// streaming fashion. |
628 | 0 | pub fn new(wtr: W) -> Result<MapBuilder<W>> { |
629 | 0 | raw::Builder::new_type(wtr, 0).map(MapBuilder) |
630 | 0 | } |
631 | | |
632 | | /// Insert a new key-value pair into the map. |
633 | | /// |
634 | | /// Keys must be convertible to byte strings. Values must be a `u64`, which |
635 | | /// is a restriction of the current implementation of finite state |
636 | | /// transducers. (Values may one day be expanded to other types.) |
637 | | /// |
638 | | /// If a key is inserted that is less than or equal to any previous key |
639 | | /// added, then an error is returned. Similarly, if there was a problem |
640 | | /// writing to the underlying writer, an error is returned. |
641 | 0 | pub fn insert<K: AsRef<[u8]>>(&mut self, key: K, val: u64) -> Result<()> { |
642 | 0 | self.0.insert(key, val) |
643 | 0 | } |
644 | | |
645 | | /// Calls insert on each item in the iterator. |
646 | | /// |
647 | | /// If an error occurred while adding an element, processing is stopped |
648 | | /// and the error is returned. |
649 | | /// |
650 | | /// If a key is inserted that is less than or equal to any previous key |
651 | | /// added, then an error is returned. Similarly, if there was a problem |
652 | | /// writing to the underlying writer, an error is returned. |
653 | 0 | pub fn extend_iter<K, I>(&mut self, iter: I) -> Result<()> |
654 | 0 | where |
655 | 0 | K: AsRef<[u8]>, |
656 | 0 | I: IntoIterator<Item = (K, u64)>, |
657 | | { |
658 | 0 | self.0.extend_iter( |
659 | 0 | iter.into_iter().map(|(k, v)| (k, raw::Output::new(v))), |
660 | | ) |
661 | 0 | } |
662 | | |
663 | | /// Calls insert on each item in the stream. |
664 | | /// |
665 | | /// Note that unlike `extend_iter`, this is not generic on the items in |
666 | | /// the stream. |
667 | | /// |
668 | | /// If a key is inserted that is less than or equal to any previous key |
669 | | /// added, then an error is returned. Similarly, if there was a problem |
670 | | /// writing to the underlying writer, an error is returned. |
671 | 0 | pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()> |
672 | 0 | where |
673 | 0 | I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, |
674 | 0 | S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, |
675 | | { |
676 | 0 | self.0.extend_stream(StreamOutput(stream.into_stream())) |
677 | 0 | } |
678 | | |
679 | | /// Finishes the construction of the map and flushes the underlying |
680 | | /// writer. After completion, the data written to `W` may be read using |
681 | | /// one of `Map`'s constructor methods. |
682 | 0 | pub fn finish(self) -> Result<()> { |
683 | 0 | self.0.finish() |
684 | 0 | } |
685 | | |
686 | | /// Just like `finish`, except it returns the underlying writer after |
687 | | /// flushing it. |
688 | 0 | pub fn into_inner(self) -> Result<W> { |
689 | 0 | self.0.into_inner() |
690 | 0 | } |
691 | | |
692 | | /// Gets a reference to the underlying writer. |
693 | 0 | pub fn get_ref(&self) -> &W { |
694 | 0 | self.0.get_ref() |
695 | 0 | } |
696 | | |
697 | | /// Returns the number of bytes written to the underlying writer |
698 | 0 | pub fn bytes_written(&self) -> u64 { |
699 | 0 | self.0.bytes_written() |
700 | 0 | } |
701 | | } |
702 | | |
703 | | /// A lexicographically ordered stream of key-value pairs from a map. |
704 | | /// |
705 | | /// The `A` type parameter corresponds to an optional automaton to filter |
706 | | /// the stream. By default, no filtering is done. |
707 | | /// |
708 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
709 | | pub struct Stream<'m, A = AlwaysMatch>(raw::Stream<'m, A>) |
710 | | where |
711 | | A: Automaton; |
712 | | |
713 | | impl<'a, 'm, A: Automaton> Streamer<'a> for Stream<'m, A> { |
714 | | type Item = (&'a [u8], u64); |
715 | | |
716 | 0 | fn next(&'a mut self) -> Option<(&'a [u8], u64)> { |
717 | 0 | self.0.next().map(|(key, out)| (key, out.value())) |
718 | 0 | } |
719 | | } |
720 | | |
721 | | impl<'m, A: Automaton> Stream<'m, A> { |
722 | | /// Convert this stream into a vector of byte strings and outputs. |
723 | | /// |
724 | | /// Note that this creates a new allocation for every key in the stream. |
725 | 0 | pub fn into_byte_vec(self) -> Vec<(Vec<u8>, u64)> { |
726 | 0 | self.0.into_byte_vec() |
727 | 0 | } |
728 | | |
729 | | /// Convert this stream into a vector of Unicode strings and outputs. |
730 | | /// |
731 | | /// If any key is not valid UTF-8, then iteration on the stream is stopped |
732 | | /// and a UTF-8 decoding error is returned. |
733 | | /// |
734 | | /// Note that this creates a new allocation for every key in the stream. |
735 | 0 | pub fn into_str_vec(self) -> Result<Vec<(String, u64)>> { |
736 | 0 | self.0.into_str_vec() |
737 | 0 | } |
738 | | |
739 | | /// Convert this stream into a vector of byte strings. |
740 | | /// |
741 | | /// Note that this creates a new allocation for every key in the stream. |
742 | 0 | pub fn into_byte_keys(self) -> Vec<Vec<u8>> { |
743 | 0 | self.0.into_byte_keys() |
744 | 0 | } |
745 | | |
746 | | /// Convert this stream into a vector of Unicode strings. |
747 | | /// |
748 | | /// If any key is not valid UTF-8, then iteration on the stream is stopped |
749 | | /// and a UTF-8 decoding error is returned. |
750 | | /// |
751 | | /// Note that this creates a new allocation for every key in the stream. |
752 | 0 | pub fn into_str_keys(self) -> Result<Vec<String>> { |
753 | 0 | self.0.into_str_keys() |
754 | 0 | } |
755 | | |
756 | | /// Convert this stream into a vector of outputs. |
757 | 0 | pub fn into_values(self) -> Vec<u64> { |
758 | 0 | self.0.into_values() |
759 | 0 | } |
760 | | } |
761 | | |
762 | | /// A lexicographically ordered stream of key-value-state triples from a map |
763 | | /// and an automaton. |
764 | | /// |
765 | | /// The key-values are from the map while the states are from the automaton. |
766 | | /// |
767 | | /// The `A` type parameter corresponds to an optional automaton to filter |
768 | | /// the stream. By default, no filtering is done. |
769 | | /// |
770 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
771 | | pub struct StreamWithState<'m, A = AlwaysMatch>(raw::StreamWithState<'m, A>) |
772 | | where |
773 | | A: Automaton; |
774 | | |
775 | | impl<'a, 'm, A: 'a + Automaton> Streamer<'a> for StreamWithState<'m, A> |
776 | | where |
777 | | A::State: Clone, |
778 | | { |
779 | | type Item = (&'a [u8], u64, A::State); |
780 | | |
781 | 0 | fn next(&'a mut self) -> Option<(&'a [u8], u64, A::State)> { |
782 | 0 | self.0.next().map(|(key, out, state)| (key, out.value(), state)) |
783 | 0 | } |
784 | | } |
785 | | |
786 | | /// A lexicographically ordered stream of keys from a map. |
787 | | /// |
788 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
789 | | pub struct Keys<'m>(raw::Stream<'m>); |
790 | | |
791 | | impl<'a, 'm> Streamer<'a> for Keys<'m> { |
792 | | type Item = &'a [u8]; |
793 | | |
794 | | #[inline] |
795 | 0 | fn next(&'a mut self) -> Option<&'a [u8]> { |
796 | 0 | self.0.next().map(|(key, _)| key) |
797 | 0 | } |
798 | | } |
799 | | |
800 | | /// A stream of values from a map, lexicographically ordered by each value's |
801 | | /// corresponding key. |
802 | | /// |
803 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
804 | | pub struct Values<'m>(raw::Stream<'m>); |
805 | | |
806 | | impl<'a, 'm> Streamer<'a> for Values<'m> { |
807 | | type Item = u64; |
808 | | |
809 | | #[inline] |
810 | 0 | fn next(&'a mut self) -> Option<u64> { |
811 | 0 | self.0.next().map(|(_, out)| out.value()) |
812 | 0 | } |
813 | | } |
814 | | |
815 | | /// A builder for constructing range queries on streams. |
816 | | /// |
817 | | /// Once all bounds are set, one should call `into_stream` to get a |
818 | | /// `Stream`. |
819 | | /// |
820 | | /// Bounds are not additive. That is, if `ge` is called twice on the same |
821 | | /// builder, then the second setting wins. |
822 | | /// |
823 | | /// The `A` type parameter corresponds to an optional automaton to filter |
824 | | /// the stream. By default, no filtering is done. |
825 | | /// |
826 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
827 | | pub struct StreamBuilder<'m, A = AlwaysMatch>(raw::StreamBuilder<'m, A>); |
828 | | |
829 | | impl<'m, A: Automaton> StreamBuilder<'m, A> { |
830 | | /// Specify a greater-than-or-equal-to bound. |
831 | 0 | pub fn ge<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> { |
832 | 0 | StreamBuilder(self.0.ge(bound)) |
833 | 0 | } |
834 | | |
835 | | /// Specify a greater-than bound. |
836 | 0 | pub fn gt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> { |
837 | 0 | StreamBuilder(self.0.gt(bound)) |
838 | 0 | } |
839 | | |
840 | | /// Specify a less-than-or-equal-to bound. |
841 | 0 | pub fn le<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> { |
842 | 0 | StreamBuilder(self.0.le(bound)) |
843 | 0 | } |
844 | | |
845 | | /// Specify a less-than bound. |
846 | 0 | pub fn lt<T: AsRef<[u8]>>(self, bound: T) -> StreamBuilder<'m, A> { |
847 | 0 | StreamBuilder(self.0.lt(bound)) |
848 | 0 | } |
849 | | } |
850 | | |
851 | | impl<'m, 'a, A: Automaton> IntoStreamer<'a> for StreamBuilder<'m, A> { |
852 | | type Item = (&'a [u8], u64); |
853 | | type Into = Stream<'m, A>; |
854 | | |
855 | 0 | fn into_stream(self) -> Stream<'m, A> { |
856 | 0 | Stream(self.0.into_stream()) |
857 | 0 | } |
858 | | } |
859 | | |
860 | | /// A builder for constructing range queries on streams that include automaton |
861 | | /// states. |
862 | | /// |
863 | | /// In general, one should use `StreamBuilder` unless you have a specific need |
864 | | /// for accessing the states of the underlying automaton that is being used to |
865 | | /// filter this stream. |
866 | | /// |
867 | | /// Once all bounds are set, one should call `into_stream` to get a |
868 | | /// `Stream`. |
869 | | /// |
870 | | /// Bounds are not additive. That is, if `ge` is called twice on the same |
871 | | /// builder, then the second setting wins. |
872 | | /// |
873 | | /// The `A` type parameter corresponds to an optional automaton to filter |
874 | | /// the stream. By default, no filtering is done. |
875 | | /// |
876 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
877 | | pub struct StreamWithStateBuilder<'m, A = AlwaysMatch>( |
878 | | raw::StreamWithStateBuilder<'m, A>, |
879 | | ); |
880 | | |
881 | | impl<'m, A: Automaton> StreamWithStateBuilder<'m, A> { |
882 | | /// Specify a greater-than-or-equal-to bound. |
883 | 0 | pub fn ge<T: AsRef<[u8]>>( |
884 | 0 | self, |
885 | 0 | bound: T, |
886 | 0 | ) -> StreamWithStateBuilder<'m, A> { |
887 | 0 | StreamWithStateBuilder(self.0.ge(bound)) |
888 | 0 | } |
889 | | |
890 | | /// Specify a greater-than bound. |
891 | 0 | pub fn gt<T: AsRef<[u8]>>( |
892 | 0 | self, |
893 | 0 | bound: T, |
894 | 0 | ) -> StreamWithStateBuilder<'m, A> { |
895 | 0 | StreamWithStateBuilder(self.0.gt(bound)) |
896 | 0 | } |
897 | | |
898 | | /// Specify a less-than-or-equal-to bound. |
899 | 0 | pub fn le<T: AsRef<[u8]>>( |
900 | 0 | self, |
901 | 0 | bound: T, |
902 | 0 | ) -> StreamWithStateBuilder<'m, A> { |
903 | 0 | StreamWithStateBuilder(self.0.le(bound)) |
904 | 0 | } |
905 | | |
906 | | /// Specify a less-than bound. |
907 | 0 | pub fn lt<T: AsRef<[u8]>>( |
908 | 0 | self, |
909 | 0 | bound: T, |
910 | 0 | ) -> StreamWithStateBuilder<'m, A> { |
911 | 0 | StreamWithStateBuilder(self.0.lt(bound)) |
912 | 0 | } |
913 | | } |
914 | | |
915 | | impl<'m, 'a, A: 'a + Automaton> IntoStreamer<'a> |
916 | | for StreamWithStateBuilder<'m, A> |
917 | | where |
918 | | A::State: Clone, |
919 | | { |
920 | | type Item = (&'a [u8], u64, A::State); |
921 | | type Into = StreamWithState<'m, A>; |
922 | | |
923 | 0 | fn into_stream(self) -> StreamWithState<'m, A> { |
924 | 0 | StreamWithState(self.0.into_stream()) |
925 | 0 | } |
926 | | } |
927 | | |
928 | | /// A builder for collecting map streams on which to perform set operations |
929 | | /// on the keys of maps. |
930 | | /// |
931 | | /// Set operations include intersection, union, difference and symmetric |
932 | | /// difference. The result of each set operation is itself a stream that emits |
933 | | /// pairs of keys and a sequence of each occurrence of that key in the |
934 | | /// participating streams. This information allows one to perform set |
935 | | /// operations on maps and customize how conflicting output values are handled. |
936 | | /// |
937 | | /// All set operations work efficiently on an arbitrary number of |
938 | | /// streams with memory proportional to the number of streams. |
939 | | /// |
940 | | /// The algorithmic complexity of all set operations is `O(n1 + n2 + n3 + ...)` |
941 | | /// where `n1, n2, n3, ...` correspond to the number of elements in each |
942 | | /// stream. |
943 | | /// |
944 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying set. |
945 | | pub struct OpBuilder<'m>(raw::OpBuilder<'m>); |
946 | | |
947 | | impl<'m> OpBuilder<'m> { |
948 | | /// Create a new set operation builder. |
949 | | #[inline] |
950 | 0 | pub fn new() -> OpBuilder<'m> { |
951 | 0 | OpBuilder(raw::OpBuilder::new()) |
952 | 0 | } |
953 | | |
954 | | /// Add a stream to this set operation. |
955 | | /// |
956 | | /// This is useful for a chaining style pattern, e.g., |
957 | | /// `builder.add(stream1).add(stream2).union()`. |
958 | | /// |
959 | | /// The stream must emit a lexicographically ordered sequence of key-value |
960 | | /// pairs. |
961 | 0 | pub fn add<I, S>(mut self, streamable: I) -> OpBuilder<'m> |
962 | 0 | where |
963 | 0 | I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, |
964 | 0 | S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, |
965 | | { |
966 | 0 | self.push(streamable); |
967 | 0 | self |
968 | 0 | } |
969 | | |
970 | | /// Add a stream to this set operation. |
971 | | /// |
972 | | /// The stream must emit a lexicographically ordered sequence of key-value |
973 | | /// pairs. |
974 | 0 | pub fn push<I, S>(&mut self, streamable: I) |
975 | 0 | where |
976 | 0 | I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, |
977 | 0 | S: 'm + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, |
978 | | { |
979 | 0 | self.0.push(StreamOutput(streamable.into_stream())); |
980 | 0 | } |
981 | | |
982 | | /// Performs a union operation on all streams that have been added. |
983 | | /// |
984 | | /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The |
985 | | /// first element of the tuple is the byte string key. The second element |
986 | | /// of the tuple is a list of all occurrences of that key in participating |
987 | | /// streams. The `IndexedValue` contains an index and the value associated |
988 | | /// with that key in that stream. The index uniquely identifies each |
989 | | /// stream, which is an integer that is auto-incremented when a stream |
990 | | /// is added to this operation (starting at `0`). |
991 | | /// |
992 | | /// # Example |
993 | | /// |
994 | | /// ```rust |
995 | | /// use fst::{IntoStreamer, Streamer, Map}; |
996 | | /// use fst::map::IndexedValue; |
997 | | /// |
998 | | /// let map1 = Map::from_iter(vec![ |
999 | | /// ("a", 1), ("b", 2), ("c", 3), |
1000 | | /// ]).unwrap(); |
1001 | | /// let map2 = Map::from_iter(vec![ |
1002 | | /// ("a", 11), ("y", 12), ("z", 13), |
1003 | | /// ]).unwrap(); |
1004 | | /// |
1005 | | /// let mut union = map1.op().add(&map2).union(); |
1006 | | /// |
1007 | | /// let mut kvs = vec![]; |
1008 | | /// while let Some((k, vs)) = union.next() { |
1009 | | /// kvs.push((k.to_vec(), vs.to_vec())); |
1010 | | /// } |
1011 | | /// assert_eq!(kvs, vec![ |
1012 | | /// (b"a".to_vec(), vec![ |
1013 | | /// IndexedValue { index: 0, value: 1 }, |
1014 | | /// IndexedValue { index: 1, value: 11 }, |
1015 | | /// ]), |
1016 | | /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]), |
1017 | | /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]), |
1018 | | /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]), |
1019 | | /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]), |
1020 | | /// ]); |
1021 | | /// ``` |
1022 | | #[inline] |
1023 | 0 | pub fn union(self) -> Union<'m> { |
1024 | 0 | Union(self.0.union()) |
1025 | 0 | } |
1026 | | |
1027 | | /// Performs an intersection operation on all streams that have been added. |
1028 | | /// |
1029 | | /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The |
1030 | | /// first element of the tuple is the byte string key. The second element |
1031 | | /// of the tuple is a list of all occurrences of that key in participating |
1032 | | /// streams. The `IndexedValue` contains an index and the value associated |
1033 | | /// with that key in that stream. The index uniquely identifies each |
1034 | | /// stream, which is an integer that is auto-incremented when a stream |
1035 | | /// is added to this operation (starting at `0`). |
1036 | | /// |
1037 | | /// # Example |
1038 | | /// |
1039 | | /// ```rust |
1040 | | /// use fst::{IntoStreamer, Streamer, Map}; |
1041 | | /// use fst::map::IndexedValue; |
1042 | | /// |
1043 | | /// let map1 = Map::from_iter(vec![ |
1044 | | /// ("a", 1), ("b", 2), ("c", 3), |
1045 | | /// ]).unwrap(); |
1046 | | /// let map2 = Map::from_iter(vec![ |
1047 | | /// ("a", 11), ("y", 12), ("z", 13), |
1048 | | /// ]).unwrap(); |
1049 | | /// |
1050 | | /// let mut intersection = map1.op().add(&map2).intersection(); |
1051 | | /// |
1052 | | /// let mut kvs = vec![]; |
1053 | | /// while let Some((k, vs)) = intersection.next() { |
1054 | | /// kvs.push((k.to_vec(), vs.to_vec())); |
1055 | | /// } |
1056 | | /// assert_eq!(kvs, vec![ |
1057 | | /// (b"a".to_vec(), vec![ |
1058 | | /// IndexedValue { index: 0, value: 1 }, |
1059 | | /// IndexedValue { index: 1, value: 11 }, |
1060 | | /// ]), |
1061 | | /// ]); |
1062 | | /// ``` |
1063 | | #[inline] |
1064 | 0 | pub fn intersection(self) -> Intersection<'m> { |
1065 | 0 | Intersection(self.0.intersection()) |
1066 | 0 | } |
1067 | | |
1068 | | /// Performs a difference operation with respect to the first stream added. |
1069 | | /// That is, this returns a stream of all elements in the first stream |
1070 | | /// that don't exist in any other stream that has been added. |
1071 | | /// |
1072 | | /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The |
1073 | | /// first element of the tuple is the byte string key. The second element |
1074 | | /// of the tuple is a list of all occurrences of that key in participating |
1075 | | /// streams. The `IndexedValue` contains an index and the value associated |
1076 | | /// with that key in that stream. The index uniquely identifies each |
1077 | | /// stream, which is an integer that is auto-incremented when a stream |
1078 | | /// is added to this operation (starting at `0`). |
1079 | | /// |
1080 | | /// While the interface is the same for all the operations combining multiple |
1081 | | /// maps, due to the nature of `difference` there's exactly one `IndexValue` |
1082 | | /// for each yielded value. |
1083 | | /// |
1084 | | /// # Example |
1085 | | /// |
1086 | | /// ```rust |
1087 | | /// use fst::{Streamer, Map}; |
1088 | | /// use fst::map::IndexedValue; |
1089 | | /// |
1090 | | /// let map1 = Map::from_iter(vec![ |
1091 | | /// ("a", 1), ("b", 2), ("c", 3), |
1092 | | /// ]).unwrap(); |
1093 | | /// let map2 = Map::from_iter(vec![ |
1094 | | /// ("a", 11), ("y", 12), ("z", 13), |
1095 | | /// ]).unwrap(); |
1096 | | /// |
1097 | | /// let mut difference = map1.op().add(&map2).difference(); |
1098 | | /// |
1099 | | /// let mut kvs = vec![]; |
1100 | | /// while let Some((k, vs)) = difference.next() { |
1101 | | /// kvs.push((k.to_vec(), vs.to_vec())); |
1102 | | /// } |
1103 | | /// assert_eq!(kvs, vec![ |
1104 | | /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]), |
1105 | | /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]), |
1106 | | /// ]); |
1107 | | /// ``` |
1108 | | #[inline] |
1109 | 0 | pub fn difference(self) -> Difference<'m> { |
1110 | 0 | Difference(self.0.difference()) |
1111 | 0 | } |
1112 | | |
1113 | | /// Performs a symmetric difference operation on all of the streams that |
1114 | | /// have been added. |
1115 | | /// |
1116 | | /// When there are only two streams, then the keys returned correspond to |
1117 | | /// keys that are in either stream but *not* in both streams. |
1118 | | /// |
1119 | | /// More generally, for any number of streams, keys that occur in an odd |
1120 | | /// number of streams are returned. |
1121 | | /// |
1122 | | /// Note that this returns a stream of `(&[u8], &[IndexedValue])`. The |
1123 | | /// first element of the tuple is the byte string key. The second element |
1124 | | /// of the tuple is a list of all occurrences of that key in participating |
1125 | | /// streams. The `IndexedValue` contains an index and the value associated |
1126 | | /// with that key in that stream. The index uniquely identifies each |
1127 | | /// stream, which is an integer that is auto-incremented when a stream |
1128 | | /// is added to this operation (starting at `0`). |
1129 | | /// |
1130 | | /// # Example |
1131 | | /// |
1132 | | /// ```rust |
1133 | | /// use fst::{IntoStreamer, Streamer, Map}; |
1134 | | /// use fst::map::IndexedValue; |
1135 | | /// |
1136 | | /// let map1 = Map::from_iter(vec![ |
1137 | | /// ("a", 1), ("b", 2), ("c", 3), |
1138 | | /// ]).unwrap(); |
1139 | | /// let map2 = Map::from_iter(vec![ |
1140 | | /// ("a", 11), ("y", 12), ("z", 13), |
1141 | | /// ]).unwrap(); |
1142 | | /// |
1143 | | /// let mut sym_difference = map1.op().add(&map2).symmetric_difference(); |
1144 | | /// |
1145 | | /// let mut kvs = vec![]; |
1146 | | /// while let Some((k, vs)) = sym_difference.next() { |
1147 | | /// kvs.push((k.to_vec(), vs.to_vec())); |
1148 | | /// } |
1149 | | /// assert_eq!(kvs, vec![ |
1150 | | /// (b"b".to_vec(), vec![IndexedValue { index: 0, value: 2 }]), |
1151 | | /// (b"c".to_vec(), vec![IndexedValue { index: 0, value: 3 }]), |
1152 | | /// (b"y".to_vec(), vec![IndexedValue { index: 1, value: 12 }]), |
1153 | | /// (b"z".to_vec(), vec![IndexedValue { index: 1, value: 13 }]), |
1154 | | /// ]); |
1155 | | /// ``` |
1156 | | #[inline] |
1157 | 0 | pub fn symmetric_difference(self) -> SymmetricDifference<'m> { |
1158 | 0 | SymmetricDifference(self.0.symmetric_difference()) |
1159 | 0 | } |
1160 | | } |
1161 | | |
1162 | | impl<'f, I, S> Extend<I> for OpBuilder<'f> |
1163 | | where |
1164 | | I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, |
1165 | | S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, |
1166 | | { |
1167 | 0 | fn extend<T>(&mut self, it: T) |
1168 | 0 | where |
1169 | 0 | T: IntoIterator<Item = I>, |
1170 | | { |
1171 | 0 | for stream in it { |
1172 | 0 | self.push(stream); |
1173 | 0 | } |
1174 | 0 | } |
1175 | | } |
1176 | | |
1177 | | impl<'f, I, S> FromIterator<I> for OpBuilder<'f> |
1178 | | where |
1179 | | I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], u64)>, |
1180 | | S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], u64)>, |
1181 | | { |
1182 | 0 | fn from_iter<T>(it: T) -> OpBuilder<'f> |
1183 | 0 | where |
1184 | 0 | T: IntoIterator<Item = I>, |
1185 | | { |
1186 | 0 | let mut op = OpBuilder::new(); |
1187 | 0 | op.extend(it); |
1188 | 0 | op |
1189 | 0 | } |
1190 | | } |
1191 | | |
1192 | | /// A stream of set union over multiple map streams in lexicographic order. |
1193 | | /// |
1194 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
1195 | | pub struct Union<'m>(raw::Union<'m>); |
1196 | | |
1197 | | impl<'a, 'm> Streamer<'a> for Union<'m> { |
1198 | | type Item = (&'a [u8], &'a [IndexedValue]); |
1199 | | |
1200 | | #[inline] |
1201 | 0 | fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { |
1202 | 0 | self.0.next() |
1203 | 0 | } |
1204 | | } |
1205 | | |
1206 | | /// A stream of set intersection over multiple map streams in lexicographic |
1207 | | /// order. |
1208 | | /// |
1209 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
1210 | | pub struct Intersection<'m>(raw::Intersection<'m>); |
1211 | | |
1212 | | impl<'a, 'm> Streamer<'a> for Intersection<'m> { |
1213 | | type Item = (&'a [u8], &'a [IndexedValue]); |
1214 | | |
1215 | | #[inline] |
1216 | 0 | fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { |
1217 | 0 | self.0.next() |
1218 | 0 | } |
1219 | | } |
1220 | | |
1221 | | /// A stream of set difference over multiple map streams in lexicographic |
1222 | | /// order. |
1223 | | /// |
1224 | | /// The difference operation is taken with respect to the first stream and the |
1225 | | /// rest of the streams. i.e., All elements in the first stream that do not |
1226 | | /// appear in any other streams. |
1227 | | /// |
1228 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
1229 | | pub struct Difference<'m>(raw::Difference<'m>); |
1230 | | |
1231 | | impl<'a, 'm> Streamer<'a> for Difference<'m> { |
1232 | | type Item = (&'a [u8], &'a [IndexedValue]); |
1233 | | |
1234 | | #[inline] |
1235 | 0 | fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { |
1236 | 0 | self.0.next() |
1237 | 0 | } |
1238 | | } |
1239 | | |
1240 | | /// A stream of set symmetric difference over multiple map streams in |
1241 | | /// lexicographic order. |
1242 | | /// |
1243 | | /// The `'m` lifetime parameter refers to the lifetime of the underlying map. |
1244 | | pub struct SymmetricDifference<'m>(raw::SymmetricDifference<'m>); |
1245 | | |
1246 | | impl<'a, 'm> Streamer<'a> for SymmetricDifference<'m> { |
1247 | | type Item = (&'a [u8], &'a [IndexedValue]); |
1248 | | |
1249 | | #[inline] |
1250 | 0 | fn next(&'a mut self) -> Option<(&'a [u8], &'a [IndexedValue])> { |
1251 | 0 | self.0.next() |
1252 | 0 | } |
1253 | | } |
1254 | | |
1255 | | /// A specialized stream for mapping map streams (`(&[u8], u64)`) to streams |
1256 | | /// used by raw fsts (`(&[u8], Output)`). |
1257 | | /// |
1258 | | /// If this were iterators, we could use `iter::Map`, but doing this on streams |
1259 | | /// requires HKT, so we need to write out the monomorphization ourselves. |
1260 | | struct StreamOutput<S>(S); |
1261 | | |
1262 | | impl<'a, S> Streamer<'a> for StreamOutput<S> |
1263 | | where |
1264 | | S: Streamer<'a, Item = (&'a [u8], u64)>, |
1265 | | { |
1266 | | type Item = (&'a [u8], raw::Output); |
1267 | | |
1268 | 0 | fn next(&'a mut self) -> Option<(&'a [u8], raw::Output)> { |
1269 | 0 | self.0.next().map(|(k, v)| (k, raw::Output::new(v))) |
1270 | 0 | } |
1271 | | } |