/rust/registry/src/index.crates.io-1949cf8c6b5b557f/fst-0.4.7/src/raw/build.rs
Line | Count | Source |
1 | | use std::io; |
2 | | |
3 | | use crate::bytes; |
4 | | use crate::error::Result; |
5 | | use crate::raw::counting_writer::CountingWriter; |
6 | | use crate::raw::error::Error; |
7 | | use crate::raw::registry::{Registry, RegistryEntry}; |
8 | | use crate::raw::{ |
9 | | CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS, |
10 | | NONE_ADDRESS, VERSION, |
11 | | }; |
12 | | // use raw::registry_minimal::{Registry, RegistryEntry}; |
13 | | use crate::stream::{IntoStreamer, Streamer}; |
14 | | |
15 | | /// A builder for creating a finite state transducer. |
16 | | /// |
17 | | /// This is not your average everyday builder. It has two important qualities |
18 | | /// that make it a bit unique from what you might expect: |
19 | | /// |
20 | | /// 1. All keys must be added in lexicographic order. Adding a key out of order |
21 | | /// will result in an error. Additionally, adding a duplicate key with an |
22 | | /// output value will also result in an error. That is, once a key is |
23 | | /// associated with a value, that association can never be modified or |
24 | | /// deleted. |
25 | | /// 2. The representation of an fst is streamed to *any* `io::Write` as it is |
26 | | /// built. For an in memory representation, this can be a `Vec<u8>`. |
27 | | /// |
28 | | /// Point (2) is especially important because it means that an fst can be |
29 | | /// constructed *without storing the entire fst in memory*. Namely, since it |
30 | | /// works with any `io::Write`, it can be streamed directly to a file. |
31 | | /// |
32 | | /// With that said, the builder does use memory, but **memory usage is bounded |
33 | | /// to a constant size**. The amount of memory used trades off with the |
34 | | /// compression ratio. Currently, the implementation hard codes this trade off |
35 | | /// which can result in about 5-20MB of heap usage during construction. (N.B. |
36 | | /// Guaranteeing a maximal compression ratio requires memory proportional to |
37 | | /// the size of the fst, which defeats some of the benefit of streaming |
38 | | /// it to disk. In practice, a small bounded amount of memory achieves |
39 | | /// close-to-minimal compression ratios.) |
40 | | /// |
41 | | /// The algorithmic complexity of fst construction is `O(n)` where `n` is the |
42 | | /// number of elements added to the fst. |
43 | | pub struct Builder<W> { |
44 | | /// The FST raw data is written directly to `wtr`. |
45 | | /// |
46 | | /// No internal buffering is done. |
47 | | wtr: CountingWriter<W>, |
48 | | /// The stack of unfinished nodes. |
49 | | /// |
50 | | /// An unfinished node is a node that could potentially have a new |
51 | | /// transition added to it when a new word is added to the dictionary. |
52 | | unfinished: UnfinishedNodes, |
53 | | /// A map of finished nodes. |
54 | | /// |
55 | | /// A finished node is one that has been compiled and written to `wtr`. |
56 | | /// After this point, the node is considered immutable and will never |
57 | | /// change. |
58 | | registry: Registry, |
59 | | /// The last word added. |
60 | | /// |
61 | | /// This is used to enforce the invariant that words are added in sorted |
62 | | /// order. |
63 | | last: Option<Vec<u8>>, |
64 | | /// The address of the last compiled node. |
65 | | /// |
66 | | /// This is used to optimize states with one transition that point |
67 | | /// to the previously compiled node. (The previously compiled node in |
68 | | /// this case actually corresponds to the next state for the transition, |
69 | | /// since states are compiled in reverse.) |
70 | | last_addr: CompiledAddr, |
71 | | /// The number of keys added. |
72 | | len: usize, |
73 | | } |
74 | | |
75 | | #[derive(Debug)] |
76 | | struct UnfinishedNodes { |
77 | | stack: Vec<BuilderNodeUnfinished>, |
78 | | } |
79 | | |
80 | | #[derive(Debug)] |
81 | | struct BuilderNodeUnfinished { |
82 | | node: BuilderNode, |
83 | | last: Option<LastTransition>, |
84 | | } |
85 | | |
86 | | #[derive(Debug, Hash, Eq, PartialEq)] |
87 | | pub struct BuilderNode { |
88 | | pub is_final: bool, |
89 | | pub final_output: Output, |
90 | | pub trans: Vec<Transition>, |
91 | | } |
92 | | |
93 | | #[derive(Debug)] |
94 | | struct LastTransition { |
95 | | inp: u8, |
96 | | out: Output, |
97 | | } |
98 | | |
99 | | impl Builder<Vec<u8>> { |
100 | | /// Create a builder that builds an fst in memory. |
101 | | #[inline] |
102 | 0 | pub fn memory() -> Builder<Vec<u8>> { |
103 | 0 | Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap() |
104 | 0 | } |
105 | | |
106 | | /// Finishes construction of the FST and returns it. |
107 | | #[inline] |
108 | 0 | pub fn into_fst(self) -> Fst<Vec<u8>> { |
109 | 0 | self.into_inner().and_then(Fst::new).unwrap() |
110 | 0 | } |
111 | | } |
112 | | |
113 | | impl<W: io::Write> Builder<W> { |
114 | | /// Create a builder that builds an fst by writing it to `wtr` in a |
115 | | /// streaming fashion. |
116 | 0 | pub fn new(wtr: W) -> Result<Builder<W>> { |
117 | 0 | Builder::new_type(wtr, 0) |
118 | 0 | } |
119 | | |
120 | | /// The same as `new`, except it sets the type of the fst to the type |
121 | | /// given. |
122 | 0 | pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> { |
123 | 0 | let mut wtr = CountingWriter::new(wtr); |
124 | | // Don't allow any nodes to have address 0-7. We use these to encode |
125 | | // the API version. We also use addresses `0` and `1` as special |
126 | | // sentinel values, so they should never correspond to a real node. |
127 | 0 | bytes::io_write_u64_le(VERSION, &mut wtr)?; |
128 | | // Similarly for 8-15 for the fst type. |
129 | 0 | bytes::io_write_u64_le(ty, &mut wtr)?; |
130 | 0 | Ok(Builder { |
131 | 0 | wtr, |
132 | 0 | unfinished: UnfinishedNodes::new(), |
133 | 0 | registry: Registry::new(10_000, 2), |
134 | 0 | last: None, |
135 | 0 | last_addr: NONE_ADDRESS, |
136 | 0 | len: 0, |
137 | 0 | }) |
138 | 0 | } |
139 | | |
140 | | /// Adds a byte string to this FST with a zero output value. |
141 | 0 | pub fn add<B>(&mut self, bs: B) -> Result<()> |
142 | 0 | where |
143 | 0 | B: AsRef<[u8]>, |
144 | | { |
145 | 0 | self.check_last_key(bs.as_ref(), false)?; |
146 | 0 | self.insert_output(bs, None) |
147 | 0 | } |
148 | | |
149 | | /// Insert a new key-value pair into the fst. |
150 | | /// |
151 | | /// Keys must be convertible to byte strings. Values must be a `u64`, which |
152 | | /// is a restriction of the current implementation of finite state |
153 | | /// transducers. (Values may one day be expanded to other types.) |
154 | | /// |
155 | | /// If a key is inserted that is less than or equal to any previous key |
156 | | /// added, then an error is returned. Similarly, if there was a problem |
157 | | /// writing to the underlying writer, an error is returned. |
158 | 0 | pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()> |
159 | 0 | where |
160 | 0 | B: AsRef<[u8]>, |
161 | | { |
162 | 0 | self.check_last_key(bs.as_ref(), true)?; |
163 | 0 | self.insert_output(bs, Some(Output::new(val))) |
164 | 0 | } |
165 | | |
166 | | /// Calls insert on each item in the iterator. |
167 | | /// |
168 | | /// If an error occurred while adding an element, processing is stopped |
169 | | /// and the error is returned. |
170 | | /// |
171 | | /// If a key is inserted that is less than or equal to any previous key |
172 | | /// added, then an error is returned. Similarly, if there was a problem |
173 | | /// writing to the underlying writer, an error is returned. |
174 | 0 | pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()> |
175 | 0 | where |
176 | 0 | T: AsRef<[u8]>, |
177 | 0 | I: IntoIterator<Item = (T, Output)>, |
178 | | { |
179 | 0 | for (key, out) in iter { |
180 | 0 | self.insert(key, out.value())?; |
181 | | } |
182 | 0 | Ok(()) |
183 | 0 | } |
184 | | |
185 | | /// Calls insert on each item in the stream. |
186 | | /// |
187 | | /// Note that unlike `extend_iter`, this is not generic on the items in |
188 | | /// the stream. |
189 | | /// |
190 | | /// If a key is inserted that is less than or equal to any previous key |
191 | | /// added, then an error is returned. Similarly, if there was a problem |
192 | | /// writing to the underlying writer, an error is returned. |
193 | 0 | pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()> |
194 | 0 | where |
195 | 0 | I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>, |
196 | 0 | S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>, |
197 | | { |
198 | 0 | let mut stream = stream.into_stream(); |
199 | 0 | while let Some((key, out)) = stream.next() { |
200 | 0 | self.insert(key, out.value())?; |
201 | | } |
202 | 0 | Ok(()) |
203 | 0 | } |
204 | | |
205 | | /// Finishes the construction of the fst and flushes the underlying |
206 | | /// writer. After completion, the data written to `W` may be read using |
207 | | /// one of `Fst`'s constructor methods. |
208 | 0 | pub fn finish(self) -> Result<()> { |
209 | 0 | self.into_inner()?; |
210 | 0 | Ok(()) |
211 | 0 | } |
212 | | |
213 | | /// Just like `finish`, except it returns the underlying writer after |
214 | | /// flushing it. |
215 | 0 | pub fn into_inner(mut self) -> Result<W> { |
216 | 0 | self.compile_from(0)?; |
217 | 0 | let root_node = self.unfinished.pop_root(); |
218 | 0 | let root_addr = self.compile(&root_node)?; |
219 | 0 | bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?; |
220 | 0 | bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?; |
221 | | |
222 | 0 | let sum = self.wtr.masked_checksum(); |
223 | 0 | let mut wtr = self.wtr.into_inner(); |
224 | 0 | bytes::io_write_u32_le(sum, &mut wtr)?; |
225 | 0 | wtr.flush()?; |
226 | 0 | Ok(wtr) |
227 | 0 | } |
228 | | |
229 | 0 | fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()> |
230 | 0 | where |
231 | 0 | B: AsRef<[u8]>, |
232 | | { |
233 | 0 | let bs = bs.as_ref(); |
234 | 0 | if bs.is_empty() { |
235 | 0 | self.len = 1; // must be first key, so length is always 1 |
236 | 0 | self.unfinished.set_root_output(out.unwrap_or(Output::zero())); |
237 | 0 | return Ok(()); |
238 | 0 | } |
239 | 0 | let (prefix_len, out) = if let Some(out) = out { |
240 | 0 | self.unfinished.find_common_prefix_and_set_output(bs, out) |
241 | | } else { |
242 | 0 | (self.unfinished.find_common_prefix(bs), Output::zero()) |
243 | | }; |
244 | 0 | if prefix_len == bs.len() { |
245 | | // If the prefix found consumes the entire set of bytes, then |
246 | | // the prefix *equals* the bytes given. This means it is a |
247 | | // duplicate value with no output. So we can give up here. |
248 | | // |
249 | | // If the below assert fails, then that means we let a duplicate |
250 | | // value through even when inserting outputs. |
251 | 0 | assert!(out.is_zero()); |
252 | 0 | return Ok(()); |
253 | 0 | } |
254 | 0 | self.len += 1; |
255 | 0 | self.compile_from(prefix_len)?; |
256 | 0 | self.unfinished.add_suffix(&bs[prefix_len..], out); |
257 | 0 | Ok(()) |
258 | 0 | } |
259 | | |
260 | 0 | fn compile_from(&mut self, istate: usize) -> Result<()> { |
261 | 0 | let mut addr = NONE_ADDRESS; |
262 | 0 | while istate + 1 < self.unfinished.len() { |
263 | 0 | let node = if addr == NONE_ADDRESS { |
264 | 0 | self.unfinished.pop_empty() |
265 | | } else { |
266 | 0 | self.unfinished.pop_freeze(addr) |
267 | | }; |
268 | 0 | addr = self.compile(&node)?; |
269 | 0 | assert!(addr != NONE_ADDRESS); |
270 | | } |
271 | 0 | self.unfinished.top_last_freeze(addr); |
272 | 0 | Ok(()) |
273 | 0 | } |
274 | | |
275 | 0 | fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> { |
276 | 0 | if node.is_final |
277 | 0 | && node.trans.is_empty() |
278 | 0 | && node.final_output.is_zero() |
279 | | { |
280 | 0 | return Ok(EMPTY_ADDRESS); |
281 | 0 | } |
282 | 0 | let entry = self.registry.entry(&node); |
283 | 0 | if let RegistryEntry::Found(ref addr) = entry { |
284 | 0 | return Ok(*addr); |
285 | 0 | } |
286 | 0 | let start_addr = self.wtr.count() as CompiledAddr; |
287 | 0 | node.compile_to(&mut self.wtr, self.last_addr, start_addr)?; |
288 | 0 | self.last_addr = self.wtr.count() as CompiledAddr - 1; |
289 | 0 | if let RegistryEntry::NotFound(cell) = entry { |
290 | 0 | cell.insert(self.last_addr); |
291 | 0 | } |
292 | 0 | Ok(self.last_addr) |
293 | 0 | } |
294 | | |
295 | 0 | fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> { |
296 | 0 | if let Some(ref mut last) = self.last { |
297 | 0 | if check_dupe && bs == &**last { |
298 | 0 | return Err(Error::DuplicateKey { got: bs.to_vec() }.into()); |
299 | 0 | } |
300 | 0 | if bs < &**last { |
301 | 0 | return Err(Error::OutOfOrder { |
302 | 0 | previous: last.to_vec(), |
303 | 0 | got: bs.to_vec(), |
304 | 0 | } |
305 | 0 | .into()); |
306 | 0 | } |
307 | 0 | last.clear(); |
308 | 0 | for &b in bs { |
309 | 0 | last.push(b); |
310 | 0 | } |
311 | 0 | } else { |
312 | 0 | self.last = Some(bs.to_vec()); |
313 | 0 | } |
314 | 0 | Ok(()) |
315 | 0 | } |
316 | | |
317 | | /// Gets a reference to the underlying writer. |
318 | 0 | pub fn get_ref(&self) -> &W { |
319 | 0 | self.wtr.get_ref() |
320 | 0 | } |
321 | | |
322 | | /// Returns the number of bytes written to the underlying writer |
323 | 0 | pub fn bytes_written(&self) -> u64 { |
324 | 0 | self.wtr.count() |
325 | 0 | } |
326 | | } |
327 | | |
328 | | impl UnfinishedNodes { |
329 | 0 | fn new() -> UnfinishedNodes { |
330 | 0 | let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) }; |
331 | 0 | unfinished.push_empty(false); |
332 | 0 | unfinished |
333 | 0 | } |
334 | | |
335 | 0 | fn len(&self) -> usize { |
336 | 0 | self.stack.len() |
337 | 0 | } |
338 | | |
339 | 0 | fn push_empty(&mut self, is_final: bool) { |
340 | 0 | self.stack.push(BuilderNodeUnfinished { |
341 | 0 | node: BuilderNode { is_final, ..BuilderNode::default() }, |
342 | 0 | last: None, |
343 | 0 | }); |
344 | 0 | } |
345 | | |
346 | 0 | fn pop_root(&mut self) -> BuilderNode { |
347 | 0 | assert!(self.stack.len() == 1); |
348 | 0 | assert!(self.stack[0].last.is_none()); |
349 | 0 | self.stack.pop().unwrap().node |
350 | 0 | } |
351 | | |
352 | 0 | fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode { |
353 | 0 | let mut unfinished = self.stack.pop().unwrap(); |
354 | 0 | unfinished.last_compiled(addr); |
355 | 0 | unfinished.node |
356 | 0 | } |
357 | | |
358 | 0 | fn pop_empty(&mut self) -> BuilderNode { |
359 | 0 | let unfinished = self.stack.pop().unwrap(); |
360 | 0 | assert!(unfinished.last.is_none()); |
361 | 0 | unfinished.node |
362 | 0 | } |
363 | | |
364 | 0 | fn set_root_output(&mut self, out: Output) { |
365 | 0 | self.stack[0].node.is_final = true; |
366 | 0 | self.stack[0].node.final_output = out; |
367 | 0 | } |
368 | | |
369 | 0 | fn top_last_freeze(&mut self, addr: CompiledAddr) { |
370 | 0 | let last = self.stack.len().checked_sub(1).unwrap(); |
371 | 0 | self.stack[last].last_compiled(addr); |
372 | 0 | } |
373 | | |
374 | 0 | fn add_suffix(&mut self, bs: &[u8], out: Output) { |
375 | 0 | if bs.is_empty() { |
376 | 0 | return; |
377 | 0 | } |
378 | 0 | let last = self.stack.len().checked_sub(1).unwrap(); |
379 | 0 | assert!(self.stack[last].last.is_none()); |
380 | 0 | self.stack[last].last = Some(LastTransition { inp: bs[0], out }); |
381 | 0 | for &b in &bs[1..] { |
382 | 0 | self.stack.push(BuilderNodeUnfinished { |
383 | 0 | node: BuilderNode::default(), |
384 | 0 | last: Some(LastTransition { inp: b, out: Output::zero() }), |
385 | 0 | }); |
386 | 0 | } |
387 | 0 | self.push_empty(true); |
388 | 0 | } |
389 | | |
390 | 0 | fn find_common_prefix(&mut self, bs: &[u8]) -> usize { |
391 | 0 | bs.iter() |
392 | 0 | .zip(&self.stack) |
393 | 0 | .take_while(|&(&b, ref node)| { |
394 | 0 | node.last.as_ref().map(|t| t.inp == b).unwrap_or(false) |
395 | 0 | }) |
396 | 0 | .count() |
397 | 0 | } |
398 | | |
399 | 0 | fn find_common_prefix_and_set_output( |
400 | 0 | &mut self, |
401 | 0 | bs: &[u8], |
402 | 0 | mut out: Output, |
403 | 0 | ) -> (usize, Output) { |
404 | 0 | let mut i = 0; |
405 | 0 | while i < bs.len() { |
406 | 0 | let add_prefix = match self.stack[i].last.as_mut() { |
407 | 0 | Some(ref mut t) if t.inp == bs[i] => { |
408 | 0 | i += 1; |
409 | 0 | let common_pre = t.out.prefix(out); |
410 | 0 | let add_prefix = t.out.sub(common_pre); |
411 | 0 | out = out.sub(common_pre); |
412 | 0 | t.out = common_pre; |
413 | 0 | add_prefix |
414 | | } |
415 | 0 | _ => break, |
416 | | }; |
417 | 0 | if !add_prefix.is_zero() { |
418 | 0 | self.stack[i].add_output_prefix(add_prefix); |
419 | 0 | } |
420 | | } |
421 | 0 | (i, out) |
422 | 0 | } |
423 | | } |
424 | | |
425 | | impl BuilderNodeUnfinished { |
426 | 0 | fn last_compiled(&mut self, addr: CompiledAddr) { |
427 | 0 | if let Some(trans) = self.last.take() { |
428 | 0 | self.node.trans.push(Transition { |
429 | 0 | inp: trans.inp, |
430 | 0 | out: trans.out, |
431 | 0 | addr, |
432 | 0 | }); |
433 | 0 | } |
434 | 0 | } |
435 | | |
436 | 0 | fn add_output_prefix(&mut self, prefix: Output) { |
437 | 0 | if self.node.is_final { |
438 | 0 | self.node.final_output = prefix.cat(self.node.final_output); |
439 | 0 | } |
440 | 0 | for t in &mut self.node.trans { |
441 | 0 | t.out = prefix.cat(t.out); |
442 | 0 | } |
443 | 0 | if let Some(ref mut t) = self.last { |
444 | 0 | t.out = prefix.cat(t.out); |
445 | 0 | } |
446 | 0 | } |
447 | | } |
448 | | |
449 | | impl Clone for BuilderNode { |
450 | 0 | fn clone(&self) -> BuilderNode { |
451 | 0 | BuilderNode { |
452 | 0 | is_final: self.is_final, |
453 | 0 | final_output: self.final_output, |
454 | 0 | trans: self.trans.clone(), |
455 | 0 | } |
456 | 0 | } |
457 | | |
458 | 0 | fn clone_from(&mut self, source: &BuilderNode) { |
459 | 0 | self.is_final = source.is_final; |
460 | 0 | self.final_output = source.final_output; |
461 | 0 | self.trans.clear(); |
462 | 0 | self.trans.extend(source.trans.iter()); |
463 | 0 | } |
464 | | } |
465 | | |
466 | | impl Default for BuilderNode { |
467 | 0 | fn default() -> BuilderNode { |
468 | 0 | BuilderNode { |
469 | 0 | is_final: false, |
470 | 0 | final_output: Output::zero(), |
471 | 0 | trans: vec![], |
472 | 0 | } |
473 | 0 | } |
474 | | } |