Coverage Report

Created: 2025-10-10 07:11

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/rust/registry/src/index.crates.io-1949cf8c6b5b557f/fst-0.4.7/src/raw/build.rs
Line
Count
Source
1
use std::io;
2
3
use crate::bytes;
4
use crate::error::Result;
5
use crate::raw::counting_writer::CountingWriter;
6
use crate::raw::error::Error;
7
use crate::raw::registry::{Registry, RegistryEntry};
8
use crate::raw::{
9
    CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS,
10
    NONE_ADDRESS, VERSION,
11
};
12
// use raw::registry_minimal::{Registry, RegistryEntry};
13
use crate::stream::{IntoStreamer, Streamer};
14
15
/// A builder for creating a finite state transducer.
16
///
17
/// This is not your average everyday builder. It has two important qualities
18
/// that make it a bit unique from what you might expect:
19
///
20
/// 1. All keys must be added in lexicographic order. Adding a key out of order
21
///    will result in an error. Additionally, adding a duplicate key with an
22
///    output value will also result in an error. That is, once a key is
23
///    associated with a value, that association can never be modified or
24
///    deleted.
25
/// 2. The representation of an fst is streamed to *any* `io::Write` as it is
26
///    built. For an in memory representation, this can be a `Vec<u8>`.
27
///
28
/// Point (2) is especially important because it means that an fst can be
29
/// constructed *without storing the entire fst in memory*. Namely, since it
30
/// works with any `io::Write`, it can be streamed directly to a file.
31
///
32
/// With that said, the builder does use memory, but **memory usage is bounded
33
/// to a constant size**. The amount of memory used trades off with the
34
/// compression ratio. Currently, the implementation hard codes this trade off
35
/// which can result in about 5-20MB of heap usage during construction. (N.B.
36
/// Guaranteeing a maximal compression ratio requires memory proportional to
37
/// the size of the fst, which defeats some of the benefit of streaming
38
/// it to disk. In practice, a small bounded amount of memory achieves
39
/// close-to-minimal compression ratios.)
40
///
41
/// The algorithmic complexity of fst construction is `O(n)` where `n` is the
42
/// number of elements added to the fst.
43
pub struct Builder<W> {
44
    /// The FST raw data is written directly to `wtr`.
45
    ///
46
    /// No internal buffering is done.
47
    wtr: CountingWriter<W>,
48
    /// The stack of unfinished nodes.
49
    ///
50
    /// An unfinished node is a node that could potentially have a new
51
    /// transition added to it when a new word is added to the dictionary.
52
    unfinished: UnfinishedNodes,
53
    /// A map of finished nodes.
54
    ///
55
    /// A finished node is one that has been compiled and written to `wtr`.
56
    /// After this point, the node is considered immutable and will never
57
    /// change.
58
    registry: Registry,
59
    /// The last word added.
60
    ///
61
    /// This is used to enforce the invariant that words are added in sorted
62
    /// order.
63
    last: Option<Vec<u8>>,
64
    /// The address of the last compiled node.
65
    ///
66
    /// This is used to optimize states with one transition that point
67
    /// to the previously compiled node. (The previously compiled node in
68
    /// this case actually corresponds to the next state for the transition,
69
    /// since states are compiled in reverse.)
70
    last_addr: CompiledAddr,
71
    /// The number of keys added.
72
    len: usize,
73
}
74
75
#[derive(Debug)]
76
struct UnfinishedNodes {
77
    stack: Vec<BuilderNodeUnfinished>,
78
}
79
80
#[derive(Debug)]
81
struct BuilderNodeUnfinished {
82
    node: BuilderNode,
83
    last: Option<LastTransition>,
84
}
85
86
#[derive(Debug, Hash, Eq, PartialEq)]
87
pub struct BuilderNode {
88
    pub is_final: bool,
89
    pub final_output: Output,
90
    pub trans: Vec<Transition>,
91
}
92
93
#[derive(Debug)]
94
struct LastTransition {
95
    inp: u8,
96
    out: Output,
97
}
98
99
impl Builder<Vec<u8>> {
100
    /// Create a builder that builds an fst in memory.
101
    #[inline]
102
0
    pub fn memory() -> Builder<Vec<u8>> {
103
0
        Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
104
0
    }
105
106
    /// Finishes construction of the FST and returns it.
107
    #[inline]
108
0
    pub fn into_fst(self) -> Fst<Vec<u8>> {
109
0
        self.into_inner().and_then(Fst::new).unwrap()
110
0
    }
111
}
112
113
impl<W: io::Write> Builder<W> {
114
    /// Create a builder that builds an fst by writing it to `wtr` in a
115
    /// streaming fashion.
116
0
    pub fn new(wtr: W) -> Result<Builder<W>> {
117
0
        Builder::new_type(wtr, 0)
118
0
    }
119
120
    /// The same as `new`, except it sets the type of the fst to the type
121
    /// given.
122
0
    pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> {
123
0
        let mut wtr = CountingWriter::new(wtr);
124
        // Don't allow any nodes to have address 0-7. We use these to encode
125
        // the API version. We also use addresses `0` and `1` as special
126
        // sentinel values, so they should never correspond to a real node.
127
0
        bytes::io_write_u64_le(VERSION, &mut wtr)?;
128
        // Similarly for 8-15 for the fst type.
129
0
        bytes::io_write_u64_le(ty, &mut wtr)?;
130
0
        Ok(Builder {
131
0
            wtr,
132
0
            unfinished: UnfinishedNodes::new(),
133
0
            registry: Registry::new(10_000, 2),
134
0
            last: None,
135
0
            last_addr: NONE_ADDRESS,
136
0
            len: 0,
137
0
        })
138
0
    }
139
140
    /// Adds a byte string to this FST with a zero output value.
141
0
    pub fn add<B>(&mut self, bs: B) -> Result<()>
142
0
    where
143
0
        B: AsRef<[u8]>,
144
    {
145
0
        self.check_last_key(bs.as_ref(), false)?;
146
0
        self.insert_output(bs, None)
147
0
    }
148
149
    /// Insert a new key-value pair into the fst.
150
    ///
151
    /// Keys must be convertible to byte strings. Values must be a `u64`, which
152
    /// is a restriction of the current implementation of finite state
153
    /// transducers. (Values may one day be expanded to other types.)
154
    ///
155
    /// If a key is inserted that is less than or equal to any previous key
156
    /// added, then an error is returned. Similarly, if there was a problem
157
    /// writing to the underlying writer, an error is returned.
158
0
    pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
159
0
    where
160
0
        B: AsRef<[u8]>,
161
    {
162
0
        self.check_last_key(bs.as_ref(), true)?;
163
0
        self.insert_output(bs, Some(Output::new(val)))
164
0
    }
165
166
    /// Calls insert on each item in the iterator.
167
    ///
168
    /// If an error occurred while adding an element, processing is stopped
169
    /// and the error is returned.
170
    ///
171
    /// If a key is inserted that is less than or equal to any previous key
172
    /// added, then an error is returned. Similarly, if there was a problem
173
    /// writing to the underlying writer, an error is returned.
174
0
    pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
175
0
    where
176
0
        T: AsRef<[u8]>,
177
0
        I: IntoIterator<Item = (T, Output)>,
178
    {
179
0
        for (key, out) in iter {
180
0
            self.insert(key, out.value())?;
181
        }
182
0
        Ok(())
183
0
    }
184
185
    /// Calls insert on each item in the stream.
186
    ///
187
    /// Note that unlike `extend_iter`, this is not generic on the items in
188
    /// the stream.
189
    ///
190
    /// If a key is inserted that is less than or equal to any previous key
191
    /// added, then an error is returned. Similarly, if there was a problem
192
    /// writing to the underlying writer, an error is returned.
193
0
    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
194
0
    where
195
0
        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
196
0
        S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
197
    {
198
0
        let mut stream = stream.into_stream();
199
0
        while let Some((key, out)) = stream.next() {
200
0
            self.insert(key, out.value())?;
201
        }
202
0
        Ok(())
203
0
    }
204
205
    /// Finishes the construction of the fst and flushes the underlying
206
    /// writer. After completion, the data written to `W` may be read using
207
    /// one of `Fst`'s constructor methods.
208
0
    pub fn finish(self) -> Result<()> {
209
0
        self.into_inner()?;
210
0
        Ok(())
211
0
    }
212
213
    /// Just like `finish`, except it returns the underlying writer after
214
    /// flushing it.
215
0
    pub fn into_inner(mut self) -> Result<W> {
216
0
        self.compile_from(0)?;
217
0
        let root_node = self.unfinished.pop_root();
218
0
        let root_addr = self.compile(&root_node)?;
219
0
        bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?;
220
0
        bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?;
221
222
0
        let sum = self.wtr.masked_checksum();
223
0
        let mut wtr = self.wtr.into_inner();
224
0
        bytes::io_write_u32_le(sum, &mut wtr)?;
225
0
        wtr.flush()?;
226
0
        Ok(wtr)
227
0
    }
228
229
0
    fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
230
0
    where
231
0
        B: AsRef<[u8]>,
232
    {
233
0
        let bs = bs.as_ref();
234
0
        if bs.is_empty() {
235
0
            self.len = 1; // must be first key, so length is always 1
236
0
            self.unfinished.set_root_output(out.unwrap_or(Output::zero()));
237
0
            return Ok(());
238
0
        }
239
0
        let (prefix_len, out) = if let Some(out) = out {
240
0
            self.unfinished.find_common_prefix_and_set_output(bs, out)
241
        } else {
242
0
            (self.unfinished.find_common_prefix(bs), Output::zero())
243
        };
244
0
        if prefix_len == bs.len() {
245
            // If the prefix found consumes the entire set of bytes, then
246
            // the prefix *equals* the bytes given. This means it is a
247
            // duplicate value with no output. So we can give up here.
248
            //
249
            // If the below assert fails, then that means we let a duplicate
250
            // value through even when inserting outputs.
251
0
            assert!(out.is_zero());
252
0
            return Ok(());
253
0
        }
254
0
        self.len += 1;
255
0
        self.compile_from(prefix_len)?;
256
0
        self.unfinished.add_suffix(&bs[prefix_len..], out);
257
0
        Ok(())
258
0
    }
259
260
0
    fn compile_from(&mut self, istate: usize) -> Result<()> {
261
0
        let mut addr = NONE_ADDRESS;
262
0
        while istate + 1 < self.unfinished.len() {
263
0
            let node = if addr == NONE_ADDRESS {
264
0
                self.unfinished.pop_empty()
265
            } else {
266
0
                self.unfinished.pop_freeze(addr)
267
            };
268
0
            addr = self.compile(&node)?;
269
0
            assert!(addr != NONE_ADDRESS);
270
        }
271
0
        self.unfinished.top_last_freeze(addr);
272
0
        Ok(())
273
0
    }
274
275
0
    fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
276
0
        if node.is_final
277
0
            && node.trans.is_empty()
278
0
            && node.final_output.is_zero()
279
        {
280
0
            return Ok(EMPTY_ADDRESS);
281
0
        }
282
0
        let entry = self.registry.entry(&node);
283
0
        if let RegistryEntry::Found(ref addr) = entry {
284
0
            return Ok(*addr);
285
0
        }
286
0
        let start_addr = self.wtr.count() as CompiledAddr;
287
0
        node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
288
0
        self.last_addr = self.wtr.count() as CompiledAddr - 1;
289
0
        if let RegistryEntry::NotFound(cell) = entry {
290
0
            cell.insert(self.last_addr);
291
0
        }
292
0
        Ok(self.last_addr)
293
0
    }
294
295
0
    fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
296
0
        if let Some(ref mut last) = self.last {
297
0
            if check_dupe && bs == &**last {
298
0
                return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
299
0
            }
300
0
            if bs < &**last {
301
0
                return Err(Error::OutOfOrder {
302
0
                    previous: last.to_vec(),
303
0
                    got: bs.to_vec(),
304
0
                }
305
0
                .into());
306
0
            }
307
0
            last.clear();
308
0
            for &b in bs {
309
0
                last.push(b);
310
0
            }
311
0
        } else {
312
0
            self.last = Some(bs.to_vec());
313
0
        }
314
0
        Ok(())
315
0
    }
316
317
    /// Gets a reference to the underlying writer.
318
0
    pub fn get_ref(&self) -> &W {
319
0
        self.wtr.get_ref()
320
0
    }
321
322
    /// Returns the number of bytes written to the underlying writer
323
0
    pub fn bytes_written(&self) -> u64 {
324
0
        self.wtr.count()
325
0
    }
326
}
327
328
impl UnfinishedNodes {
329
0
    fn new() -> UnfinishedNodes {
330
0
        let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) };
331
0
        unfinished.push_empty(false);
332
0
        unfinished
333
0
    }
334
335
0
    fn len(&self) -> usize {
336
0
        self.stack.len()
337
0
    }
338
339
0
    fn push_empty(&mut self, is_final: bool) {
340
0
        self.stack.push(BuilderNodeUnfinished {
341
0
            node: BuilderNode { is_final, ..BuilderNode::default() },
342
0
            last: None,
343
0
        });
344
0
    }
345
346
0
    fn pop_root(&mut self) -> BuilderNode {
347
0
        assert!(self.stack.len() == 1);
348
0
        assert!(self.stack[0].last.is_none());
349
0
        self.stack.pop().unwrap().node
350
0
    }
351
352
0
    fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
353
0
        let mut unfinished = self.stack.pop().unwrap();
354
0
        unfinished.last_compiled(addr);
355
0
        unfinished.node
356
0
    }
357
358
0
    fn pop_empty(&mut self) -> BuilderNode {
359
0
        let unfinished = self.stack.pop().unwrap();
360
0
        assert!(unfinished.last.is_none());
361
0
        unfinished.node
362
0
    }
363
364
0
    fn set_root_output(&mut self, out: Output) {
365
0
        self.stack[0].node.is_final = true;
366
0
        self.stack[0].node.final_output = out;
367
0
    }
368
369
0
    fn top_last_freeze(&mut self, addr: CompiledAddr) {
370
0
        let last = self.stack.len().checked_sub(1).unwrap();
371
0
        self.stack[last].last_compiled(addr);
372
0
    }
373
374
0
    fn add_suffix(&mut self, bs: &[u8], out: Output) {
375
0
        if bs.is_empty() {
376
0
            return;
377
0
        }
378
0
        let last = self.stack.len().checked_sub(1).unwrap();
379
0
        assert!(self.stack[last].last.is_none());
380
0
        self.stack[last].last = Some(LastTransition { inp: bs[0], out });
381
0
        for &b in &bs[1..] {
382
0
            self.stack.push(BuilderNodeUnfinished {
383
0
                node: BuilderNode::default(),
384
0
                last: Some(LastTransition { inp: b, out: Output::zero() }),
385
0
            });
386
0
        }
387
0
        self.push_empty(true);
388
0
    }
389
390
0
    fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
391
0
        bs.iter()
392
0
            .zip(&self.stack)
393
0
            .take_while(|&(&b, ref node)| {
394
0
                node.last.as_ref().map(|t| t.inp == b).unwrap_or(false)
395
0
            })
396
0
            .count()
397
0
    }
398
399
0
    fn find_common_prefix_and_set_output(
400
0
        &mut self,
401
0
        bs: &[u8],
402
0
        mut out: Output,
403
0
    ) -> (usize, Output) {
404
0
        let mut i = 0;
405
0
        while i < bs.len() {
406
0
            let add_prefix = match self.stack[i].last.as_mut() {
407
0
                Some(ref mut t) if t.inp == bs[i] => {
408
0
                    i += 1;
409
0
                    let common_pre = t.out.prefix(out);
410
0
                    let add_prefix = t.out.sub(common_pre);
411
0
                    out = out.sub(common_pre);
412
0
                    t.out = common_pre;
413
0
                    add_prefix
414
                }
415
0
                _ => break,
416
            };
417
0
            if !add_prefix.is_zero() {
418
0
                self.stack[i].add_output_prefix(add_prefix);
419
0
            }
420
        }
421
0
        (i, out)
422
0
    }
423
}
424
425
impl BuilderNodeUnfinished {
426
0
    fn last_compiled(&mut self, addr: CompiledAddr) {
427
0
        if let Some(trans) = self.last.take() {
428
0
            self.node.trans.push(Transition {
429
0
                inp: trans.inp,
430
0
                out: trans.out,
431
0
                addr,
432
0
            });
433
0
        }
434
0
    }
435
436
0
    fn add_output_prefix(&mut self, prefix: Output) {
437
0
        if self.node.is_final {
438
0
            self.node.final_output = prefix.cat(self.node.final_output);
439
0
        }
440
0
        for t in &mut self.node.trans {
441
0
            t.out = prefix.cat(t.out);
442
0
        }
443
0
        if let Some(ref mut t) = self.last {
444
0
            t.out = prefix.cat(t.out);
445
0
        }
446
0
    }
447
}
448
449
impl Clone for BuilderNode {
450
0
    fn clone(&self) -> BuilderNode {
451
0
        BuilderNode {
452
0
            is_final: self.is_final,
453
0
            final_output: self.final_output,
454
0
            trans: self.trans.clone(),
455
0
        }
456
0
    }
457
458
0
    fn clone_from(&mut self, source: &BuilderNode) {
459
0
        self.is_final = source.is_final;
460
0
        self.final_output = source.final_output;
461
0
        self.trans.clear();
462
0
        self.trans.extend(source.trans.iter());
463
0
    }
464
}
465
466
impl Default for BuilderNode {
467
0
    fn default() -> BuilderNode {
468
0
        BuilderNode {
469
0
            is_final: false,
470
0
            final_output: Output::zero(),
471
0
            trans: vec![],
472
0
        }
473
0
    }
474
}