/rust/registry/src/index.crates.io-1949cf8c6b5b557f/fst-0.4.7/src/raw/build.rs

Source
use std::io;

use crate::bytes;
use crate::error::Result;
use crate::raw::counting_writer::CountingWriter;
use crate::raw::error::Error;
use crate::raw::registry::{Registry, RegistryEntry};
use crate::raw::{
    CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS,
    NONE_ADDRESS, VERSION,
};
// use raw::registry_minimal::{Registry, RegistryEntry};
use crate::stream::{IntoStreamer, Streamer};

/// A builder for creating a finite state transducer.
///
/// This is not your average everyday builder. It has two important qualities
/// that make it a bit unique from what you might expect:
///
/// 1. All keys must be added in lexicographic order. Adding a key out of order
///    will result in an error. Additionally, adding a duplicate key with an
///    output value will also result in an error. That is, once a key is
///    associated with a value, that association can never be modified or
///    deleted.
/// 2. The representation of an fst is streamed to *any* `io::Write` as it is
///    built. For an in memory representation, this can be a `Vec<u8>`.
///
/// Point (2) is especially important because it means that an fst can be
/// constructed *without storing the entire fst in memory*. Namely, since it
/// works with any `io::Write`, it can be streamed directly to a file.
///
/// With that said, the builder does use memory, but **memory usage is bounded
/// to a constant size**. The amount of memory used trades off with the
/// compression ratio. Currently, the implementation hard codes this trade off
/// which can result in about 5-20MB of heap usage during construction. (N.B.
/// Guaranteeing a maximal compression ratio requires memory proportional to
/// the size of the fst, which defeats some of the benefit of streaming
/// it to disk. In practice, a small bounded amount of memory achieves
/// close-to-minimal compression ratios.)
///
/// The algorithmic complexity of fst construction is `O(n)` where `n` is the
/// number of elements added to the fst.
pub struct Builder<W> {
    /// The FST raw data is written directly to `wtr`.
    ///
    /// No internal buffering is done.
    wtr: CountingWriter<W>,
    /// The stack of unfinished nodes.
    ///
    /// An unfinished node is a node that could potentially have a new
    /// transition added to it when a new word is added to the dictionary.
    unfinished: UnfinishedNodes,
    /// A map of finished nodes.
    ///
    /// A finished node is one that has been compiled and written to `wtr`.
    /// After this point, the node is considered immutable and will never
    /// change.
    registry: Registry,
    /// The last word added.
    ///
    /// This is used to enforce the invariant that words are added in sorted
    /// order.
    last: Option<Vec<u8>>,
    /// The address of the last compiled node.
    ///
    /// This is used to optimize states with one transition that point
    /// to the previously compiled node. (The previously compiled node in
    /// this case actually corresponds to the next state for the transition,
    /// since states are compiled in reverse.)
    last_addr: CompiledAddr,
    /// The number of keys added.
    len: usize,
}

#[derive(Debug)]
struct UnfinishedNodes {
    stack: Vec<BuilderNodeUnfinished>,
}

#[derive(Debug)]
struct BuilderNodeUnfinished {
    node: BuilderNode,
    last: Option<LastTransition>,
}

#[derive(Debug, Hash, Eq, PartialEq)]
pub struct BuilderNode {
    pub is_final: bool,
    pub final_output: Output,
    pub trans: Vec<Transition>,
}

#[derive(Debug)]
struct LastTransition {
    inp: u8,
    out: Output,
}

impl Builder<Vec<u8>> {
    /// Create a builder that builds an fst in memory.
    #[inline]
    pub fn memory() -> Builder<Vec<u8>> {
        Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
    }

    /// Finishes construction of the FST and returns it.
    #[inline]
    pub fn into_fst(self) -> Fst<Vec<u8>> {
        self.into_inner().and_then(Fst::new).unwrap()
    }
}

impl<W: io::Write> Builder<W> {
    /// Create a builder that builds an fst by writing it to `wtr` in a
    /// streaming fashion.
    pub fn new(wtr: W) -> Result<Builder<W>> {
        Builder::new_type(wtr, 0)
    }

    /// The same as `new`, except it sets the type of the fst to the type
    /// given.
    pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> {
        let mut wtr = CountingWriter::new(wtr);
        // Don't allow any nodes to have address 0-7. We use these to encode
        // the API version. We also use addresses `0` and `1` as special
        // sentinel values, so they should never correspond to a real node.
        bytes::io_write_u64_le(VERSION, &mut wtr)?;
        // Similarly for 8-15 for the fst type.
        bytes::io_write_u64_le(ty, &mut wtr)?;
        Ok(Builder {
            wtr,
            unfinished: UnfinishedNodes::new(),
            registry: Registry::new(10_000, 2),
            last: None,
            last_addr: NONE_ADDRESS,
            len: 0,
        })
    }

    /// Adds a byte string to this FST with a zero output value.
    pub fn add<B>(&mut self, bs: B) -> Result<()>
    where
        B: AsRef<[u8]>,
    {
        self.check_last_key(bs.as_ref(), false)?;
        self.insert_output(bs, None)
    }

    /// Insert a new key-value pair into the fst.
    ///
    /// Keys must be convertible to byte strings. Values must be a `u64`, which
    /// is a restriction of the current implementation of finite state
    /// transducers. (Values may one day be expanded to other types.)
    ///
    /// If a key is inserted that is less than or equal to any previous key
    /// added, then an error is returned. Similarly, if there was a problem
    /// writing to the underlying writer, an error is returned.
    pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
    where
        B: AsRef<[u8]>,
    {
        self.check_last_key(bs.as_ref(), true)?;
        self.insert_output(bs, Some(Output::new(val)))
    }

    /// Calls insert on each item in the iterator.
    ///
    /// If an error occurred while adding an element, processing is stopped
    /// and the error is returned.
    ///
    /// If a key is inserted that is less than or equal to any previous key
    /// added, then an error is returned. Similarly, if there was a problem
    /// writing to the underlying writer, an error is returned.
    pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
    where
        T: AsRef<[u8]>,
        I: IntoIterator<Item = (T, Output)>,
    {
        for (key, out) in iter {
            self.insert(key, out.value())?;
        }
        Ok(())
    }

    /// Calls insert on each item in the stream.
    ///
    /// Note that unlike `extend_iter`, this is not generic on the items in
    /// the stream.
    ///
    /// If a key is inserted that is less than or equal to any previous key
    /// added, then an error is returned. Similarly, if there was a problem
    /// writing to the underlying writer, an error is returned.
    pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
    where
        I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
        S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
    {
        let mut stream = stream.into_stream();
        while let Some((key, out)) = stream.next() {
            self.insert(key, out.value())?;
        }
        Ok(())
    }

    /// Finishes the construction of the fst and flushes the underlying
    /// writer. After completion, the data written to `W` may be read using
    /// one of `Fst`'s constructor methods.
    pub fn finish(self) -> Result<()> {
        self.into_inner()?;
        Ok(())
    }

    /// Just like `finish`, except it returns the underlying writer after
    /// flushing it.
    pub fn into_inner(mut self) -> Result<W> {
        self.compile_from(0)?;
        let root_node = self.unfinished.pop_root();
        let root_addr = self.compile(&root_node)?;
        bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?;
        bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?;

        let sum = self.wtr.masked_checksum();
        let mut wtr = self.wtr.into_inner();
        bytes::io_write_u32_le(sum, &mut wtr)?;
        wtr.flush()?;
        Ok(wtr)
    }

    fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
    where
        B: AsRef<[u8]>,
    {
        let bs = bs.as_ref();
        if bs.is_empty() {
            self.len = 1; // must be first key, so length is always 1
            self.unfinished.set_root_output(out.unwrap_or(Output::zero()));
            return Ok(());
        }
        let (prefix_len, out) = if let Some(out) = out {
            self.unfinished.find_common_prefix_and_set_output(bs, out)
        } else {
            (self.unfinished.find_common_prefix(bs), Output::zero())
        };
        if prefix_len == bs.len() {
            // If the prefix found consumes the entire set of bytes, then
            // the prefix *equals* the bytes given. This means it is a
            // duplicate value with no output. So we can give up here.
            //
            // If the below assert fails, then that means we let a duplicate
            // value through even when inserting outputs.
            assert!(out.is_zero());
            return Ok(());
        }
        self.len += 1;
        self.compile_from(prefix_len)?;
        self.unfinished.add_suffix(&bs[prefix_len..], out);
        Ok(())
    }

    fn compile_from(&mut self, istate: usize) -> Result<()> {
        let mut addr = NONE_ADDRESS;
        while istate + 1 < self.unfinished.len() {
            let node = if addr == NONE_ADDRESS {
                self.unfinished.pop_empty()
            } else {
                self.unfinished.pop_freeze(addr)
            };
            addr = self.compile(&node)?;
            assert!(addr != NONE_ADDRESS);
        }
        self.unfinished.top_last_freeze(addr);
        Ok(())
    }

    fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
        if node.is_final
            && node.trans.is_empty()
            && node.final_output.is_zero()
        {
            return Ok(EMPTY_ADDRESS);
        }
        let entry = self.registry.entry(&node);
        if let RegistryEntry::Found(ref addr) = entry {
            return Ok(*addr);
        }
        let start_addr = self.wtr.count() as CompiledAddr;
        node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
        self.last_addr = self.wtr.count() as CompiledAddr - 1;
        if let RegistryEntry::NotFound(cell) = entry {
            cell.insert(self.last_addr);
        }
        Ok(self.last_addr)
    }

    fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
        if let Some(ref mut last) = self.last {
            if check_dupe && bs == &**last {
                return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
            }
            if bs < &**last {
                return Err(Error::OutOfOrder {
                    previous: last.to_vec(),
                    got: bs.to_vec(),
                }
                .into());
            }
            last.clear();
            for &b in bs {
                last.push(b);
            }
        } else {
            self.last = Some(bs.to_vec());
        }
        Ok(())
    }

    /// Gets a reference to the underlying writer.
    pub fn get_ref(&self) -> &W {
        self.wtr.get_ref()
    }

    /// Returns the number of bytes written to the underlying writer
    pub fn bytes_written(&self) -> u64 {
        self.wtr.count()
    }
}

impl UnfinishedNodes {
    fn new() -> UnfinishedNodes {
        let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) };
        unfinished.push_empty(false);
        unfinished
    }

    fn len(&self) -> usize {
        self.stack.len()
    }

    fn push_empty(&mut self, is_final: bool) {
        self.stack.push(BuilderNodeUnfinished {
            node: BuilderNode { is_final, ..BuilderNode::default() },
            last: None,
        });
    }

    fn pop_root(&mut self) -> BuilderNode {
        assert!(self.stack.len() == 1);
        assert!(self.stack[0].last.is_none());
        self.stack.pop().unwrap().node
    }

    fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
        let mut unfinished = self.stack.pop().unwrap();
        unfinished.last_compiled(addr);
        unfinished.node
    }

    fn pop_empty(&mut self) -> BuilderNode {
        let unfinished = self.stack.pop().unwrap();
        assert!(unfinished.last.is_none());
        unfinished.node
    }

    fn set_root_output(&mut self, out: Output) {
        self.stack[0].node.is_final = true;
        self.stack[0].node.final_output = out;
    }

    fn top_last_freeze(&mut self, addr: CompiledAddr) {
        let last = self.stack.len().checked_sub(1).unwrap();
        self.stack[last].last_compiled(addr);
    }

    fn add_suffix(&mut self, bs: &[u8], out: Output) {
        if bs.is_empty() {
            return;
        }
        let last = self.stack.len().checked_sub(1).unwrap();
        assert!(self.stack[last].last.is_none());
        self.stack[last].last = Some(LastTransition { inp: bs[0], out });
        for &b in &bs[1..] {
            self.stack.push(BuilderNodeUnfinished {
                node: BuilderNode::default(),
                last: Some(LastTransition { inp: b, out: Output::zero() }),
            });
        }
        self.push_empty(true);
    }

    fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
        bs.iter()
            .zip(&self.stack)
            .take_while(|&(&b, ref node)| {
                node.last.as_ref().map(|t| t.inp == b).unwrap_or(false)
            })
            .count()
    }

    fn find_common_prefix_and_set_output(
        &mut self,
        bs: &[u8],
        mut out: Output,
    ) -> (usize, Output) {
        let mut i = 0;
        while i < bs.len() {
            let add_prefix = match self.stack[i].last.as_mut() {
                Some(ref mut t) if t.inp == bs[i] => {
                    i += 1;
                    let common_pre = t.out.prefix(out);
                    let add_prefix = t.out.sub(common_pre);
                    out = out.sub(common_pre);
                    t.out = common_pre;
                    add_prefix
                }
                _ => break,
            };
            if !add_prefix.is_zero() {
                self.stack[i].add_output_prefix(add_prefix);
            }
        }
        (i, out)
    }
}

impl BuilderNodeUnfinished {
    fn last_compiled(&mut self, addr: CompiledAddr) {
        if let Some(trans) = self.last.take() {
            self.node.trans.push(Transition {
                inp: trans.inp,
                out: trans.out,
                addr,
            });
        }
    }

    fn add_output_prefix(&mut self, prefix: Output) {
        if self.node.is_final {
            self.node.final_output = prefix.cat(self.node.final_output);
        }
        for t in &mut self.node.trans {
            t.out = prefix.cat(t.out);
        }
        if let Some(ref mut t) = self.last {
            t.out = prefix.cat(t.out);
        }
    }
}

impl Clone for BuilderNode {
    fn clone(&self) -> BuilderNode {
        BuilderNode {
            is_final: self.is_final,
            final_output: self.final_output,
            trans: self.trans.clone(),
        }
    }

    fn clone_from(&mut self, source: &BuilderNode) {
        self.is_final = source.is_final;
        self.final_output = source.final_output;
        self.trans.clear();
        self.trans.extend(source.trans.iter());
    }
}

impl Default for BuilderNode {
    fn default() -> BuilderNode {
        BuilderNode {
            is_final: false,
            final_output: Output::zero(),
            trans: vec![],
        }
    }
}

Coverage Report

Created: 2025-10-10 07:11

Line	Count	Source
1		use std::io;
2
3		use crate::bytes;
4		use crate::error::Result;
5		use crate::raw::counting_writer::CountingWriter;
6		use crate::raw::error::Error;
7		use crate::raw::registry::{Registry, RegistryEntry};
8		use crate::raw::{
9		CompiledAddr, Fst, FstType, Output, Transition, EMPTY_ADDRESS,
10		NONE_ADDRESS, VERSION,
11		};
12		// use raw::registry_minimal::{Registry, RegistryEntry};
13		use crate::stream::{IntoStreamer, Streamer};
14
15		/// A builder for creating a finite state transducer.
16		///
17		/// This is not your average everyday builder. It has two important qualities
18		/// that make it a bit unique from what you might expect:
19		///
20		/// 1. All keys must be added in lexicographic order. Adding a key out of order
21		/// will result in an error. Additionally, adding a duplicate key with an
22		/// output value will also result in an error. That is, once a key is
23		/// associated with a value, that association can never be modified or
24		/// deleted.
25		/// 2. The representation of an fst is streamed to any `io::Write` as it is
26		/// built. For an in memory representation, this can be a `Vec<u8>`.
27		///
28		/// Point (2) is especially important because it means that an fst can be
29		/// constructed without storing the entire fst in memory. Namely, since it
30		/// works with any `io::Write`, it can be streamed directly to a file.
31		///
32		/// With that said, the builder does use memory, but **memory usage is bounded
33		/// to a constant size**. The amount of memory used trades off with the
34		/// compression ratio. Currently, the implementation hard codes this trade off
35		/// which can result in about 5-20MB of heap usage during construction. (N.B.
36		/// Guaranteeing a maximal compression ratio requires memory proportional to
37		/// the size of the fst, which defeats some of the benefit of streaming
38		/// it to disk. In practice, a small bounded amount of memory achieves
39		/// close-to-minimal compression ratios.)
40		///
41		/// The algorithmic complexity of fst construction is `O(n)` where `n` is the
42		/// number of elements added to the fst.
43		pub struct Builder<W> {
44		/// The FST raw data is written directly to `wtr`.
45		///
46		/// No internal buffering is done.
47		wtr: CountingWriter<W>,
48		/// The stack of unfinished nodes.
49		///
50		/// An unfinished node is a node that could potentially have a new
51		/// transition added to it when a new word is added to the dictionary.
52		unfinished: UnfinishedNodes,
53		/// A map of finished nodes.
54		///
55		/// A finished node is one that has been compiled and written to `wtr`.
56		/// After this point, the node is considered immutable and will never
57		/// change.
58		registry: Registry,
59		/// The last word added.
60		///
61		/// This is used to enforce the invariant that words are added in sorted
62		/// order.
63		last: Option<Vec<u8>>,
64		/// The address of the last compiled node.
65		///
66		/// This is used to optimize states with one transition that point
67		/// to the previously compiled node. (The previously compiled node in
68		/// this case actually corresponds to the next state for the transition,
69		/// since states are compiled in reverse.)
70		last_addr: CompiledAddr,
71		/// The number of keys added.
72		len: usize,
73		}
74
75		#[derive(Debug)]
76		struct UnfinishedNodes {
77		stack: Vec<BuilderNodeUnfinished>,
78		}
79
80		#[derive(Debug)]
81		struct BuilderNodeUnfinished {
82		node: BuilderNode,
83		last: Option<LastTransition>,
84		}
85
86		#[derive(Debug, Hash, Eq, PartialEq)]
87		pub struct BuilderNode {
88		pub is_final: bool,
89		pub final_output: Output,
90		pub trans: Vec<Transition>,
91		}
92
93		#[derive(Debug)]
94		struct LastTransition {
95		inp: u8,
96		out: Output,
97		}
98
99		impl Builder<Vec<u8>> {
100		/// Create a builder that builds an fst in memory.
101		#[inline]
102	0	pub fn memory() -> Builder<Vec<u8>> {
103	0	Builder::new(Vec::with_capacity(10 * (1 << 10))).unwrap()
104	0	}
105
106		/// Finishes construction of the FST and returns it.
107		#[inline]
108	0	pub fn into_fst(self) -> Fst<Vec<u8>> {
109	0	self.into_inner().and_then(Fst::new).unwrap()
110	0	}
111		}
112
113		impl<W: io::Write> Builder<W> {
114		/// Create a builder that builds an fst by writing it to `wtr` in a
115		/// streaming fashion.
116	0	pub fn new(wtr: W) -> Result<Builder<W>> {
117	0	Builder::new_type(wtr, 0)
118	0	}
119
120		/// The same as `new`, except it sets the type of the fst to the type
121		/// given.
122	0	pub fn new_type(wtr: W, ty: FstType) -> Result<Builder<W>> {
123	0	let mut wtr = CountingWriter::new(wtr);
124		// Don't allow any nodes to have address 0-7. We use these to encode
125		// the API version. We also use addresses `0` and `1` as special
126		// sentinel values, so they should never correspond to a real node.
127	0	bytes::io_write_u64_le(VERSION, &mut wtr)?;
128		// Similarly for 8-15 for the fst type.
129	0	bytes::io_write_u64_le(ty, &mut wtr)?;
130	0	Ok(Builder {
131	0	wtr,
132	0	unfinished: UnfinishedNodes::new(),
133	0	registry: Registry::new(10_000, 2),
134	0	last: None,
135	0	last_addr: NONE_ADDRESS,
136	0	len: 0,
137	0	})
138	0	}
139
140		/// Adds a byte string to this FST with a zero output value.
141	0	pub fn add<B>(&mut self, bs: B) -> Result<()>
142	0	where
143	0	B: AsRef<[u8]>,
144		{
145	0	self.check_last_key(bs.as_ref(), false)?;
146	0	self.insert_output(bs, None)
147	0	}
148
149		/// Insert a new key-value pair into the fst.
150		///
151		/// Keys must be convertible to byte strings. Values must be a `u64`, which
152		/// is a restriction of the current implementation of finite state
153		/// transducers. (Values may one day be expanded to other types.)
154		///
155		/// If a key is inserted that is less than or equal to any previous key
156		/// added, then an error is returned. Similarly, if there was a problem
157		/// writing to the underlying writer, an error is returned.
158	0	pub fn insert<B>(&mut self, bs: B, val: u64) -> Result<()>
159	0	where
160	0	B: AsRef<[u8]>,
161		{
162	0	self.check_last_key(bs.as_ref(), true)?;
163	0	self.insert_output(bs, Some(Output::new(val)))
164	0	}
165
166		/// Calls insert on each item in the iterator.
167		///
168		/// If an error occurred while adding an element, processing is stopped
169		/// and the error is returned.
170		///
171		/// If a key is inserted that is less than or equal to any previous key
172		/// added, then an error is returned. Similarly, if there was a problem
173		/// writing to the underlying writer, an error is returned.
174	0	pub fn extend_iter<T, I>(&mut self, iter: I) -> Result<()>
175	0	where
176	0	T: AsRef<[u8]>,
177	0	I: IntoIterator<Item = (T, Output)>,
178		{
179	0	for (key, out) in iter {
180	0	self.insert(key, out.value())?;
181		}
182	0	Ok(())
183	0	}
184
185		/// Calls insert on each item in the stream.
186		///
187		/// Note that unlike `extend_iter`, this is not generic on the items in
188		/// the stream.
189		///
190		/// If a key is inserted that is less than or equal to any previous key
191		/// added, then an error is returned. Similarly, if there was a problem
192		/// writing to the underlying writer, an error is returned.
193	0	pub fn extend_stream<'f, I, S>(&mut self, stream: I) -> Result<()>
194	0	where
195	0	I: for<'a> IntoStreamer<'a, Into = S, Item = (&'a [u8], Output)>,
196	0	S: 'f + for<'a> Streamer<'a, Item = (&'a [u8], Output)>,
197		{
198	0	let mut stream = stream.into_stream();
199	0	while let Some((key, out)) = stream.next() {
200	0	self.insert(key, out.value())?;
201		}
202	0	Ok(())
203	0	}
204
205		/// Finishes the construction of the fst and flushes the underlying
206		/// writer. After completion, the data written to `W` may be read using
207		/// one of `Fst`'s constructor methods.
208	0	pub fn finish(self) -> Result<()> {
209	0	self.into_inner()?;
210	0	Ok(())
211	0	}
212
213		/// Just like `finish`, except it returns the underlying writer after
214		/// flushing it.
215	0	pub fn into_inner(mut self) -> Result<W> {
216	0	self.compile_from(0)?;
217	0	let root_node = self.unfinished.pop_root();
218	0	let root_addr = self.compile(&root_node)?;
219	0	bytes::io_write_u64_le(self.len as u64, &mut self.wtr)?;
220	0	bytes::io_write_u64_le(root_addr as u64, &mut self.wtr)?;
221
222	0	let sum = self.wtr.masked_checksum();
223	0	let mut wtr = self.wtr.into_inner();
224	0	bytes::io_write_u32_le(sum, &mut wtr)?;
225	0	wtr.flush()?;
226	0	Ok(wtr)
227	0	}
228
229	0	fn insert_output<B>(&mut self, bs: B, out: Option<Output>) -> Result<()>
230	0	where
231	0	B: AsRef<[u8]>,
232		{
233	0	let bs = bs.as_ref();
234	0	if bs.is_empty() {
235	0	self.len = 1; // must be first key, so length is always 1
236	0	self.unfinished.set_root_output(out.unwrap_or(Output::zero()));
237	0	return Ok(());
238	0	}
239	0	let (prefix_len, out) = if let Some(out) = out {
240	0	self.unfinished.find_common_prefix_and_set_output(bs, out)
241		} else {
242	0	(self.unfinished.find_common_prefix(bs), Output::zero())
243		};
244	0	if prefix_len == bs.len() {
245		// If the prefix found consumes the entire set of bytes, then
246		// the prefix equals the bytes given. This means it is a
247		// duplicate value with no output. So we can give up here.
248		//
249		// If the below assert fails, then that means we let a duplicate
250		// value through even when inserting outputs.
251	0	assert!(out.is_zero());
252	0	return Ok(());
253	0	}
254	0	self.len += 1;
255	0	self.compile_from(prefix_len)?;
256	0	self.unfinished.add_suffix(&bs[prefix_len..], out);
257	0	Ok(())
258	0	}
259
260	0	fn compile_from(&mut self, istate: usize) -> Result<()> {
261	0	let mut addr = NONE_ADDRESS;
262	0	while istate + 1 < self.unfinished.len() {
263	0	let node = if addr == NONE_ADDRESS {
264	0	self.unfinished.pop_empty()
265		} else {
266	0	self.unfinished.pop_freeze(addr)
267		};
268	0	addr = self.compile(&node)?;
269	0	assert!(addr != NONE_ADDRESS);
270		}
271	0	self.unfinished.top_last_freeze(addr);
272	0	Ok(())
273	0	}
274
275	0	fn compile(&mut self, node: &BuilderNode) -> Result<CompiledAddr> {
276	0	if node.is_final
277	0	&& node.trans.is_empty()
278	0	&& node.final_output.is_zero()
279		{
280	0	return Ok(EMPTY_ADDRESS);
281	0	}
282	0	let entry = self.registry.entry(&node);
283	0	if let RegistryEntry::Found(ref addr) = entry {
284	0	return Ok(*addr);
285	0	}
286	0	let start_addr = self.wtr.count() as CompiledAddr;
287	0	node.compile_to(&mut self.wtr, self.last_addr, start_addr)?;
288	0	self.last_addr = self.wtr.count() as CompiledAddr - 1;
289	0	if let RegistryEntry::NotFound(cell) = entry {
290	0	cell.insert(self.last_addr);
291	0	}
292	0	Ok(self.last_addr)
293	0	}
294
295	0	fn check_last_key(&mut self, bs: &[u8], check_dupe: bool) -> Result<()> {
296	0	if let Some(ref mut last) = self.last {
297	0	if check_dupe && bs == &**last {
298	0	return Err(Error::DuplicateKey { got: bs.to_vec() }.into());
299	0	}
300	0	if bs < &**last {
301	0	return Err(Error::OutOfOrder {
302	0	previous: last.to_vec(),
303	0	got: bs.to_vec(),
304	0	}
305	0	.into());
306	0	}
307	0	last.clear();
308	0	for &b in bs {
309	0	last.push(b);
310	0	}
311	0	} else {
312	0	self.last = Some(bs.to_vec());
313	0	}
314	0	Ok(())
315	0	}
316
317		/// Gets a reference to the underlying writer.
318	0	pub fn get_ref(&self) -> &W {
319	0	self.wtr.get_ref()
320	0	}
321
322		/// Returns the number of bytes written to the underlying writer
323	0	pub fn bytes_written(&self) -> u64 {
324	0	self.wtr.count()
325	0	}
326		}
327
328		impl UnfinishedNodes {
329	0	fn new() -> UnfinishedNodes {
330	0	let mut unfinished = UnfinishedNodes { stack: Vec::with_capacity(64) };
331	0	unfinished.push_empty(false);
332	0	unfinished
333	0	}
334
335	0	fn len(&self) -> usize {
336	0	self.stack.len()
337	0	}
338
339	0	fn push_empty(&mut self, is_final: bool) {
340	0	self.stack.push(BuilderNodeUnfinished {
341	0	node: BuilderNode { is_final, ..BuilderNode::default() },
342	0	last: None,
343	0	});
344	0	}
345
346	0	fn pop_root(&mut self) -> BuilderNode {
347	0	assert!(self.stack.len() == 1);
348	0	assert!(self.stack[0].last.is_none());
349	0	self.stack.pop().unwrap().node
350	0	}
351
352	0	fn pop_freeze(&mut self, addr: CompiledAddr) -> BuilderNode {
353	0	let mut unfinished = self.stack.pop().unwrap();
354	0	unfinished.last_compiled(addr);
355	0	unfinished.node
356	0	}
357
358	0	fn pop_empty(&mut self) -> BuilderNode {
359	0	let unfinished = self.stack.pop().unwrap();
360	0	assert!(unfinished.last.is_none());
361	0	unfinished.node
362	0	}
363
364	0	fn set_root_output(&mut self, out: Output) {
365	0	self.stack[0].node.is_final = true;
366	0	self.stack[0].node.final_output = out;
367	0	}
368
369	0	fn top_last_freeze(&mut self, addr: CompiledAddr) {
370	0	let last = self.stack.len().checked_sub(1).unwrap();
371	0	self.stack[last].last_compiled(addr);
372	0	}
373
374	0	fn add_suffix(&mut self, bs: &[u8], out: Output) {
375	0	if bs.is_empty() {
376	0	return;
377	0	}
378	0	let last = self.stack.len().checked_sub(1).unwrap();
379	0	assert!(self.stack[last].last.is_none());
380	0	self.stack[last].last = Some(LastTransition { inp: bs[0], out });
381	0	for &b in &bs[1..] {
382	0	self.stack.push(BuilderNodeUnfinished {
383	0	node: BuilderNode::default(),
384	0	last: Some(LastTransition { inp: b, out: Output::zero() }),
385	0	});
386	0	}
387	0	self.push_empty(true);
388	0	}
389
390	0	fn find_common_prefix(&mut self, bs: &[u8]) -> usize {
391	0	bs.iter()
392	0	.zip(&self.stack)
393	0	.take_while(\|&(&b, ref node)\| {
394	0	node.last.as_ref().map(\|t\| t.inp == b).unwrap_or(false)
395	0	})
396	0	.count()
397	0	}
398
399	0	fn find_common_prefix_and_set_output(
400	0	&mut self,
401	0	bs: &[u8],
402	0	mut out: Output,
403	0	) -> (usize, Output) {
404	0	let mut i = 0;
405	0	while i < bs.len() {
406	0	let add_prefix = match self.stack[i].last.as_mut() {
407	0	Some(ref mut t) if t.inp == bs[i] => {
408	0	i += 1;
409	0	let common_pre = t.out.prefix(out);
410	0	let add_prefix = t.out.sub(common_pre);
411	0	out = out.sub(common_pre);
412	0	t.out = common_pre;
413	0	add_prefix
414		}
415	0	_ => break,
416		};
417	0	if !add_prefix.is_zero() {
418	0	self.stack[i].add_output_prefix(add_prefix);
419	0	}
420		}
421	0	(i, out)
422	0	}
423		}
424
425		impl BuilderNodeUnfinished {
426	0	fn last_compiled(&mut self, addr: CompiledAddr) {
427	0	if let Some(trans) = self.last.take() {
428	0	self.node.trans.push(Transition {
429	0	inp: trans.inp,
430	0	out: trans.out,
431	0	addr,
432	0	});
433	0	}
434	0	}
435
436	0	fn add_output_prefix(&mut self, prefix: Output) {
437	0	if self.node.is_final {
438	0	self.node.final_output = prefix.cat(self.node.final_output);
439	0	}
440	0	for t in &mut self.node.trans {
441	0	t.out = prefix.cat(t.out);
442	0	}
443	0	if let Some(ref mut t) = self.last {
444	0	t.out = prefix.cat(t.out);
445	0	}
446	0	}
447		}
448
449		impl Clone for BuilderNode {
450	0	fn clone(&self) -> BuilderNode {
451	0	BuilderNode {
452	0	is_final: self.is_final,
453	0	final_output: self.final_output,
454	0	trans: self.trans.clone(),
455	0	}
456	0	}
457
458	0	fn clone_from(&mut self, source: &BuilderNode) {
459	0	self.is_final = source.is_final;
460	0	self.final_output = source.final_output;
461	0	self.trans.clear();
462	0	self.trans.extend(source.trans.iter());
463	0	}
464		}
465
466		impl Default for BuilderNode {
467	0	fn default() -> BuilderNode {
468	0	BuilderNode {
469	0	is_final: false,
470	0	final_output: Output::zero(),
471	0	trans: vec![],
472	0	}
473	0	}
474		}