/rust/registry/src/index.crates.io-1949cf8c6b5b557f/fst-0.4.7/src/raw/node.rs
Line | Count | Source |
1 | | use std::cmp; |
2 | | use std::fmt; |
3 | | use std::io; |
4 | | use std::ops::Range; |
5 | | |
6 | | use crate::bytes; |
7 | | use crate::raw::build::BuilderNode; |
8 | | use crate::raw::common_inputs::{COMMON_INPUTS, COMMON_INPUTS_INV}; |
9 | | use crate::raw::{ |
10 | | u64_to_usize, CompiledAddr, Output, Transition, EMPTY_ADDRESS, |
11 | | }; |
12 | | |
13 | | /// The threshold (in number of transitions) at which an index is created for |
14 | | /// a node's transitions. This speeds up lookup time at the expense of FST |
15 | | /// size. |
16 | | const TRANS_INDEX_THRESHOLD: usize = 32; |
17 | | |
18 | | /// Node represents a single state in a finite state transducer. |
19 | | /// |
20 | | /// Nodes are very cheap to construct. Notably, they satisfy the `Copy` trait. |
21 | | #[derive(Clone, Copy)] |
22 | | pub struct Node<'f> { |
23 | | data: &'f [u8], |
24 | | version: u64, |
25 | | state: State, |
26 | | start: CompiledAddr, |
27 | | end: usize, |
28 | | is_final: bool, |
29 | | ntrans: usize, |
30 | | sizes: PackSizes, |
31 | | final_output: Output, |
32 | | } |
33 | | |
34 | | impl<'f> fmt::Debug for Node<'f> { |
35 | 0 | fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { |
36 | 0 | writeln!(f, "NODE@{}", self.start)?; |
37 | 0 | writeln!(f, " end_addr: {}", self.end)?; |
38 | 0 | writeln!(f, " size: {} bytes", self.as_slice().len())?; |
39 | 0 | writeln!(f, " state: {:?}", self.state)?; |
40 | 0 | writeln!(f, " is_final: {}", self.is_final())?; |
41 | 0 | writeln!(f, " final_output: {:?}", self.final_output())?; |
42 | 0 | writeln!(f, " # transitions: {}", self.len())?; |
43 | 0 | writeln!(f, " transitions:")?; |
44 | 0 | for t in self.transitions() { |
45 | 0 | writeln!(f, " {:?}", t)?; |
46 | | } |
47 | 0 | Ok(()) |
48 | 0 | } |
49 | | } |
50 | | |
51 | | impl<'f> Node<'f> { |
52 | | /// Creates a new note at the address given. |
53 | | /// |
54 | | /// `data` should be a slice to an entire FST. |
55 | 0 | pub(crate) fn new( |
56 | 0 | version: u64, |
57 | 0 | addr: CompiledAddr, |
58 | 0 | data: &[u8], |
59 | 0 | ) -> Node<'_> { |
60 | 0 | let state = State::new(data, addr); |
61 | 0 | match state { |
62 | 0 | State::EmptyFinal => Node { |
63 | 0 | data: &[], |
64 | 0 | version, |
65 | 0 | state: State::EmptyFinal, |
66 | 0 | start: EMPTY_ADDRESS, |
67 | 0 | end: EMPTY_ADDRESS, |
68 | 0 | is_final: true, |
69 | 0 | ntrans: 0, |
70 | 0 | sizes: PackSizes::new(), |
71 | 0 | final_output: Output::zero(), |
72 | 0 | }, |
73 | 0 | State::OneTransNext(s) => { |
74 | 0 | let data = &data[..addr + 1]; |
75 | 0 | Node { |
76 | 0 | data, |
77 | 0 | version, |
78 | 0 | state, |
79 | 0 | start: addr, |
80 | 0 | end: s.end_addr(data), |
81 | 0 | is_final: false, |
82 | 0 | sizes: PackSizes::new(), |
83 | 0 | ntrans: 1, |
84 | 0 | final_output: Output::zero(), |
85 | 0 | } |
86 | | } |
87 | 0 | State::OneTrans(s) => { |
88 | 0 | let data = &data[..addr + 1]; |
89 | 0 | let sizes = s.sizes(data); |
90 | 0 | Node { |
91 | 0 | data, |
92 | 0 | version, |
93 | 0 | state, |
94 | 0 | start: addr, |
95 | 0 | end: s.end_addr(data, sizes), |
96 | 0 | is_final: false, |
97 | 0 | ntrans: 1, |
98 | 0 | sizes, |
99 | 0 | final_output: Output::zero(), |
100 | 0 | } |
101 | | } |
102 | 0 | State::AnyTrans(s) => { |
103 | 0 | let data = &data[..addr + 1]; |
104 | 0 | let sizes = s.sizes(data); |
105 | 0 | let ntrans = s.ntrans(data); |
106 | 0 | Node { |
107 | 0 | data, |
108 | 0 | version, |
109 | 0 | state, |
110 | 0 | start: addr, |
111 | 0 | end: s.end_addr(version, data, sizes, ntrans), |
112 | 0 | is_final: s.is_final_state(), |
113 | 0 | ntrans, |
114 | 0 | sizes, |
115 | 0 | final_output: s.final_output(version, data, sizes, ntrans), |
116 | 0 | } |
117 | | } |
118 | | } |
119 | 0 | } |
120 | | /// Returns an iterator over all transitions in this node in lexicographic |
121 | | /// order. |
122 | | #[inline] |
123 | 0 | pub fn transitions<'n>(&'n self) -> Transitions<'f, 'n> { |
124 | 0 | Transitions { node: self, range: 0..self.len() } |
125 | 0 | } |
126 | | |
127 | | /// Returns the transition at index `i`. |
128 | | #[inline(always)] |
129 | 0 | pub fn transition(&self, i: usize) -> Transition { |
130 | | // The `inline(always)` annotation on this function appears to |
131 | | // dramatically speed up FST traversal. In particular, measuring the |
132 | | // time it takes to run `fst range something-big.fst` shows almost a 2x |
133 | | // improvement with this single `inline(always)` annotation. |
134 | | |
135 | 0 | match self.state { |
136 | 0 | State::OneTransNext(s) => { |
137 | 0 | assert!(i == 0); |
138 | 0 | Transition { |
139 | 0 | inp: s.input(self), |
140 | 0 | out: Output::zero(), |
141 | 0 | addr: s.trans_addr(self), |
142 | 0 | } |
143 | | } |
144 | 0 | State::OneTrans(s) => { |
145 | 0 | assert!(i == 0); |
146 | 0 | Transition { |
147 | 0 | inp: s.input(self), |
148 | 0 | out: s.output(self), |
149 | 0 | addr: s.trans_addr(self), |
150 | 0 | } |
151 | | } |
152 | 0 | State::AnyTrans(s) => Transition { |
153 | 0 | inp: s.input(self, i), |
154 | 0 | out: s.output(self, i), |
155 | 0 | addr: s.trans_addr(self, i), |
156 | 0 | }, |
157 | 0 | State::EmptyFinal => panic!("out of bounds"), |
158 | | } |
159 | 0 | } |
160 | | |
161 | | /// Returns the transition address of the `i`th transition. |
162 | | #[inline] |
163 | 0 | pub fn transition_addr(&self, i: usize) -> CompiledAddr { |
164 | 0 | match self.state { |
165 | 0 | State::OneTransNext(s) => { |
166 | 0 | assert!(i == 0); |
167 | 0 | s.trans_addr(self) |
168 | | } |
169 | 0 | State::OneTrans(s) => { |
170 | 0 | assert!(i == 0); |
171 | 0 | s.trans_addr(self) |
172 | | } |
173 | 0 | State::AnyTrans(s) => s.trans_addr(self, i), |
174 | 0 | State::EmptyFinal => panic!("out of bounds"), |
175 | | } |
176 | 0 | } |
177 | | |
178 | | /// Finds the `i`th transition corresponding to the given input byte. |
179 | | /// |
180 | | /// If no transition for this byte exists, then `None` is returned. |
181 | | #[inline] |
182 | 0 | pub fn find_input(&self, b: u8) -> Option<usize> { |
183 | 0 | match self.state { |
184 | 0 | State::OneTransNext(s) if s.input(self) == b => Some(0), |
185 | 0 | State::OneTransNext(_) => None, |
186 | 0 | State::OneTrans(s) if s.input(self) == b => Some(0), |
187 | 0 | State::OneTrans(_) => None, |
188 | 0 | State::AnyTrans(s) => s.find_input(self, b), |
189 | 0 | State::EmptyFinal => None, |
190 | | } |
191 | 0 | } |
192 | | |
193 | | /// If this node is final and has a terminal output value, then it is |
194 | | /// returned. Otherwise, a zero output is returned. |
195 | | #[inline] |
196 | 0 | pub fn final_output(&self) -> Output { |
197 | 0 | self.final_output |
198 | 0 | } |
199 | | |
200 | | /// Returns true if and only if this node corresponds to a final or "match" |
201 | | /// state in the finite state transducer. |
202 | | #[inline] |
203 | 0 | pub fn is_final(&self) -> bool { |
204 | 0 | self.is_final |
205 | 0 | } |
206 | | |
207 | | /// Returns the number of transitions in this node. |
208 | | /// |
209 | | /// The maximum number of transitions is 256. |
210 | | #[inline] |
211 | 0 | pub fn len(&self) -> usize { |
212 | 0 | self.ntrans |
213 | 0 | } |
214 | | |
215 | | /// Returns true if and only if this node has zero transitions. |
216 | | #[inline] |
217 | 0 | pub fn is_empty(&self) -> bool { |
218 | 0 | self.ntrans == 0 |
219 | 0 | } |
220 | | |
221 | | /// Return the address of this node. |
222 | | #[inline] |
223 | 0 | pub fn addr(&self) -> CompiledAddr { |
224 | 0 | self.start |
225 | 0 | } |
226 | | |
227 | | #[doc(hidden)] |
228 | | #[inline] |
229 | 0 | pub fn as_slice(&self) -> &[u8] { |
230 | 0 | &self.data[self.end..] |
231 | 0 | } |
232 | | |
233 | | #[doc(hidden)] |
234 | | #[inline] |
235 | 0 | pub fn state(&self) -> &'static str { |
236 | 0 | match self.state { |
237 | 0 | State::OneTransNext(_) => "OTN", |
238 | 0 | State::OneTrans(_) => "OT", |
239 | 0 | State::AnyTrans(_) => "AT", |
240 | 0 | State::EmptyFinal => "EF", |
241 | | } |
242 | 0 | } |
243 | | |
244 | 0 | fn compile<W: io::Write>( |
245 | 0 | wtr: W, |
246 | 0 | last_addr: CompiledAddr, |
247 | 0 | addr: CompiledAddr, |
248 | 0 | node: &BuilderNode, |
249 | 0 | ) -> io::Result<()> { |
250 | 0 | assert!(node.trans.len() <= 256); |
251 | 0 | if node.trans.is_empty() |
252 | 0 | && node.is_final |
253 | 0 | && node.final_output.is_zero() |
254 | | { |
255 | 0 | return Ok(()); |
256 | 0 | } else if node.trans.len() != 1 || node.is_final { |
257 | 0 | StateAnyTrans::compile(wtr, addr, node) |
258 | | } else { |
259 | 0 | if node.trans[0].addr == last_addr && node.trans[0].out.is_zero() { |
260 | 0 | StateOneTransNext::compile(wtr, addr, node.trans[0].inp) |
261 | | } else { |
262 | 0 | StateOneTrans::compile(wtr, addr, node.trans[0]) |
263 | | } |
264 | | } |
265 | 0 | } |
266 | | } |
267 | | |
268 | | impl BuilderNode { |
269 | 0 | pub fn compile_to<W: io::Write>( |
270 | 0 | &self, |
271 | 0 | wtr: W, |
272 | 0 | last_addr: CompiledAddr, |
273 | 0 | addr: CompiledAddr, |
274 | 0 | ) -> io::Result<()> { |
275 | 0 | Node::compile(wtr, last_addr, addr, self) |
276 | 0 | } |
277 | | } |
278 | | |
279 | | #[derive(Clone, Copy, Debug)] |
280 | | enum State { |
281 | | OneTransNext(StateOneTransNext), |
282 | | OneTrans(StateOneTrans), |
283 | | AnyTrans(StateAnyTrans), |
284 | | EmptyFinal, |
285 | | } |
286 | | |
287 | | // one trans flag (1), next flag (1), common input |
288 | | #[derive(Clone, Copy, Debug)] |
289 | | struct StateOneTransNext(u8); |
290 | | // one trans flag (1), next flag (0), common input |
291 | | #[derive(Clone, Copy, Debug)] |
292 | | struct StateOneTrans(u8); |
293 | | // one trans flag (0), final flag, # transitions |
294 | | #[derive(Clone, Copy, Debug)] |
295 | | struct StateAnyTrans(u8); |
296 | | |
297 | | impl State { |
298 | 0 | fn new(data: &[u8], addr: CompiledAddr) -> State { |
299 | 0 | if addr == EMPTY_ADDRESS { |
300 | 0 | return State::EmptyFinal; |
301 | 0 | } |
302 | 0 | let v = data[addr]; |
303 | 0 | match (v & 0b11_000000) >> 6 { |
304 | 0 | 0b11 => State::OneTransNext(StateOneTransNext(v)), |
305 | 0 | 0b10 => State::OneTrans(StateOneTrans(v)), |
306 | 0 | _ => State::AnyTrans(StateAnyTrans(v)), |
307 | | } |
308 | 0 | } |
309 | | } |
310 | | |
311 | | impl StateOneTransNext { |
312 | 0 | fn compile<W: io::Write>( |
313 | 0 | mut wtr: W, |
314 | 0 | _: CompiledAddr, |
315 | 0 | input: u8, |
316 | 0 | ) -> io::Result<()> { |
317 | 0 | let mut state = StateOneTransNext::new(); |
318 | 0 | state.set_common_input(input); |
319 | 0 | if state.common_input().is_none() { |
320 | 0 | wtr.write_all(&[input])?; |
321 | 0 | } |
322 | 0 | wtr.write_all(&[state.0])?; |
323 | 0 | Ok(()) |
324 | 0 | } |
325 | | |
326 | | #[inline] |
327 | 0 | fn new() -> StateOneTransNext { |
328 | 0 | StateOneTransNext(0b11_000000) |
329 | 0 | } |
330 | | |
331 | | #[inline] |
332 | 0 | fn set_common_input(&mut self, input: u8) { |
333 | 0 | self.0 = (self.0 & 0b11_000000) | common_idx(input, 0b111111); |
334 | 0 | } |
335 | | |
336 | | #[inline] |
337 | 0 | fn common_input(&self) -> Option<u8> { |
338 | 0 | common_input(self.0 & 0b00_111111) |
339 | 0 | } |
340 | | |
341 | | #[inline] |
342 | 0 | fn input_len(&self) -> usize { |
343 | 0 | if self.common_input().is_none() { |
344 | 0 | 1 |
345 | | } else { |
346 | 0 | 0 |
347 | | } |
348 | 0 | } |
349 | | |
350 | | #[inline] |
351 | 0 | fn end_addr(&self, data: &[u8]) -> usize { |
352 | 0 | data.len() - 1 - self.input_len() |
353 | 0 | } |
354 | | |
355 | | #[inline] |
356 | 0 | fn input(&self, node: &Node<'_>) -> u8 { |
357 | 0 | if let Some(inp) = self.common_input() { |
358 | 0 | inp |
359 | | } else { |
360 | 0 | node.data[node.start - 1] |
361 | | } |
362 | 0 | } |
363 | | |
364 | | #[inline] |
365 | 0 | fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { |
366 | 0 | node.end as CompiledAddr - 1 |
367 | 0 | } |
368 | | } |
369 | | |
370 | | impl StateOneTrans { |
371 | 0 | fn compile<W: io::Write>( |
372 | 0 | mut wtr: W, |
373 | 0 | addr: CompiledAddr, |
374 | 0 | trans: Transition, |
375 | 0 | ) -> io::Result<()> { |
376 | 0 | let out = trans.out.value(); |
377 | 0 | let output_pack_size = |
378 | 0 | if out == 0 { 0 } else { bytes::pack_uint(&mut wtr, out)? }; |
379 | 0 | let trans_pack_size = pack_delta(&mut wtr, addr, trans.addr)?; |
380 | | |
381 | 0 | let mut pack_sizes = PackSizes::new(); |
382 | 0 | pack_sizes.set_output_pack_size(output_pack_size); |
383 | 0 | pack_sizes.set_transition_pack_size(trans_pack_size); |
384 | 0 | wtr.write_all(&[pack_sizes.encode()])?; |
385 | | |
386 | 0 | let mut state = StateOneTrans::new(); |
387 | 0 | state.set_common_input(trans.inp); |
388 | 0 | if state.common_input().is_none() { |
389 | 0 | wtr.write_all(&[trans.inp])?; |
390 | 0 | } |
391 | 0 | wtr.write_all(&[state.0])?; |
392 | 0 | Ok(()) |
393 | 0 | } |
394 | | |
395 | | #[inline] |
396 | 0 | fn new() -> StateOneTrans { |
397 | 0 | StateOneTrans(0b10_000000) |
398 | 0 | } |
399 | | |
400 | | #[inline] |
401 | 0 | fn set_common_input(&mut self, input: u8) { |
402 | 0 | self.0 = (self.0 & 0b10_000000) | common_idx(input, 0b111111); |
403 | 0 | } |
404 | | |
405 | | #[inline] |
406 | 0 | fn common_input(&self) -> Option<u8> { |
407 | 0 | common_input(self.0 & 0b00_111111) |
408 | 0 | } |
409 | | |
410 | | #[inline] |
411 | 0 | fn input_len(&self) -> usize { |
412 | 0 | if self.common_input().is_none() { |
413 | 0 | 1 |
414 | | } else { |
415 | 0 | 0 |
416 | | } |
417 | 0 | } |
418 | | |
419 | | #[inline] |
420 | 0 | fn sizes(&self, data: &[u8]) -> PackSizes { |
421 | 0 | let i = data.len() - 1 - self.input_len() - 1; |
422 | 0 | PackSizes::decode(data[i]) |
423 | 0 | } |
424 | | |
425 | | #[inline] |
426 | 0 | fn end_addr(&self, data: &[u8], sizes: PackSizes) -> usize { |
427 | 0 | data.len() - 1 |
428 | 0 | - self.input_len() |
429 | 0 | - 1 // pack size |
430 | 0 | - sizes.transition_pack_size() |
431 | 0 | - sizes.output_pack_size() |
432 | 0 | } |
433 | | |
434 | | #[inline] |
435 | 0 | fn input(&self, node: &Node<'_>) -> u8 { |
436 | 0 | if let Some(inp) = self.common_input() { |
437 | 0 | inp |
438 | | } else { |
439 | 0 | node.data[node.start - 1] |
440 | | } |
441 | 0 | } |
442 | | |
443 | | #[inline] |
444 | 0 | fn output(&self, node: &Node<'_>) -> Output { |
445 | 0 | let osize = node.sizes.output_pack_size(); |
446 | 0 | if osize == 0 { |
447 | 0 | return Output::zero(); |
448 | 0 | } |
449 | 0 | let tsize = node.sizes.transition_pack_size(); |
450 | 0 | let i = node.start |
451 | 0 | - self.input_len() |
452 | 0 | - 1 // pack size |
453 | 0 | - tsize - osize; |
454 | 0 | Output::new(bytes::unpack_uint(&node.data[i..], osize as u8)) |
455 | 0 | } |
456 | | |
457 | | #[inline] |
458 | 0 | fn trans_addr(&self, node: &Node<'_>) -> CompiledAddr { |
459 | 0 | let tsize = node.sizes.transition_pack_size(); |
460 | 0 | let i = node.start |
461 | 0 | - self.input_len() |
462 | 0 | - 1 // pack size |
463 | 0 | - tsize; |
464 | 0 | unpack_delta(&node.data[i..], tsize, node.end) |
465 | 0 | } |
466 | | } |
467 | | |
468 | | impl StateAnyTrans { |
469 | 0 | fn compile<W: io::Write>( |
470 | 0 | mut wtr: W, |
471 | 0 | addr: CompiledAddr, |
472 | 0 | node: &BuilderNode, |
473 | 0 | ) -> io::Result<()> { |
474 | 0 | assert!(node.trans.len() <= 256); |
475 | | |
476 | 0 | let mut tsize = 0; |
477 | 0 | let mut osize = bytes::pack_size(node.final_output.value()); |
478 | 0 | let mut any_outs = !node.final_output.is_zero(); |
479 | 0 | for t in &node.trans { |
480 | 0 | tsize = cmp::max(tsize, pack_delta_size(addr, t.addr)); |
481 | 0 | osize = cmp::max(osize, bytes::pack_size(t.out.value())); |
482 | 0 | any_outs = any_outs || !t.out.is_zero(); |
483 | | } |
484 | | |
485 | 0 | let mut pack_sizes = PackSizes::new(); |
486 | 0 | if any_outs { |
487 | 0 | pack_sizes.set_output_pack_size(osize); |
488 | 0 | } else { |
489 | 0 | pack_sizes.set_output_pack_size(0); |
490 | 0 | } |
491 | 0 | pack_sizes.set_transition_pack_size(tsize); |
492 | | |
493 | 0 | let mut state = StateAnyTrans::new(); |
494 | 0 | state.set_final_state(node.is_final); |
495 | 0 | state.set_state_ntrans(node.trans.len() as u8); |
496 | | |
497 | 0 | if any_outs { |
498 | 0 | if node.is_final { |
499 | 0 | bytes::pack_uint_in( |
500 | 0 | &mut wtr, |
501 | 0 | node.final_output.value(), |
502 | 0 | osize, |
503 | 0 | )?; |
504 | 0 | } |
505 | 0 | for t in node.trans.iter().rev() { |
506 | 0 | bytes::pack_uint_in(&mut wtr, t.out.value(), osize)?; |
507 | | } |
508 | 0 | } |
509 | 0 | for t in node.trans.iter().rev() { |
510 | 0 | pack_delta_in(&mut wtr, addr, t.addr, tsize)?; |
511 | | } |
512 | 0 | for t in node.trans.iter().rev() { |
513 | 0 | wtr.write_all(&[t.inp])?; |
514 | | } |
515 | 0 | if node.trans.len() > TRANS_INDEX_THRESHOLD { |
516 | | // A value of 255 indicates that no transition exists for the byte |
517 | | // at that index. (Except when there are 256 transitions.) Namely, |
518 | | // any value greater than or equal to the number of transitions in |
519 | | // this node indicates an absent transition. |
520 | 0 | let mut index = [255u8; 256]; |
521 | 0 | for (i, t) in node.trans.iter().enumerate() { |
522 | 0 | index[t.inp as usize] = i as u8; |
523 | 0 | } |
524 | 0 | wtr.write_all(&index)?; |
525 | 0 | } |
526 | | |
527 | 0 | wtr.write_all(&[pack_sizes.encode()])?; |
528 | 0 | if state.state_ntrans().is_none() { |
529 | 0 | if node.trans.len() == 256 { |
530 | | // 256 can't be represented in a u8, so we abuse the fact that |
531 | | // the # of transitions can never be 1 here, since 1 is always |
532 | | // encoded in the state byte. |
533 | 0 | wtr.write_all(&[1])?; |
534 | | } else { |
535 | 0 | wtr.write_all(&[node.trans.len() as u8])?; |
536 | | } |
537 | 0 | } |
538 | 0 | wtr.write_all(&[state.0])?; |
539 | 0 | Ok(()) |
540 | 0 | } |
541 | | |
542 | | #[inline] |
543 | 0 | fn new() -> StateAnyTrans { |
544 | 0 | StateAnyTrans(0b00_000000) |
545 | 0 | } |
546 | | |
547 | | #[inline] |
548 | 0 | fn set_final_state(&mut self, yes: bool) { |
549 | 0 | if yes { |
550 | 0 | self.0 |= 0b01_000000; |
551 | 0 | } |
552 | 0 | } |
553 | | |
554 | | #[inline] |
555 | 0 | fn is_final_state(&self) -> bool { |
556 | 0 | self.0 & 0b01_000000 == 0b01_000000 |
557 | 0 | } |
558 | | |
559 | | #[inline] |
560 | 0 | fn set_state_ntrans(&mut self, n: u8) { |
561 | 0 | if n <= 0b00_111111 { |
562 | 0 | self.0 = (self.0 & 0b11_000000) | n; |
563 | 0 | } |
564 | 0 | } |
565 | | |
566 | | #[inline] |
567 | 0 | fn state_ntrans(&self) -> Option<u8> { |
568 | 0 | let n = self.0 & 0b00_111111; |
569 | 0 | if n == 0 { |
570 | 0 | None |
571 | | } else { |
572 | 0 | Some(n) |
573 | | } |
574 | 0 | } |
575 | | |
576 | | #[inline] |
577 | 0 | fn sizes(&self, data: &[u8]) -> PackSizes { |
578 | 0 | let i = data.len() - 1 - self.ntrans_len() - 1; |
579 | 0 | PackSizes::decode(data[i]) |
580 | 0 | } |
581 | | |
582 | | #[inline] |
583 | 0 | fn total_trans_size( |
584 | 0 | &self, |
585 | 0 | version: u64, |
586 | 0 | sizes: PackSizes, |
587 | 0 | ntrans: usize, |
588 | 0 | ) -> usize { |
589 | 0 | let index_size = self.trans_index_size(version, ntrans); |
590 | 0 | ntrans + (ntrans * sizes.transition_pack_size()) + index_size |
591 | 0 | } |
592 | | |
593 | | #[inline] |
594 | 0 | fn trans_index_size(&self, version: u64, ntrans: usize) -> usize { |
595 | 0 | if version >= 2 && ntrans > TRANS_INDEX_THRESHOLD { |
596 | 0 | 256 |
597 | | } else { |
598 | 0 | 0 |
599 | | } |
600 | 0 | } |
601 | | |
602 | | #[inline] |
603 | 0 | fn ntrans_len(&self) -> usize { |
604 | 0 | if self.state_ntrans().is_none() { |
605 | 0 | 1 |
606 | | } else { |
607 | 0 | 0 |
608 | | } |
609 | 0 | } |
610 | | |
611 | | #[inline] |
612 | 0 | fn ntrans(&self, data: &[u8]) -> usize { |
613 | 0 | if let Some(n) = self.state_ntrans() { |
614 | 0 | n as usize |
615 | | } else { |
616 | 0 | let n = data[data.len() - 2] as usize; |
617 | 0 | if n == 1 { |
618 | | // "1" is never a normal legal value here, because if there |
619 | | // is only 1 transition, then it is encoded in the state byte. |
620 | 0 | 256 |
621 | | } else { |
622 | 0 | n |
623 | | } |
624 | | } |
625 | 0 | } |
626 | | |
627 | | #[inline] |
628 | 0 | fn final_output( |
629 | 0 | &self, |
630 | 0 | version: u64, |
631 | 0 | data: &[u8], |
632 | 0 | sizes: PackSizes, |
633 | 0 | ntrans: usize, |
634 | 0 | ) -> Output { |
635 | 0 | let osize = sizes.output_pack_size(); |
636 | 0 | if osize == 0 || !self.is_final_state() { |
637 | 0 | return Output::zero(); |
638 | 0 | } |
639 | 0 | let at = data.len() - 1 |
640 | 0 | - self.ntrans_len() |
641 | 0 | - 1 // pack size |
642 | 0 | - self.total_trans_size(version, sizes, ntrans) |
643 | 0 | - (ntrans * osize) // output values |
644 | 0 | - osize; // the desired output value |
645 | 0 | Output::new(bytes::unpack_uint(&data[at..], osize as u8)) |
646 | 0 | } |
647 | | |
648 | | #[inline] |
649 | 0 | fn end_addr( |
650 | 0 | &self, |
651 | 0 | version: u64, |
652 | 0 | data: &[u8], |
653 | 0 | sizes: PackSizes, |
654 | 0 | ntrans: usize, |
655 | 0 | ) -> usize { |
656 | 0 | let osize = sizes.output_pack_size(); |
657 | 0 | let final_osize = if !self.is_final_state() { 0 } else { osize }; |
658 | 0 | data.len() - 1 |
659 | 0 | - self.ntrans_len() |
660 | 0 | - 1 // pack size |
661 | 0 | - self.total_trans_size(version, sizes, ntrans) |
662 | 0 | - (ntrans * osize) // output values |
663 | 0 | - final_osize // final output |
664 | 0 | } |
665 | | |
666 | | #[inline] |
667 | 0 | fn trans_addr(&self, node: &Node<'_>, i: usize) -> CompiledAddr { |
668 | 0 | assert!(i < node.ntrans); |
669 | 0 | let tsize = node.sizes.transition_pack_size(); |
670 | 0 | let at = node.start |
671 | 0 | - self.ntrans_len() |
672 | 0 | - 1 // pack size |
673 | 0 | - self.trans_index_size(node.version, node.ntrans) |
674 | 0 | - node.ntrans // inputs |
675 | 0 | - (i * tsize) // the previous transition addresses |
676 | 0 | - tsize; // the desired transition address |
677 | 0 | unpack_delta(&node.data[at..], tsize, node.end) |
678 | 0 | } |
679 | | |
680 | | #[inline] |
681 | 0 | fn input(&self, node: &Node<'_>, i: usize) -> u8 { |
682 | 0 | let at = node.start |
683 | 0 | - self.ntrans_len() |
684 | 0 | - 1 // pack size |
685 | 0 | - self.trans_index_size(node.version, node.ntrans) |
686 | 0 | - i |
687 | 0 | - 1; // the input byte |
688 | 0 | node.data[at] |
689 | 0 | } |
690 | | |
691 | | #[inline] |
692 | 0 | fn find_input(&self, node: &Node<'_>, b: u8) -> Option<usize> { |
693 | 0 | if node.version >= 2 && node.ntrans > TRANS_INDEX_THRESHOLD { |
694 | 0 | let start = node.start |
695 | 0 | - self.ntrans_len() |
696 | 0 | - 1 // pack size |
697 | 0 | - self.trans_index_size(node.version, node.ntrans); |
698 | 0 | let i = node.data[start + b as usize] as usize; |
699 | 0 | if i >= node.ntrans { |
700 | 0 | None |
701 | | } else { |
702 | 0 | Some(i) |
703 | | } |
704 | | } else { |
705 | 0 | let start = node.start |
706 | 0 | - self.ntrans_len() |
707 | 0 | - 1 // pack size |
708 | 0 | - node.ntrans; // inputs |
709 | 0 | let end = start + node.ntrans; |
710 | 0 | let inputs = &node.data[start..end]; |
711 | 0 | inputs.iter().position(|&b2| b == b2).map(|i| node.ntrans - i - 1) |
712 | | } |
713 | 0 | } |
714 | | |
715 | | #[inline] |
716 | 0 | fn output(&self, node: &Node<'_>, i: usize) -> Output { |
717 | 0 | let osize = node.sizes.output_pack_size(); |
718 | 0 | if osize == 0 { |
719 | 0 | return Output::zero(); |
720 | 0 | } |
721 | 0 | let at = node.start |
722 | 0 | - self.ntrans_len() |
723 | 0 | - 1 // pack size |
724 | 0 | - self.total_trans_size(node.version, node.sizes, node.ntrans) |
725 | 0 | - (i * osize) // the previous outputs |
726 | 0 | - osize; // the desired output value |
727 | 0 | Output::new(bytes::unpack_uint(&node.data[at..], osize as u8)) |
728 | 0 | } |
729 | | } |
730 | | |
731 | | // high 4 bits is transition address packed size. |
732 | | // low 4 bits is output value packed size. |
733 | | // |
734 | | // `0` is a legal value which means there are no transitions/outputs. |
735 | | #[derive(Clone, Copy, Debug)] |
736 | | struct PackSizes(u8); |
737 | | |
738 | | impl PackSizes { |
739 | | #[inline] |
740 | 0 | fn new() -> PackSizes { |
741 | 0 | PackSizes(0) |
742 | 0 | } |
743 | | |
744 | | #[inline] |
745 | 0 | fn decode(v: u8) -> PackSizes { |
746 | 0 | PackSizes(v) |
747 | 0 | } |
748 | | |
749 | | #[inline] |
750 | 0 | fn encode(&self) -> u8 { |
751 | 0 | self.0 |
752 | 0 | } |
753 | | |
754 | | #[inline] |
755 | 0 | fn set_transition_pack_size(&mut self, size: u8) { |
756 | 0 | assert!(size <= 8); |
757 | 0 | self.0 = (self.0 & 0b0000_1111) | (size << 4); |
758 | 0 | } |
759 | | |
760 | | #[inline] |
761 | 0 | fn transition_pack_size(&self) -> usize { |
762 | 0 | ((self.0 & 0b1111_0000) >> 4) as usize |
763 | 0 | } |
764 | | |
765 | | #[inline] |
766 | 0 | fn set_output_pack_size(&mut self, size: u8) { |
767 | 0 | assert!(size <= 8); |
768 | 0 | self.0 = (self.0 & 0b1111_0000) | size; |
769 | 0 | } |
770 | | |
771 | | #[inline] |
772 | 0 | fn output_pack_size(&self) -> usize { |
773 | 0 | (self.0 & 0b0000_1111) as usize |
774 | 0 | } |
775 | | } |
776 | | |
777 | | /// An iterator over all transitions in a node. |
778 | | /// |
779 | | /// `'f` is the lifetime of the underlying fst and `'n` is the lifetime of |
780 | | /// the underlying `Node`. |
781 | | pub struct Transitions<'f, 'n> { |
782 | | node: &'n Node<'f>, |
783 | | range: Range<usize>, |
784 | | } |
785 | | |
786 | | impl<'f, 'n> Iterator for Transitions<'f, 'n> { |
787 | | type Item = Transition; |
788 | | |
789 | | #[inline] |
790 | 0 | fn next(&mut self) -> Option<Transition> { |
791 | 0 | self.range.next().map(|i| self.node.transition(i)) |
792 | 0 | } |
793 | | } |
794 | | |
795 | | /// common_idx translate a byte to an index in the COMMON_INPUTS_INV array. |
796 | | /// |
797 | | /// I wonder if it would be prudent to store this mapping in the FST itself. |
798 | | /// The advantage of doing so would mean that common inputs would reflect the |
799 | | /// specific data in the FST. The problem of course is that this table has to |
800 | | /// be computed up front, which is pretty much at odds with the streaming |
801 | | /// nature of the builder. |
802 | | /// |
803 | | /// Nevertheless, the *caller* may have a priori knowledge that could be |
804 | | /// supplied to the builder manually, which could then be embedded in the FST. |
805 | | #[inline] |
806 | 0 | fn common_idx(input: u8, max: u8) -> u8 { |
807 | 0 | let val = ((COMMON_INPUTS[input as usize] as u32 + 1) % 256) as u8; |
808 | 0 | if val > max { |
809 | 0 | 0 |
810 | | } else { |
811 | 0 | val |
812 | | } |
813 | 0 | } |
814 | | |
815 | | /// common_input translates a common input index stored in a serialized FST |
816 | | /// to the corresponding byte. |
817 | | #[inline] |
818 | 0 | fn common_input(idx: u8) -> Option<u8> { |
819 | 0 | if idx == 0 { |
820 | 0 | None |
821 | | } else { |
822 | 0 | Some(COMMON_INPUTS_INV[(idx - 1) as usize]) |
823 | | } |
824 | 0 | } |
825 | | |
826 | | #[inline] |
827 | 0 | fn pack_delta<W: io::Write>( |
828 | 0 | wtr: W, |
829 | 0 | node_addr: CompiledAddr, |
830 | 0 | trans_addr: CompiledAddr, |
831 | 0 | ) -> io::Result<u8> { |
832 | 0 | let nbytes = pack_delta_size(node_addr, trans_addr); |
833 | 0 | pack_delta_in(wtr, node_addr, trans_addr, nbytes)?; |
834 | 0 | Ok(nbytes) |
835 | 0 | } |
836 | | |
837 | | #[inline] |
838 | 0 | fn pack_delta_in<W: io::Write>( |
839 | 0 | wtr: W, |
840 | 0 | node_addr: CompiledAddr, |
841 | 0 | trans_addr: CompiledAddr, |
842 | 0 | nbytes: u8, |
843 | 0 | ) -> io::Result<()> { |
844 | 0 | let delta_addr = if trans_addr == EMPTY_ADDRESS { |
845 | 0 | EMPTY_ADDRESS |
846 | | } else { |
847 | 0 | node_addr - trans_addr |
848 | | }; |
849 | 0 | bytes::pack_uint_in(wtr, delta_addr as u64, nbytes) |
850 | 0 | } |
851 | | |
852 | | #[inline] |
853 | 0 | fn pack_delta_size(node_addr: CompiledAddr, trans_addr: CompiledAddr) -> u8 { |
854 | 0 | let delta_addr = if trans_addr == EMPTY_ADDRESS { |
855 | 0 | EMPTY_ADDRESS |
856 | | } else { |
857 | 0 | node_addr - trans_addr |
858 | | }; |
859 | 0 | bytes::pack_size(delta_addr as u64) |
860 | 0 | } |
861 | | |
862 | | #[inline] |
863 | 0 | fn unpack_delta( |
864 | 0 | slice: &[u8], |
865 | 0 | trans_pack_size: usize, |
866 | 0 | node_addr: usize, |
867 | 0 | ) -> CompiledAddr { |
868 | 0 | let delta = bytes::unpack_uint(slice, trans_pack_size as u8); |
869 | 0 | let delta_addr = u64_to_usize(delta); |
870 | 0 | if delta_addr == EMPTY_ADDRESS { |
871 | 0 | EMPTY_ADDRESS |
872 | | } else { |
873 | 0 | node_addr - delta_addr |
874 | | } |
875 | 0 | } |
876 | | |
877 | | #[cfg(test)] |
878 | | mod tests { |
879 | | use quickcheck::{quickcheck, TestResult}; |
880 | | |
881 | | use crate::raw::build::BuilderNode; |
882 | | use crate::raw::node::Node; |
883 | | use crate::raw::{Builder, CompiledAddr, Output, Transition, VERSION}; |
884 | | use crate::stream::Streamer; |
885 | | |
886 | | const NEVER_LAST: CompiledAddr = std::u64::MAX as CompiledAddr; |
887 | | |
888 | | #[test] |
889 | | fn prop_emits_inputs() { |
890 | | fn p(mut bs: Vec<Vec<u8>>) -> TestResult { |
891 | | bs.sort(); |
892 | | bs.dedup(); |
893 | | |
894 | | let mut bfst = Builder::memory(); |
895 | | for word in &bs { |
896 | | bfst.add(word).unwrap(); |
897 | | } |
898 | | let fst = bfst.into_fst(); |
899 | | let mut rdr = fst.stream(); |
900 | | let mut words = vec![]; |
901 | | while let Some(w) = rdr.next() { |
902 | | words.push(w.0.to_owned()); |
903 | | } |
904 | | TestResult::from_bool(bs == words) |
905 | | } |
906 | | quickcheck(p as fn(Vec<Vec<u8>>) -> TestResult) |
907 | | } |
908 | | |
909 | | fn nodes_equal(compiled: &Node, uncompiled: &BuilderNode) -> bool { |
910 | | println!("{:?}", compiled); |
911 | | assert_eq!(compiled.is_final(), uncompiled.is_final); |
912 | | assert_eq!(compiled.len(), uncompiled.trans.len()); |
913 | | assert_eq!(compiled.final_output(), uncompiled.final_output); |
914 | | for (ct, ut) in |
915 | | compiled.transitions().zip(uncompiled.trans.iter().cloned()) |
916 | | { |
917 | | assert_eq!(ct.inp, ut.inp); |
918 | | assert_eq!(ct.out, ut.out); |
919 | | assert_eq!(ct.addr, ut.addr); |
920 | | } |
921 | | true |
922 | | } |
923 | | |
924 | | fn compile(node: &BuilderNode) -> (CompiledAddr, Vec<u8>) { |
925 | | let mut buf = vec![0; 24]; |
926 | | node.compile_to(&mut buf, NEVER_LAST, 24).unwrap(); |
927 | | (buf.len() as CompiledAddr - 1, buf) |
928 | | } |
929 | | |
930 | | fn roundtrip(bnode: &BuilderNode) -> bool { |
931 | | let (addr, bytes) = compile(bnode); |
932 | | let node = Node::new(VERSION, addr, &bytes); |
933 | | nodes_equal(&node, &bnode) |
934 | | } |
935 | | |
936 | | fn trans(addr: CompiledAddr, inp: u8) -> Transition { |
937 | | Transition { inp, out: Output::zero(), addr } |
938 | | } |
939 | | |
940 | | #[test] |
941 | | fn bin_no_trans() { |
942 | | let bnode = BuilderNode { |
943 | | is_final: false, |
944 | | final_output: Output::zero(), |
945 | | trans: vec![], |
946 | | }; |
947 | | let (addr, buf) = compile(&bnode); |
948 | | let node = Node::new(VERSION, addr, &buf); |
949 | | assert_eq!(node.as_slice().len(), 3); |
950 | | roundtrip(&bnode); |
951 | | } |
952 | | |
953 | | #[test] |
954 | | fn bin_one_trans_common() { |
955 | | let bnode = BuilderNode { |
956 | | is_final: false, |
957 | | final_output: Output::zero(), |
958 | | trans: vec![trans(20, b'a')], |
959 | | }; |
960 | | let (addr, buf) = compile(&bnode); |
961 | | let node = Node::new(VERSION, addr, &buf); |
962 | | assert_eq!(node.as_slice().len(), 3); |
963 | | roundtrip(&bnode); |
964 | | } |
965 | | |
966 | | #[test] |
967 | | fn bin_one_trans_not_common() { |
968 | | let bnode = BuilderNode { |
969 | | is_final: false, |
970 | | final_output: Output::zero(), |
971 | | trans: vec![trans(2, b'\xff')], |
972 | | }; |
973 | | let (addr, buf) = compile(&bnode); |
974 | | let node = Node::new(VERSION, addr, &buf); |
975 | | assert_eq!(node.as_slice().len(), 4); |
976 | | roundtrip(&bnode); |
977 | | } |
978 | | |
979 | | #[test] |
980 | | fn bin_many_trans() { |
981 | | let bnode = BuilderNode { |
982 | | is_final: false, |
983 | | final_output: Output::zero(), |
984 | | trans: vec![ |
985 | | trans(2, b'a'), |
986 | | trans(3, b'b'), |
987 | | trans(4, b'c'), |
988 | | trans(5, b'd'), |
989 | | trans(6, b'e'), |
990 | | trans(7, b'f'), |
991 | | ], |
992 | | }; |
993 | | let (addr, buf) = compile(&bnode); |
994 | | let node = Node::new(VERSION, addr, &buf); |
995 | | assert_eq!(node.as_slice().len(), 14); |
996 | | roundtrip(&bnode); |
997 | | } |
998 | | |
999 | | #[test] |
1000 | | fn node_max_trans() { |
1001 | | let bnode = BuilderNode { |
1002 | | is_final: false, |
1003 | | final_output: Output::zero(), |
1004 | | trans: (0..256).map(|i| trans(0, i as u8)).collect(), |
1005 | | }; |
1006 | | let (addr, buf) = compile(&bnode); |
1007 | | let node = Node::new(VERSION, addr, &buf); |
1008 | | assert_eq!(node.transitions().count(), 256); |
1009 | | assert_eq!(node.len(), node.transitions().count()); |
1010 | | roundtrip(&bnode); |
1011 | | } |
1012 | | } |